From: Arne Fitzenreiter Date: Fri, 25 Jul 2008 07:18:15 +0000 (+0200) Subject: Changed openswan to 2.4.12 X-Git-Tag: v2.3-beta2~6^2~1 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=45ff5fa6dcf56f5545eaa1c60625fae2f3cd2c6b;p=people%2Fstevee%2Fipfire-2.x.git Changed openswan to 2.4.12 add dnsmasq to updater fix removal of obsolete packages some cleanings --- diff --git a/config/rootfiles/updater/filelists/dnsmasq b/config/rootfiles/updater/filelists/dnsmasq new file mode 100644 index 0000000000..1e900122dc --- /dev/null +++ b/config/rootfiles/updater/filelists/dnsmasq @@ -0,0 +1,2 @@ +usr/sbin/dnsmasq +#usr/share/man/man8/dnsmasq.8 diff --git a/config/rootfiles/updater/update.sh b/config/rootfiles/updater/update.sh index d720b01781..4096c1d2fa 100755 --- a/config/rootfiles/updater/update.sh +++ b/config/rootfiles/updater/update.sh @@ -40,9 +40,9 @@ echo # # check if we the backup file already exist if [ -e /var/ipfire/backup/update_$OLDVERSION-$NEWVERSION.tar.bz2 ]; then - echo Error! The backupfile of this update already exist!!! - echo Have you already installed this update? - exit 3 + echo Moving backup to backup-old ... + mv -f /var/ipfire/backup/update_$OLDVERSION-$NEWVERSION.tar.bz2 \ + /var/ipfire/backup/update_$OLDVERSION-$NEWVERSION-old.tar.bz2 fi echo First we made a backup of all files that was inside of the echo update archive. This may take a while ... @@ -116,14 +116,14 @@ perl -e "require '/var/ipfire/lang.pl'; &Lang::BuildCacheLang" # # Remove obsolete packages # -echo '#!/bin/sh' > /tmp/remove_obsolete_paks +echo '#!/bin/bash' > /tmp/remove_obsolete_paks echo 'while [ "$(ps -A | grep " update.sh")" != "" ]; do' >> /tmp/remove_obsolete_paks echo ' sleep 2' >> /tmp/remove_obsolete_paks echo 'done' >> /tmp/remove_obsolete_paks echo 'while [ "$(ps -A | grep " pakfire")" != "" ]; do' >> /tmp/remove_obsolete_paks echo ' sleep 2' >> /tmp/remove_obsolete_paks echo 'done' >> /tmp/remove_obsolete_paks -echo 'pakfire remove zaptel -y' >> /tmp/remove_obsolete_paks +echo '/opt/pakfire/pakfire remove zaptel -y' >> /tmp/remove_obsolete_paks echo 'echo' >> /tmp/remove_obsolete_paks echo 'echo Update to IPFire $NEWVERSION finished. Please reboot... ' >> /tmp/remove_obsolete_paks echo 'echo' >> /tmp/remove_obsolete_paks diff --git a/doc/packages-list.txt b/doc/packages-list.txt index 9f9d17c8d4..89dfac415b 100644 --- a/doc/packages-list.txt +++ b/doc/packages-list.txt @@ -216,8 +216,8 @@ * openmailadmin-1.0.0 * openssh-4.7p1 * openssl-0.9.8g -* openswan-2.4.13 -* openswan-2.4.13-kmod +* openswan-2.4.12 +* openswan-2.4.12-kmod * openvpn-2.0.9 * pam_mysql-0.7RC1 * patch-2.5.4 diff --git a/lfs/atl1 b/lfs/atl1 deleted file mode 100644 index 399b7f6b9c..0000000000 --- a/lfs/atl1 +++ /dev/null @@ -1,90 +0,0 @@ -############################################################################### -# # -# IPFire.org - A linux based firewall # -# Copyright (C) 2007 Michael Tremer & Christian Schmidt # -# # -# This program is free software: you can redistribute it and/or modify # -# it under the terms of the GNU General Public License as published by # -# the Free Software Foundation, either version 3 of the License, or # -# (at your option) any later version. # -# # -# This program is distributed in the hope that it will be useful, # -# but WITHOUT ANY WARRANTY; without even the implied warranty of # -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # -# GNU General Public License for more details. # -# # -# You should have received a copy of the GNU General Public License # -# along with this program. If not, see . # -# # -############################################################################### - -############################################################################### -# Definitions -############################################################################### - -include Config - -VER = 1.2.40.2 - -THISAPP = atl1-$(VER) -DL_FILE = $(THISAPP).tar.bz2 -DL_FROM = $(URL_IPFIRE) -DIR_APP = $(DIR_SRC)/$(THISAPP) -ifeq "$(SMP)" "1" - TARGET = $(DIR_INFO)/$(THISAPP)-smp -else - TARGET = $(DIR_INFO)/$(THISAPP) -endif - - -############################################################################### -# Top-level Rules -############################################################################### - -objects = $(DL_FILE) - -$(DL_FILE) = $(DL_FROM)/$(DL_FILE) - -$(DL_FILE)_MD5 = b9f30f9d3c9ab2e98309f8d229713b27 - -install : $(TARGET) - -check : $(patsubst %,$(DIR_CHK)/%,$(objects)) - -download :$(patsubst %,$(DIR_DL)/%,$(objects)) - -md5 : $(subst %,%_MD5,$(objects)) - -dist: - $(PAK) - -############################################################################### -# Downloading, checking, md5sum -############################################################################### - -$(patsubst %,$(DIR_CHK)/%,$(objects)) : - @$(CHECK) - -$(patsubst %,$(DIR_DL)/%,$(objects)) : - @$(LOAD) - -$(subst %,%_MD5,$(objects)) : - @$(MD5) - -############################################################################### -# Installation Details -############################################################################### - -$(TARGET) : $(patsubst %,$(DIR_DL)/%,$(objects)) - @$(PREBUILD) - @rm -rf $(DIR_APP) && cd $(DIR_SRC) && tar jxf $(DIR_DL)/$(DL_FILE) - -ifeq "$(SMP)" "1" - cd $(DIR_APP)/src && make -C /lib/modules/$(KVER)-ipfire-smp/build/ SUBDIRS=$(DIR_APP)/src modules - cd $(DIR_APP)/src && install -m 644 atl1.ko /lib/modules/$(KVER)-ipfire-smp/kernel/drivers/net -else - cd $(DIR_APP)/src && make -C /lib/modules/$(KVER)-ipfire/build/ SUBDIRS=$(DIR_APP)/src modules - cd $(DIR_APP)/src && install -m 644 atl1.ko /lib/modules/$(KVER)-ipfire/kernel/drivers/net -endif - @rm -rf $(DIR_APP) - @$(POSTBUILD) diff --git a/lfs/linux b/lfs/linux index 4714db5f8a..ca756f410b 100644 --- a/lfs/linux +++ b/lfs/linux @@ -97,7 +97,7 @@ $(TARGET) : $(patsubst %,$(DIR_DL)/%,$(objects)) # Security fix for CIFS & Netfilter SNMP cd $(DIR_APP) && patch -Np1 < $(DIR_SRC)/src/patches/linux-2.6.20.21-additional_check_on_BER_decoding.patch - # Openswan nat-t + # Openswan cd $(DIR_APP) && patch -Np1 < $(DIR_SRC)/src/patches/openswan-2.4.x.kernel-2.6.23-natt.patch # Reiser4 diff --git a/lfs/openswan b/lfs/openswan index 76eb5c90d3..d6e71b2148 100644 --- a/lfs/openswan +++ b/lfs/openswan @@ -24,7 +24,7 @@ include Config -VER = 2.4.13 +VER = 2.4.12 THISAPP = openswan-$(VER) DL_FILE = $(THISAPP).tar.gz @@ -48,7 +48,7 @@ objects = $(DL_FILE) $(DL_FILE) = $(DL_FROM)/$(DL_FILE) -$(DL_FILE)_MD5 = 0c2505cf2639a7de051e815f41e8e1f4 +$(DL_FILE)_MD5 = 0bca0cc205d2d83eff64a7cea825ce7a install : $(TARGET) diff --git a/src/patches/openswan-2.4.12.kernel-2.6.20.21-natt.patch b/src/patches/openswan-2.4.12.kernel-2.6.20.21-natt.patch deleted file mode 100644 index 471eb3296d..0000000000 --- a/src/patches/openswan-2.4.12.kernel-2.6.20.21-natt.patch +++ /dev/null @@ -1,122 +0,0 @@ -packaging/utils/nattpatch 2.6 ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ nat-t/include/net/xfrmudp.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,10 @@ -+/* -+ * pointer to function for type that xfrm4_input wants, to permit -+ * decoupling of XFRM from udp.c -+ */ -+#define HAVE_XFRM4_UDP_REGISTER -+ -+typedef int (*xfrm4_rcv_encap_t)(struct sk_buff *skb, __u16 encap_type); -+extern int udp4_register_esp_rcvencap(xfrm4_rcv_encap_t func -+ , xfrm4_rcv_encap_t *oldfunc); -+extern int udp4_unregister_esp_rcvencap(xfrm4_rcv_encap_t func); ---- /distros/kernel/linux-2.6.11.2/net/ipv4/Kconfig 2005-03-09 03:12:33.000000000 -0500 -+++ swan26/net/ipv4/Kconfig 2005-04-04 18:46:13.000000000 -0400 -@@ -351,2 +351,8 @@ - -+config IPSEC_NAT_TRAVERSAL -+ bool "IPSEC NAT-Traversal (KLIPS compatible)" -+ depends on INET -+ ---help--- -+ Includes support for RFC3947/RFC3948 NAT-Traversal of ESP over UDP. -+ - config IP_TCPDIAG ---- plain26/net/ipv4/udp.c.orig 2006-01-02 22:21:10.000000000 -0500 -+++ plain26/net/ipv4/udp.c 2006-01-12 20:18:57.000000000 -0500 -@@ -108,6 +108,7 @@ - */ - - DEFINE_SNMP_STAT(struct udp_mib, udp_statistics) __read_mostly; -+#include - - struct hlist_head udp_hash[UDP_HTABLE_SIZE]; - DEFINE_RWLOCK(udp_hash_lock); -@@ -914,6 +915,44 @@ - return 0; - } - -+#if defined(CONFIG_XFRM) || defined(CONFIG_IPSEC_NAT_TRAVERSAL) -+ -+/* if XFRM isn't a module, then register it directly. */ -+#if !defined(CONFIG_XFRM_MODULE) -+static xfrm4_rcv_encap_t xfrm4_rcv_encap_func = xfrm4_rcv_encap; -+#else -+static xfrm4_rcv_encap_t xfrm4_rcv_encap_func = NULL; -+#endif -+ -+static xfrm4_rcv_encap_t xfrm4_rcv_encap_func; -+ -+int udp4_register_esp_rcvencap(xfrm4_rcv_encap_t func -+ , xfrm4_rcv_encap_t *oldfunc) -+{ -+ if(oldfunc != NULL) { -+ *oldfunc = xfrm4_rcv_encap_func; -+ } -+ -+#if 0 -+ if(xfrm4_rcv_encap_func != NULL) -+ return -1; -+#endif -+ -+ xfrm4_rcv_encap_func = func; -+ return 0; -+} -+ -+int udp4_unregister_esp_rcvencap(xfrm4_rcv_encap_t func) -+{ -+ if(xfrm4_rcv_encap_func != func) -+ return -1; -+ -+ xfrm4_rcv_encap_func = NULL; -+ return 0; -+} -+#endif /* CONFIG_XFRM || defined(CONFIG_IPSEC_NAT_TRAVERSAL)*/ -+ -+ - /* return: - * 1 if the the UDP system should process it - * 0 if we should drop this packet -@@ -921,9 +960,9 @@ - */ - static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb) - { --#ifndef CONFIG_XFRM -+#if !defined(CONFIG_XFRM) && !defined(CONFIG_IPSEC_NAT_TRAVERSAL) - return 1; --#else -+#else /* either CONFIG_XFRM or CONFIG_IPSEC_NAT_TRAVERSAL */ - struct udp_sock *up = udp_sk(sk); - struct udphdr *uh; - struct iphdr *iph; -@@ -1049,11 +1088,15 @@ - kfree_skb(skb); - return 0; - } -- if (ret < 0) { -- /* process the ESP packet */ -- ret = xfrm4_rcv_encap(skb, up->encap_type); -- UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS, up->pcflag); -- return -ret; -+ if (ret < 0) { -+ if(xfrm4_rcv_encap_func != NULL) { -+ ret = (*xfrm4_rcv_encap_func)(skb, up->encap_type); -+ UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS, up->pcflag); -+ } else { -+ UDP_INC_STATS_BH(UDP_MIB_INERRORS, up->pcflag); -+ ret = 1; -+ } -+ return ret; - } - /* FALLTHROUGH -- it's a UDP Packet */ - } -@@ -1732,3 +1775,8 @@ - EXPORT_SYMBOL(udp_proc_register); - EXPORT_SYMBOL(udp_proc_unregister); - #endif -+ -+#if defined(CONFIG_IPSEC_NAT_TRAVERSAL) -+EXPORT_SYMBOL(udp4_register_esp_rcvencap); -+EXPORT_SYMBOL(udp4_unregister_esp_rcvencap); -+#endif diff --git a/src/patches/openswan-2.6.14-kernel-2.6.24.7-natt.patch b/src/patches/openswan-2.6.14-kernel-2.6.24.7-natt.patch deleted file mode 100644 index c84e996f5b..0000000000 --- a/src/patches/openswan-2.6.14-kernel-2.6.24.7-natt.patch +++ /dev/null @@ -1,129 +0,0 @@ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ nat-t/include/net/xfrmudp.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,10 @@ -+/* -+ * pointer to function for type that xfrm4_input wants, to permit -+ * decoupling of XFRM from udp.c -+ */ -+#define HAVE_XFRM4_UDP_REGISTER -+ -+typedef int (*xfrm4_rcv_encap_t)(struct sk_buff *skb, __u16 encap_type); -+extern int udp4_register_esp_rcvencap(xfrm4_rcv_encap_t func -+ , xfrm4_rcv_encap_t *oldfunc); -+extern int udp4_unregister_esp_rcvencap(xfrm4_rcv_encap_t func); ---- /distros/kernel/linux-2.6.11.2/net/ipv4/Kconfig 2005-03-09 03:12:33.000000000 -0500 -+++ swan26/net/ipv4/Kconfig 2005-04-04 18:46:13.000000000 -0400 -@@ -351,2 +351,8 @@ - -+config IPSEC_NAT_TRAVERSAL -+ bool "IPSEC NAT-Traversal (KLIPS compatible)" -+ depends on INET -+ ---help--- -+ Includes support for RFC3947/RFC3948 NAT-Traversal of ESP over UDP. -+ - config IP_TCPDIAG ---- plain26/net/ipv4/udp.c.orig 2006-12-28 20:53:17.000000000 -0500 -+++ plain26/net/ipv4/udp.c 2007-05-11 10:22:50.000000000 -0400 -@@ -108,6 +108,7 @@ - #include - #include - #include -+#include - - /* - * Snmp MIB for the UDP layer -@@ -881,6 +882,31 @@ - sk_common_release(sk); - } - -+#if defined(CONFIG_XFRM) || defined(CONFIG_IPSEC_NAT_TRAVERSAL) -+ -+static xfrm4_rcv_encap_t xfrm4_rcv_encap_func = NULL; -+int udp4_register_esp_rcvencap(xfrm4_rcv_encap_t func -+ , xfrm4_rcv_encap_t *oldfunc) -+{ -+ if(oldfunc != NULL) { -+ *oldfunc = xfrm4_rcv_encap_func; -+ } -+ -+ xfrm4_rcv_encap_func = func; -+ return 0; -+} -+ -+int udp4_unregister_esp_rcvencap(xfrm4_rcv_encap_t func) -+{ -+ if(xfrm4_rcv_encap_func != func) -+ return -1; -+ -+ xfrm4_rcv_encap_func = NULL; -+ return 0; -+} -+#endif /* CONFIG_XFRM_MODULE || CONFIG_IPSEC_NAT_TRAVERSAL */ -+ -+ - /* return: - * 1 if the the UDP system should process it - * 0 if we should drop this packet -@@ -888,9 +914,9 @@ - */ - static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb) - { --#ifndef CONFIG_XFRM -+#if !defined(CONFIG_XFRM) && !defined(CONFIG_IPSEC_NAT_TRAVERSAL) - return 1; --#else -+#else /* either CONFIG_XFRM or CONFIG_IPSEC_NAT_TRAVERSAL */ - struct udp_sock *up = udp_sk(sk); - struct udphdr *uh; - struct iphdr *iph; -@@ -1018,10 +1044,27 @@ - return 0; - } - if (ret < 0) { -- /* process the ESP packet */ -- ret = xfrm4_rcv_encap(skb, up->encap_type); -- UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS); -- return -ret; -+ if(xfrm4_rcv_encap_func != NULL) -+ ret = (*xfrm4_rcv_encap_func)(skb, up->encap_type); -+ -+ switch(ret) { -+ case 1: -+ /* FALLTHROUGH to send-up */; -+ break; -+ -+ case 0: -+ /* PROCESSED, free it */ -+ UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS); -+ return 0; -+ -+ case -1: -+ /* PACKET wasn't for _func, or no func, pass it -+ * to stock function -+ */ -+ ret = xfrm4_rcv_encap(skb, up->encap_type); -+ UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS); -+ return -ret; -+ } - } - /* FALLTHROUGH -- it's a UDP Packet */ - } -@@ -1110,7 +1153,6 @@ - /* - * All we need to do is get the socket, and then do a checksum. - */ -- - int udp_rcv(struct sk_buff *skb) - { - struct sock *sk; -@@ -1599,3 +1641,9 @@ - EXPORT_SYMBOL(udp_proc_register); - EXPORT_SYMBOL(udp_proc_unregister); - #endif -+ -+#if defined(CONFIG_IPSEC_NAT_TRAVERSAL) -+EXPORT_SYMBOL(udp4_register_esp_rcvencap); -+EXPORT_SYMBOL(udp4_unregister_esp_rcvencap); -+#endif -+ -make[1]: Leaving directory `/usr/src/openswan-2.6.14' diff --git a/src/patches/openswan-2.6.14-startklips-1.patch b/src/patches/openswan-2.6.14-startklips-1.patch deleted file mode 100644 index 910a9cd19b..0000000000 --- a/src/patches/openswan-2.6.14-startklips-1.patch +++ /dev/null @@ -1,55 +0,0 @@ ---- _startklips.orig 2008-07-11 01:55:19.000000000 +0200 -+++ _startklips 2008-07-12 09:11:56.000000000 +0200 -@@ -149,23 +149,35 @@ - - # figure out ifconfig for interface - addr= -- eval `ifconfig $phys | -- awk '$1 == "inet" && $2 ~ /^addr:/ && $NF ~ /^Mask:/ { -- gsub(/:/, " ", $0) -- print "addr=" $3 -- other = $5 -- if ($4 == "Bcast") -- print "type=broadcast" -- else if ($4 == "P-t-P") -- print "type=pointopoint" -- else if (NF == 5) { -- print "type=" -- other = "" -- } else -- print "type=unknown" -- print "otheraddr=" other -- print "mask=" $NF -- }'` -+ eval `ip addr show $phys | awk '$3 ~ /BROADCAST|POINTOPOINT/ { -+ if ($3 ~ /BROADCAST/) -+ print "type=broadcast"; -+ else if ($3 ~ /POINTOPOINT/) -+ print "type=pointopoint"; -+ else { -+ print "type="; -+ } -+ }'` -+ -+ if [ "$type" == "broadcast" ]; then -+ eval `ip addr show $phys | awk '$1 == "inet" { gsub(/\//, " "); -+ print "addr=" $2; -+ print "mask=" $3; -+ print "otheraddr=" $5; -+ }'` -+ elif [ "$type" == "pointopoint" ]; then -+ eval `ip addr show $phys | awk '$1 == "inet" { gsub(/\//, " "); -+ print "addr=" $2; -+ print "mask=" $5; -+ print "otheraddr=" $4; -+ }'` -+ else -+ type="unknown" -+ otheraddr= -+ fi -+ -+ eval `whatmask /$mask | awk -F': ' '$1 ~ /^Netmask =/ { print "mask=" $2 }'` -+ - if test " $addr" = " " - then - echo "unable to determine address of \`$phys'" diff --git a/src/patches/openswan-2.6.14-updown-1.patch b/src/patches/openswan-2.6.14-updown-1.patch deleted file mode 100644 index ac38b7bb33..0000000000 --- a/src/patches/openswan-2.6.14-updown-1.patch +++ /dev/null @@ -1,30 +0,0 @@ ---- _updown.klips.orig 2008-07-11 01:55:19.000000000 +0200 -+++ _updown.klips 2008-07-12 09:20:26.000000000 +0200 -@@ -407,8 +407,8 @@ - # opportunistic encryption work around - # need to provide route that eclipses default, without - # replacing it. -- it="ip route $1 0.0.0.0/1 $parms2 $parms3 && -- ip route $1 128.0.0.0/1 $parms2 $parms3" -+ #it="ip route $1 0.0.0.0/1 $parms2 $parms3 && -+ # ip route $1 128.0.0.0/1 $parms2 $parms3" - ;; - *) it="ip route $1 $parms $parms2 $parms3" - ;; -@@ -432,13 +432,13 @@ - prepare-host:*|prepare-client:*) - # delete possibly-existing route (preliminary to adding a route) - case "$PLUTO_PEER_CLIENT" in -- "0.0.0.0/0") -+ "0.0.0.0/0") - # need to provide route that eclipses default, without - # replacing it. - parms1="0.0.0.0/1" - parms2="128.0.0.0/1" -- it="ip route delete $parms1 $IPROUTEARGS 2>&1 ; ip route delete $parms2 $IPROUTEARGS 2>&1" -- oops="`ip route delete $parms1 $IPROUTEARGS 2>&1 ; ip route delete $parms2 $IPROUTEARGS 2>&1`" -+ # it="ip route delete $parms1 $IPROUTEARGS 2>&1 ; ip route delete $parms2 $IPROUTEARGS 2>&1" -+ # oops="`ip route delete $parms1 $IPROUTEARGS 2>&1 ; ip route delete $parms2 $IPROUTEARGS 2>&1`" - ;; - *) - parms="$PLUTO_PEER_CLIENT $IPROUTEARGS" diff --git a/src/patches/openswan-2.6.16dr2-2.6.24-kernel.patch b/src/patches/openswan-2.6.16dr2-2.6.24-kernel.patch deleted file mode 100644 index faf66e7917..0000000000 --- a/src/patches/openswan-2.6.16dr2-2.6.24-kernel.patch +++ /dev/null @@ -1,56013 +0,0 @@ -packaging/utils/kernelpatch 2.6 ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/README.openswan-2 Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,112 @@ -+* -+* RCSID $Id: README.openswan-2,v 1.1 2003/12/10 01:07:49 mcr Exp $ -+* -+ -+ **************************************** -+ * IPSEC for Linux, Release 2.xx series * -+ **************************************** -+ -+ -+ -+1. Files -+ -+The contents of linux/net/ipsec/ (see below) join the linux kernel source tree. -+as provided for higher up. -+ -+The programs/ directory contains the user-level utilities which you need -+to run IPSEC. See the top-level top/INSTALL to compile and install them. -+ -+The testing/ directory contains test scripts. -+ -+The doc/ directory contains -- what else -- documentation. -+ -+1.1. Kernel files -+ -+The following are found in net/ipsec/: -+ -+Makefile The Makefile -+Config.in The configuration script for make menuconfig -+defconfig Configuration defaults for first time. -+ -+radij.c General-purpose radix-tree operations -+ -+ipsec_ipcomp.c IPCOMP encapsulate/decapsulate code. -+ipsec_ah.c Authentication Header (AH) encapsulate/decapsulate code. -+ipsec_esp.c Encapsulated Security Payload (ESP) encap/decap code. -+ -+pfkey_v2.c PF_KEYv2 socket interface code. -+pfkey_v2_parser.c PF_KEYv2 message parsing and processing code. -+ -+ipsec_init.c Initialization code, /proc interface. -+ipsec_radij.c Interface with the radix tree code. -+ipsec_netlink.c Interface with the netlink code. -+ipsec_xform.c Routines and structures common to transforms. -+ipsec_tunnel.c The outgoing packet processing code. -+ipsec_rcv.c The incoming packet processing code. -+ipsec_md5c.c Somewhat modified RSADSI MD5 C code. -+ipsec_sha1.c Somewhat modified Steve Reid SHA-1 C code. -+ -+sysctl_net_ipsec.c /proc/sys/net/ipsec/* variable definitions. -+ -+version.c symbolic link to project version. -+ -+radij.h Headers for radij.c -+ -+ipcomp.h Headers used by IPCOMP code. -+ -+ipsec_radij.h Interface with the radix tree code. -+ipsec_netlink.h Headers used by the netlink interface. -+ipsec_encap.h Headers defining encapsulation structures. -+ipsec_xform.h Transform headers. -+ipsec_tunnel.h Headers used by tunneling code. -+ipsec_ipe4.h Headers for the IP-in-IP code. -+ipsec_ah.h Headers common to AH transforms. -+ipsec_md5h.h RSADSI MD5 headers. -+ipsec_sha1.h SHA-1 headers. -+ipsec_esp.h Headers common to ESP transfroms. -+ipsec_rcv.h Headers for incoming packet processing code. -+ -+1.2. User-level files. -+ -+The following are found in utils/: -+ -+eroute.c Create an "extended route" source code -+spi.c Set up Security Associations source code -+spigrp.c Link SPIs together source code. -+tncfg.c Configure the tunneling features of the virtual interface -+ source code -+klipsdebug.c Set/reset klips debugging features source code. -+version.c symbolic link to project version. -+ -+eroute.8 Create an "extended route" manual page -+spi.8 Set up Security Associations manual page -+spigrp.8 Link SPIs together manual page -+tncfg.8 Configure the tunneling features of the virtual interface -+ manual page -+klipsdebug.8 Set/reset klips debugging features manual page -+ -+eroute.5 /proc/net/ipsec_eroute format manual page -+spi.5 /proc/net/ipsec_spi format manual page -+spigrp.5 /proc/net/ipsec_spigrp format manual page -+tncfg.5 /proc/net/ipsec_tncfg format manual page -+klipsdebug.5 /proc/net/ipsec_klipsdebug format manual page -+version.5 /proc/net/ipsec_version format manual page -+pf_key.5 /proc/net/pf_key format manual page -+ -+Makefile Utilities makefile. -+ -+*.8 Manpages for the respective utils. -+ -+ -+1.3. Test files -+ -+The test scripts are locate in testing/ and and documentation is found -+at doc/src/umltesting.html. Automated testing via "make check" is available -+provided that the User-Mode-Linux patches are available. -+ -+* -+* $Log: README.openswan-2,v $ -+* Revision 1.1 2003/12/10 01:07:49 mcr -+* documentation for additions. -+* -+* ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/des/des_locl.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,511 @@ -+/* crypto/des/des_locl.org */ -+/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com) -+ * All rights reserved. -+ * -+ * This package is an SSL implementation written -+ * by Eric Young (eay@cryptsoft.com). -+ * The implementation was written so as to conform with Netscapes SSL. -+ * -+ * This library is free for commercial and non-commercial use as long as -+ * the following conditions are aheared to. The following conditions -+ * apply to all code found in this distribution, be it the RC4, RSA, -+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation -+ * included with this distribution is covered by the same copyright terms -+ * except that the holder is Tim Hudson (tjh@cryptsoft.com). -+ * -+ * Copyright remains Eric Young's, and as such any Copyright notices in -+ * the code are not to be removed. -+ * If this package is used in a product, Eric Young should be given attribution -+ * as the author of the parts of the library used. -+ * This can be in the form of a textual message at program startup or -+ * in documentation (online or textual) provided with the package. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * 1. Redistributions of source code must retain the copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. All advertising materials mentioning features or use of this software -+ * must display the following acknowledgement: -+ * "This product includes cryptographic software written by -+ * Eric Young (eay@cryptsoft.com)" -+ * The word 'cryptographic' can be left out if the rouines from the library -+ * being used are not cryptographic related :-). -+ * 4. If you include any Windows specific code (or a derivative thereof) from -+ * the apps directory (application code) you must include an acknowledgement: -+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" -+ * -+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND -+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -+ * SUCH DAMAGE. -+ * -+ * The licence and distribution terms for any publically available version or -+ * derivative of this code cannot be changed. i.e. this code cannot simply be -+ * copied and put under another distribution licence -+ * [including the GNU Public Licence.] -+ */ -+ -+/* WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING -+ * -+ * Always modify des_locl.org since des_locl.h is automatically generated from -+ * it during SSLeay configuration. -+ * -+ * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING -+ */ -+ -+#ifndef HEADER_DES_LOCL_H -+#define HEADER_DES_LOCL_H -+ -+#if defined(WIN32) || defined(WIN16) -+#ifndef MSDOS -+#define MSDOS -+#endif -+#endif -+ -+#include "klips-crypto/des.h" -+#ifdef OCF_ASSIST -+#include "klips-crypto/ocf_assist.h" -+#endif -+ -+#ifndef DES_DEFAULT_OPTIONS -+/* the following is tweaked from a config script, that is why it is a -+ * protected undef/define */ -+#ifndef DES_PTR -+#define DES_PTR -+#endif -+ -+/* This helps C compiler generate the correct code for multiple functional -+ * units. It reduces register dependancies at the expense of 2 more -+ * registers */ -+#ifndef DES_RISC1 -+#define DES_RISC1 -+#endif -+ -+#ifndef DES_RISC2 -+#undef DES_RISC2 -+#endif -+ -+#if defined(DES_RISC1) && defined(DES_RISC2) -+YOU SHOULD NOT HAVE BOTH DES_RISC1 AND DES_RISC2 DEFINED!!!!! -+#endif -+ -+/* Unroll the inner loop, this sometimes helps, sometimes hinders. -+ * Very mucy CPU dependant */ -+#ifndef DES_UNROLL -+#define DES_UNROLL -+#endif -+ -+/* These default values were supplied by -+ * Peter Gutman -+ * They are only used if nothing else has been defined */ -+#if !defined(DES_PTR) && !defined(DES_RISC1) && !defined(DES_RISC2) && !defined(DES_UNROLL) -+/* Special defines which change the way the code is built depending on the -+ CPU and OS. For SGI machines you can use _MIPS_SZLONG (32 or 64) to find -+ even newer MIPS CPU's, but at the moment one size fits all for -+ optimization options. Older Sparc's work better with only UNROLL, but -+ there's no way to tell at compile time what it is you're running on */ -+ -+#if defined( sun ) /* Newer Sparc's */ -+ #define DES_PTR -+ #define DES_RISC1 -+ #define DES_UNROLL -+#elif defined( __ultrix ) /* Older MIPS */ -+ #define DES_PTR -+ #define DES_RISC2 -+ #define DES_UNROLL -+#elif defined( __osf1__ ) /* Alpha */ -+ #define DES_PTR -+ #define DES_RISC2 -+#elif defined ( _AIX ) /* RS6000 */ -+ /* Unknown */ -+#elif defined( __hpux ) /* HP-PA */ -+ /* Unknown */ -+#elif defined( __aux ) /* 68K */ -+ /* Unknown */ -+#elif defined( __dgux ) /* 88K (but P6 in latest boxes) */ -+ #define DES_UNROLL -+#elif defined( __sgi ) /* Newer MIPS */ -+ #define DES_PTR -+ #define DES_RISC2 -+ #define DES_UNROLL -+#elif defined( i386 ) /* x86 boxes, should be gcc */ -+ #define DES_PTR -+ #define DES_RISC1 -+ #define DES_UNROLL -+#endif /* Systems-specific speed defines */ -+#endif -+ -+#endif /* DES_DEFAULT_OPTIONS */ -+ -+#ifdef MSDOS /* Visual C++ 2.1 (Windows NT/95) */ -+#include -+#include -+#include -+#include -+#ifndef RAND -+#define RAND -+#endif -+#undef NOPROTO -+#endif -+ -+#if defined(__STDC__) || defined(VMS) || defined(M_XENIX) || defined(MSDOS) -+#ifndef __KERNEL__ -+#include -+#else -+#include -+#endif -+#endif -+ -+#ifndef RAND -+#define RAND -+#endif -+ -+#ifdef linux -+#undef RAND -+#endif -+ -+#ifdef MSDOS -+#define getpid() 2 -+#define RAND -+#undef NOPROTO -+#endif -+ -+#if defined(NOCONST) -+#define const -+#endif -+ -+#ifdef __STDC__ -+#undef NOPROTO -+#endif -+ -+#define ITERATIONS 16 -+#define HALF_ITERATIONS 8 -+ -+/* used in des_read and des_write */ -+#define MAXWRITE (1024*16) -+#define BSIZE (MAXWRITE+4) -+ -+#define c2l(c,l) (l =((DES_LONG)(*((c)++))) , \ -+ l|=((DES_LONG)(*((c)++)))<< 8L, \ -+ l|=((DES_LONG)(*((c)++)))<<16L, \ -+ l|=((DES_LONG)(*((c)++)))<<24L) -+ -+/* NOTE - c is not incremented as per c2l */ -+#define c2ln(c,l1,l2,n) { \ -+ c+=n; \ -+ l1=l2=0; \ -+ switch (n) { \ -+ case 8: l2 =((DES_LONG)(*(--(c))))<<24L; \ -+ case 7: l2|=((DES_LONG)(*(--(c))))<<16L; \ -+ case 6: l2|=((DES_LONG)(*(--(c))))<< 8L; \ -+ case 5: l2|=((DES_LONG)(*(--(c)))); \ -+ case 4: l1 =((DES_LONG)(*(--(c))))<<24L; \ -+ case 3: l1|=((DES_LONG)(*(--(c))))<<16L; \ -+ case 2: l1|=((DES_LONG)(*(--(c))))<< 8L; \ -+ case 1: l1|=((DES_LONG)(*(--(c)))); \ -+ } \ -+ } -+ -+#define l2c(l,c) (*((c)++)=(unsigned char)(((l) )&0xff), \ -+ *((c)++)=(unsigned char)(((l)>> 8L)&0xff), \ -+ *((c)++)=(unsigned char)(((l)>>16L)&0xff), \ -+ *((c)++)=(unsigned char)(((l)>>24L)&0xff)) -+ -+/* replacements for htonl and ntohl since I have no idea what to do -+ * when faced with machines with 8 byte longs. */ -+#define HDRSIZE 4 -+ -+#define n2l(c,l) (l =((DES_LONG)(*((c)++)))<<24L, \ -+ l|=((DES_LONG)(*((c)++)))<<16L, \ -+ l|=((DES_LONG)(*((c)++)))<< 8L, \ -+ l|=((DES_LONG)(*((c)++)))) -+ -+#define l2n(l,c) (*((c)++)=(unsigned char)(((l)>>24L)&0xff), \ -+ *((c)++)=(unsigned char)(((l)>>16L)&0xff), \ -+ *((c)++)=(unsigned char)(((l)>> 8L)&0xff), \ -+ *((c)++)=(unsigned char)(((l) )&0xff)) -+ -+/* NOTE - c is not incremented as per l2c */ -+#define l2cn(l1,l2,c,n) { \ -+ c+=n; \ -+ switch (n) { \ -+ case 8: *(--(c))=(unsigned char)(((l2)>>24L)&0xff); \ -+ case 7: *(--(c))=(unsigned char)(((l2)>>16L)&0xff); \ -+ case 6: *(--(c))=(unsigned char)(((l2)>> 8L)&0xff); \ -+ case 5: *(--(c))=(unsigned char)(((l2) )&0xff); \ -+ case 4: *(--(c))=(unsigned char)(((l1)>>24L)&0xff); \ -+ case 3: *(--(c))=(unsigned char)(((l1)>>16L)&0xff); \ -+ case 2: *(--(c))=(unsigned char)(((l1)>> 8L)&0xff); \ -+ case 1: *(--(c))=(unsigned char)(((l1) )&0xff); \ -+ } \ -+ } -+ -+#define ROTATE(a,n) (((a)>>(n))+((a)<<(32-(n)))) -+ -+/* Don't worry about the LOAD_DATA() stuff, that is used by -+ * fcrypt() to add it's little bit to the front */ -+ -+#ifdef DES_FCRYPT -+ -+#define LOAD_DATA_tmp(R,S,u,t,E0,E1) \ -+ { DES_LONG tmp; LOAD_DATA(R,S,u,t,E0,E1,tmp); } -+ -+#define LOAD_DATA(R,S,u,t,E0,E1,tmp) \ -+ t=R^(R>>16L); \ -+ u=t&E0; t&=E1; \ -+ tmp=(u<<16); u^=R^s[S ]; u^=tmp; \ -+ tmp=(t<<16); t^=R^s[S+1]; t^=tmp -+#else -+#define LOAD_DATA_tmp(a,b,c,d,e,f) LOAD_DATA(a,b,c,d,e,f,g) -+#define LOAD_DATA(R,S,u,t,E0,E1,tmp) \ -+ u=R^s[S ]; \ -+ t=R^s[S+1] -+#endif -+ -+/* The changes to this macro may help or hinder, depending on the -+ * compiler and the achitecture. gcc2 always seems to do well :-). -+ * Inspired by Dana How -+ * DO NOT use the alternative version on machines with 8 byte longs. -+ * It does not seem to work on the Alpha, even when DES_LONG is 4 -+ * bytes, probably an issue of accessing non-word aligned objects :-( */ -+#ifdef DES_PTR -+ -+/* It recently occured to me that 0^0^0^0^0^0^0 == 0, so there -+ * is no reason to not xor all the sub items together. This potentially -+ * saves a register since things can be xored directly into L */ -+ -+#if defined(DES_RISC1) || defined(DES_RISC2) -+#ifdef DES_RISC1 -+#define D_ENCRYPT(LL,R,S) { \ -+ unsigned int u1,u2,u3; \ -+ LOAD_DATA(R,S,u,t,E0,E1,u1); \ -+ u2=(int)u>>8L; \ -+ u1=(int)u&0xfc; \ -+ u2&=0xfc; \ -+ t=ROTATE(t,4); \ -+ u>>=16L; \ -+ LL^= *(DES_LONG *)((unsigned char *)des_SP +u1); \ -+ LL^= *(DES_LONG *)((unsigned char *)des_SP+0x200+u2); \ -+ u3=(int)(u>>8L); \ -+ u1=(int)u&0xfc; \ -+ u3&=0xfc; \ -+ LL^= *(DES_LONG *)((unsigned char *)des_SP+0x400+u1); \ -+ LL^= *(DES_LONG *)((unsigned char *)des_SP+0x600+u3); \ -+ u2=(int)t>>8L; \ -+ u1=(int)t&0xfc; \ -+ u2&=0xfc; \ -+ t>>=16L; \ -+ LL^= *(DES_LONG *)((unsigned char *)des_SP+0x100+u1); \ -+ LL^= *(DES_LONG *)((unsigned char *)des_SP+0x300+u2); \ -+ u3=(int)t>>8L; \ -+ u1=(int)t&0xfc; \ -+ u3&=0xfc; \ -+ LL^= *(DES_LONG *)((unsigned char *)des_SP+0x500+u1); \ -+ LL^= *(DES_LONG *)((unsigned char *)des_SP+0x700+u3); } -+#endif -+#ifdef DES_RISC2 -+#define D_ENCRYPT(LL,R,S) { \ -+ unsigned int u1,u2,s1,s2; \ -+ LOAD_DATA(R,S,u,t,E0,E1,u1); \ -+ u2=(int)u>>8L; \ -+ u1=(int)u&0xfc; \ -+ u2&=0xfc; \ -+ t=ROTATE(t,4); \ -+ LL^= *(DES_LONG *)((unsigned char *)des_SP +u1); \ -+ LL^= *(DES_LONG *)((unsigned char *)des_SP+0x200+u2); \ -+ s1=(int)(u>>16L); \ -+ s2=(int)(u>>24L); \ -+ s1&=0xfc; \ -+ s2&=0xfc; \ -+ LL^= *(DES_LONG *)((unsigned char *)des_SP+0x400+s1); \ -+ LL^= *(DES_LONG *)((unsigned char *)des_SP+0x600+s2); \ -+ u2=(int)t>>8L; \ -+ u1=(int)t&0xfc; \ -+ u2&=0xfc; \ -+ LL^= *(DES_LONG *)((unsigned char *)des_SP+0x100+u1); \ -+ LL^= *(DES_LONG *)((unsigned char *)des_SP+0x300+u2); \ -+ s1=(int)(t>>16L); \ -+ s2=(int)(t>>24L); \ -+ s1&=0xfc; \ -+ s2&=0xfc; \ -+ LL^= *(DES_LONG *)((unsigned char *)des_SP+0x500+s1); \ -+ LL^= *(DES_LONG *)((unsigned char *)des_SP+0x700+s2); } -+#endif -+#else -+#define D_ENCRYPT(LL,R,S) { \ -+ LOAD_DATA_tmp(R,S,u,t,E0,E1); \ -+ t=ROTATE(t,4); \ -+ LL^= \ -+ *(DES_LONG *)((unsigned char *)des_SP +((u )&0xfc))^ \ -+ *(DES_LONG *)((unsigned char *)des_SP+0x200+((u>> 8L)&0xfc))^ \ -+ *(DES_LONG *)((unsigned char *)des_SP+0x400+((u>>16L)&0xfc))^ \ -+ *(DES_LONG *)((unsigned char *)des_SP+0x600+((u>>24L)&0xfc))^ \ -+ *(DES_LONG *)((unsigned char *)des_SP+0x100+((t )&0xfc))^ \ -+ *(DES_LONG *)((unsigned char *)des_SP+0x300+((t>> 8L)&0xfc))^ \ -+ *(DES_LONG *)((unsigned char *)des_SP+0x500+((t>>16L)&0xfc))^ \ -+ *(DES_LONG *)((unsigned char *)des_SP+0x700+((t>>24L)&0xfc)); } -+#endif -+ -+#else /* original version */ -+ -+#if defined(DES_RISC1) || defined(DES_RISC2) -+#ifdef DES_RISC1 -+#define D_ENCRYPT(LL,R,S) {\ -+ unsigned int u1,u2,u3; \ -+ LOAD_DATA(R,S,u,t,E0,E1,u1); \ -+ u>>=2L; \ -+ t=ROTATE(t,6); \ -+ u2=(int)u>>8L; \ -+ u1=(int)u&0x3f; \ -+ u2&=0x3f; \ -+ u>>=16L; \ -+ LL^=des_SPtrans[0][u1]; \ -+ LL^=des_SPtrans[2][u2]; \ -+ u3=(int)u>>8L; \ -+ u1=(int)u&0x3f; \ -+ u3&=0x3f; \ -+ LL^=des_SPtrans[4][u1]; \ -+ LL^=des_SPtrans[6][u3]; \ -+ u2=(int)t>>8L; \ -+ u1=(int)t&0x3f; \ -+ u2&=0x3f; \ -+ t>>=16L; \ -+ LL^=des_SPtrans[1][u1]; \ -+ LL^=des_SPtrans[3][u2]; \ -+ u3=(int)t>>8L; \ -+ u1=(int)t&0x3f; \ -+ u3&=0x3f; \ -+ LL^=des_SPtrans[5][u1]; \ -+ LL^=des_SPtrans[7][u3]; } -+#endif -+#ifdef DES_RISC2 -+#define D_ENCRYPT(LL,R,S) {\ -+ unsigned int u1,u2,s1,s2; \ -+ LOAD_DATA(R,S,u,t,E0,E1,u1); \ -+ u>>=2L; \ -+ t=ROTATE(t,6); \ -+ u2=(int)u>>8L; \ -+ u1=(int)u&0x3f; \ -+ u2&=0x3f; \ -+ LL^=des_SPtrans[0][u1]; \ -+ LL^=des_SPtrans[2][u2]; \ -+ s1=(int)u>>16L; \ -+ s2=(int)u>>24L; \ -+ s1&=0x3f; \ -+ s2&=0x3f; \ -+ LL^=des_SPtrans[4][s1]; \ -+ LL^=des_SPtrans[6][s2]; \ -+ u2=(int)t>>8L; \ -+ u1=(int)t&0x3f; \ -+ u2&=0x3f; \ -+ LL^=des_SPtrans[1][u1]; \ -+ LL^=des_SPtrans[3][u2]; \ -+ s1=(int)t>>16; \ -+ s2=(int)t>>24L; \ -+ s1&=0x3f; \ -+ s2&=0x3f; \ -+ LL^=des_SPtrans[5][s1]; \ -+ LL^=des_SPtrans[7][s2]; } -+#endif -+ -+#else -+ -+#define D_ENCRYPT(LL,R,S) {\ -+ LOAD_DATA_tmp(R,S,u,t,E0,E1); \ -+ t=ROTATE(t,4); \ -+ LL^=\ -+ des_SPtrans[0][(u>> 2L)&0x3f]^ \ -+ des_SPtrans[2][(u>>10L)&0x3f]^ \ -+ des_SPtrans[4][(u>>18L)&0x3f]^ \ -+ des_SPtrans[6][(u>>26L)&0x3f]^ \ -+ des_SPtrans[1][(t>> 2L)&0x3f]^ \ -+ des_SPtrans[3][(t>>10L)&0x3f]^ \ -+ des_SPtrans[5][(t>>18L)&0x3f]^ \ -+ des_SPtrans[7][(t>>26L)&0x3f]; } -+#endif -+#endif -+ -+ /* IP and FP -+ * The problem is more of a geometric problem that random bit fiddling. -+ 0 1 2 3 4 5 6 7 62 54 46 38 30 22 14 6 -+ 8 9 10 11 12 13 14 15 60 52 44 36 28 20 12 4 -+ 16 17 18 19 20 21 22 23 58 50 42 34 26 18 10 2 -+ 24 25 26 27 28 29 30 31 to 56 48 40 32 24 16 8 0 -+ -+ 32 33 34 35 36 37 38 39 63 55 47 39 31 23 15 7 -+ 40 41 42 43 44 45 46 47 61 53 45 37 29 21 13 5 -+ 48 49 50 51 52 53 54 55 59 51 43 35 27 19 11 3 -+ 56 57 58 59 60 61 62 63 57 49 41 33 25 17 9 1 -+ -+ The output has been subject to swaps of the form -+ 0 1 -> 3 1 but the odd and even bits have been put into -+ 2 3 2 0 -+ different words. The main trick is to remember that -+ t=((l>>size)^r)&(mask); -+ r^=t; -+ l^=(t<>(n))^(b))&(m)),\ -+ (b)^=(t),\ -+ (a)^=((t)<<(n))) -+ -+#define IP(l,r) \ -+ { \ -+ register DES_LONG tt; \ -+ PERM_OP(r,l,tt, 4,0x0f0f0f0fL); \ -+ PERM_OP(l,r,tt,16,0x0000ffffL); \ -+ PERM_OP(r,l,tt, 2,0x33333333L); \ -+ PERM_OP(l,r,tt, 8,0x00ff00ffL); \ -+ PERM_OP(r,l,tt, 1,0x55555555L); \ -+ } -+ -+#define FP(l,r) \ -+ { \ -+ register DES_LONG tt; \ -+ PERM_OP(l,r,tt, 1,0x55555555L); \ -+ PERM_OP(r,l,tt, 8,0x00ff00ffL); \ -+ PERM_OP(l,r,tt, 2,0x33333333L); \ -+ PERM_OP(r,l,tt,16,0x0000ffffL); \ -+ PERM_OP(l,r,tt, 4,0x0f0f0f0fL); \ -+ } -+ -+extern const DES_LONG des_SPtrans[8][64]; -+ -+#ifndef NO_FCRYPT -+#ifndef NOPROTO -+void fcrypt_body(DES_LONG *out,des_key_schedule ks, -+ DES_LONG Eswap0, DES_LONG Eswap1); -+#else -+void fcrypt_body(); -+#endif -+#endif /* NO_FCRYPT */ -+ -+#endif ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/des/des_ver.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,60 @@ -+/* crypto/des/des_ver.h */ -+/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com) -+ * All rights reserved. -+ * -+ * This package is an SSL implementation written -+ * by Eric Young (eay@cryptsoft.com). -+ * The implementation was written so as to conform with Netscapes SSL. -+ * -+ * This library is free for commercial and non-commercial use as long as -+ * the following conditions are aheared to. The following conditions -+ * apply to all code found in this distribution, be it the RC4, RSA, -+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation -+ * included with this distribution is covered by the same copyright terms -+ * except that the holder is Tim Hudson (tjh@cryptsoft.com). -+ * -+ * Copyright remains Eric Young's, and as such any Copyright notices in -+ * the code are not to be removed. -+ * If this package is used in a product, Eric Young should be given attribution -+ * as the author of the parts of the library used. -+ * This can be in the form of a textual message at program startup or -+ * in documentation (online or textual) provided with the package. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * 1. Redistributions of source code must retain the copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. All advertising materials mentioning features or use of this software -+ * must display the following acknowledgement: -+ * "This product includes cryptographic software written by -+ * Eric Young (eay@cryptsoft.com)" -+ * The word 'cryptographic' can be left out if the rouines from the library -+ * being used are not cryptographic related :-). -+ * 4. If you include any Windows specific code (or a derivative thereof) from -+ * the apps directory (application code) you must include an acknowledgement: -+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" -+ * -+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND -+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -+ * SUCH DAMAGE. -+ * -+ * The licence and distribution terms for any publically available version or -+ * derivative of this code cannot be changed. i.e. this code cannot simply be -+ * copied and put under another distribution licence -+ * [including the GNU Public Licence.] -+ */ -+ -+extern char *DES_version; /* SSLeay version string */ -+extern char *libdes_version; /* old libdes version string */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/des/podd.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,75 @@ -+/* crypto/des/podd.h */ -+/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com) -+ * All rights reserved. -+ * -+ * This package is an SSL implementation written -+ * by Eric Young (eay@cryptsoft.com). -+ * The implementation was written so as to conform with Netscapes SSL. -+ * -+ * This library is free for commercial and non-commercial use as long as -+ * the following conditions are aheared to. The following conditions -+ * apply to all code found in this distribution, be it the RC4, RSA, -+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation -+ * included with this distribution is covered by the same copyright terms -+ * except that the holder is Tim Hudson (tjh@cryptsoft.com). -+ * -+ * Copyright remains Eric Young's, and as such any Copyright notices in -+ * the code are not to be removed. -+ * If this package is used in a product, Eric Young should be given attribution -+ * as the author of the parts of the library used. -+ * This can be in the form of a textual message at program startup or -+ * in documentation (online or textual) provided with the package. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * 1. Redistributions of source code must retain the copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. All advertising materials mentioning features or use of this software -+ * must display the following acknowledgement: -+ * "This product includes cryptographic software written by -+ * Eric Young (eay@cryptsoft.com)" -+ * The word 'cryptographic' can be left out if the rouines from the library -+ * being used are not cryptographic related :-). -+ * 4. If you include any Windows specific code (or a derivative thereof) from -+ * the apps directory (application code) you must include an acknowledgement: -+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" -+ * -+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND -+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -+ * SUCH DAMAGE. -+ * -+ * The licence and distribution terms for any publically available version or -+ * derivative of this code cannot be changed. i.e. this code cannot simply be -+ * copied and put under another distribution licence -+ * [including the GNU Public Licence.] -+ */ -+ -+static const unsigned char odd_parity[256]={ -+ 1, 1, 2, 2, 4, 4, 7, 7, 8, 8, 11, 11, 13, 13, 14, 14, -+ 16, 16, 19, 19, 21, 21, 22, 22, 25, 25, 26, 26, 28, 28, 31, 31, -+ 32, 32, 35, 35, 37, 37, 38, 38, 41, 41, 42, 42, 44, 44, 47, 47, -+ 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 59, 59, 61, 61, 62, 62, -+ 64, 64, 67, 67, 69, 69, 70, 70, 73, 73, 74, 74, 76, 76, 79, 79, -+ 81, 81, 82, 82, 84, 84, 87, 87, 88, 88, 91, 91, 93, 93, 94, 94, -+ 97, 97, 98, 98,100,100,103,103,104,104,107,107,109,109,110,110, -+112,112,115,115,117,117,118,118,121,121,122,122,124,124,127,127, -+128,128,131,131,133,133,134,134,137,137,138,138,140,140,143,143, -+145,145,146,146,148,148,151,151,152,152,155,155,157,157,158,158, -+161,161,162,162,164,164,167,167,168,168,171,171,173,173,174,174, -+176,176,179,179,181,181,182,182,185,185,186,186,188,188,191,191, -+193,193,194,194,196,196,199,199,200,200,203,203,205,205,206,206, -+208,208,211,211,213,213,214,214,217,217,218,218,220,220,223,223, -+224,224,227,227,229,229,230,230,233,233,234,234,236,236,239,239, -+241,241,242,242,244,244,247,247,248,248,251,251,253,253,254,254}; ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/des/sk.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,204 @@ -+/* crypto/des/sk.h */ -+/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com) -+ * All rights reserved. -+ * -+ * This package is an SSL implementation written -+ * by Eric Young (eay@cryptsoft.com). -+ * The implementation was written so as to conform with Netscapes SSL. -+ * -+ * This library is free for commercial and non-commercial use as long as -+ * the following conditions are aheared to. The following conditions -+ * apply to all code found in this distribution, be it the RC4, RSA, -+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation -+ * included with this distribution is covered by the same copyright terms -+ * except that the holder is Tim Hudson (tjh@cryptsoft.com). -+ * -+ * Copyright remains Eric Young's, and as such any Copyright notices in -+ * the code are not to be removed. -+ * If this package is used in a product, Eric Young should be given attribution -+ * as the author of the parts of the library used. -+ * This can be in the form of a textual message at program startup or -+ * in documentation (online or textual) provided with the package. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * 1. Redistributions of source code must retain the copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. All advertising materials mentioning features or use of this software -+ * must display the following acknowledgement: -+ * "This product includes cryptographic software written by -+ * Eric Young (eay@cryptsoft.com)" -+ * The word 'cryptographic' can be left out if the rouines from the library -+ * being used are not cryptographic related :-). -+ * 4. If you include any Windows specific code (or a derivative thereof) from -+ * the apps directory (application code) you must include an acknowledgement: -+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" -+ * -+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND -+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -+ * SUCH DAMAGE. -+ * -+ * The licence and distribution terms for any publically available version or -+ * derivative of this code cannot be changed. i.e. this code cannot simply be -+ * copied and put under another distribution licence -+ * [including the GNU Public Licence.] -+ */ -+ -+static const DES_LONG des_skb[8][64]={ -+{ -+/* for C bits (numbered as per FIPS 46) 1 2 3 4 5 6 */ -+0x00000000L,0x00000010L,0x20000000L,0x20000010L, -+0x00010000L,0x00010010L,0x20010000L,0x20010010L, -+0x00000800L,0x00000810L,0x20000800L,0x20000810L, -+0x00010800L,0x00010810L,0x20010800L,0x20010810L, -+0x00000020L,0x00000030L,0x20000020L,0x20000030L, -+0x00010020L,0x00010030L,0x20010020L,0x20010030L, -+0x00000820L,0x00000830L,0x20000820L,0x20000830L, -+0x00010820L,0x00010830L,0x20010820L,0x20010830L, -+0x00080000L,0x00080010L,0x20080000L,0x20080010L, -+0x00090000L,0x00090010L,0x20090000L,0x20090010L, -+0x00080800L,0x00080810L,0x20080800L,0x20080810L, -+0x00090800L,0x00090810L,0x20090800L,0x20090810L, -+0x00080020L,0x00080030L,0x20080020L,0x20080030L, -+0x00090020L,0x00090030L,0x20090020L,0x20090030L, -+0x00080820L,0x00080830L,0x20080820L,0x20080830L, -+0x00090820L,0x00090830L,0x20090820L,0x20090830L, -+},{ -+/* for C bits (numbered as per FIPS 46) 7 8 10 11 12 13 */ -+0x00000000L,0x02000000L,0x00002000L,0x02002000L, -+0x00200000L,0x02200000L,0x00202000L,0x02202000L, -+0x00000004L,0x02000004L,0x00002004L,0x02002004L, -+0x00200004L,0x02200004L,0x00202004L,0x02202004L, -+0x00000400L,0x02000400L,0x00002400L,0x02002400L, -+0x00200400L,0x02200400L,0x00202400L,0x02202400L, -+0x00000404L,0x02000404L,0x00002404L,0x02002404L, -+0x00200404L,0x02200404L,0x00202404L,0x02202404L, -+0x10000000L,0x12000000L,0x10002000L,0x12002000L, -+0x10200000L,0x12200000L,0x10202000L,0x12202000L, -+0x10000004L,0x12000004L,0x10002004L,0x12002004L, -+0x10200004L,0x12200004L,0x10202004L,0x12202004L, -+0x10000400L,0x12000400L,0x10002400L,0x12002400L, -+0x10200400L,0x12200400L,0x10202400L,0x12202400L, -+0x10000404L,0x12000404L,0x10002404L,0x12002404L, -+0x10200404L,0x12200404L,0x10202404L,0x12202404L, -+},{ -+/* for C bits (numbered as per FIPS 46) 14 15 16 17 19 20 */ -+0x00000000L,0x00000001L,0x00040000L,0x00040001L, -+0x01000000L,0x01000001L,0x01040000L,0x01040001L, -+0x00000002L,0x00000003L,0x00040002L,0x00040003L, -+0x01000002L,0x01000003L,0x01040002L,0x01040003L, -+0x00000200L,0x00000201L,0x00040200L,0x00040201L, -+0x01000200L,0x01000201L,0x01040200L,0x01040201L, -+0x00000202L,0x00000203L,0x00040202L,0x00040203L, -+0x01000202L,0x01000203L,0x01040202L,0x01040203L, -+0x08000000L,0x08000001L,0x08040000L,0x08040001L, -+0x09000000L,0x09000001L,0x09040000L,0x09040001L, -+0x08000002L,0x08000003L,0x08040002L,0x08040003L, -+0x09000002L,0x09000003L,0x09040002L,0x09040003L, -+0x08000200L,0x08000201L,0x08040200L,0x08040201L, -+0x09000200L,0x09000201L,0x09040200L,0x09040201L, -+0x08000202L,0x08000203L,0x08040202L,0x08040203L, -+0x09000202L,0x09000203L,0x09040202L,0x09040203L, -+},{ -+/* for C bits (numbered as per FIPS 46) 21 23 24 26 27 28 */ -+0x00000000L,0x00100000L,0x00000100L,0x00100100L, -+0x00000008L,0x00100008L,0x00000108L,0x00100108L, -+0x00001000L,0x00101000L,0x00001100L,0x00101100L, -+0x00001008L,0x00101008L,0x00001108L,0x00101108L, -+0x04000000L,0x04100000L,0x04000100L,0x04100100L, -+0x04000008L,0x04100008L,0x04000108L,0x04100108L, -+0x04001000L,0x04101000L,0x04001100L,0x04101100L, -+0x04001008L,0x04101008L,0x04001108L,0x04101108L, -+0x00020000L,0x00120000L,0x00020100L,0x00120100L, -+0x00020008L,0x00120008L,0x00020108L,0x00120108L, -+0x00021000L,0x00121000L,0x00021100L,0x00121100L, -+0x00021008L,0x00121008L,0x00021108L,0x00121108L, -+0x04020000L,0x04120000L,0x04020100L,0x04120100L, -+0x04020008L,0x04120008L,0x04020108L,0x04120108L, -+0x04021000L,0x04121000L,0x04021100L,0x04121100L, -+0x04021008L,0x04121008L,0x04021108L,0x04121108L, -+},{ -+/* for D bits (numbered as per FIPS 46) 1 2 3 4 5 6 */ -+0x00000000L,0x10000000L,0x00010000L,0x10010000L, -+0x00000004L,0x10000004L,0x00010004L,0x10010004L, -+0x20000000L,0x30000000L,0x20010000L,0x30010000L, -+0x20000004L,0x30000004L,0x20010004L,0x30010004L, -+0x00100000L,0x10100000L,0x00110000L,0x10110000L, -+0x00100004L,0x10100004L,0x00110004L,0x10110004L, -+0x20100000L,0x30100000L,0x20110000L,0x30110000L, -+0x20100004L,0x30100004L,0x20110004L,0x30110004L, -+0x00001000L,0x10001000L,0x00011000L,0x10011000L, -+0x00001004L,0x10001004L,0x00011004L,0x10011004L, -+0x20001000L,0x30001000L,0x20011000L,0x30011000L, -+0x20001004L,0x30001004L,0x20011004L,0x30011004L, -+0x00101000L,0x10101000L,0x00111000L,0x10111000L, -+0x00101004L,0x10101004L,0x00111004L,0x10111004L, -+0x20101000L,0x30101000L,0x20111000L,0x30111000L, -+0x20101004L,0x30101004L,0x20111004L,0x30111004L, -+},{ -+/* for D bits (numbered as per FIPS 46) 8 9 11 12 13 14 */ -+0x00000000L,0x08000000L,0x00000008L,0x08000008L, -+0x00000400L,0x08000400L,0x00000408L,0x08000408L, -+0x00020000L,0x08020000L,0x00020008L,0x08020008L, -+0x00020400L,0x08020400L,0x00020408L,0x08020408L, -+0x00000001L,0x08000001L,0x00000009L,0x08000009L, -+0x00000401L,0x08000401L,0x00000409L,0x08000409L, -+0x00020001L,0x08020001L,0x00020009L,0x08020009L, -+0x00020401L,0x08020401L,0x00020409L,0x08020409L, -+0x02000000L,0x0A000000L,0x02000008L,0x0A000008L, -+0x02000400L,0x0A000400L,0x02000408L,0x0A000408L, -+0x02020000L,0x0A020000L,0x02020008L,0x0A020008L, -+0x02020400L,0x0A020400L,0x02020408L,0x0A020408L, -+0x02000001L,0x0A000001L,0x02000009L,0x0A000009L, -+0x02000401L,0x0A000401L,0x02000409L,0x0A000409L, -+0x02020001L,0x0A020001L,0x02020009L,0x0A020009L, -+0x02020401L,0x0A020401L,0x02020409L,0x0A020409L, -+},{ -+/* for D bits (numbered as per FIPS 46) 16 17 18 19 20 21 */ -+0x00000000L,0x00000100L,0x00080000L,0x00080100L, -+0x01000000L,0x01000100L,0x01080000L,0x01080100L, -+0x00000010L,0x00000110L,0x00080010L,0x00080110L, -+0x01000010L,0x01000110L,0x01080010L,0x01080110L, -+0x00200000L,0x00200100L,0x00280000L,0x00280100L, -+0x01200000L,0x01200100L,0x01280000L,0x01280100L, -+0x00200010L,0x00200110L,0x00280010L,0x00280110L, -+0x01200010L,0x01200110L,0x01280010L,0x01280110L, -+0x00000200L,0x00000300L,0x00080200L,0x00080300L, -+0x01000200L,0x01000300L,0x01080200L,0x01080300L, -+0x00000210L,0x00000310L,0x00080210L,0x00080310L, -+0x01000210L,0x01000310L,0x01080210L,0x01080310L, -+0x00200200L,0x00200300L,0x00280200L,0x00280300L, -+0x01200200L,0x01200300L,0x01280200L,0x01280300L, -+0x00200210L,0x00200310L,0x00280210L,0x00280310L, -+0x01200210L,0x01200310L,0x01280210L,0x01280310L, -+},{ -+/* for D bits (numbered as per FIPS 46) 22 23 24 25 27 28 */ -+0x00000000L,0x04000000L,0x00040000L,0x04040000L, -+0x00000002L,0x04000002L,0x00040002L,0x04040002L, -+0x00002000L,0x04002000L,0x00042000L,0x04042000L, -+0x00002002L,0x04002002L,0x00042002L,0x04042002L, -+0x00000020L,0x04000020L,0x00040020L,0x04040020L, -+0x00000022L,0x04000022L,0x00040022L,0x04040022L, -+0x00002020L,0x04002020L,0x00042020L,0x04042020L, -+0x00002022L,0x04002022L,0x00042022L,0x04042022L, -+0x00000800L,0x04000800L,0x00040800L,0x04040800L, -+0x00000802L,0x04000802L,0x00040802L,0x04040802L, -+0x00002800L,0x04002800L,0x00042800L,0x04042800L, -+0x00002802L,0x04002802L,0x00042802L,0x04042802L, -+0x00000820L,0x04000820L,0x00040820L,0x04040820L, -+0x00000822L,0x04000822L,0x00040822L,0x04040822L, -+0x00002820L,0x04002820L,0x00042820L,0x04042820L, -+0x00002822L,0x04002822L,0x00042822L,0x04042822L, -+}}; ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/des/spr.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,204 @@ -+/* crypto/des/spr.h */ -+/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com) -+ * All rights reserved. -+ * -+ * This package is an SSL implementation written -+ * by Eric Young (eay@cryptsoft.com). -+ * The implementation was written so as to conform with Netscapes SSL. -+ * -+ * This library is free for commercial and non-commercial use as long as -+ * the following conditions are aheared to. The following conditions -+ * apply to all code found in this distribution, be it the RC4, RSA, -+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation -+ * included with this distribution is covered by the same copyright terms -+ * except that the holder is Tim Hudson (tjh@cryptsoft.com). -+ * -+ * Copyright remains Eric Young's, and as such any Copyright notices in -+ * the code are not to be removed. -+ * If this package is used in a product, Eric Young should be given attribution -+ * as the author of the parts of the library used. -+ * This can be in the form of a textual message at program startup or -+ * in documentation (online or textual) provided with the package. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * 1. Redistributions of source code must retain the copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. All advertising materials mentioning features or use of this software -+ * must display the following acknowledgement: -+ * "This product includes cryptographic software written by -+ * Eric Young (eay@cryptsoft.com)" -+ * The word 'cryptographic' can be left out if the rouines from the library -+ * being used are not cryptographic related :-). -+ * 4. If you include any Windows specific code (or a derivative thereof) from -+ * the apps directory (application code) you must include an acknowledgement: -+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" -+ * -+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND -+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -+ * SUCH DAMAGE. -+ * -+ * The licence and distribution terms for any publically available version or -+ * derivative of this code cannot be changed. i.e. this code cannot simply be -+ * copied and put under another distribution licence -+ * [including the GNU Public Licence.] -+ */ -+ -+const DES_LONG des_SPtrans[8][64]={ -+{ -+/* nibble 0 */ -+0x02080800L, 0x00080000L, 0x02000002L, 0x02080802L, -+0x02000000L, 0x00080802L, 0x00080002L, 0x02000002L, -+0x00080802L, 0x02080800L, 0x02080000L, 0x00000802L, -+0x02000802L, 0x02000000L, 0x00000000L, 0x00080002L, -+0x00080000L, 0x00000002L, 0x02000800L, 0x00080800L, -+0x02080802L, 0x02080000L, 0x00000802L, 0x02000800L, -+0x00000002L, 0x00000800L, 0x00080800L, 0x02080002L, -+0x00000800L, 0x02000802L, 0x02080002L, 0x00000000L, -+0x00000000L, 0x02080802L, 0x02000800L, 0x00080002L, -+0x02080800L, 0x00080000L, 0x00000802L, 0x02000800L, -+0x02080002L, 0x00000800L, 0x00080800L, 0x02000002L, -+0x00080802L, 0x00000002L, 0x02000002L, 0x02080000L, -+0x02080802L, 0x00080800L, 0x02080000L, 0x02000802L, -+0x02000000L, 0x00000802L, 0x00080002L, 0x00000000L, -+0x00080000L, 0x02000000L, 0x02000802L, 0x02080800L, -+0x00000002L, 0x02080002L, 0x00000800L, 0x00080802L, -+},{ -+/* nibble 1 */ -+0x40108010L, 0x00000000L, 0x00108000L, 0x40100000L, -+0x40000010L, 0x00008010L, 0x40008000L, 0x00108000L, -+0x00008000L, 0x40100010L, 0x00000010L, 0x40008000L, -+0x00100010L, 0x40108000L, 0x40100000L, 0x00000010L, -+0x00100000L, 0x40008010L, 0x40100010L, 0x00008000L, -+0x00108010L, 0x40000000L, 0x00000000L, 0x00100010L, -+0x40008010L, 0x00108010L, 0x40108000L, 0x40000010L, -+0x40000000L, 0x00100000L, 0x00008010L, 0x40108010L, -+0x00100010L, 0x40108000L, 0x40008000L, 0x00108010L, -+0x40108010L, 0x00100010L, 0x40000010L, 0x00000000L, -+0x40000000L, 0x00008010L, 0x00100000L, 0x40100010L, -+0x00008000L, 0x40000000L, 0x00108010L, 0x40008010L, -+0x40108000L, 0x00008000L, 0x00000000L, 0x40000010L, -+0x00000010L, 0x40108010L, 0x00108000L, 0x40100000L, -+0x40100010L, 0x00100000L, 0x00008010L, 0x40008000L, -+0x40008010L, 0x00000010L, 0x40100000L, 0x00108000L, -+},{ -+/* nibble 2 */ -+0x04000001L, 0x04040100L, 0x00000100L, 0x04000101L, -+0x00040001L, 0x04000000L, 0x04000101L, 0x00040100L, -+0x04000100L, 0x00040000L, 0x04040000L, 0x00000001L, -+0x04040101L, 0x00000101L, 0x00000001L, 0x04040001L, -+0x00000000L, 0x00040001L, 0x04040100L, 0x00000100L, -+0x00000101L, 0x04040101L, 0x00040000L, 0x04000001L, -+0x04040001L, 0x04000100L, 0x00040101L, 0x04040000L, -+0x00040100L, 0x00000000L, 0x04000000L, 0x00040101L, -+0x04040100L, 0x00000100L, 0x00000001L, 0x00040000L, -+0x00000101L, 0x00040001L, 0x04040000L, 0x04000101L, -+0x00000000L, 0x04040100L, 0x00040100L, 0x04040001L, -+0x00040001L, 0x04000000L, 0x04040101L, 0x00000001L, -+0x00040101L, 0x04000001L, 0x04000000L, 0x04040101L, -+0x00040000L, 0x04000100L, 0x04000101L, 0x00040100L, -+0x04000100L, 0x00000000L, 0x04040001L, 0x00000101L, -+0x04000001L, 0x00040101L, 0x00000100L, 0x04040000L, -+},{ -+/* nibble 3 */ -+0x00401008L, 0x10001000L, 0x00000008L, 0x10401008L, -+0x00000000L, 0x10400000L, 0x10001008L, 0x00400008L, -+0x10401000L, 0x10000008L, 0x10000000L, 0x00001008L, -+0x10000008L, 0x00401008L, 0x00400000L, 0x10000000L, -+0x10400008L, 0x00401000L, 0x00001000L, 0x00000008L, -+0x00401000L, 0x10001008L, 0x10400000L, 0x00001000L, -+0x00001008L, 0x00000000L, 0x00400008L, 0x10401000L, -+0x10001000L, 0x10400008L, 0x10401008L, 0x00400000L, -+0x10400008L, 0x00001008L, 0x00400000L, 0x10000008L, -+0x00401000L, 0x10001000L, 0x00000008L, 0x10400000L, -+0x10001008L, 0x00000000L, 0x00001000L, 0x00400008L, -+0x00000000L, 0x10400008L, 0x10401000L, 0x00001000L, -+0x10000000L, 0x10401008L, 0x00401008L, 0x00400000L, -+0x10401008L, 0x00000008L, 0x10001000L, 0x00401008L, -+0x00400008L, 0x00401000L, 0x10400000L, 0x10001008L, -+0x00001008L, 0x10000000L, 0x10000008L, 0x10401000L, -+},{ -+/* nibble 4 */ -+0x08000000L, 0x00010000L, 0x00000400L, 0x08010420L, -+0x08010020L, 0x08000400L, 0x00010420L, 0x08010000L, -+0x00010000L, 0x00000020L, 0x08000020L, 0x00010400L, -+0x08000420L, 0x08010020L, 0x08010400L, 0x00000000L, -+0x00010400L, 0x08000000L, 0x00010020L, 0x00000420L, -+0x08000400L, 0x00010420L, 0x00000000L, 0x08000020L, -+0x00000020L, 0x08000420L, 0x08010420L, 0x00010020L, -+0x08010000L, 0x00000400L, 0x00000420L, 0x08010400L, -+0x08010400L, 0x08000420L, 0x00010020L, 0x08010000L, -+0x00010000L, 0x00000020L, 0x08000020L, 0x08000400L, -+0x08000000L, 0x00010400L, 0x08010420L, 0x00000000L, -+0x00010420L, 0x08000000L, 0x00000400L, 0x00010020L, -+0x08000420L, 0x00000400L, 0x00000000L, 0x08010420L, -+0x08010020L, 0x08010400L, 0x00000420L, 0x00010000L, -+0x00010400L, 0x08010020L, 0x08000400L, 0x00000420L, -+0x00000020L, 0x00010420L, 0x08010000L, 0x08000020L, -+},{ -+/* nibble 5 */ -+0x80000040L, 0x00200040L, 0x00000000L, 0x80202000L, -+0x00200040L, 0x00002000L, 0x80002040L, 0x00200000L, -+0x00002040L, 0x80202040L, 0x00202000L, 0x80000000L, -+0x80002000L, 0x80000040L, 0x80200000L, 0x00202040L, -+0x00200000L, 0x80002040L, 0x80200040L, 0x00000000L, -+0x00002000L, 0x00000040L, 0x80202000L, 0x80200040L, -+0x80202040L, 0x80200000L, 0x80000000L, 0x00002040L, -+0x00000040L, 0x00202000L, 0x00202040L, 0x80002000L, -+0x00002040L, 0x80000000L, 0x80002000L, 0x00202040L, -+0x80202000L, 0x00200040L, 0x00000000L, 0x80002000L, -+0x80000000L, 0x00002000L, 0x80200040L, 0x00200000L, -+0x00200040L, 0x80202040L, 0x00202000L, 0x00000040L, -+0x80202040L, 0x00202000L, 0x00200000L, 0x80002040L, -+0x80000040L, 0x80200000L, 0x00202040L, 0x00000000L, -+0x00002000L, 0x80000040L, 0x80002040L, 0x80202000L, -+0x80200000L, 0x00002040L, 0x00000040L, 0x80200040L, -+},{ -+/* nibble 6 */ -+0x00004000L, 0x00000200L, 0x01000200L, 0x01000004L, -+0x01004204L, 0x00004004L, 0x00004200L, 0x00000000L, -+0x01000000L, 0x01000204L, 0x00000204L, 0x01004000L, -+0x00000004L, 0x01004200L, 0x01004000L, 0x00000204L, -+0x01000204L, 0x00004000L, 0x00004004L, 0x01004204L, -+0x00000000L, 0x01000200L, 0x01000004L, 0x00004200L, -+0x01004004L, 0x00004204L, 0x01004200L, 0x00000004L, -+0x00004204L, 0x01004004L, 0x00000200L, 0x01000000L, -+0x00004204L, 0x01004000L, 0x01004004L, 0x00000204L, -+0x00004000L, 0x00000200L, 0x01000000L, 0x01004004L, -+0x01000204L, 0x00004204L, 0x00004200L, 0x00000000L, -+0x00000200L, 0x01000004L, 0x00000004L, 0x01000200L, -+0x00000000L, 0x01000204L, 0x01000200L, 0x00004200L, -+0x00000204L, 0x00004000L, 0x01004204L, 0x01000000L, -+0x01004200L, 0x00000004L, 0x00004004L, 0x01004204L, -+0x01000004L, 0x01004200L, 0x01004000L, 0x00004004L, -+},{ -+/* nibble 7 */ -+0x20800080L, 0x20820000L, 0x00020080L, 0x00000000L, -+0x20020000L, 0x00800080L, 0x20800000L, 0x20820080L, -+0x00000080L, 0x20000000L, 0x00820000L, 0x00020080L, -+0x00820080L, 0x20020080L, 0x20000080L, 0x20800000L, -+0x00020000L, 0x00820080L, 0x00800080L, 0x20020000L, -+0x20820080L, 0x20000080L, 0x00000000L, 0x00820000L, -+0x20000000L, 0x00800000L, 0x20020080L, 0x20800080L, -+0x00800000L, 0x00020000L, 0x20820000L, 0x00000080L, -+0x00800000L, 0x00020000L, 0x20000080L, 0x20820080L, -+0x00020080L, 0x20000000L, 0x00000000L, 0x00820000L, -+0x20800080L, 0x20020080L, 0x20020000L, 0x00800080L, -+0x20820000L, 0x00000080L, 0x00800080L, 0x20020000L, -+0x20820080L, 0x00800000L, 0x20800000L, 0x20000080L, -+0x00820000L, 0x00020080L, 0x20020080L, 0x20800000L, -+0x00000080L, 0x20820000L, 0x00820080L, 0x00000000L, -+0x20000000L, 0x20800080L, 0x00020000L, 0x00820080L, -+}}; ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/klips-crypto/aes.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,97 @@ -+// I retain copyright in this code but I encourage its free use provided -+// that I don't carry any responsibility for the results. I am especially -+// happy to see it used in free and open source software. If you do use -+// it I would appreciate an acknowledgement of its origin in the code or -+// the product that results and I would also appreciate knowing a little -+// about the use to which it is being put. I am grateful to Frank Yellin -+// for some ideas that are used in this implementation. -+// -+// Dr B. R. Gladman 6th April 2001. -+// -+// This is an implementation of the AES encryption algorithm (Rijndael) -+// designed by Joan Daemen and Vincent Rijmen. This version is designed -+// to provide both fixed and dynamic block and key lengths and can also -+// run with either big or little endian internal byte order (see aes.h). -+// It inputs block and key lengths in bytes with the legal values being -+// 16, 24 and 32. -+ -+/* -+ * Modified by Jari Ruusu, May 1 2001 -+ * - Fixed some compile warnings, code was ok but gcc warned anyway. -+ * - Changed basic types: byte -> unsigned char, word -> u_int32_t -+ * - Major name space cleanup: Names visible to outside now begin -+ * with "aes_" or "AES_". A lot of stuff moved from aes.h to aes.c -+ * - Removed C++ and DLL support as part of name space cleanup. -+ * - Eliminated unnecessary recomputation of tables. (actual bug fix) -+ * - Merged precomputed constant tables to aes.c file. -+ * - Removed data alignment restrictions for portability reasons. -+ * - Made block and key lengths accept bit count (128/192/256) -+ * as well byte count (16/24/32). -+ * - Removed all error checks. This change also eliminated the need -+ * to preinitialize the context struct to zero. -+ * - Removed some totally unused constants. -+ */ -+ -+#ifndef _AES_H -+#define _AES_H -+ -+#if defined(__linux__) && defined(__KERNEL__) -+# include -+#else -+# include -+#endif -+ -+// CONFIGURATION OPTIONS (see also aes.c) -+// -+// Define AES_BLOCK_SIZE to set the cipher block size (16, 24 or 32) or -+// leave this undefined for dynamically variable block size (this will -+// result in much slower code). -+// IMPORTANT NOTE: AES_BLOCK_SIZE is in BYTES (16, 24, 32 or undefined). If -+// left undefined a slower version providing variable block length is compiled -+ -+#define AES_BLOCK_SIZE 16 -+ -+// The number of key schedule words for different block and key lengths -+// allowing for method of computation which requires the length to be a -+// multiple of the key length -+// -+// Nk = 4 6 8 -+// ------------- -+// Nb = 4 | 60 60 64 -+// 6 | 96 90 96 -+// 8 | 120 120 120 -+ -+#if !defined(AES_BLOCK_SIZE) || (AES_BLOCK_SIZE == 32) -+#define AES_KS_LENGTH 120 -+#define AES_RC_LENGTH 29 -+#else -+#define AES_KS_LENGTH 4 * AES_BLOCK_SIZE -+#define AES_RC_LENGTH (9 * AES_BLOCK_SIZE) / 8 - 8 -+#endif -+ -+typedef struct -+{ -+ u_int32_t aes_Nkey; // the number of words in the key input block -+ u_int32_t aes_Nrnd; // the number of cipher rounds -+ u_int32_t aes_e_key[AES_KS_LENGTH]; // the encryption key schedule -+ u_int32_t aes_d_key[AES_KS_LENGTH]; // the decryption key schedule -+#if !defined(AES_BLOCK_SIZE) -+ u_int32_t aes_Ncol; // the number of columns in the cipher state -+#endif -+} aes_context; -+ -+// THE CIPHER INTERFACE -+ -+#if !defined(AES_BLOCK_SIZE) -+extern void aes_set_blk(aes_context *, const int); -+#endif -+extern void aes_set_key(aes_context *, const unsigned char [], const int, const int); -+extern void aes_encrypt(const aes_context *, const unsigned char [], unsigned char []); -+extern void aes_decrypt(const aes_context *, const unsigned char [], unsigned char []); -+ -+// The block length inputs to aes_set_block and aes_set_key are in numbers -+// of bytes or bits. The calls to subroutines must be made in the above -+// order but multiple calls can be made without repeating earlier calls -+// if their parameters have not changed. -+ -+#endif // _AES_H ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/klips-crypto/aes_cbc.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,4 @@ -+/* Glue header */ -+#include "aes.h" -+int AES_set_key(aes_context *aes_ctx, const u_int8_t * key, int keysize); -+int AES_cbc_encrypt(aes_context *ctx, const u_int8_t * in, u_int8_t * out, int ilen, const u_int8_t * iv, int encrypt); ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/klips-crypto/aes_xcbc_mac.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,12 @@ -+#ifndef _AES_XCBC_MAC_H -+#define _AES_XCBC_MAC_H -+ -+typedef u_int32_t aes_block[4]; -+typedef struct { -+ aes_context ctx_k1; -+ aes_block k2; -+ aes_block k3; -+} aes_context_mac; -+int AES_xcbc_mac_set_key(aes_context_mac *ctxm, const u_int8_t *key, int keylen); -+int AES_xcbc_mac_hash(const aes_context_mac *ctxm, const u_int8_t * in, int ilen, u_int8_t hash[16]); -+#endif /* _AES_XCBC_MAC_H */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/klips-crypto/cbc_generic.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,110 @@ -+#ifndef _CBC_GENERIC_H -+#define _CBC_GENERIC_H -+/* -+ * CBC macro helpers -+ * -+ * Author: JuanJo Ciarlante -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ */ -+ -+/* -+ * Heavily inspired in loop_AES -+ */ -+#define CBC_IMPL_BLK16(name, ctx_type, addr_type, enc_func, dec_func) \ -+int name(ctx_type *ctx, const u_int8_t * in, u_int8_t * out, int ilen, const u_int8_t * iv, int encrypt) { \ -+ int ret=ilen, pos; \ -+ const u_int32_t *iv_i; \ -+ if ((ilen) % 16) return 0; \ -+ if (encrypt) { \ -+ pos=0; \ -+ while(pos=0) { \ -+ dec_func(ctx, (const addr_type) in, (addr_type) out); \ -+ if (pos==0) \ -+ iv_i=(const u_int32_t*) (iv); \ -+ else \ -+ iv_i=(const u_int32_t*) (in-16); \ -+ *((u_int32_t *)(&out[ 0])) ^= iv_i[0]; \ -+ *((u_int32_t *)(&out[ 4])) ^= iv_i[1]; \ -+ *((u_int32_t *)(&out[ 8])) ^= iv_i[2]; \ -+ *((u_int32_t *)(&out[12])) ^= iv_i[3]; \ -+ in-=16; \ -+ out-=16; \ -+ pos-=16; \ -+ } \ -+ } \ -+ return ret; \ -+} -+#define CBC_IMPL_BLK8(name, ctx_type, addr_type, enc_func, dec_func) \ -+int name(ctx_type *ctx, u_int8_t * in, u_int8_t * out, int ilen, const u_int8_t * iv, int encrypt) { \ -+ int ret=ilen, pos; \ -+ const u_int32_t *iv_i; \ -+ if ((ilen) % 8) return 0; \ -+ if (encrypt) { \ -+ pos=0; \ -+ while(pos=0) { \ -+ dec_func(ctx, (const addr_type)in, (addr_type)out); \ -+ if (pos==0) \ -+ iv_i=(const u_int32_t*) (iv); \ -+ else \ -+ iv_i=(const u_int32_t*) (in-8); \ -+ *((u_int32_t *)(&out[ 0])) ^= iv_i[0]; \ -+ *((u_int32_t *)(&out[ 4])) ^= iv_i[1]; \ -+ in-=8; \ -+ out-=8; \ -+ pos-=8; \ -+ } \ -+ } \ -+ return ret; \ -+} -+#define CBC_DECL(name, ctx_type) \ -+int name(ctx_type *ctx, u_int8_t * in, u_int8_t * out, int ilen, const u_int8_t * iv, int encrypt) -+/* -+Eg.: -+CBC_IMPL_BLK16(AES_cbc_encrypt, aes_context, u_int8_t *, aes_encrypt, aes_decrypt); -+CBC_DECL(AES_cbc_encrypt, aes_context); -+*/ -+#endif /* _CBC_GENERIC_H */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/klips-crypto/des.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,286 @@ -+/* crypto/des/des.org */ -+/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com) -+ * All rights reserved. -+ * -+ * This package is an SSL implementation written -+ * by Eric Young (eay@cryptsoft.com). -+ * The implementation was written so as to conform with Netscapes SSL. -+ * -+ * This library is free for commercial and non-commercial use as long as -+ * the following conditions are aheared to. The following conditions -+ * apply to all code found in this distribution, be it the RC4, RSA, -+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation -+ * included with this distribution is covered by the same copyright terms -+ * except that the holder is Tim Hudson (tjh@cryptsoft.com). -+ * -+ * Copyright remains Eric Young's, and as such any Copyright notices in -+ * the code are not to be removed. -+ * If this package is used in a product, Eric Young should be given attribution -+ * as the author of the parts of the library used. -+ * This can be in the form of a textual message at program startup or -+ * in documentation (online or textual) provided with the package. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * 1. Redistributions of source code must retain the copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. All advertising materials mentioning features or use of this software -+ * must display the following acknowledgement: -+ * "This product includes cryptographic software written by -+ * Eric Young (eay@cryptsoft.com)" -+ * The word 'cryptographic' can be left out if the rouines from the library -+ * being used are not cryptographic related :-). -+ * 4. If you include any Windows specific code (or a derivative thereof) from -+ * the apps directory (application code) you must include an acknowledgement: -+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" -+ * -+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND -+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -+ * SUCH DAMAGE. -+ * -+ * The licence and distribution terms for any publically available version or -+ * derivative of this code cannot be changed. i.e. this code cannot simply be -+ * copied and put under another distribution licence -+ * [including the GNU Public Licence.] -+ */ -+ -+/* WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING -+ * -+ * Always modify des.org since des.h is automatically generated from -+ * it during SSLeay configuration. -+ * -+ * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING -+ */ -+ -+#ifndef HEADER_DES_H -+#define HEADER_DES_H -+ -+#ifdef __cplusplus -+extern "C" { -+#endif -+ -+ -+/* If this is set to 'unsigned int' on a DEC Alpha, this gives about a -+ * %20 speed up (longs are 8 bytes, int's are 4). */ -+/* Must be unsigned int on ia64/Itanium or DES breaks badly */ -+ -+#ifdef __KERNEL__ -+#include -+#else -+#include -+#endif -+ -+#ifndef DES_LONG -+#define DES_LONG u_int32_t -+#endif -+ -+typedef unsigned char des_cblock[8]; -+typedef struct { des_cblock ks; } des_key_schedule[16]; -+ -+#define DES_KEY_SZ (sizeof(des_cblock)) -+#define DES_SCHEDULE_SZ (sizeof(des_key_schedule)) -+ -+#define DES_ENCRYPT 1 -+#define DES_DECRYPT 0 -+ -+#define DES_CBC_MODE 0 -+#define DES_PCBC_MODE 1 -+ -+#define des_ecb2_encrypt(i,o,k1,k2,e) \ -+ des_ecb3_encrypt((i),(o),(k1),(k2),(k1),(e)) -+ -+#define des_ede2_cbc_encrypt(i,o,l,k1,k2,iv,e) \ -+ des_ede3_cbc_encrypt((i),(o),(l),(k1),(k2),(k1),(iv),(e)) -+ -+#define des_ede2_cfb64_encrypt(i,o,l,k1,k2,iv,n,e) \ -+ des_ede3_cfb64_encrypt((i),(o),(l),(k1),(k2),(k1),(iv),(n),(e)) -+ -+#define des_ede2_ofb64_encrypt(i,o,l,k1,k2,iv,n) \ -+ des_ede3_ofb64_encrypt((i),(o),(l),(k1),(k2),(k1),(iv),(n)) -+ -+#define C_Block des_cblock -+#define Key_schedule des_key_schedule -+#ifdef KERBEROS -+#define ENCRYPT DES_ENCRYPT -+#define DECRYPT DES_DECRYPT -+#endif -+#define KEY_SZ DES_KEY_SZ -+#define string_to_key des_string_to_key -+#define read_pw_string des_read_pw_string -+#define random_key des_random_key -+#define pcbc_encrypt des_pcbc_encrypt -+#define set_key des_set_key -+#define key_sched des_key_sched -+#define ecb_encrypt des_ecb_encrypt -+#define cbc_encrypt des_cbc_encrypt -+#define ncbc_encrypt des_ncbc_encrypt -+#define xcbc_encrypt des_xcbc_encrypt -+#define cbc_cksum des_cbc_cksum -+#define quad_cksum des_quad_cksum -+ -+/* For compatibility with the MIT lib - eay 20/05/92 */ -+typedef des_key_schedule bit_64; -+#define des_fixup_key_parity des_set_odd_parity -+#define des_check_key_parity check_parity -+ -+extern int des_check_key; /* defaults to false */ -+extern int des_rw_mode; /* defaults to DES_PCBC_MODE */ -+ -+/* The next line is used to disable full ANSI prototypes, if your -+ * compiler has problems with the prototypes, make sure this line always -+ * evaluates to true :-) */ -+#if defined(MSDOS) || defined(__STDC__) -+#undef NOPROTO -+#endif -+#ifndef NOPROTO -+char *des_options(void); -+void des_ecb3_encrypt(des_cblock *input,des_cblock *output, -+ des_key_schedule ks1,des_key_schedule ks2, -+ des_key_schedule ks3, int enc); -+DES_LONG des_cbc_cksum(des_cblock *input,des_cblock *output, -+ long length,des_key_schedule schedule,des_cblock *ivec); -+void des_cbc_encrypt(des_cblock *input,des_cblock *output,long length, -+ des_key_schedule schedule,des_cblock *ivec,int enc); -+void des_ncbc_encrypt(des_cblock *input,des_cblock *output,long length, -+ des_key_schedule schedule,des_cblock *ivec,int enc); -+void des_xcbc_encrypt(des_cblock *input,des_cblock *output,long length, -+ des_key_schedule schedule,des_cblock *ivec, -+ des_cblock *inw,des_cblock *outw,int enc); -+void des_cfb_encrypt(unsigned char *in,unsigned char *out,int numbits, -+ long length,des_key_schedule schedule,des_cblock *ivec,int enc); -+void des_ecb_encrypt(des_cblock *input,des_cblock *output, -+ des_key_schedule ks,int enc); -+void des_encrypt(DES_LONG *data,des_key_schedule ks, int enc); -+void des_encrypt2(DES_LONG *data,des_key_schedule ks, int enc); -+void des_encrypt3(DES_LONG *data, des_key_schedule ks1, -+ des_key_schedule ks2, des_key_schedule ks3); -+void des_decrypt3(DES_LONG *data, des_key_schedule ks1, -+ des_key_schedule ks2, des_key_schedule ks3); -+void des_ede3_cbc_encrypt(des_cblock *input, des_cblock *output, -+ long length, des_key_schedule ks1, des_key_schedule ks2, -+ des_key_schedule ks3, des_cblock *ivec, int enc); -+void des_ede3_cfb64_encrypt(unsigned char *in, unsigned char *out, -+ long length, des_key_schedule ks1, des_key_schedule ks2, -+ des_key_schedule ks3, des_cblock *ivec, int *num, int enc); -+void des_ede3_ofb64_encrypt(unsigned char *in, unsigned char *out, -+ long length, des_key_schedule ks1, des_key_schedule ks2, -+ des_key_schedule ks3, des_cblock *ivec, int *num); -+ -+void des_xwhite_in2out(des_cblock (*des_key), des_cblock (*in_white), -+ des_cblock (*out_white)); -+ -+int des_enc_read(int fd,char *buf,int len,des_key_schedule sched, -+ des_cblock *iv); -+int des_enc_write(int fd,char *buf,int len,des_key_schedule sched, -+ des_cblock *iv); -+char *des_fcrypt(const char *buf,const char *salt, char *ret); -+ -+void des_ofb_encrypt(unsigned char *in,unsigned char *out, -+ int numbits,long length,des_key_schedule schedule,des_cblock *ivec); -+void des_pcbc_encrypt(des_cblock *input,des_cblock *output,long length, -+ des_key_schedule schedule,des_cblock *ivec,int enc); -+DES_LONG des_quad_cksum(des_cblock *input,des_cblock *output, -+ long length,int out_count,des_cblock *seed); -+void des_random_seed(des_cblock key); -+void des_random_key(des_cblock ret); -+int des_read_password(des_cblock *key,char *prompt,int verify); -+int des_read_2passwords(des_cblock *key1,des_cblock *key2, -+ char *prompt,int verify); -+int des_read_pw_string(char *buf,int length,char *prompt,int verify); -+void des_set_odd_parity(des_cblock *key); -+int des_is_weak_key(des_cblock *key); -+int des_set_key(des_cblock *key,des_key_schedule schedule); -+int des_key_sched(des_cblock *key,des_key_schedule schedule); -+void des_string_to_key(char *str,des_cblock *key); -+void des_string_to_2keys(char *str,des_cblock *key1,des_cblock *key2); -+void des_cfb64_encrypt(unsigned char *in, unsigned char *out, long length, -+ des_key_schedule schedule, des_cblock *ivec, int *num, int enc); -+void des_ofb64_encrypt(unsigned char *in, unsigned char *out, long length, -+ des_key_schedule schedule, des_cblock *ivec, int *num); -+int des_read_pw(char *buf, char *buff, int size, char *prompt, int verify); -+ -+/* Extra functions from Mark Murray */ -+/* The following functions are not in the normal unix build or the -+ * SSLeay build. When using the SSLeay build, use RAND_seed() -+ * and RAND_bytes() instead. */ -+int des_new_random_key(des_cblock *key); -+void des_init_random_number_generator(des_cblock *key); -+void des_set_random_generator_seed(des_cblock *key); -+void des_set_sequence_number(des_cblock new_sequence_number); -+void des_generate_random_block(des_cblock *block); -+ -+#else -+ -+char *des_options(); -+void des_ecb3_encrypt(); -+DES_LONG des_cbc_cksum(); -+void des_cbc_encrypt(); -+void des_ncbc_encrypt(); -+void des_xcbc_encrypt(); -+void des_cfb_encrypt(); -+void des_ede3_cfb64_encrypt(); -+void des_ede3_ofb64_encrypt(); -+void des_ecb_encrypt(); -+void des_encrypt(); -+void des_encrypt2(); -+void des_encrypt3(); -+void des_decrypt3(); -+void des_ede3_cbc_encrypt(); -+int des_enc_read(); -+int des_enc_write(); -+char *des_fcrypt(); -+#ifdef PERL5 -+char *des_crypt(); -+#else -+char *crypt(); -+#endif -+void des_ofb_encrypt(); -+void des_pcbc_encrypt(); -+DES_LONG des_quad_cksum(); -+void des_random_seed(); -+void des_random_key(); -+int des_read_password(); -+int des_read_2passwords(); -+int des_read_pw_string(); -+void des_set_odd_parity(); -+int des_is_weak_key(); -+int des_set_key(); -+int des_key_sched(); -+void des_string_to_key(); -+void des_string_to_2keys(); -+void des_cfb64_encrypt(); -+void des_ofb64_encrypt(); -+int des_read_pw(); -+void des_xwhite_in2out(); -+ -+/* Extra functions from Mark Murray */ -+/* The following functions are not in the normal unix build or the -+ * SSLeay build. When using the SSLeay build, use RAND_seed() -+ * and RAND_bytes() instead. */ -+#ifdef FreeBSD -+int des_new_random_key(); -+void des_init_random_number_generator(); -+void des_set_random_generator_seed(); -+void des_set_sequence_number(); -+void des_generate_random_block(); -+#endif -+ -+#endif -+ -+#ifdef __cplusplus -+} -+#endif -+ -+#endif ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/klips-crypto/ocf_assist.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,63 @@ -+#ifndef _OCF_ASSIST_H -+#define _OCF_ASSIST_H 1 -+/****************************************************************************/ -+/* The various hw_assist functions return these bits */ -+ -+#define OCF_PROVIDES_AES 0x0001 -+#define OCF_PROVIDES_DES_3DES 0x0002 -+ -+/****************************************************************************/ -+#if !defined(OCF_ASSIST) -+/****************************************************************************/ -+/* -+ * stub it all out just in case -+ */ -+ -+#define ocf_aes_assist() (0) -+#define ocf_aes_set_key(a1,a2,a3,a4) -+#define ocf_aes_cbc_encrypt(a1,a2,a3,a4,a5,a6) -+ -+#define ocf_des_assist() (0) -+#define ocf_des_set_key(a, b) -+#define ocf_des_cbc_encrypt(a1,a2,a3,a4,a5,a6) -+#define ocf_des_encrypt(a1,a2,a3) -+#define ocf_des_ede3_cbc_encrypt(a1,a2,a3,a4,a5,a6,a7,a8) -+#define ocf_des_ncbc_encrypt(a1,a2,a3,a4,a5,a6) -+#define ocf_des_ecb_encrypt(a1,a2,a3,a4) -+ -+/****************************************************************************/ -+#else -+/****************************************************************************/ -+ -+#include -+#include "aes.h" -+#include "des.h" -+ -+extern int ocf_aes_assist(void); -+extern void ocf_aes_set_key(aes_context *cx, const unsigned char in_key[], -+ int n_bytes, const int f); -+extern int ocf_aes_cbc_encrypt(aes_context *ctx, const u_int8_t *input, -+ u_int8_t *output, -+ long length, -+ const u_int8_t *ivec, int enc); -+ -+extern int ocf_des_assist(void); -+extern int ocf_des_set_key(des_cblock *key, des_key_schedule schedule); -+extern void ocf_des_cbc_encrypt(des_cblock *input, des_cblock *output, -+ long length, des_key_schedule schedule, -+ des_cblock *ivec, int enc); -+extern void ocf_des_encrypt(DES_LONG *data, des_key_schedule ks, int enc); -+extern void ocf_des_ede3_cbc_encrypt(des_cblock *input, des_cblock *output, -+ long length, des_key_schedule ks1, -+ des_key_schedule ks2, des_key_schedule ks3, -+ des_cblock *ivec, int enc); -+extern void ocf_des_ncbc_encrypt(des_cblock *input, des_cblock *output, -+ long length, des_key_schedule schedule, -+ des_cblock *ivec, int enc); -+extern void ocf_des_ecb_encrypt(des_cblock *input, des_cblock *output, -+ des_key_schedule ks, int enc); -+ -+/****************************************************************************/ -+#endif /* !defined(OCF_ASSIST) */ -+/****************************************************************************/ -+#endif /* _OCF_ASSIST_H */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,569 @@ -+#ifndef _OPENSWAN_H -+/* -+ * header file for FreeS/WAN library functions -+ * Copyright (C) 1998, 1999, 2000 Henry Spencer. -+ * Copyright (C) 1999, 2000, 2001 Richard Guy Briggs -+ * -+ * This library is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU Library General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This library is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public -+ * License for more details. -+ * -+ * RCSID $Id: openswan.h,v 1.95 2005/08/25 01:24:40 paul Exp $ -+ */ -+#define _OPENSWAN_H /* seen it, no need to see it again */ -+ -+/* you'd think this should be builtin to compiler... */ -+#ifndef TRUE -+#define TRUE 1 -+#endif -+ -+#ifndef FALSE -+#define FALSE 0 -+#endif -+ -+/* -+ * When using uclibc, malloc(0) returns NULL instead of success. This is -+ * to make it use the inbuilt work-around. -+ * See: http://osdir.com/ml/network.freeswan.devel/2003-11/msg00009.html -+ */ -+#ifdef __UCLIBC__ -+# if !defined(__MALLOC_GLIBC_COMPAT__) && !defined(MALLOC_GLIBC_COMPAT) -+# warning Please compile uclibc with GLIBC_COMPATIBILITY defined -+# endif -+#endif -+ -+ -+/* -+ * We've just got to have some datatypes defined... And annoyingly, just -+ * where we get them depends on whether we're in userland or not. -+ */ -+/* things that need to come from one place or the other, depending */ -+#if defined(linux) -+#if defined(__KERNEL__) -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#define user_assert(foo) /*nothing*/ -+ -+#else /* NOT in kernel */ -+#include -+#include -+#include -+#include -+#include -+#define user_assert(foo) assert(foo) -+#include -+ -+# define uint8_t u_int8_t -+# define uint16_t u_int16_t -+# define uint32_t u_int32_t -+# define uint64_t u_int64_t -+ -+ -+ -+#endif /* __KERNEL__ */ -+ -+#endif /* linux */ -+ -+#define DEBUG_NO_STATIC static -+ -+/* -+ * Yes Virginia, we have started a windows port. -+ */ -+#if defined(__CYGWIN32__) -+#if !defined(WIN32_KERNEL) -+/* get windows equivalents */ -+#include -+#include -+#include -+#include -+#include -+#include -+#define user_assert(foo) assert(foo) -+#endif /* _KERNEL */ -+#endif /* WIN32 */ -+ -+/* -+ * Kovacs? A macosx port? -+ */ -+#if defined(macintosh) || (defined(__MACH__) && defined(__APPLE__)) -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#define user_assert(foo) assert(foo) -+#define __u32 unsigned int -+#define __u8 unsigned char -+#define s6_addr16 __u6_addr.__u6_addr16 -+#define DEBUG_NO_STATIC static -+#endif -+ -+/* -+ * FreeBSD -+ */ -+#if defined(__FreeBSD__) -+# define DEBUG_NO_STATIC static -+#include -+#include -+#include -+#include -+#include -+#include -+#define user_assert(foo) assert(foo) -+/* apparently this way to deal with an IPv6 address is not standard. */ -+#define s6_addr16 __u6_addr.__u6_addr16 -+#endif -+ -+ -+#ifndef IPPROTO_COMP -+# define IPPROTO_COMP 108 -+#endif /* !IPPROTO_COMP */ -+ -+#ifndef IPPROTO_INT -+# define IPPROTO_INT 61 -+#endif /* !IPPROTO_INT */ -+ -+#if !defined(ESPINUDP_WITH_NON_IKE) -+#define ESPINUDP_WITH_NON_IKE 1 /* draft-ietf-ipsec-nat-t-ike-00/01 */ -+#define ESPINUDP_WITH_NON_ESP 2 /* draft-ietf-ipsec-nat-t-ike-02 */ -+#endif -+ -+/* -+ * Basic data types for the address-handling functions. -+ * ip_address and ip_subnet are supposed to be opaque types; do not -+ * use their definitions directly, they are subject to change! -+ */ -+ -+/* first, some quick fakes in case we're on an old system with no IPv6 */ -+#if !defined(s6_addr16) && defined(__CYGWIN32__) -+struct in6_addr { -+ union -+ { -+ u_int8_t u6_addr8[16]; -+ u_int16_t u6_addr16[8]; -+ u_int32_t u6_addr32[4]; -+ } in6_u; -+#define s6_addr in6_u.u6_addr8 -+#define s6_addr16 in6_u.u6_addr16 -+#define s6_addr32 in6_u.u6_addr32 -+}; -+struct sockaddr_in6 { -+ unsigned short int sin6_family; /* AF_INET6 */ -+ __u16 sin6_port; /* Transport layer port # */ -+ __u32 sin6_flowinfo; /* IPv6 flow information */ -+ struct in6_addr sin6_addr; /* IPv6 address */ -+ __u32 sin6_scope_id; /* scope id (new in RFC2553) */ -+}; -+#endif /* !s6_addr16 */ -+ -+/* then the main types */ -+typedef struct { -+ union { -+ struct sockaddr_in v4; -+ struct sockaddr_in6 v6; -+ } u; -+} ip_address; -+typedef struct { -+ ip_address addr; -+ int maskbits; -+} ip_subnet; -+ -+/* and the SA ID stuff */ -+#ifdef __KERNEL__ -+typedef __u32 ipsec_spi_t; -+#else -+typedef u_int32_t ipsec_spi_t; -+#endif -+typedef struct { /* to identify an SA, we need: */ -+ ip_address dst; /* A. destination host */ -+ ipsec_spi_t spi; /* B. 32-bit SPI, assigned by dest. host */ -+# define SPI_PASS 256 /* magic values... */ -+# define SPI_DROP 257 /* ...for use... */ -+# define SPI_REJECT 258 /* ...with SA_INT */ -+# define SPI_HOLD 259 -+# define SPI_TRAP 260 -+# define SPI_TRAPSUBNET 261 -+ int proto; /* C. protocol */ -+# define SA_ESP 50 /* IPPROTO_ESP */ -+# define SA_AH 51 /* IPPROTO_AH */ -+# define SA_IPIP 4 /* IPPROTO_IPIP */ -+# define SA_COMP 108 /* IPPROTO_COMP */ -+# define SA_INT 61 /* IANA reserved for internal use */ -+} ip_said; -+ -+/* misc */ -+typedef const char *err_t; /* error message, or NULL for success */ -+struct prng { /* pseudo-random-number-generator guts */ -+ unsigned char sbox[256]; -+ int i, j; -+ unsigned long count; -+}; -+ -+ -+/* -+ * definitions for user space, taken from freeswan/ipsec_sa.h -+ */ -+typedef uint32_t IPsecSAref_t; -+ -+/* Translation to/from nfmark. -+ * -+ * use bits 16-31. Leave bit 32 as a indicate that IPsec processing -+ * has already been done. -+ */ -+#define IPSEC_SA_REF_TABLE_IDX_WIDTH 15 -+#define IPSEC_SA_REF_TABLE_OFFSET 16 -+#define IPSEC_SA_REF_MAASK ((1<> IPSEC_SA_REF_TABLE_OFFSET)&IPSEC_SA_REF_MASK) -+ -+#define IPSEC_SAREF_NULL ((IPsecSAref_t)0) -+#define IPSEC_SAREF_NA ((IPsecSAref_t)0xffff0001) -+ -+/* GCC magic for use in function definitions! */ -+#ifdef GCC_LINT -+# define PRINTF_LIKE(n) __attribute__ ((format(printf, n, n+1))) -+# define NEVER_RETURNS __attribute__ ((noreturn)) -+# define UNUSED __attribute__ ((unused)) -+# define BLANK_FORMAT " " /* GCC_LINT whines about empty formats */ -+#else -+# define PRINTF_LIKE(n) /* ignore */ -+# define NEVER_RETURNS /* ignore */ -+# define UNUSED /* ignore */ -+# define BLANK_FORMAT "" -+#endif -+ -+ -+/* -+ * function to log stuff from libraries that may be used in multiple -+ * places. -+ */ -+typedef int (*openswan_keying_debug_func_t)(const char *message, ...); -+ -+ -+ -+/* -+ * new IPv6-compatible functions -+ */ -+ -+/* text conversions */ -+err_t ttoul(const char *src, size_t srclen, int format, unsigned long *dst); -+size_t ultot(unsigned long src, int format, char *buf, size_t buflen); -+#define ULTOT_BUF (22+1) /* holds 64 bits in octal */ -+ -+/* looks up names in DNS */ -+err_t ttoaddr(const char *src, size_t srclen, int af, ip_address *dst); -+ -+/* does not look up names in DNS */ -+err_t ttoaddr_num(const char *src, size_t srclen, int af, ip_address *dst); -+ -+err_t tnatoaddr(const char *src, size_t srclen, int af, ip_address *dst); -+size_t addrtot(const ip_address *src, int format, char *buf, size_t buflen); -+/* RFC 1886 old IPv6 reverse-lookup format is the bulkiest */ -+#define ADDRTOT_BUF (32*2 + 3 + 1 + 3 + 1 + 1) -+err_t ttosubnet(const char *src, size_t srclen, int af, ip_subnet *dst); -+size_t subnettot(const ip_subnet *src, int format, char *buf, size_t buflen); -+#define SUBNETTOT_BUF (ADDRTOT_BUF + 1 + 3) -+size_t subnetporttot(const ip_subnet *src, int format, char *buf, size_t buflen); -+#define SUBNETPROTOTOT_BUF (SUBNETTOTO_BUF + ULTOT_BUF) -+err_t ttosa(const char *src, size_t srclen, ip_said *dst); -+size_t satot(const ip_said *src, int format, char *bufptr, size_t buflen); -+#define SATOT_BUF (5 + ULTOA_BUF + 1 + ADDRTOT_BUF) -+err_t ttodata(const char *src, size_t srclen, int base, char *buf, -+ size_t buflen, size_t *needed); -+err_t ttodatav(const char *src, size_t srclen, int base, -+ char *buf, size_t buflen, size_t *needed, -+ char *errp, size_t errlen, unsigned int flags); -+#define TTODATAV_BUF 40 /* ttodatav's largest non-literal message */ -+#define TTODATAV_IGNORESPACE (1<<1) /* ignore spaces in base64 encodings*/ -+#define TTODATAV_SPACECOUNTS 0 /* do not ignore spaces in base64 */ -+ -+size_t datatot(const unsigned char *src, size_t srclen, int format -+ , char *buf, size_t buflen); -+size_t keyblobtoid(const unsigned char *src, size_t srclen, char *dst, -+ size_t dstlen); -+size_t splitkeytoid(const unsigned char *e, size_t elen, const unsigned char *m, -+ size_t mlen, char *dst, size_t dstlen); -+#define KEYID_BUF 10 /* up to 9 text digits plus NUL */ -+err_t ttoprotoport(char *src, size_t src_len, u_int8_t *proto, u_int16_t *port, -+ int *has_port_wildcard); -+ -+/* initializations */ -+void initsaid(const ip_address *addr, ipsec_spi_t spi, int proto, ip_said *dst); -+err_t loopbackaddr(int af, ip_address *dst); -+err_t unspecaddr(int af, ip_address *dst); -+err_t anyaddr(int af, ip_address *dst); -+err_t initaddr(const unsigned char *src, size_t srclen, int af, ip_address *dst); -+err_t add_port(int af, ip_address *addr, unsigned short port); -+err_t initsubnet(const ip_address *addr, int maskbits, int clash, ip_subnet *dst); -+err_t addrtosubnet(const ip_address *addr, ip_subnet *dst); -+ -+/* misc. conversions and related */ -+err_t rangetosubnet(const ip_address *from, const ip_address *to, ip_subnet *dst); -+int addrtypeof(const ip_address *src); -+int subnettypeof(const ip_subnet *src); -+size_t addrlenof(const ip_address *src); -+size_t addrbytesptr(const ip_address *src, const unsigned char **dst); -+size_t addrbytesptr_write(ip_address *src, unsigned char **dst); -+size_t addrbytesof(const ip_address *src, unsigned char *dst, size_t dstlen); -+int masktocount(const ip_address *src); -+void networkof(const ip_subnet *src, ip_address *dst); -+void maskof(const ip_subnet *src, ip_address *dst); -+ -+/* tests */ -+int sameaddr(const ip_address *a, const ip_address *b); -+int addrcmp(const ip_address *a, const ip_address *b); -+int samesubnet(const ip_subnet *a, const ip_subnet *b); -+int addrinsubnet(const ip_address *a, const ip_subnet *s); -+int subnetinsubnet(const ip_subnet *a, const ip_subnet *b); -+int subnetishost(const ip_subnet *s); -+int samesaid(const ip_said *a, const ip_said *b); -+int sameaddrtype(const ip_address *a, const ip_address *b); -+int samesubnettype(const ip_subnet *a, const ip_subnet *b); -+int isvalidsubnet(const ip_subnet *a); -+int isanyaddr(const ip_address *src); -+int isunspecaddr(const ip_address *src); -+int isloopbackaddr(const ip_address *src); -+ -+/* low-level grot */ -+int portof(const ip_address *src); -+void setportof(int port, ip_address *dst); -+struct sockaddr *sockaddrof(ip_address *src); -+size_t sockaddrlenof(const ip_address *src); -+ -+/* PRNG */ -+void prng_init(struct prng *prng, const unsigned char *key, size_t keylen); -+void prng_bytes(struct prng *prng, unsigned char *dst, size_t dstlen); -+unsigned long prng_count(struct prng *prng); -+void prng_final(struct prng *prng); -+ -+/* odds and ends */ -+const char *ipsec_version_code(void); -+const char *ipsec_version_string(void); -+const char **ipsec_copyright_notice(void); -+ -+const char *dns_string_rr(int rr, char *buf, int bufsize); -+const char *dns_string_datetime(time_t seconds, -+ char *buf, -+ int bufsize); -+ -+ -+/* -+ * old functions, to be deleted eventually -+ */ -+ -+/* unsigned long */ -+const char * /* NULL for success, else string literal */ -+atoul( -+ const char *src, -+ size_t srclen, /* 0 means strlen(src) */ -+ int base, /* 0 means figure it out */ -+ unsigned long *resultp -+); -+size_t /* space needed for full conversion */ -+ultoa( -+ unsigned long n, -+ int base, -+ char *dst, -+ size_t dstlen -+); -+#define ULTOA_BUF 21 /* just large enough for largest result, */ -+ /* assuming 64-bit unsigned long! */ -+ -+/* Internet addresses */ -+const char * /* NULL for success, else string literal */ -+atoaddr( -+ const char *src, -+ size_t srclen, /* 0 means strlen(src) */ -+ struct in_addr *addr -+); -+size_t /* space needed for full conversion */ -+addrtoa( -+ struct in_addr addr, -+ int format, /* character; 0 means default */ -+ char *dst, -+ size_t dstlen -+); -+#define ADDRTOA_BUF 16 /* just large enough for largest result */ -+ -+/* subnets */ -+const char * /* NULL for success, else string literal */ -+atosubnet( -+ const char *src, -+ size_t srclen, /* 0 means strlen(src) */ -+ struct in_addr *addr, -+ struct in_addr *mask -+); -+size_t /* space needed for full conversion */ -+subnettoa( -+ struct in_addr addr, -+ struct in_addr mask, -+ int format, /* character; 0 means default */ -+ char *dst, -+ size_t dstlen -+); -+#define SUBNETTOA_BUF 32 /* large enough for worst case result */ -+ -+/* ranges */ -+const char * /* NULL for success, else string literal */ -+atoasr( -+ const char *src, -+ size_t srclen, /* 0 means strlen(src) */ -+ char *type, /* 'a', 's', 'r' */ -+ struct in_addr *addrs /* two-element array */ -+); -+size_t /* space needed for full conversion */ -+rangetoa( -+ struct in_addr *addrs, /* two-element array */ -+ int format, /* character; 0 means default */ -+ char *dst, -+ size_t dstlen -+); -+#define RANGETOA_BUF 34 /* large enough for worst case result */ -+ -+/* data types for SA conversion functions */ -+ -+/* generic data, e.g. keys */ -+const char * /* NULL for success, else string literal */ -+atobytes( -+ const char *src, -+ size_t srclen, /* 0 means strlen(src) */ -+ char *dst, -+ size_t dstlen, -+ size_t *lenp /* NULL means don't bother telling me */ -+); -+size_t /* 0 failure, else true size */ -+bytestoa( -+ const unsigned char *src, -+ size_t srclen, -+ int format, /* character; 0 means default */ -+ char *dst, -+ size_t dstlen -+); -+ -+/* old versions of generic-data functions; deprecated */ -+size_t /* 0 failure, else true size */ -+atodata( -+ const char *src, -+ size_t srclen, /* 0 means strlen(src) */ -+ char *dst, -+ size_t dstlen -+); -+size_t /* 0 failure, else true size */ -+datatoa( -+ const unsigned char *src, -+ size_t srclen, -+ int format, /* character; 0 means default */ -+ char *dst, -+ size_t dstlen -+); -+ -+/* part extraction and special addresses */ -+struct in_addr -+subnetof( -+ struct in_addr addr, -+ struct in_addr mask -+); -+struct in_addr -+hostof( -+ struct in_addr addr, -+ struct in_addr mask -+); -+struct in_addr -+broadcastof( -+ struct in_addr addr, -+ struct in_addr mask -+); -+ -+/* mask handling */ -+int -+goodmask( -+ struct in_addr mask -+); -+extern int masktobits(struct in_addr mask); -+extern struct in_addr bitstomask(int n); -+extern struct in6_addr bitstomask6(int n); -+ -+ -+ -+/* -+ * ENUM of klips debugging values. Not currently used in klips. -+ * debug flag is actually 32 -bits, but only one bit is ever used, -+ * so we can actually pack it all into a single 32-bit word. -+ */ -+enum klips_debug_flags { -+ KDF_VERBOSE = 0, -+ KDF_XMIT = 1, -+ KDF_NETLINK = 2, /* obsolete */ -+ KDF_XFORM = 3, -+ KDF_EROUTE = 4, -+ KDF_SPI = 5, -+ KDF_RADIJ = 6, -+ KDF_ESP = 7, -+ KDF_AH = 8, /* obsolete */ -+ KDF_RCV = 9, -+ KDF_TUNNEL = 10, -+ KDF_PFKEY = 11, -+ KDF_COMP = 12, -+ KDF_NATT = 13, -+}; -+ -+ -+/* -+ * Debugging levels for pfkey_lib_debug -+ */ -+#define PF_KEY_DEBUG_PARSE_NONE 0 -+#define PF_KEY_DEBUG_PARSE_PROBLEM 1 -+#define PF_KEY_DEBUG_PARSE_STRUCT 2 -+#define PF_KEY_DEBUG_PARSE_FLOW 4 -+#define PF_KEY_DEBUG_BUILD 8 -+#define PF_KEY_DEBUG_PARSE_MAX 15 -+ -+extern unsigned int pfkey_lib_debug; /* bits selecting what to report */ -+ -+/* -+ * pluto and lwdnsq need to know the maximum size of the commands to, -+ * and replies from lwdnsq. -+ */ -+ -+#define LWDNSQ_CMDBUF_LEN 1024 -+#define LWDNSQ_RESULT_LEN_MAX 4096 -+ -+ -+/* syntax for passthrough SA */ -+#ifndef PASSTHROUGHNAME -+#define PASSTHROUGHNAME "%passthrough" -+#define PASSTHROUGH4NAME "%passthrough4" -+#define PASSTHROUGH6NAME "%passthrough6" -+#define PASSTHROUGHIS "tun0@0.0.0.0" -+#define PASSTHROUGH4IS "tun0@0.0.0.0" -+#define PASSTHROUGH6IS "tun0@::" -+#define PASSTHROUGHTYPE "tun" -+#define PASSTHROUGHSPI 0 -+#define PASSTHROUGHDST 0 -+#endif -+ -+ -+ -+#endif /* _OPENSWAN_H */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipcomp.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,58 @@ -+/* -+ * IPCOMP zlib interface code. -+ * Copyright (C) 2000 Svenning Soerensen -+ * Copyright (C) 2000, 2001 Richard Guy Briggs -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ -+ RCSID $Id: ipcomp.h,v 1.14 2004/07/10 19:08:41 mcr Exp $ -+ -+ */ -+ -+/* SSS */ -+ -+#ifndef _IPCOMP_H -+#define _IPCOMP_H -+ -+/* Prefix all global deflate symbols with "ipcomp_" to avoid collisions with ppp_deflate & ext2comp */ -+#ifndef IPCOMP_PREFIX -+#define IPCOMP_PREFIX -+#endif /* IPCOMP_PREFIX */ -+ -+#ifndef IPPROTO_COMP -+#define IPPROTO_COMP 108 -+#endif /* IPPROTO_COMP */ -+ -+#include "openswan/ipsec_sysctl.h" -+ -+struct ipcomphdr { /* IPCOMP header */ -+ __u8 ipcomp_nh; /* Next header (protocol) */ -+ __u8 ipcomp_flags; /* Reserved, must be 0 */ -+ __u16 ipcomp_cpi; /* Compression Parameter Index */ -+}; -+ -+extern struct inet_protocol comp_protocol; -+ -+#define IPCOMP_UNCOMPRESSABLE 0x000000001 -+#define IPCOMP_COMPRESSIONERROR 0x000000002 -+#define IPCOMP_PARMERROR 0x000000004 -+#define IPCOMP_DECOMPRESSIONERROR 0x000000008 -+ -+#define IPCOMP_ADAPT_INITIAL_TRIES 8 -+#define IPCOMP_ADAPT_INITIAL_SKIP 4 -+#define IPCOMP_ADAPT_SUBSEQ_TRIES 2 -+#define IPCOMP_ADAPT_SUBSEQ_SKIP 8 -+ -+/* Function prototypes */ -+struct sk_buff *skb_compress(struct sk_buff *skb, struct ipsec_sa *ips, unsigned int *flags); -+struct sk_buff *skb_decompress(struct sk_buff *skb, struct ipsec_sa *ips, unsigned int *flags); -+ -+#endif /* _IPCOMP_H */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_ah.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,201 @@ -+/* -+ * Authentication Header declarations -+ * Copyright (C) 1996, 1997 John Ioannidis. -+ * Copyright (C) 1998, 1999, 2000, 2001 Richard Guy Briggs. -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * RCSID $Id: ipsec_ah.h,v 1.26 2004/09/13 02:22:10 mcr Exp $ -+ */ -+ -+#include "ipsec_md5h.h" -+#include "ipsec_sha1.h" -+ -+#ifndef IPPROTO_AH -+#define IPPROTO_AH 51 -+#endif /* IPPROTO_AH */ -+ -+#include "ipsec_auth.h" -+ -+#ifdef __KERNEL__ -+ -+#ifndef CONFIG_XFRM_ALTERNATE_STACK -+extern struct inet_protocol ah_protocol; -+#endif /* CONFIG_XFRM_ALTERNATE_STACK */ -+ -+struct options; -+ -+struct ahhdr /* Generic AH header */ -+{ -+ __u8 ah_nh; /* Next header (protocol) */ -+ __u8 ah_hl; /* AH length, in 32-bit words */ -+ __u16 ah_rv; /* reserved, must be 0 */ -+ __u32 ah_spi; /* Security Parameters Index */ -+ __u32 ah_rpl; /* Replay prevention */ -+ __u8 ah_data[AHHMAC_HASHLEN];/* Authentication hash */ -+}; -+#define AH_BASIC_LEN 8 /* basic AH header is 8 bytes, nh,hl,rv,spi -+ * and the ah_hl, says how many bytes after that -+ * to cover. */ -+ -+extern struct xform_functions ah_xform_funcs[]; -+ -+#include "openswan/ipsec_sysctl.h" -+ -+#endif /* __KERNEL__ */ -+ -+/* -+ * $Log: ipsec_ah.h,v $ -+ * Revision 1.26 2004/09/13 02:22:10 mcr -+ * #define inet_protocol if necessary. -+ * -+ * Revision 1.25 2004/09/06 18:35:41 mcr -+ * 2.6.8.1 gets rid of inet_protocol->net_protocol compatibility, -+ * so adjust for that. -+ * -+ * Revision 1.24 2004/07/10 19:08:41 mcr -+ * CONFIG_IPSEC -> CONFIG_KLIPS. -+ * -+ * Revision 1.23 2004/04/05 19:55:04 mcr -+ * Moved from linux/include/freeswan/ipsec_ah.h,v -+ * -+ * Revision 1.22 2004/04/05 19:41:05 mcr -+ * merged alg-branch code. -+ * -+ * Revision 1.21 2003/12/13 19:10:16 mcr -+ * refactored rcv and xmit code - same as FS 2.05. -+ * -+ * Revision 1.22 2003/12/11 20:14:58 mcr -+ * refactored the xmit code, to move all encapsulation -+ * code into protocol functions. Note that all functions -+ * are essentially done by a single function, which is probably -+ * wrong. -+ * the rcv_functions structures are renamed xform_functions. -+ * -+ * Revision 1.21 2003/12/06 21:21:19 mcr -+ * split up receive path into per-transform files, for -+ * easier later removal. -+ * -+ * Revision 1.20.8.1 2003/12/22 15:25:52 jjo -+ * Merged algo-0.8.1-rc11-test1 into alg-branch -+ * -+ * Revision 1.20 2003/02/06 02:21:34 rgb -+ * -+ * Moved "struct auth_alg" from ipsec_rcv.c to ipsec_ah.h . -+ * Changed "struct ah" to "struct ahhdr" and "struct esp" to "struct esphdr". -+ * Removed "#ifdef INBOUND_POLICY_CHECK_eroute" dead code. -+ * -+ * Revision 1.19 2002/09/16 21:19:13 mcr -+ * fixes for west-ah-icmp-01 - length of AH header must be -+ * calculated properly, and next_header field properly copied. -+ * -+ * Revision 1.18 2002/05/14 02:37:02 rgb -+ * Change reference from _TDB to _IPSA. -+ * -+ * Revision 1.17 2002/04/24 07:36:46 mcr -+ * Moved from ./klips/net/ipsec/ipsec_ah.h,v -+ * -+ * Revision 1.16 2002/02/20 01:27:06 rgb -+ * Ditched a pile of structs only used by the old Netlink interface. -+ * -+ * Revision 1.15 2001/12/11 02:35:57 rgb -+ * Change "struct net_device" to "struct device" for 2.2 compatibility. -+ * -+ * Revision 1.14 2001/11/26 09:23:47 rgb -+ * Merge MCR's ipsec_sa, eroute, proc and struct lifetime changes. -+ * -+ * Revision 1.13.2.1 2001/09/25 02:18:24 mcr -+ * replace "struct device" with "struct netdevice" -+ * -+ * Revision 1.13 2001/06/14 19:35:08 rgb -+ * Update copyright date. -+ * -+ * Revision 1.12 2000/09/12 03:21:20 rgb -+ * Cleared out unused htonq. -+ * -+ * Revision 1.11 2000/09/08 19:12:55 rgb -+ * Change references from DEBUG_IPSEC to CONFIG_IPSEC_DEBUG. -+ * -+ * Revision 1.10 2000/01/21 06:13:10 rgb -+ * Tidied up spacing. -+ * Added macros for HMAC padding magic numbers.(kravietz) -+ * -+ * Revision 1.9 1999/12/07 18:16:23 rgb -+ * Fixed comments at end of #endif lines. -+ * -+ * Revision 1.8 1999/04/11 00:28:56 henry -+ * GPL boilerplate -+ * -+ * Revision 1.7 1999/04/06 04:54:25 rgb -+ * Fix/Add RCSID Id: and Log: bits to make PHMDs happy. This includes -+ * patch shell fixes. -+ * -+ * Revision 1.6 1999/01/26 02:06:01 rgb -+ * Removed CONFIG_IPSEC_ALGO_SWITCH macro. -+ * -+ * Revision 1.5 1999/01/22 06:17:49 rgb -+ * Updated macro comments. -+ * Added context types to support algorithm switch code. -+ * 64-bit clean-up -- converting 'u long long' to __u64. -+ * -+ * Revision 1.4 1998/07/14 15:54:56 rgb -+ * Add #ifdef __KERNEL__ to protect kernel-only structures. -+ * -+ * Revision 1.3 1998/06/30 18:05:16 rgb -+ * Comment out references to htonq. -+ * -+ * Revision 1.2 1998/06/25 19:33:46 rgb -+ * Add prototype for protocol receive function. -+ * Rearrange for more logical layout. -+ * -+ * Revision 1.1 1998/06/18 21:27:43 henry -+ * move sources from klips/src to klips/net/ipsec, to keep stupid -+ * kernel-build scripts happier in the presence of symlinks -+ * -+ * Revision 1.4 1998/05/18 22:28:43 rgb -+ * Disable key printing facilities from /proc/net/ipsec_*. -+ * -+ * Revision 1.3 1998/04/21 21:29:07 rgb -+ * Rearrange debug switches to change on the fly debug output from user -+ * space. Only kernel changes checked in at this time. radij.c was also -+ * changed to temporarily remove buggy debugging code in rj_delete causing -+ * an OOPS and hence, netlink device open errors. -+ * -+ * Revision 1.2 1998/04/12 22:03:17 rgb -+ * Updated ESP-3DES-HMAC-MD5-96, -+ * ESP-DES-HMAC-MD5-96, -+ * AH-HMAC-MD5-96, -+ * AH-HMAC-SHA1-96 since Henry started freeswan cvs repository -+ * from old standards (RFC182[5-9] to new (as of March 1998) drafts. -+ * -+ * Fixed eroute references in /proc/net/ipsec*. -+ * -+ * Started to patch module unloading memory leaks in ipsec_netlink and -+ * radij tree unloading. -+ * -+ * Revision 1.1 1998/04/09 03:05:55 henry -+ * sources moved up from linux/net/ipsec -+ * -+ * Revision 1.1.1.1 1998/04/08 05:35:02 henry -+ * RGB's ipsec-0.8pre2.tar.gz ipsec-0.8 -+ * -+ * Revision 0.4 1997/01/15 01:28:15 ji -+ * Added definitions for new AH transforms. -+ * -+ * Revision 0.3 1996/11/20 14:35:48 ji -+ * Minor Cleanup. -+ * Rationalized debugging code. -+ * -+ * Revision 0.2 1996/11/02 00:18:33 ji -+ * First limited release. -+ * -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_alg.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,248 @@ -+/* -+ * Modular extensions service and registration functions interface -+ * -+ * Author: JuanJo Ciarlante -+ * -+ * ipsec_alg.h,v 1.1.2.1 2003/11/21 18:12:23 jjo Exp -+ * -+ */ -+/* -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ */ -+#ifndef IPSEC_ALG_H -+#define IPSEC_ALG_H -+ -+/* -+ * gcc >= 3.2 has removed __FUNCTION__, replaced by C99 __func__ -+ * *BUT* its a compiler variable. -+ */ -+#if (__GNUC__ >= 3) -+#ifndef __FUNCTION__ -+#define __FUNCTION__ __func__ -+#endif -+#endif -+ -+/* Version 0.8.1-0 */ -+#define IPSEC_ALG_VERSION 0x00080100 -+ -+#include -+#include -+#include -+#include -+ -+/* -+ * The following structs are used via pointers in ipsec_alg object to -+ * avoid ipsec_alg.h coupling with freeswan headers, thus simplifying -+ * module development -+ */ -+struct ipsec_sa; -+struct esp; -+ -+/************************************** -+ * -+ * Main registration object -+ * -+ *************************************/ -+#define IPSEC_ALG_VERSION_QUAD(v) \ -+ (v>>24),((v>>16)&0xff),((v>>8)&0xff),(v&0xff) -+/* -+ * Main ipsec_alg objects: "OOPrograming wannabe" -+ * Hierachy (carefully handled with _minimal_ cast'ing): -+ * -+ * ipsec_alg+ -+ * +->ipsec_alg_enc (ixt_alg_type=SADB_EXT_SUPPORTED_ENCRYPT) -+ * +->ipsec_alg_auth (ixt_alg_type=SADB_EXT_SUPPORTED_AUTH) -+ */ -+ -+/*************************************************************** -+ * -+ * INTERFACE object: struct ipsec_alg -+ * -+ ***************************************************************/ -+ -+#define ixt_alg_type ixt_support.ias_exttype -+#define ixt_alg_id ixt_support.ias_id -+ -+#define IPSEC_ALG_ST_SUPP 0x01 -+#define IPSEC_ALG_ST_REGISTERED 0x02 -+#define IPSEC_ALG_ST_EXCL 0x04 -+struct ipsec_alg { -+ unsigned ixt_version; /* only allow this version (or 'near')*/ \ -+ struct list_head ixt_list; /* dlinked list */ \ -+ struct module *ixt_module; /* THIS_MODULE */ \ -+ unsigned ixt_state; /* state flags */ \ -+ atomic_t ixt_refcnt; /* ref. count when pointed from ipsec_sa */ \ -+ char ixt_name[16]; /* descriptive short name, eg. "3des" */ \ -+ void *ixt_data; /* private for algo implementation */ \ -+ uint8_t ixt_blocksize; /* blocksize in bytes */ \ -+ -+ struct ipsec_alg_supported ixt_support; -+}; -+/* -+ * Note the const in cbc_encrypt IV arg: -+ * some ciphers like to toast passed IV (eg. 3DES): make a local IV copy -+ */ -+struct ipsec_alg_enc { -+ struct ipsec_alg ixt_common; -+ unsigned ixt_e_keylen; /* raw key length in bytes */ -+ unsigned ixt_e_ctx_size; /* sa_p->key_e_size */ -+ int (*ixt_e_set_key)(struct ipsec_alg_enc *alg, __u8 *key_e, const __u8 *key, size_t keysize); -+ __u8 *(*ixt_e_new_key)(struct ipsec_alg_enc *alg, const __u8 *key, size_t keysize); -+ void (*ixt_e_destroy_key)(struct ipsec_alg_enc *alg, __u8 *key_e); -+ int (*ixt_e_cbc_encrypt)(struct ipsec_alg_enc *alg, __u8 *key_e, __u8 *in, int ilen, __u8 *iv, int encrypt); -+}; -+struct ipsec_alg_auth { -+ struct ipsec_alg ixt_common; -+ unsigned ixt_a_keylen; /* raw key length in bytes */ -+ unsigned ixt_a_ctx_size; /* sa_p->key_a_size */ -+ unsigned ixt_a_authlen; /* 'natural' auth. hash len (bytes) */ -+ int (*ixt_a_hmac_set_key)(struct ipsec_alg_auth *alg, __u8 *key_a, const __u8 *key, int keylen); -+ int (*ixt_a_hmac_hash)(struct ipsec_alg_auth *alg, __u8 *key_a, const __u8 *dat, int len, __u8 *hash, int hashlen); -+}; -+/* -+ * These are _copies_ of SADB_EXT_SUPPORTED_{AUTH,ENCRYPT}, -+ * to avoid header coupling for true constants -+ * about headers ... "cp is your friend" --Linus -+ */ -+#define IPSEC_ALG_TYPE_AUTH 14 -+#define IPSEC_ALG_TYPE_ENCRYPT 15 -+ -+/*************************************************************** -+ * -+ * INTERFACE for module loading,testing, and unloading -+ * -+ ***************************************************************/ -+/* - registration calls */ -+int register_ipsec_alg(struct ipsec_alg *); -+int unregister_ipsec_alg(struct ipsec_alg *); -+/* - optional (simple test) for algos */ -+int ipsec_alg_test(unsigned alg_type, unsigned alg_id, int testparm); -+/* inline wrappers (usefull for type validation */ -+static inline int register_ipsec_alg_enc(struct ipsec_alg_enc *ixt) { -+ return register_ipsec_alg((struct ipsec_alg*)ixt); -+} -+static inline int unregister_ipsec_alg_enc(struct ipsec_alg_enc *ixt) { -+ return unregister_ipsec_alg((struct ipsec_alg*)ixt); -+} -+static inline int register_ipsec_alg_auth(struct ipsec_alg_auth *ixt) { -+ return register_ipsec_alg((struct ipsec_alg*)ixt); -+} -+static inline int unregister_ipsec_alg_auth(struct ipsec_alg_auth *ixt) { -+ return unregister_ipsec_alg((struct ipsec_alg*)ixt); -+} -+ -+/***************************************************************** -+ * -+ * INTERFACE for ENC services: key creation, encrypt function -+ * -+ *****************************************************************/ -+ -+#define IPSEC_ALG_ENCRYPT 1 -+#define IPSEC_ALG_DECRYPT 0 -+ -+/* encryption key context creation function */ -+int ipsec_alg_enc_key_create(struct ipsec_sa *sa_p); -+/* -+ * ipsec_alg_esp_encrypt(): encrypt ilen bytes in idat returns -+ * 0 or ERR<0 -+ */ -+int ipsec_alg_esp_encrypt(struct ipsec_sa *sa_p, __u8 *idat, int ilen, __u8 *iv, int action); -+ -+/*************************************************************** -+ * -+ * INTERFACE for AUTH services: key creation, hash functions -+ * -+ ***************************************************************/ -+int ipsec_alg_auth_key_create(struct ipsec_sa *sa_p); -+int ipsec_alg_sa_esp_hash(const struct ipsec_sa *sa_p, const __u8 *espp, int len, __u8 *hash, int hashlen) ; -+#define ipsec_alg_sa_esp_update(c,k,l) ipsec_alg_sa_esp_hash(c,k,l,NULL,0) -+ -+/* only called from ipsec_init.c */ -+int ipsec_alg_init(void); -+ -+/* algo module glue for static algos */ -+void ipsec_alg_static_init(void); -+typedef int (*ipsec_alg_init_func_t) (void); -+ -+/********************************************** -+ * -+ * INTERFACE for ipsec_sa init and wipe -+ * -+ **********************************************/ -+ -+/* returns true if ipsec_sa has ipsec_alg obj attached */ -+/* -+ * Initializes ipsec_sa's ipsec_alg object, using already loaded -+ * proto, authalg, encalg.; links ipsec_alg objects (enc, auth) -+ */ -+int ipsec_alg_sa_init(struct ipsec_sa *sa_p); -+/* -+ * Destroys ipsec_sa's ipsec_alg object -+ * unlinking ipsec_alg objects -+ */ -+int ipsec_alg_sa_wipe(struct ipsec_sa *sa_p); -+ -+#define IPSEC_ALG_MODULE_INIT_MOD( func_name ) \ -+ static int func_name(void); \ -+ module_init(func_name); \ -+ static int __init func_name(void) -+#define IPSEC_ALG_MODULE_EXIT_MOD( func_name ) \ -+ static void func_name(void); \ -+ module_exit(func_name); \ -+ static void __exit func_name(void) -+ -+#define IPSEC_ALG_MODULE_INIT_STATIC( func_name ) \ -+ extern int func_name(void); \ -+ int func_name(void) -+#define IPSEC_ALG_MODULE_EXIT_STATIC( func_name ) \ -+ extern void func_name(void); \ -+ void func_name(void) -+ -+/********************************************** -+ * -+ * 2.2 backport for some 2.4 useful module stuff -+ * -+ **********************************************/ -+#ifdef MODULE -+#ifndef THIS_MODULE -+#define THIS_MODULE (&__this_module) -+#endif -+#ifndef module_init -+typedef int (*__init_module_func_t)(void); -+typedef void (*__cleanup_module_func_t)(void); -+ -+#define module_init(x) \ -+ int init_module(void) __attribute__((alias(#x))); \ -+ static inline __init_module_func_t __init_module_inline(void) \ -+ { return x; } -+#define module_exit(x) \ -+ void cleanup_module(void) __attribute__((alias(#x))); \ -+ static inline __cleanup_module_func_t __cleanup_module_inline(void) \ -+ { return x; } -+#endif -+#define IPSEC_ALG_MODULE_INIT( func_name ) IPSEC_ALG_MODULE_INIT_MOD( func_name ) -+#define IPSEC_ALG_MODULE_EXIT( func_name ) IPSEC_ALG_MODULE_EXIT_MOD( func_name ) -+ -+#else /* not MODULE */ -+#ifndef THIS_MODULE -+#define THIS_MODULE NULL -+#endif -+/* -+ * I only want module_init() magic -+ * when algo.c file *is THE MODULE*, in all other -+ * cases, initialization is called explicitely from ipsec_alg_init() -+ */ -+#define IPSEC_ALG_MODULE_INIT( func_name ) IPSEC_ALG_MODULE_INIT_STATIC(func_name) -+#define IPSEC_ALG_MODULE_EXIT( func_name ) IPSEC_ALG_MODULE_EXIT_STATIC(func_name) -+#endif -+ -+#endif /* IPSEC_ALG_H */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_alg_3des.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,12 @@ -+struct TripleDES_context { -+ des_key_schedule s1; -+ des_key_schedule s2; -+ des_key_schedule s3; -+}; -+typedef struct TripleDES_context TripleDES_context; -+ -+#define ESP_3DES_KEY_SZ 3*(sizeof(des_cblock)) -+#define ESP_3DES_CBC_BLK_LEN 8 -+ -+ -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_auth.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,100 @@ -+/* -+ * Authentication Header declarations -+ * Copyright (C) 2003 Michael Richardson -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * RCSID $Id: ipsec_auth.h,v 1.3 2004/04/06 02:49:08 mcr Exp $ -+ */ -+ -+#include "ipsec_md5h.h" -+#include "ipsec_sha1.h" -+ -+#ifndef IPSEC_AUTH_H -+#define IPSEC_AUTH_H -+ -+#define AH_FLENGTH 12 /* size of fixed part */ -+#define AHMD5_KMAX 64 /* MD5 max 512 bits key */ -+#define AHMD5_AMAX 12 /* MD5 96 bits of authenticator */ -+ -+#define AHMD596_KLEN 16 /* MD5 128 bits key */ -+#define AHSHA196_KLEN 20 /* SHA1 160 bits key */ -+ -+#define AHMD596_ALEN 16 /* MD5 128 bits authentication length */ -+#define AHSHA196_ALEN 20 /* SHA1 160 bits authentication length */ -+ -+#define AHMD596_BLKLEN 64 /* MD5 block length */ -+#define AHSHA196_BLKLEN 64 /* SHA1 block length */ -+#define AHSHA2_256_BLKLEN 64 /* SHA2-256 block length */ -+#define AHSHA2_384_BLKLEN 128 /* SHA2-384 block length (?) */ -+#define AHSHA2_512_BLKLEN 128 /* SHA2-512 block length */ -+ -+#define AH_BLKLEN_MAX 128 /* keep up to date! */ -+ -+ -+#define AH_AMAX AHSHA196_ALEN /* keep up to date! */ -+#define AHHMAC_HASHLEN 12 /* authenticator length of 96bits */ -+#define AHHMAC_RPLLEN 4 /* 32 bit replay counter */ -+ -+#define DB_AH_PKTRX 0x0001 -+#define DB_AH_PKTRX2 0x0002 -+#define DB_AH_DMP 0x0004 -+#define DB_AH_IPSA 0x0010 -+#define DB_AH_XF 0x0020 -+#define DB_AH_INAU 0x0040 -+#define DB_AH_REPLAY 0x0100 -+ -+#ifdef __KERNEL__ -+ -+/* General HMAC algorithm is described in RFC 2104 */ -+ -+#define HMAC_IPAD 0x36 -+#define HMAC_OPAD 0x5C -+ -+struct md5_ctx { -+ MD5_CTX ictx; /* context after H(K XOR ipad) */ -+ MD5_CTX octx; /* context after H(K XOR opad) */ -+}; -+ -+struct sha1_ctx { -+ SHA1_CTX ictx; /* context after H(K XOR ipad) */ -+ SHA1_CTX octx; /* context after H(K XOR opad) */ -+}; -+ -+struct auth_alg { -+ void (*init)(void *ctx); -+ void (*update)(void *ctx, unsigned char *bytes, __u32 len); -+ void (*final)(unsigned char *hash, void *ctx); -+ int hashlen; -+}; -+ -+struct options; -+ -+#endif /* __KERNEL__ */ -+#endif /* IPSEC_AUTH_H */ -+ -+/* -+ * $Log: ipsec_auth.h,v $ -+ * Revision 1.3 2004/04/06 02:49:08 mcr -+ * pullup of algo code from alg-branch. -+ * -+ * Revision 1.2 2004/04/05 19:55:04 mcr -+ * Moved from linux/include/freeswan/ipsec_auth.h,v -+ * -+ * Revision 1.1 2003/12/13 19:10:16 mcr -+ * refactored rcv and xmit code - same as FS 2.05. -+ * -+ * Revision 1.1 2003/12/06 21:21:19 mcr -+ * split up receive path into per-transform files, for -+ * easier later removal. -+ * -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_encap.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,149 @@ -+/* -+ * declarations relevant to encapsulation-like operations -+ * Copyright (C) 1996, 1997 John Ioannidis. -+ * Copyright (C) 1998, 1999, 2000, 2001 Richard Guy Briggs. -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * RCSID $Id: ipsec_encap.h,v 1.19 2004/04/05 19:55:04 mcr Exp $ -+ */ -+ -+#ifndef _IPSEC_ENCAP_H_ -+ -+#define SENT_IP4 16 /* data is two struct in_addr + proto + ports*/ -+ /* (2 * sizeof(struct in_addr)) */ -+ /* sizeof(struct sockaddr_encap) -+ - offsetof(struct sockaddr_encap, Sen.Sip4.Src) */ -+ -+struct sockaddr_encap -+{ -+ __u8 sen_len; /* length */ -+ __u8 sen_family; /* AF_ENCAP */ -+ __u16 sen_type; /* see SENT_* */ -+ union -+ { -+ struct /* SENT_IP4 */ -+ { -+ struct in_addr Src; -+ struct in_addr Dst; -+ __u8 Proto; -+ __u16 Sport; -+ __u16 Dport; -+ } Sip4; -+ } Sen; -+}; -+ -+#define sen_ip_src Sen.Sip4.Src -+#define sen_ip_dst Sen.Sip4.Dst -+#define sen_proto Sen.Sip4.Proto -+#define sen_sport Sen.Sip4.Sport -+#define sen_dport Sen.Sip4.Dport -+ -+#ifndef AF_ENCAP -+#define AF_ENCAP 26 -+#endif /* AF_ENCAP */ -+ -+#define _IPSEC_ENCAP_H_ -+#endif /* _IPSEC_ENCAP_H_ */ -+ -+/* -+ * $Log: ipsec_encap.h,v $ -+ * Revision 1.19 2004/04/05 19:55:04 mcr -+ * Moved from linux/include/freeswan/ipsec_encap.h,v -+ * -+ * Revision 1.18 2003/10/31 02:27:05 mcr -+ * pulled up port-selector patches and sa_id elimination. -+ * -+ * Revision 1.17.30.1 2003/09/21 13:59:38 mcr -+ * pre-liminary X.509 patch - does not yet pass tests. -+ * -+ * Revision 1.17 2002/04/24 07:36:46 mcr -+ * Moved from ./klips/net/ipsec/ipsec_encap.h,v -+ * -+ * Revision 1.16 2001/11/26 09:23:47 rgb -+ * Merge MCR's ipsec_sa, eroute, proc and struct lifetime changes. -+ * -+ * Revision 1.15.2.1 2001/09/25 02:18:54 mcr -+ * struct eroute moved to ipsec_eroute.h -+ * -+ * Revision 1.15 2001/09/14 16:58:36 rgb -+ * Added support for storing the first and last packets through a HOLD. -+ * -+ * Revision 1.14 2001/09/08 21:13:31 rgb -+ * Added pfkey ident extension support for ISAKMPd. (NetCelo) -+ * -+ * Revision 1.13 2001/06/14 19:35:08 rgb -+ * Update copyright date. -+ * -+ * Revision 1.12 2001/05/27 06:12:10 rgb -+ * Added structures for pid, packet count and last access time to eroute. -+ * Added packet count to beginning of /proc/net/ipsec_eroute. -+ * -+ * Revision 1.11 2000/09/08 19:12:56 rgb -+ * Change references from DEBUG_IPSEC to CONFIG_IPSEC_DEBUG. -+ * -+ * Revision 1.10 2000/03/22 16:15:36 rgb -+ * Fixed renaming of dev_get (MB). -+ * -+ * Revision 1.9 2000/01/21 06:13:26 rgb -+ * Added a macro for AF_ENCAP -+ * -+ * Revision 1.8 1999/12/31 14:56:55 rgb -+ * MB fix for 2.3 dev-use-count. -+ * -+ * Revision 1.7 1999/11/18 04:09:18 rgb -+ * Replaced all kernel version macros to shorter, readable form. -+ * -+ * Revision 1.6 1999/09/24 00:34:13 rgb -+ * Add Marc Boucher's support for 2.3.xx+. -+ * -+ * Revision 1.5 1999/04/11 00:28:57 henry -+ * GPL boilerplate -+ * -+ * Revision 1.4 1999/04/06 04:54:25 rgb -+ * Fix/Add RCSID Id: and Log: bits to make PHMDs happy. This includes -+ * patch shell fixes. -+ * -+ * Revision 1.3 1998/10/19 14:44:28 rgb -+ * Added inclusion of freeswan.h. -+ * sa_id structure implemented and used: now includes protocol. -+ * -+ * Revision 1.2 1998/07/14 18:19:33 rgb -+ * Added #ifdef __KERNEL__ directives to restrict scope of header. -+ * -+ * Revision 1.1 1998/06/18 21:27:44 henry -+ * move sources from klips/src to klips/net/ipsec, to keep stupid -+ * kernel-build scripts happier in the presence of symlinks -+ * -+ * Revision 1.2 1998/04/21 21:29:10 rgb -+ * Rearrange debug switches to change on the fly debug output from user -+ * space. Only kernel changes checked in at this time. radij.c was also -+ * changed to temporarily remove buggy debugging code in rj_delete causing -+ * an OOPS and hence, netlink device open errors. -+ * -+ * Revision 1.1 1998/04/09 03:05:58 henry -+ * sources moved up from linux/net/ipsec -+ * -+ * Revision 1.1.1.1 1998/04/08 05:35:02 henry -+ * RGB's ipsec-0.8pre2.tar.gz ipsec-0.8 -+ * -+ * Revision 0.4 1997/01/15 01:28:15 ji -+ * Minor cosmetic changes. -+ * -+ * Revision 0.3 1996/11/20 14:35:48 ji -+ * Minor Cleanup. -+ * Rationalized debugging code. -+ * -+ * Revision 0.2 1996/11/02 00:18:33 ji -+ * First limited release. -+ * -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_eroute.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,112 @@ -+/* -+ * @(#) declarations of eroute structures -+ * -+ * Copyright (C) 1996, 1997 John Ioannidis. -+ * Copyright (C) 1998, 1999, 2000, 2001 Richard Guy Briggs -+ * Copyright (C) 2001 Michael Richardson -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * RCSID $Id: ipsec_eroute.h,v 1.5 2004/04/05 19:55:05 mcr Exp $ -+ * -+ * derived from ipsec_encap.h 1.15 on 2001/9/18 by mcr. -+ * -+ */ -+ -+#ifndef _IPSEC_EROUTE_H_ -+ -+#include "radij.h" -+#include "ipsec_encap.h" -+#include "ipsec_radij.h" -+ -+/* -+ * The "type" is really part of the address as far as the routing -+ * system is concerned. By using only one bit in the type field -+ * for each type, we sort-of make sure that different types of -+ * encapsulation addresses won't be matched against the wrong type. -+ */ -+ -+/* -+ * An entry in the radix tree -+ */ -+ -+struct rjtentry -+{ -+ struct radij_node rd_nodes[2]; /* tree glue, and other values */ -+#define rd_key(r) ((struct sockaddr_encap *)((r)->rd_nodes->rj_key)) -+#define rd_mask(r) ((struct sockaddr_encap *)((r)->rd_nodes->rj_mask)) -+ short rd_flags; -+ short rd_count; -+}; -+ -+struct ident -+{ -+ __u16 type; /* identity type */ -+ __u64 id; /* identity id */ -+ __u8 len; /* identity len */ -+ caddr_t data; /* identity data */ -+}; -+ -+/* -+ * An encapsulation route consists of a pointer to a -+ * radix tree entry and a SAID (a destination_address/SPI/protocol triple). -+ */ -+ -+struct eroute -+{ -+ struct rjtentry er_rjt; -+ ip_said er_said; -+ uint32_t er_pid; -+ uint32_t er_count; -+ uint64_t er_lasttime; -+ struct sockaddr_encap er_eaddr; /* MCR get rid of _encap, it is silly*/ -+ struct sockaddr_encap er_emask; -+ struct ident er_ident_s; -+ struct ident er_ident_d; -+ struct sk_buff* er_first; -+ struct sk_buff* er_last; -+}; -+ -+#define er_dst er_said.dst -+#define er_spi er_said.spi -+ -+#define _IPSEC_EROUTE_H_ -+#endif /* _IPSEC_EROUTE_H_ */ -+ -+/* -+ * $Log: ipsec_eroute.h,v $ -+ * Revision 1.5 2004/04/05 19:55:05 mcr -+ * Moved from linux/include/freeswan/ipsec_eroute.h,v -+ * -+ * Revision 1.4 2003/10/31 02:27:05 mcr -+ * pulled up port-selector patches and sa_id elimination. -+ * -+ * Revision 1.3.30.2 2003/10/29 01:10:19 mcr -+ * elimited "struct sa_id" -+ * -+ * Revision 1.3.30.1 2003/09/21 13:59:38 mcr -+ * pre-liminary X.509 patch - does not yet pass tests. -+ * -+ * Revision 1.3 2002/04/24 07:36:46 mcr -+ * Moved from ./klips/net/ipsec/ipsec_eroute.h,v -+ * -+ * Revision 1.2 2001/11/26 09:16:13 rgb -+ * Merge MCR's ipsec_sa, eroute, proc and struct lifetime changes. -+ * -+ * Revision 1.1.2.1 2001/09/25 02:18:54 mcr -+ * struct eroute moved to ipsec_eroute.h -+ * -+ * -+ * Local variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_errs.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,53 @@ -+/* -+ * @(#) definition of ipsec_errs structure -+ * -+ * Copyright (C) 2001 Richard Guy Briggs -+ * and Michael Richardson -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * RCSID $Id: ipsec_errs.h,v 1.4 2004/04/05 19:55:05 mcr Exp $ -+ * -+ */ -+ -+/* -+ * This file describes the errors/statistics that FreeSWAN collects. -+ * -+ */ -+ -+struct ipsec_errs { -+ __u32 ips_alg_errs; /* number of algorithm errors */ -+ __u32 ips_auth_errs; /* # of authentication errors */ -+ __u32 ips_encsize_errs; /* # of encryption size errors*/ -+ __u32 ips_encpad_errs; /* # of encryption pad errors*/ -+ __u32 ips_replaywin_errs; /* # of pkt sequence errors */ -+}; -+ -+/* -+ * $Log: ipsec_errs.h,v $ -+ * Revision 1.4 2004/04/05 19:55:05 mcr -+ * Moved from linux/include/freeswan/ipsec_errs.h,v -+ * -+ * Revision 1.3 2002/04/24 07:36:46 mcr -+ * Moved from ./klips/net/ipsec/ipsec_errs.h,v -+ * -+ * Revision 1.2 2001/11/26 09:16:13 rgb -+ * Merge MCR's ipsec_sa, eroute, proc and struct lifetime changes. -+ * -+ * Revision 1.1.2.1 2001/09/25 02:25:57 mcr -+ * lifetime structure created and common functions created. -+ * -+ * -+ * Local variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_esp.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,161 @@ -+/* -+ * Copyright (C) 1996, 1997 John Ioannidis. -+ * Copyright (C) 1998, 1999, 2000, 2001 Richard Guy Briggs. -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * RCSID $Id: ipsec_esp.h,v 1.28 2004/09/13 02:22:10 mcr Exp $ -+ */ -+ -+#include "openswan/ipsec_md5h.h" -+#include "openswan/ipsec_sha1.h" -+ -+#include "klips-crypto/des.h" -+ -+#ifndef IPPROTO_ESP -+#define IPPROTO_ESP 50 -+#endif /* IPPROTO_ESP */ -+ -+#define ESP_HEADER_LEN 8 /* 64 bits header (spi+rpl)*/ -+ -+#define EMT_ESPDESCBC_ULEN 20 /* coming from user mode */ -+#define EMT_ESPDES_KMAX 64 /* 512 bit secret key enough? */ -+#define EMT_ESPDES_KEY_SZ 8 /* 56 bit secret key with parity = 64 bits */ -+#define EMT_ESP3DES_KEY_SZ 24 /* 168 bit secret key with parity = 192 bits */ -+#define EMT_ESPDES_IV_SZ 8 /* IV size */ -+#define ESP_DESCBC_BLKLEN 8 /* DES-CBC block size */ -+ -+#define ESP_IV_MAXSZ 16 /* This is _critical_ */ -+#define ESP_IV_MAXSZ_INT (ESP_IV_MAXSZ/sizeof(int)) -+ -+#define DB_ES_PKTRX 0x0001 -+#define DB_ES_PKTRX2 0x0002 -+#define DB_ES_IPSA 0x0010 -+#define DB_ES_XF 0x0020 -+#define DB_ES_IPAD 0x0040 -+#define DB_ES_INAU 0x0080 -+#define DB_ES_OINFO 0x0100 -+#define DB_ES_OINFO2 0x0200 -+#define DB_ES_OH 0x0400 -+#define DB_ES_REPLAY 0x0800 -+ -+#ifdef __KERNEL__ -+struct des_eks { -+ des_key_schedule ks; -+}; -+ -+#ifndef CONFIG_XFRM_ALTERNATE_STACK -+extern struct inet_protocol esp_protocol; -+#endif /* CONFIG_XFRM_ALTERNATE_STACK */ -+ -+struct options; -+ -+struct esphdr -+{ -+ __u32 esp_spi; /* Security Parameters Index */ -+ __u32 esp_rpl; /* Replay counter */ -+ __u8 esp_iv[8]; /* iv */ -+}; -+ -+extern struct xform_functions esp_xform_funcs[]; -+ -+extern enum ipsec_rcv_value ipsec_rcv_esp_post_decrypt(struct ipsec_rcv_state *irs); -+ -+#ifdef CONFIG_KLIPS_DEBUG -+extern int debug_esp; -+#endif /* CONFIG_KLIPS_DEBUG */ -+#endif /* __KERNEL__ */ -+ -+/* -+ * $Log: ipsec_esp.h,v $ -+ * Revision 1.28 2004/09/13 02:22:10 mcr -+ * #define inet_protocol if necessary. -+ * -+ * Revision 1.27 2004/09/06 18:35:41 mcr -+ * 2.6.8.1 gets rid of inet_protocol->net_protocol compatibility, -+ * so adjust for that. -+ * -+ * Revision 1.26 2004/07/10 19:08:41 mcr -+ * CONFIG_IPSEC -> CONFIG_KLIPS. -+ * -+ * Revision 1.25 2004/04/06 02:49:08 mcr -+ * pullup of algo code from alg-branch. -+ * -+ * Revision 1.24 2004/04/05 19:55:05 mcr -+ * Moved from linux/include/freeswan/ipsec_esp.h,v -+ * -+ * Revision 1.23 2004/04/05 19:41:05 mcr -+ * merged alg-branch code. -+ * -+ * Revision 1.22 2003/12/13 19:10:16 mcr -+ * refactored rcv and xmit code - same as FS 2.05. -+ * -+ * Revision 1.23 2003/12/11 20:14:58 mcr -+ * refactored the xmit code, to move all encapsulation -+ * code into protocol functions. Note that all functions -+ * are essentially done by a single function, which is probably -+ * wrong. -+ * the rcv_functions structures are renamed xform_functions. -+ * -+ * Revision 1.22 2003/12/06 21:21:19 mcr -+ * split up receive path into per-transform files, for -+ * easier later removal. -+ * -+ * Revision 1.21.8.1 2003/12/22 15:25:52 jjo -+ * Merged algo-0.8.1-rc11-test1 into alg-branch -+ * -+ * Revision 1.21 2003/02/06 02:21:34 rgb -+ * -+ * Moved "struct auth_alg" from ipsec_rcv.c to ipsec_ah.h . -+ * Changed "struct ah" to "struct ahhdr" and "struct esp" to "struct esphdr". -+ * Removed "#ifdef INBOUND_POLICY_CHECK_eroute" dead code. -+ * -+ * Revision 1.20 2002/05/14 02:37:02 rgb -+ * Change reference from _TDB to _IPSA. -+ * -+ * Revision 1.19 2002/04/24 07:55:32 mcr -+ * #include patches and Makefiles for post-reorg compilation. -+ * -+ * Revision 1.18 2002/04/24 07:36:46 mcr -+ * Moved from ./klips/net/ipsec/ipsec_esp.h,v -+ * -+ * Revision 1.17 2002/02/20 01:27:07 rgb -+ * Ditched a pile of structs only used by the old Netlink interface. -+ * -+ * Revision 1.16 2001/12/11 02:35:57 rgb -+ * Change "struct net_device" to "struct device" for 2.2 compatibility. -+ * -+ * Revision 1.15 2001/11/26 09:23:48 rgb -+ * Merge MCR's ipsec_sa, eroute, proc and struct lifetime changes. -+ * -+ * Revision 1.14.2.3 2001/10/23 04:16:42 mcr -+ * get definition of des_key_schedule from des.h -+ * -+ * Revision 1.14.2.2 2001/10/22 20:33:13 mcr -+ * use "des_key_schedule" structure instead of cooking our own. -+ * -+ * Revision 1.14.2.1 2001/09/25 02:18:25 mcr -+ * replace "struct device" with "struct netdevice" -+ * -+ * Revision 1.14 2001/06/14 19:35:08 rgb -+ * Update copyright date. -+ * -+ * Revision 1.13 2000/09/08 19:12:56 rgb -+ * Change references from DEBUG_IPSEC to CONFIG_IPSEC_DEBUG. -+ * -+ * Revision 1.12 2000/08/01 14:51:50 rgb -+ * Removed _all_ remaining traces of DES. -+ * -+ * Revision 1.11 2000/01/10 16:36:20 rgb -+ * Ditch last of EME option flags, including initiator. -+ * -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_ipcomp.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,97 @@ -+/* -+ * IP compression header declations -+ * -+ * Copyright (C) 2003 Michael Richardson -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * RCSID $Id: ipsec_ipcomp.h,v 1.4 2004/07/10 19:08:41 mcr Exp $ -+ */ -+ -+#ifndef IPSEC_IPCOMP_H -+#define IPSEC_IPCOMP_H -+ -+#include "openswan/ipsec_auth.h" -+ -+/* Prefix all global deflate symbols with "ipcomp_" to avoid collisions with ppp_deflate & ext2comp */ -+#ifndef IPCOMP_PREFIX -+#define IPCOMP_PREFIX -+#endif /* IPCOMP_PREFIX */ -+ -+#ifndef IPPROTO_COMP -+#define IPPROTO_COMP 108 -+#endif /* IPPROTO_COMP */ -+ -+#ifdef CONFIG_KLIPS_DEBUG -+extern int sysctl_ipsec_debug_ipcomp; -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+struct ipcomphdr { /* IPCOMP header */ -+ __u8 ipcomp_nh; /* Next header (protocol) */ -+ __u8 ipcomp_flags; /* Reserved, must be 0 */ -+ __u16 ipcomp_cpi; /* Compression Parameter Index */ -+}; -+ -+#ifndef CONFIG_XFRM_ALTERNATE_STACK -+extern struct inet_protocol comp_protocol; -+#endif /* CONFIG_XFRM_ALTERNATE_STACK */ -+ -+extern int sysctl_ipsec_debug_ipcomp; -+ -+#define IPCOMP_UNCOMPRESSABLE 0x000000001 -+#define IPCOMP_COMPRESSIONERROR 0x000000002 -+#define IPCOMP_PARMERROR 0x000000004 -+#define IPCOMP_DECOMPRESSIONERROR 0x000000008 -+ -+#define IPCOMP_ADAPT_INITIAL_TRIES 8 -+#define IPCOMP_ADAPT_INITIAL_SKIP 4 -+#define IPCOMP_ADAPT_SUBSEQ_TRIES 2 -+#define IPCOMP_ADAPT_SUBSEQ_SKIP 8 -+ -+/* Function prototypes */ -+struct sk_buff *skb_compress(struct sk_buff *skb, struct ipsec_sa *ips, unsigned int *flags); -+struct sk_buff *skb_decompress(struct sk_buff *skb, struct ipsec_sa *ips, unsigned int *flags); -+ -+extern struct xform_functions ipcomp_xform_funcs[]; -+ -+#endif /* IPSEC_IPCOMP_H */ -+ -+/* -+ * $Log: ipsec_ipcomp.h,v $ -+ * Revision 1.4 2004/07/10 19:08:41 mcr -+ * CONFIG_IPSEC -> CONFIG_KLIPS. -+ * -+ * Revision 1.3 2004/04/06 02:49:08 mcr -+ * pullup of algo code from alg-branch. -+ * -+ * Revision 1.2 2004/04/05 19:55:05 mcr -+ * Moved from linux/include/freeswan/ipsec_ipcomp.h,v -+ * -+ * Revision 1.1 2003/12/13 19:10:16 mcr -+ * refactored rcv and xmit code - same as FS 2.05. -+ * -+ * Revision 1.2 2003/12/11 20:14:58 mcr -+ * refactored the xmit code, to move all encapsulation -+ * code into protocol functions. Note that all functions -+ * are essentially done by a single function, which is probably -+ * wrong. -+ * the rcv_functions structures are renamed xform_functions. -+ * -+ * Revision 1.1 2003/12/06 21:21:19 mcr -+ * split up receive path into per-transform files, for -+ * easier later removal. -+ * -+ * -+ * -+ */ -+ -+ -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_ipe4.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,68 @@ -+/* -+ * IP-in-IP Header declarations -+ * Copyright (C) 1996, 1997 John Ioannidis. -+ * Copyright (C) 1998, 1999, 2000, 2001 Richard Guy Briggs. -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * RCSID $Id: ipsec_ipe4.h,v 1.6 2004/04/05 19:55:05 mcr Exp $ -+ */ -+ -+/* The packet header is an IP header! */ -+ -+struct ipe4_xdata /* transform table data */ -+{ -+ struct in_addr i4_src; -+ struct in_addr i4_dst; -+}; -+ -+#define EMT_IPE4_ULEN 8 /* coming from user mode */ -+ -+ -+/* -+ * $Log: ipsec_ipe4.h,v $ -+ * Revision 1.6 2004/04/05 19:55:05 mcr -+ * Moved from linux/include/freeswan/ipsec_ipe4.h,v -+ * -+ * Revision 1.5 2002/04/24 07:36:46 mcr -+ * Moved from ./klips/net/ipsec/ipsec_ipe4.h,v -+ * -+ * Revision 1.4 2001/06/14 19:35:08 rgb -+ * Update copyright date. -+ * -+ * Revision 1.3 1999/04/11 00:28:57 henry -+ * GPL boilerplate -+ * -+ * Revision 1.2 1999/04/06 04:54:25 rgb -+ * Fix/Add RCSID Id: and Log: bits to make PHMDs happy. This includes -+ * patch shell fixes. -+ * -+ * Revision 1.1 1998/06/18 21:27:47 henry -+ * move sources from klips/src to klips/net/ipsec, to keep stupid -+ * kernel-build scripts happier in the presence of symlinks -+ * -+ * Revision 1.1 1998/04/09 03:06:07 henry -+ * sources moved up from linux/net/ipsec -+ * -+ * Revision 1.1.1.1 1998/04/08 05:35:03 henry -+ * RGB's ipsec-0.8pre2.tar.gz ipsec-0.8 -+ * -+ * Revision 0.4 1997/01/15 01:28:15 ji -+ * No changes. -+ * -+ * Revision 0.3 1996/11/20 14:48:53 ji -+ * Release update only. -+ * -+ * Revision 0.2 1996/11/02 00:18:33 ji -+ * First limited release. -+ * -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_ipip.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,45 @@ -+/* -+ * Copyright (C) 2003 Michael Richardson -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * RCSID $Id: ipsec_ipip.h,v 1.2 2004/04/05 19:55:05 mcr Exp $ -+ */ -+ -+#ifndef _IPSEC_IPIP_H_ -+ -+#ifndef IPPROTO_IPIP -+#define IPPROTO_IPIP 4 -+#endif /* IPPROTO_ESP */ -+ -+extern struct xform_functions ipip_xform_funcs[]; -+ -+#define _IPSEC_IPIP_H_ -+ -+#endif /* _IPSEC_IPIP_H_ */ -+ -+/* -+ * $Log: ipsec_ipip.h,v $ -+ * Revision 1.2 2004/04/05 19:55:05 mcr -+ * Moved from linux/include/freeswan/ipsec_ipip.h,v -+ * -+ * Revision 1.1 2003/12/13 19:10:16 mcr -+ * refactored rcv and xmit code - same as FS 2.05. -+ * -+ * Revision 1.1 2003/12/11 20:14:58 mcr -+ * refactored the xmit code, to move all encapsulation -+ * code into protocol functions. Note that all functions -+ * are essentially done by a single function, which is probably -+ * wrong. -+ * the rcv_functions structures are renamed xform_functions. -+ * -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_kern24.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,92 @@ -+/* -+ * @(#) routines to makes kernel 2.4 compatible with 2.6 usage. -+ * -+ * Copyright (C) 2004 Michael Richardson -+ * Copyright (C) 2005 - 2008 Paul Wouters -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ */ -+ -+#ifndef _IPSEC_KERN24_H -+ -+ -+#ifdef NETDEV_23 -+#if 0 -+#ifndef NETDEV_25 -+#define device net_device -+#endif -+#endif -+ -+# if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) -+# define __ipsec_dev_get(x) __dev_get_by_name(&init_net, x) -+# define ipsec_dev_get(x) dev_get_by_name(&init_net, x) -+# else -+# define ipsec_dev_get(x) __dev_get_by_name(x) -+# define __ipsec_dev_get(x) __dev_get_by_name(x) -+# endif -+ -+# define ipsec_dev_put(x) dev_put(x) -+# define __ipsec_dev_put(x) __dev_put(x) -+# define ipsec_dev_hold(x) dev_hold(x) -+#else /* NETDEV_23 */ -+# define ipsec_dev_get dev_get -+# define __ipsec_dev_put(x) -+# define ipsec_dev_put(x) -+# define ipsec_dev_hold(x) -+#endif /* NETDEV_23 */ -+ -+#ifndef HAVE_NETDEV_PRINTK -+#define netdev_printk(sevlevel, netdev, msglevel, format, arg...) \ -+ printk(sevlevel "%s: " format , netdev->name , ## arg) -+#endif -+ -+#ifndef NET_26 -+#define sk_receive_queue receive_queue -+#define sk_destruct destruct -+#define sk_reuse reuse -+#define sk_zapped zapped -+#define sk_family family -+#define sk_protocol protocol -+#define sk_protinfo protinfo -+#define sk_sleep sleep -+#define sk_state_change state_change -+#define sk_shutdown shutdown -+#define sk_err err -+#define sk_stamp stamp -+#define sk_socket socket -+#define sk_sndbuf sndbuf -+#define sock_flag(sk, flag) sk->dead -+#define sk_for_each(sk, node, plist) for(sk=*plist; sk!=NULL; sk = sk->next) -+#endif -+ -+/* deal with 2.4 vs 2.6 issues with module counts */ -+ -+/* in 2.6, all refcounts are maintained *outside* of the -+ * module to deal with race conditions. -+ */ -+ -+#ifdef NET_26 -+#define KLIPS_INC_USE /* nothing */ -+#define KLIPS_DEC_USE /* nothing */ -+ -+#else -+#define KLIPS_INC_USE MOD_INC_USE_COUNT -+#define KLIPS_DEC_USE MOD_DEC_USE_COUNT -+#endif -+ -+extern int printk_ratelimit(void); -+ -+ -+#define _IPSEC_KERN24_H 1 -+ -+#endif /* _IPSEC_KERN24_H */ -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_kversion.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,441 @@ -+#ifndef _OPENSWAN_KVERSIONS_H -+/* -+ * header file for Openswan library functions -+ * Copyright (C) 1998, 1999, 2000 Henry Spencer. -+ * Copyright (C) 1999, 2000, 2001 Richard Guy Briggs -+ * Copyright (C) 2003 - 2008 Paul Wouters -+ * -+ * This library is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU Library General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This library is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public -+ * License for more details. -+ * -+ */ -+#define _OPENSWAN_KVERSIONS_H /* seen it, no need to see it again */ -+ -+/* -+ * this file contains a series of atomic defines that depend upon -+ * kernel version numbers. The kernel versions are arranged -+ * in version-order number (which is often not chronological) -+ * and each clause enables or disables a feature. -+ */ -+ -+/* -+ * First, assorted kernel-version-dependent trickery. -+ */ -+#include -+#ifndef KERNEL_VERSION -+# define KERNEL_VERSION(x,y,z) (((x)<<16)+((y)<<8)+(z)) -+#endif -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,1,0) -+# define HEADER_CACHE_BIND_21 -+# error "KLIPS is no longer supported on Linux 2.0. Sorry" -+#endif -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,0) -+# define SPINLOCK -+# define PROC_FS_21 -+# define NETLINK_SOCK -+# define NET_21 -+#endif -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,1,19) -+# define net_device_stats enet_statistics -+#endif -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,0) -+# define SPINLOCK_23 -+# define NETDEV_23 -+# ifndef CONFIG_IP_ALIAS -+# define CONFIG_IP_ALIAS -+# endif -+#endif -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,25) -+# define PROC_FS_2325 -+# undef PROC_FS_21 -+#endif -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,30) -+# define PROC_NO_DUMMY -+#endif -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,35) -+# define SKB_COPY_EXPAND -+#endif -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,37) -+# define IP_SELECT_IDENT -+#endif -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,2) -+# define IP_SELECT_IDENT_NEW -+#endif -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,4) -+# define IPH_is_SKB_PULLED -+# define SKB_COW_NEW -+# define PROTO_HANDLER_SINGLE_PARM -+# define IP_FRAGMENT_LINEARIZE 1 -+#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,4) */ -+# ifdef REDHAT_BOGOSITY -+# define IP_SELECT_IDENT_NEW -+# define IPH_is_SKB_PULLED -+# define SKB_COW_NEW -+# define PROTO_HANDLER_SINGLE_PARM -+# endif /* REDHAT_BOGOSITY */ -+#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,4) */ -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,9) -+# define MALLOC_SLAB -+# define LINUX_KERNEL_HAS_SNPRINTF -+#endif -+ -+/* API changes are documented at: http://lwn.net/Articles/2.6-kernel-api/ */ -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -+# define HAVE_NETDEV_PRINTK 1 -+# define NET_26 -+# define NETDEV_25 -+# define NEED_SPINLOCK_TYPES -+#endif -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,8) -+# define NEED_INET_PROTOCOL -+#endif -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12) -+# define HAVE_SOCK_ZAPPED -+# if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) -+# define NET_26_24_SKALLOC -+# else -+# define NET_26_12_SKALLOC -+# endif -+#endif -+#endif -+ -+/* see */ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,13) -+# define HAVE_SOCK_SECURITY -+/* skb->nf_debug disappared completely in 2.6.13 */ -+# define ipsec_nf_debug_reset(skb) ((skb)->nf_debug = 0) -+#else -+# define ipsec_nf_debug_reset(skb) -+#endif -+ -+/* how to reset an skb we are reusing after encrpytion/decryption etc */ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,17) -+# define ipsec_nf_reset(skb) nf_reset((skb)) -+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,50) && defined(CONFIG_NETFILTER) -+# define ipsec_nf_reset(skb) do { \ -+ nf_conntrack_put((skb)->nfct); \ -+ (skb)->nfct=NULL; \ -+ ipsec_nf_debug_reset(skb); \ -+ } while(0) -+#else -+# define ipsec_nf_reset(skb) /**/ -+#endif -+ -+/* skb->stamp changed to skb->tstamp in 2.6.14 */ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,14) -+# define HAVE_TSTAMP -+# define HAVE_INET_SK_SPORT -+#else -+# define HAVE_SKB_LIST -+#endif -+ -+/* it seems 2.6.14 accidentally removed sysctl_ip_default_ttl */ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,14) -+# define SYSCTL_IPSEC_DEFAULT_TTL IPSEC_DEFAULT_TTL -+#else -+# define SYSCTL_IPSEC_DEFAULT_TTL sysctl_ip_default_ttl -+#endif -+ -+/* -+ The obsolete MODULE_PARM() macro is gone forevermore [in 2.6.17+] -+ It was introduced in 2.6.0 -+ Zero-filled memory can now be allocated from slab caches with -+ kmem_cache_zalloc(). There is also a new slab debugging option -+ to produce a /proc/slab_allocators file with detailed allocation -+ information. -+ */ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) -+# define module_param(a,b,c) MODULE_PARM(#a,"i") -+/* note below is only true for our current calls to module_param_array */ -+# define module_param_array(a,b,c,d) MODULE_PARM(#a,"1-2i") -+#endif -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,18) -+/* -+ The skb_linearize() function has been reworked, and no longer has a -+ GFP flags argument. There is also a new skb_linearize_cow() function -+ which ensures that the resulting SKB is writable. -+ Network drivers should no longer manipulate the xmit_lock spinlock -+ in the net_device structure; instead, the following new functions -+ should be used: -+ int netif_tx_lock(struct net_device *dev); -+ int netif_tx_lock_bh(struct net_device *dev); -+ void netif_tx_unlock(struct net_device *dev); -+ void netif_tx_unlock_bh(struct net_device *dev); -+ int netif_tx_trylock(struct net_device *dev); -+ A number of crypto API changes have been merged, the biggest being -+ a change to most algorithm-specific functions to take a pointer to -+ the crypto_tfm structure, rather than the old "context" pointer. This -+ change was necessary to support parameterized algorithms. -+*/ -+ -+# define HAVE_NEW_SKB_LINEARIZE -+#endif -+ -+/* this is the best we can do to detect XEN, which makes -+ * patches to linux/skbuff.h, making it look like 2.6.18 version -+ */ -+#ifdef CONFIG_XEN -+# define HAVE_NEW_SKB_LINEARIZE -+#endif -+ -+/* And the same for SuSe kernels who have it before it got into the -+ * linus kernel. -+ */ -+#ifdef SLE_VERSION_CODE -+# if SLE_VERSION_CODE >= 655616 -+# define HAVE_NEW_SKB_LINEARIZE -+# else -+# warning "A Suse kernel was detected, but we are unsure if it requires HAVE_NEW_SKB_LINEARIZE" -+# endif -+#endif -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,19) -+# define VOID_SOCK_UNREGISTER -+#endif -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20) -+/* skb->nfmark changed to skb->mark in 2.6.20 */ -+# define nfmark mark -+#else -+# define HAVE_KMEM_CACHE_T -+#endif -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,21) -+/* -+ Significant changes have been made to the crypto support interface. -+ The sysctl code has been heavily reworked, leading to a number of -+ internal API changes. -+*/ -+# define ipsec_register_sysctl_table(a,b) register_sysctl_table(a) -+# define CTL_TABLE_PARENT -+#else -+# define ipsec_register_sysctl_table(a,b) register_sysctl_table(a,b) -+#endif -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) -+/* -+ The eth_type_trans() function now sets the skb->dev field, consistent -+ with how similar functions for other link types operate. As a result, -+ many Ethernet drivers have been changed to remove the (now) redundant -+ assignment. -+ The header fields in the sk_buff structure have been renamed -+ and are no longer unions. Networking code and drivers can -+ now just use skb->transport_header, skb->network_header, and -+ skb->skb_mac_header. There are new functions for finding specific -+ headers within packets: tcp_hdr(), udp_hdr(), ipip_hdr(), and -+ ipipv6_hdr(). -+ The crypto API has a new set of functions for use with asynchronous -+ block ciphers. There is also a new cryptd kernel thread which can -+ run any synchronous cipher in an asynchronous mode. -+ A new macro has been added to make the creation of slab caches easier: -+ struct kmem_cache KMEM_CACHE(struct-type, flags); -+ The result is the creation of a cache holding objects of the given -+ struct_type, named after that type, and with the additional slab -+ flags (if any). -+*/ -+ -+/* need to include ip.h early, no longer pick it up in skbuff.h */ -+# include -+# define HAVE_KERNEL_TSTAMP -+/* type of sock.sk_stamp changed from timeval to ktime */ -+# define grab_socket_timeval(tv, sock) { (tv) = ktime_to_timeval((sock).sk_stamp); } -+#else -+# define grab_socket_timeval(tv, sock) { (tv) = (sock).sk_stamp; } -+/* internals of struct skbuff changed */ -+# define HAVE_DEV_NEXT -+# define ip_hdr(skb) ((skb)->nh.iph) -+# define skb_tail_pointer(skb) ((skb)->tail) -+# define skb_end_pointer(skb) ((skb)->end) -+# define skb_network_header(skb) ((skb)->nh.raw) -+# define skb_set_network_header(skb,off) ((skb)->nh.raw = (skb)->data + (off)) -+# define tcp_hdr(skb) ((skb)->h.th) -+# define udp_hdr(skb) ((skb)->h.uh) -+# define skb_transport_header(skb) ((skb)->h.raw) -+# define skb_set_transport_header(skb,off) ((skb)->h.raw = (skb)->data + (off)) -+# define skb_mac_header(skb) ((skb)->mac.raw) -+# define skb_set_mac_header(skb,off) ((skb)->mac.raw = (skb)->data + (off)) -+#endif -+/* turn a pointer into an offset for above macros */ -+#define ipsec_skb_offset(skb, ptr) (((unsigned char *)(ptr)) - (skb)->data) -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23) -+/* -+ * The macro got introduced in 2,6,22 but it does not work properly, and -+ * still uses the old number of arguments. -+ */ -+ /* -+ The destructor argument has been removed from kmem_cache_create(), as -+ destructors are no longer supported. All in-kernel callers have been -+ updated -+ */ -+# define HAVE_KMEM_CACHE_MACRO -+ -+/* Try using the new kernel encaps hook for nat-t, instead of udp.c */ -+# ifdef NOT_YET_FINISHED -+# define HAVE_UDP_ENCAP_CONVERT -+# endif -+ -+#endif -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) -+/* -+ * We can switch on earlier kernels, but from here on we have no choice -+ * but to abandon the old style proc_net and use seq_file -+ * The hard_header() method has been removed from struct net_device; -+ it has been replaced by a per-protocol header_ops structure pointer. -+ -+ The prototype for slab constructor callbacks has changed to: -+ void (*ctor)(struct kmem_cache *cache, void *object); -+ The unused flags argument has been removed and the order of the other -+ two arguments has been reversed to match other slab functions. -+ */ -+# define HAVE_PROC_DIR_ENTRY -+# define PROC_NET init_net.proc_net -+ -+# define __ipsec_dev_get(x) __dev_get_by_name(&init_net, x) -+# define ipsec_dev_get(x) dev_get_by_name(&init_net, x) -+#else -+ -+# define PROC_NET proc_net -+ -+# define ipsec_dev_get(x) __dev_get_by_name(x) -+# define __ipsec_dev_get(x) __dev_get_by_name(x) -+#endif -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,25) -+# define ip_chk_addr(a) inet_addr_type(&init_net, a) -+ -+# define l_inet_addr_type(a) inet_addr_type(&init_net, a) -+ -+#else -+# define ip_chk_addr inet_addr_type -+ -+#define l_inet_addr_type inet_addr_type -+ -+#endif -+ -+#ifndef NETDEV_TX_BUSY -+# ifdef NETDEV_XMIT_CN -+# define NETDEV_TX_BUSY NETDEV_XMIT_CN -+# else -+# define NETDEV_TX_BUSY 1 -+# endif -+#endif -+ -+ -+#ifdef NET_21 -+# define ipsec_kfree_skb(a) kfree_skb(a) -+#else /* NET_21 */ -+# define ipsec_kfree_skb(a) kfree_skb(a, FREE_WRITE) -+#endif /* NET_21 */ -+ -+#ifdef NETDEV_23 -+ -+#ifndef SPINLOCK -+# include -+ /* simulate spin locks and read/write locks */ -+ typedef struct { -+ volatile char lock; -+ } spinlock_t; -+ -+ typedef struct { -+ volatile unsigned int lock; -+ } rwlock_t; -+ -+# define spin_lock_init(x) { (x)->lock = 0;} -+# define rw_lock_init(x) { (x)->lock = 0; } -+ -+# define spin_lock(x) { while ((x)->lock) barrier(); (x)->lock=1;} -+# define spin_lock_irq(x) { cli(); spin_lock(x);} -+# define spin_lock_irqsave(x,flags) { save_flags(flags); spin_lock_irq(x);} -+ -+# define spin_unlock(x) { (x)->lock=0;} -+# define spin_unlock_irq(x) { spin_unlock(x); sti();} -+# define spin_unlock_irqrestore(x,flags) { spin_unlock(x); restore_flags(flags);} -+ -+# define read_lock(x) spin_lock(x) -+# define read_lock_irq(x) spin_lock_irq(x) -+# define read_lock_irqsave(x,flags) spin_lock_irqsave(x,flags) -+ -+# define read_unlock(x) spin_unlock(x) -+# define read_unlock_irq(x) spin_unlock_irq(x) -+# define read_unlock_irqrestore(x,flags) spin_unlock_irqrestore(x,flags) -+ -+# define write_lock(x) spin_lock(x) -+# define write_lock_irq(x) spin_lock_irq(x) -+# define write_lock_irqsave(x,flags) spin_lock_irqsave(x,flags) -+ -+# define write_unlock(x) spin_unlock(x) -+# define write_unlock_irq(x) spin_unlock_irq(x) -+# define write_unlock_irqrestore(x,flags) spin_unlock_irqrestore(x,flags) -+#endif /* !SPINLOCK */ -+ -+#ifndef SPINLOCK_23 -+# define spin_lock_bh(x) spin_lock_irq(x) -+# define spin_unlock_bh(x) spin_unlock_irq(x) -+ -+# define read_lock_bh(x) read_lock_irq(x) -+# define read_unlock_bh(x) read_unlock_irq(x) -+ -+# define write_lock_bh(x) write_lock_irq(x) -+# define write_unlock_bh(x) write_unlock_irq(x) -+#endif /* !SPINLOCK_23 */ -+ -+#ifndef HAVE_NETDEV_PRINTK -+#define netdev_printk(sevlevel, netdev, msglevel, format, arg...) \ -+ printk(sevlevel "%s: " format , netdev->name , ## arg) -+#endif -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) -+#define PROC_NET init_net.proc_net -+#define PROC_EOF_DATA -+#else -+#define PROC_NET proc_net -+#endif -+ -+#ifdef NET_21 -+# include -+#else -+ /* old kernel in.h has some IPv6 stuff, but not quite enough */ -+# define s6_addr16 s6_addr -+# define AF_INET6 10 -+# define uint8_t __u8 -+# define uint16_t __u16 -+# define uint32_t __u32 -+# define uint64_t __u64 -+#endif -+ -+#if __KERNEL__ -+# if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,0) -+# if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0) -+# include "openswan/ipsec_kern24.h" -+# else -+# error "kernels before 2.4 are not supported at this time" -+# endif -+# endif -+#endif -+ -+#endif /* _OPENSWAN_KVERSIONS_H */ -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_life.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,112 @@ -+/* -+ * Definitions relevant to IPSEC lifetimes -+ * Copyright (C) 2001 Richard Guy Briggs -+ * and Michael Richardson -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * RCSID $Id: ipsec_life.h,v 1.4 2004/04/05 19:55:05 mcr Exp $ -+ * -+ * This file derived from ipsec_xform.h on 2001/9/18 by mcr. -+ * -+ */ -+ -+/* -+ * This file describes the book keeping fields for the -+ * IPsec Security Association Structure. ("ipsec_sa") -+ * -+ * This structure is never allocated directly by kernel code, -+ * (it is always a static/auto or is part of a structure) -+ * so it does not have a reference count. -+ * -+ */ -+ -+#ifndef _IPSEC_LIFE_H_ -+ -+/* -+ * _count is total count. -+ * _hard is hard limit (kill SA after this number) -+ * _soft is soft limit (try to renew SA after this number) -+ * _last is used in some special cases. -+ * -+ */ -+ -+struct ipsec_lifetime64 -+{ -+ __u64 ipl_count; -+ __u64 ipl_soft; -+ __u64 ipl_hard; -+ __u64 ipl_last; -+}; -+ -+struct ipsec_lifetimes -+{ -+ /* number of bytes processed */ -+ struct ipsec_lifetime64 ipl_bytes; -+ -+ /* number of packets processed */ -+ struct ipsec_lifetime64 ipl_packets; -+ -+ /* time since SA was added */ -+ struct ipsec_lifetime64 ipl_addtime; -+ -+ /* time since SA was first used */ -+ struct ipsec_lifetime64 ipl_usetime; -+ -+ /* from rfc2367: -+ * For CURRENT, the number of different connections, -+ * endpoints, or flows that the association has been -+ * allocated towards. For HARD and SOFT, the number of -+ * these the association may be allocated towards -+ * before it expires. The concept of a connection, -+ * flow, or endpoint is system specific. -+ * -+ * mcr(2001-9-18) it is unclear what purpose these serve for FreeSWAN. -+ * They are maintained for PF_KEY compatibility. -+ */ -+ struct ipsec_lifetime64 ipl_allocations; -+}; -+ -+enum ipsec_life_alive { -+ ipsec_life_harddied = -1, -+ ipsec_life_softdied = 0, -+ ipsec_life_okay = 1 -+}; -+ -+enum ipsec_life_type { -+ ipsec_life_timebased = 1, -+ ipsec_life_countbased= 0 -+}; -+ -+#define _IPSEC_LIFE_H_ -+#endif /* _IPSEC_LIFE_H_ */ -+ -+ -+/* -+ * $Log: ipsec_life.h,v $ -+ * Revision 1.4 2004/04/05 19:55:05 mcr -+ * Moved from linux/include/freeswan/ipsec_life.h,v -+ * -+ * Revision 1.3 2002/04/24 07:36:46 mcr -+ * Moved from ./klips/net/ipsec/ipsec_life.h,v -+ * -+ * Revision 1.2 2001/11/26 09:16:14 rgb -+ * Merge MCR's ipsec_sa, eroute, proc and struct lifetime changes. -+ * -+ * Revision 1.1.2.1 2001/09/25 02:25:58 mcr -+ * lifetime structure created and common functions created. -+ * -+ * -+ * Local variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_mast.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,59 @@ -+#ifndef _IPSEC_MAST_H -+#define _IPSEC_MAST_H -+ -+#ifdef CONFIG_KLIPS_DEBUG -+#define DB_MAST_INIT 0x0001 -+#define DB_MAST_PROCFS 0x0002 -+#define DB_MAST_XMIT 0x0010 -+#define DB_MAST_OHDR 0x0020 -+#define DB_MAST_CROUT 0x0040 -+#define DB_MAST_OXFS 0x0080 -+#define DB_MAST_REVEC 0x0100 -+#define DB_MAST_ENCAP 0x0200 -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+struct ipsecmastconf { -+ __u32 cf_cmd; -+ union -+ { -+ char cfu_name[12]; -+ } cf_u; -+#define cf_name cf_u.cfu_name -+}; -+ -+struct mastpriv -+{ -+ struct sk_buff_head sendq; -+ struct wait_queue *wait_queue; -+ int (*hard_header) (struct sk_buff *skb, -+ struct net_device *dev, -+ unsigned short type, -+ void *daddr, -+ void *saddr, -+ unsigned len); -+#if 0 -+ char locked; -+ int (*hard_start_xmit) (struct sk_buff *skb, -+ struct net_device *dev); -+ int (*rebuild_header)(struct sk_buff *skb); -+ int (*set_mac_address)(struct net_device *dev, void *addr); -+ void (*header_cache_bind)(struct hh_cache **hhp, struct net_device *dev, -+ unsigned short htype, __u32 daddr); -+ void (*header_cache_update)(struct hh_cache *hh, -+ struct net_device *dev, -+ unsigned char * haddr); -+ struct net_device_stats *(*get_stats)(struct net_device *dev); -+#endif -+ struct net_device_stats mystats; -+ int mtu; /* What is the desired MTU? */ -+}; -+ -+extern int ipsec_mast_init_devices(void); -+extern int ipsec_mast_deletenum(int vifnum); -+extern int ipsec_mast_createnum(int vifnum); -+extern struct net_device *ipsec_mast_get_device(int vifnum); -+extern unsigned int ipsec_mast_is_transport(int vifnum); -+ -+ -+ -+#endif ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_md5h.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,143 @@ -+/* -+ * RCSID $Id: ipsec_md5h.h,v 1.10 2004/09/08 17:21:35 ken Exp $ -+ */ -+ -+/* -+ * The rest of this file is Copyright RSA DSI. See the following comments -+ * for the full Copyright notice. -+ */ -+ -+#ifndef _IPSEC_MD5H_H_ -+#define _IPSEC_MD5H_H_ -+ -+/* GLOBAL.H - RSAREF types and constants -+ */ -+ -+/* PROTOTYPES should be set to one if and only if the compiler supports -+ function argument prototyping. -+ The following makes PROTOTYPES default to 0 if it has not already -+ been defined with C compiler flags. -+ */ -+#ifndef PROTOTYPES -+#define PROTOTYPES 1 -+#endif /* !PROTOTYPES */ -+ -+/* POINTER defines a generic pointer type */ -+typedef __u8 *POINTER; -+ -+/* UINT2 defines a two byte word */ -+typedef __u16 UINT2; -+ -+/* UINT4 defines a four byte word */ -+typedef __u32 UINT4; -+ -+/* PROTO_LIST is defined depending on how PROTOTYPES is defined above. -+ If using PROTOTYPES, then PROTO_LIST returns the list, otherwise it -+ returns an empty list. -+ */ -+ -+#if PROTOTYPES -+#define PROTO_LIST(list) list -+#else /* PROTOTYPES */ -+#define PROTO_LIST(list) () -+#endif /* PROTOTYPES */ -+ -+ -+/* MD5.H - header file for MD5C.C -+ */ -+ -+/* Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All -+rights reserved. -+ -+License to copy and use this software is granted provided that it -+is identified as the "RSA Data Security, Inc. MD5 Message-Digest -+Algorithm" in all material mentioning or referencing this software -+or this function. -+ -+License is also granted to make and use derivative works provided -+that such works are identified as "derived from the RSA Data -+Security, Inc. MD5 Message-Digest Algorithm" in all material -+mentioning or referencing the derived work. -+ -+RSA Data Security, Inc. makes no representations concerning either -+the merchantability of this software or the suitability of this -+software for any particular purpose. It is provided "as is" -+without express or implied warranty of any kind. -+ -+These notices must be retained in any copies of any part of this -+documentation and/or software. -+ */ -+ -+/* MD5 context. */ -+typedef struct { -+ UINT4 state[4]; /* state (ABCD) */ -+ UINT4 count[2]; /* number of bits, modulo 2^64 (lsb first) */ -+ unsigned char buffer[64]; /* input buffer */ -+} MD5_CTX; -+ -+void osMD5Init PROTO_LIST ((void *)); -+void osMD5Update PROTO_LIST -+ ((void *, unsigned char *, __u32)); -+void osMD5Final PROTO_LIST ((unsigned char [16], void *)); -+ -+#endif /* _IPSEC_MD5H_H_ */ -+ -+/* -+ * $Log: ipsec_md5h.h,v $ -+ * Revision 1.10 2004/09/08 17:21:35 ken -+ * Rename MD5* -> osMD5 functions to prevent clashes with other symbols exported by kernel modules (CIFS in 2.6 initiated this) -+ * -+ * Revision 1.9 2004/04/05 19:55:05 mcr -+ * Moved from linux/include/freeswan/ipsec_md5h.h,v -+ * -+ * Revision 1.8 2002/09/10 01:45:09 mcr -+ * changed type of MD5_CTX and SHA1_CTX to void * so that -+ * the function prototypes would match, and could be placed -+ * into a pointer to a function. -+ * -+ * Revision 1.7 2002/04/24 07:36:46 mcr -+ * Moved from ./klips/net/ipsec/ipsec_md5h.h,v -+ * -+ * Revision 1.6 1999/12/13 13:59:13 rgb -+ * Quick fix to argument size to Update bugs. -+ * -+ * Revision 1.5 1999/12/07 18:16:23 rgb -+ * Fixed comments at end of #endif lines. -+ * -+ * Revision 1.4 1999/04/06 04:54:26 rgb -+ * Fix/Add RCSID Id: and Log: bits to make PHMDs happy. This includes -+ * patch shell fixes. -+ * -+ * Revision 1.3 1999/01/22 06:19:58 rgb -+ * 64-bit clean-up. -+ * -+ * Revision 1.2 1998/11/30 13:22:54 rgb -+ * Rationalised all the klips kernel file headers. They are much shorter -+ * now and won't conflict under RH5.2. -+ * -+ * Revision 1.1 1998/06/18 21:27:48 henry -+ * move sources from klips/src to klips/net/ipsec, to keep stupid -+ * kernel-build scripts happier in the presence of symlinks -+ * -+ * Revision 1.2 1998/04/23 20:54:03 rgb -+ * Fixed md5 and sha1 include file nesting issues, to be cleaned up when -+ * verified. -+ * -+ * Revision 1.1 1998/04/09 03:04:21 henry -+ * sources moved up from linux/net/ipsec -+ * these two include files modified not to include others except in kernel -+ * -+ * Revision 1.1.1.1 1998/04/08 05:35:03 henry -+ * RGB's ipsec-0.8pre2.tar.gz ipsec-0.8 -+ * -+ * Revision 0.4 1997/01/15 01:28:15 ji -+ * No changes. -+ * -+ * Revision 0.3 1996/11/20 14:48:53 ji -+ * Release update only. -+ * -+ * Revision 0.2 1996/11/02 00:18:33 ji -+ * First limited release. -+ * -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_param.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,268 @@ -+/* -+ * @(#) Openswan tunable paramaters -+ * -+ * Copyright (C) 2001 Richard Guy Briggs -+ * and Michael Richardson -+ * Copyright (C) 2004 Michael Richardson -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * -+ */ -+ -+/* -+ * This file provides a set of #define's which may be tuned by various -+ * people/configurations. It keeps all compile-time tunables in one place. -+ * -+ * This file should be included before all other IPsec kernel-only files. -+ * -+ */ -+ -+#ifndef _IPSEC_PARAM_H_ -+ -+#ifdef __KERNEL__ -+ -+#include "openswan/ipsec_kversion.h" -+ -+/* Set number of ipsecX virtual devices here. */ -+/* This must be < exp(field width of IPSEC_DEV_FORMAT) */ -+/* It must also be reasonable so as not to overload the memory and CPU */ -+/* constraints of the host. */ -+#ifdef CONFIG_KLIPS_IF_MAX -+#define IPSEC_NUM_IFMAX CONFIG_KLIPS_IF_MAX -+#endif -+#ifndef IPSEC_NUM_IFMAX -+#define IPSEC_NUM_IFMAX 64 -+#endif -+ -+/* default number of ipsecX devices to create */ -+#define IPSEC_NUM_IF 2 -+ -+/* The field width must be < IF_NAM_SIZ - strlen("ipsec") - 1. */ -+/* With "ipsec" being 5 characters, that means 10 is the max field width */ -+/* but machine memory and CPU constraints are not likely to tollerate */ -+/* more than 3 digits. The default is one digit. */ -+/* Update: userland scripts get upset if they can't find "ipsec0", so */ -+/* for now, no "0"-padding should be used (which would have been helpful */ -+/* to make text-searches work */ -+#define IPSEC_DEV_FORMAT "ipsec%d" -+#define MAST_DEV_FORMAT "mast%d" -+ -+/* For, say, 500 virtual ipsec devices, I would recommend: */ -+/* #define IPSEC_NUM_IF 500 */ -+/* #define IPSEC_DEV_FORMAT "ipsec%03d" */ -+/* Note that the "interfaces=" line in /etc/ipsec.conf would be, um, challenging. */ -+ -+/* use dynamic ipsecX device allocation */ -+#ifndef CONFIG_KLIPS_DYNDEV -+#define CONFIG_KLIPS_DYNDEV 1 -+#endif /* CONFIG_KLIPS_DYNDEV */ -+ -+ -+#ifdef CONFIG_KLIPS_BIGGATE -+# define SADB_HASHMOD 8069 -+#else /* CONFIG_KLIPS_BIGGATE */ -+# define SADB_HASHMOD 257 -+#endif /* CONFIG_KLIPS_BIGGATE */ -+ -+#endif /* __KERNEL__ */ -+ -+/* -+ * This is for the SA reference table. This number is related to the -+ * maximum number of SAs that KLIPS can concurrently deal with, plus enough -+ * space for keeping expired SAs around. -+ * -+ * TABLE_IDX_WIDTH is the number of bits that we will use. -+ * MAIN_TABLE_WIDTH is the number of bits used for the primary index table. -+ * -+ */ -+#ifndef IPSEC_SA_REF_MAINTABLE_IDX_WIDTH -+# define IPSEC_SA_REF_MAINTABLE_IDX_WIDTH 4 -+#endif -+ -+#ifndef IPSEC_SA_REF_FREELIST_NUM_ENTRIES -+# define IPSEC_SA_REF_FREELIST_NUM_ENTRIES 256 -+#endif -+ -+#ifndef IPSEC_SA_REF_CODE -+# define IPSEC_SA_REF_CODE 1 -+#endif -+ -+#ifdef __KERNEL__ -+/* This is defined for 2.4, but not 2.2.... */ -+#ifndef ARPHRD_VOID -+# define ARPHRD_VOID 0xFFFF -+#endif -+ -+/* always turn on IPIP mode */ -+#ifndef CONFIG_KLIPS_IPIP -+#define CONFIG_KLIPS_IPIP 1 -+#endif -+ -+/* -+ * Worry about PROC_FS stuff -+ */ -+#if defined(PROC_FS_2325) -+/* kernel 2.4 */ -+# define IPSEC_PROC_LAST_ARG ,int *eof,void *data -+# define IPSEC_PROCFS_DEBUG_NO_STATIC -+# define IPSEC_PROC_SUBDIRS -+#else -+/* kernel <2.4 */ -+# define IPSEC_PROCFS_DEBUG_NO_STATIC DEBUG_NO_STATIC -+ -+# ifndef PROC_NO_DUMMY -+# define IPSEC_PROC_LAST_ARG , int dummy -+# else -+# define IPSEC_PROC_LAST_ARG -+# endif /* !PROC_NO_DUMMY */ -+#endif /* PROC_FS_2325 */ -+ -+#if !defined(LINUX_KERNEL_HAS_SNPRINTF) -+/* GNU CPP specific! */ -+# define snprintf(buf, len, fmt...) sprintf(buf, ##fmt) -+#endif /* !LINUX_KERNEL_HAS_SNPRINTF */ -+ -+#ifdef SPINLOCK -+# ifdef SPINLOCK_23 -+# include /* *lock* */ -+# else /* SPINLOCK_23 */ -+# include /* *lock* */ -+# endif /* SPINLOCK_23 */ -+#endif /* SPINLOCK */ -+ -+#ifndef KLIPS_FIXES_DES_PARITY -+# define KLIPS_FIXES_DES_PARITY 1 -+#endif /* !KLIPS_FIXES_DES_PARITY */ -+ -+/* we don't really want to print these unless there are really big problems */ -+#ifndef KLIPS_DIVULGE_CYPHER_KEY -+# define KLIPS_DIVULGE_CYPHER_KEY 0 -+#endif /* !KLIPS_DIVULGE_CYPHER_KEY */ -+ -+#ifndef KLIPS_DIVULGE_HMAC_KEY -+# define KLIPS_DIVULGE_HMAC_KEY 0 -+#endif /* !KLIPS_DIVULGE_HMAC_KEY */ -+ -+#ifndef IPSEC_DISALLOW_IPOPTIONS -+# define IPSEC_DISALLOW_IPOPTIONS 1 -+#endif /* !KLIPS_DIVULGE_HMAC_KEY */ -+ -+/* extra toggles for regression testing */ -+#ifdef CONFIG_KLIPS_REGRESS -+ -+/* -+ * should pfkey_acquire() become 100% lossy? -+ * -+ */ -+extern int sysctl_ipsec_regress_pfkey_lossage; -+#ifndef KLIPS_PFKEY_ACQUIRE_LOSSAGE -+# ifdef CONFIG_KLIPS_PFKEY_ACQUIRE_LOSSAGE -+# define KLIPS_PFKEY_ACQUIRE_LOSSAGE 100 -+# else /* CONFIG_KLIPS_PFKEY_ACQUIRE_LOSSAGE */ -+/* not by default! */ -+# define KLIPS_PFKEY_ACQUIRE_LOSSAGE 0 -+# endif /* CONFIG_KLIPS_PFKEY_ACQUIRE_LOSSAGE */ -+#endif /* KLIPS_PFKEY_ACQUIRE_LOSSAGE */ -+ -+#endif /* CONFIG_KLIPS_REGRESS */ -+ -+ -+/* -+ * debugging routines. -+ */ -+#ifdef CONFIG_KLIPS_DEBUG -+ #define KLIPS_ERROR(flag, format, args...) if(printk_ratelimit() || flag) printk(KERN_ERR "KLIPS " format, ## args) -+ #define KLIPS_PRINT(flag, format, args...) \ -+ ((flag) ? printk(KERN_INFO format , ## args) : 0) -+ #define KLIPS_PRINTMORE(flag, format, args...) \ -+ ((flag) ? printk(format , ## args) : 0) -+ #define KLIPS_IP_PRINT(flag, ip) \ -+ ((flag) ? ipsec_print_ip(ip) : 0) -+ #define KLIPS_SATOT(flag, sa, format, dst, dstlen) \ -+ ((flag) ? satot(sa, format, dst, dstlen) : 0) -+#else /* CONFIG_KLIPS_DEBUG */ -+ #define KLIPS_ERROR(flag, format, args...) if(printk_ratelimit()) printk(KERN_ERR "KLIPS " format, ## args) -+ #define KLIPS_PRINT(flag, format, args...) do ; while(0) -+ #define KLIPS_PRINTMORE(flag, format, args...) do ; while(0) -+ #define KLIPS_IP_PRINT(flag, ip) do ; while(0) -+ #define KLIPS_SATOT(flag, sa, format, dst, dstlen) (0) -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+ -+/* -+ * Stupid kernel API differences in APIs. Not only do some -+ * kernels not have ip_select_ident, but some have differing APIs, -+ * and SuSE has one with one parameter, but no way of checking to -+ * see what is really what. -+ */ -+ -+#ifdef SUSE_LINUX_2_4_19_IS_STUPID -+#define KLIPS_IP_SELECT_IDENT(iph, skb) ip_select_ident(iph) -+#else -+ -+/* simplest case, nothing */ -+#if !defined(IP_SELECT_IDENT) -+#define KLIPS_IP_SELECT_IDENT(iph, skb) do { iph->id = htons(ip_id_count++); } while(0) -+#endif -+ -+/* kernels > 2.3.37-ish */ -+#if defined(IP_SELECT_IDENT) && !defined(IP_SELECT_IDENT_NEW) -+#define KLIPS_IP_SELECT_IDENT(iph, skb) ip_select_ident(iph, skb->dst) -+#endif -+ -+/* kernels > 2.4.2 */ -+#if defined(IP_SELECT_IDENT) && defined(IP_SELECT_IDENT_NEW) -+#define KLIPS_IP_SELECT_IDENT(iph, skb) ip_select_ident(iph, skb->dst, NULL) -+#endif -+ -+#endif /* SUSE_LINUX_2_4_19_IS_STUPID */ -+ -+/* -+ * make klips fail test:east-espiv-01. -+ * exploit is at testing/attacks/espiv -+ * -+ */ -+#define KLIPS_IMPAIRMENT_ESPIV_CBC_ATTACK 0 -+ -+ -+/* IP_FRAGMENT_LINEARIZE is set in freeswan.h if Kernel > 2.4.4 */ -+#ifndef IP_FRAGMENT_LINEARIZE -+# define IP_FRAGMENT_LINEARIZE 0 -+#endif /* IP_FRAGMENT_LINEARIZE */ -+#endif /* __KERNEL__ */ -+ -+#ifdef NEED_INET_PROTOCOL -+#define inet_protocol net_protocol -+#endif -+ -+#if defined(CONFIG_IPSEC_NAT_TRAVERSAL) && CONFIG_IPSEC_NAT_TRAVERSAL -+#define NAT_TRAVERSAL 1 -+#else -+/* let people either #undef, or #define = 0 it */ -+#ifdef CONFIG_IPSEC_NAT_TRAVERSAL -+#undef CONFIG_IPSEC_NAT_TRAVERSAL -+#endif -+#endif -+ -+#ifndef IPSEC_DEFAULT_TTL -+#define IPSEC_DEFAULT_TTL 64 -+#endif -+ -+#define _IPSEC_PARAM_H_ -+#endif /* _IPSEC_PARAM_H_ */ -+ -+/* -+ * Local variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_policy.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,239 @@ -+#ifndef _IPSEC_POLICY_H -+/* -+ * policy interface file between pluto and applications -+ * Copyright (C) 2003 Michael Richardson -+ * -+ * This library is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU Library General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This library is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public -+ * License for more details. -+ * -+ * RCSID $Id: ipsec_policy.h,v 1.8 2005/07/26 01:12:38 mcr Exp $ -+ */ -+#define _IPSEC_POLICY_H /* seen it, no need to see it again */ -+ -+ -+/* -+ * this file defines an interface between an application (or rather an -+ * application library) and a key/policy daemon. It provides for inquiries -+ * as to the current state of a connected socket, as well as for general -+ * questions. -+ * -+ * In general, the interface is defined as a series of functional interfaces, -+ * and the policy messages should be internal. However, because this is in -+ * fact an ABI between pieces of the system that may get compiled and revised -+ * seperately, this ABI must be public and revision controlled. -+ * -+ * It is expected that the daemon will always support previous versions. -+ */ -+ -+#define IPSEC_POLICY_MSG_REVISION (unsigned)200305061 -+ -+enum ipsec_policy_command { -+ IPSEC_CMD_QUERY_FD = 1, -+ IPSEC_CMD_QUERY_HOSTPAIR = 2, -+ IPSEC_CMD_QUERY_DSTONLY = 3, -+}; -+ -+struct ipsec_policy_msg_head { -+ u_int32_t ipm_version; -+ u_int32_t ipm_msg_len; -+ u_int32_t ipm_msg_type; -+ u_int32_t ipm_msg_seq; -+}; -+ -+enum ipsec_privacy_quality { -+ IPSEC_PRIVACY_NONE = 0, -+ IPSEC_PRIVACY_INTEGRAL = 4, /* not private at all. AH-like */ -+ IPSEC_PRIVACY_UNKNOWN = 8, /* something is claimed, but details unavail */ -+ IPSEC_PRIVACY_ROT13 = 12, /* trivially breakable, i.e. 1DES */ -+ IPSEC_PRIVACY_GAK = 16, /* known eavesdroppers */ -+ IPSEC_PRIVACY_PRIVATE = 32, /* secure for at least a decade */ -+ IPSEC_PRIVACY_STRONG = 64, /* ridiculously secure */ -+ IPSEC_PRIVACY_TORTOISE = 192, /* even stronger, but very slow */ -+ IPSEC_PRIVACY_OTP = 224, /* some kind of *true* one time pad */ -+}; -+ -+enum ipsec_bandwidth_quality { -+ IPSEC_QOS_UNKNOWN = 0, /* unknown bandwidth */ -+ IPSEC_QOS_INTERACTIVE = 16, /* reasonably moderate jitter, moderate fast. -+ Good enough for telnet/ssh. */ -+ IPSEC_QOS_VOIP = 32, /* faster crypto, predicable jitter */ -+ IPSEC_QOS_FTP = 64, /* higher throughput crypto, perhaps hardware -+ offloaded, but latency/jitter may be bad */ -+ IPSEC_QOS_WIRESPEED = 128, /* expect to be able to fill your pipe */ -+}; -+ -+/* moved from programs/pluto/constants.h */ -+/* IPsec AH transform values -+ * RFC2407 The Internet IP security Domain of Interpretation for ISAKMP 4.4.3 -+ * and in http://www.iana.org/assignments/isakmp-registry -+ */ -+enum ipsec_authentication_algo { -+ AH_NONE=0, -+ AH_MD5=2, -+ AH_SHA=3, -+ AH_DES=4, -+ AH_SHA2_256=5, -+ AH_SHA2_384=6, -+ AH_SHA2_512=7, -+ AH_RIPEMD=8, -+ AH__AES_XCBC_MAC=9, -+ AH_RSA=10 -+}; -+ -+/* IPsec ESP transform values -+ * RFC2407 The Internet IP security Domain of Interpretation for ISAKMP 4.4.4 -+ * and from http://www.iana.org/assignments/isakmp-registry -+ */ -+ -+enum ipsec_cipher_algo { -+ ESP_reserved=0, -+ ESP_DES_IV64=1, -+ ESP_DES=2, -+ ESP_3DES=3, -+ ESP_RC5=4, -+ ESP_IDEA=5, -+ ESP_CAST=6, -+ ESP_BLOWFISH=7, -+ ESP_3IDEA=8, -+ ESP_DES_IV32=9, -+ ESP_RC4=10, -+ ESP_NULL=11, -+ ESP_AES=12, /* 128 bit AES */ -+ ESP_AES_CTR=13, -+ ESP_AES_CCM_8=14, -+ ESP_AES_CCM_12=15, -+ ESP_AES_CCM_16=16, -+ /* unassigned=17 */ -+ ESP_AES_GCM_8=18, -+ ESP_AES_GCM_12=19, -+ ESP_AES_GCM_16=20, -+ ESP_SEED_CBC=21, -+ ESP_CAMELLIA=22, -+ /* 249-255 reserved for private use */ -+}; -+ -+/* IPCOMP transform values -+ * RFC2407 The Internet IP security Domain of Interpretation for ISAKMP 4.4.5 -+ */ -+ -+enum ipsec_comp_algo { -+ IPCOMP_OUI= 1, -+ IPCOMP_DEFLATE= 2, -+ IPCOMP_LZS= 3, -+ IPCOMP_V42BIS= 4 -+}; -+ -+/* Identification type values -+ * RFC 2407 The Internet IP security Domain of Interpretation for -+ * ISAKMP 4.6.2.1 -+ * -+ * Also for RFC4306. -+ * -+ * enum ident_names; -+ */ -+ -+enum ipsec_id_type { -+ ID_FROMCERT= (-3), /* taken from certificate */ -+ ID_IMPOSSIBLE= (-2), /* private to Pluto */ -+ ID_MYID= (-1), /* private to Pluto */ -+ ID_NONE= 0, /* private to Pluto */ -+ ID_IPV4_ADDR= 1, -+ ID_FQDN= 2, -+ ID_USER_FQDN= 3, -+ ID_RFC822_ADDR = ID_USER_FQDN, /* RFC4306 */ -+ ID_IPV4_ADDR_SUBNET= 4, -+ ID_IPV6_ADDR= 5, -+ ID_IPV6_ADDR_SUBNET= 6, -+ ID_IPV4_ADDR_RANGE= 7, -+ ID_IPV6_ADDR_RANGE= 8, -+ ID_DER_ASN1_DN= 9, -+ ID_DER_ASN1_GN= 10, -+ ID_KEY_ID= 11 -+}; -+ -+/* Certificate type values -+ * RFC 2408 ISAKMP, chapter 3.9 -+ */ -+enum ipsec_cert_type { -+ CERT_NONE= 0, /* none, or guess from file contents */ -+ CERT_PKCS7_WRAPPED_X509= 1, /* self-signed certificate from disk */ -+ CERT_PGP= 2, -+ CERT_DNS_SIGNED_KEY= 3, /* KEY RR from DNS */ -+ CERT_X509_SIGNATURE= 4, -+ CERT_X509_KEY_EXCHANGE= 5, -+ CERT_KERBEROS_TOKENS= 6, -+ CERT_CRL= 7, -+ CERT_ARL= 8, -+ CERT_SPKI= 9, -+ CERT_X509_ATTRIBUTE= 10, -+ CERT_RAW_RSA= 11, /* raw RSA from config file */ -+}; -+ -+/* a SIG record in ASCII */ -+struct ipsec_dns_sig { -+ char fqdn[256]; -+ char dns_sig[768]; /* empty string if not signed */ -+}; -+ -+struct ipsec_raw_key { -+ char id_name[256]; -+ char fs_keyid[8]; -+}; -+ -+struct ipsec_identity { -+ enum ipsec_id_type ii_type; -+ enum ipsec_cert_type ii_format; -+ union { -+ struct ipsec_dns_sig ipsec_dns_signed; -+ /* some thing for PGP */ -+ /* some thing for PKIX */ -+ struct ipsec_raw_key ipsec_raw_key; -+ } ii_credential; -+}; -+ -+#define IPSEC_MAX_CREDENTIALS 32 -+ -+struct ipsec_policy_cmd_query { -+ struct ipsec_policy_msg_head head; -+ -+ /* Query section */ -+ ip_address query_local; /* us */ -+ ip_address query_remote; /* them */ -+ u_int8_t proto; /* TCP, ICMP, etc. */ -+ u_short src_port, dst_port; -+ -+ /* Answer section */ -+ enum ipsec_privacy_quality strength; -+ enum ipsec_bandwidth_quality bandwidth; -+ enum ipsec_authentication_algo auth_detail; -+ enum ipsec_cipher_algo esp_detail; -+ enum ipsec_comp_algo comp_detail; -+ -+ int credential_count; -+ -+ struct ipsec_identity credentials[IPSEC_MAX_CREDENTIALS]; -+}; -+ -+#define IPSEC_POLICY_SOCKET "/var/run/pluto/pluto.info" -+ -+/* prototypes */ -+extern err_t ipsec_policy_lookup(int fd, struct ipsec_policy_cmd_query *result); -+extern err_t ipsec_policy_init(void); -+extern err_t ipsec_policy_final(void); -+extern err_t ipsec_policy_readmsg(int policysock, -+ unsigned char *buf, size_t buflen); -+extern err_t ipsec_policy_sendrecv(unsigned char *buf, size_t buflen); -+extern err_t ipsec_policy_cgilookup(struct ipsec_policy_cmd_query *result); -+ -+ -+extern const char *ipsec_policy_version_code(void); -+extern const char *ipsec_policy_version_string(void); -+ -+#endif /* _IPSEC_POLICY_H */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_proto.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,195 @@ -+/* -+ * @(#) prototypes for FreeSWAN functions -+ * -+ * Copyright (C) 2001 Richard Guy Briggs -+ * and Michael Richardson -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * RCSID $Id: ipsec_proto.h,v 1.14 2005/04/29 04:50:03 mcr Exp $ -+ * -+ */ -+ -+#ifndef _IPSEC_PROTO_H_ -+ -+#include "ipsec_param.h" -+ -+/* -+ * This file is a kernel only file that declares prototypes for -+ * all intra-module function calls and global data structures. -+ * -+ * Include this file last. -+ * -+ */ -+ -+/* forward references */ -+enum ipsec_direction; -+enum ipsec_life_type; -+struct ipsec_lifetime64; -+struct ident; -+struct sockaddr_encap; -+struct ipsec_sa; -+ -+/* ipsec_init.c */ -+extern struct prng ipsec_prng; -+ -+/* ipsec_sa.c */ -+extern struct ipsec_sa *ipsec_sadb_hash[SADB_HASHMOD]; -+extern spinlock_t tdb_lock; -+extern int ipsec_sadb_init(void); -+extern int ipsec_sadb_cleanup(__u8); -+ -+extern struct ipsec_sa *ipsec_sa_alloc(int*error); -+ -+ -+extern struct ipsec_sa *ipsec_sa_getbyid(ip_said *); -+extern /* void */ int ipsec_sa_add(struct ipsec_sa *); -+ -+extern int ipsec_sa_init(struct ipsec_sa *ipsp); -+ -+/* debug declarations */ -+ -+/* ipsec_proc.c */ -+extern int ipsec_proc_init(void); -+extern void ipsec_proc_cleanup(void); -+ -+/* ipsec_rcv.c */ -+extern int ipsec_rcv(struct sk_buff *skb); -+extern int klips26_rcv_encap(struct sk_buff *skb, __u16 encap_type); -+ -+/* ipsec_xmit.c */ -+struct ipsec_xmit_state; -+extern enum ipsec_xmit_value ipsec_xmit_sanity_check_dev(struct ipsec_xmit_state *ixs); -+extern enum ipsec_xmit_value ipsec_xmit_sanity_check_skb(struct ipsec_xmit_state *ixs); -+extern void ipsec_print_ip(struct iphdr *ip); -+ -+ -+ -+/* ipsec_radij.c */ -+extern int ipsec_makeroute(struct sockaddr_encap *ea, -+ struct sockaddr_encap *em, -+ ip_said said, -+ uint32_t pid, -+ struct sk_buff *skb, -+ struct ident *ident_s, -+ struct ident *ident_d); -+ -+extern int ipsec_breakroute(struct sockaddr_encap *ea, -+ struct sockaddr_encap *em, -+ struct sk_buff **first, -+ struct sk_buff **last); -+ -+int ipsec_radijinit(void); -+int ipsec_cleareroutes(void); -+int ipsec_radijcleanup(void); -+ -+/* ipsec_life.c */ -+extern enum ipsec_life_alive ipsec_lifetime_check(struct ipsec_lifetime64 *il64, -+ const char *lifename, -+ const char *saname, -+ enum ipsec_life_type ilt, -+ enum ipsec_direction idir, -+ struct ipsec_sa *ips); -+ -+ -+extern int ipsec_lifetime_format(char *buffer, -+ int buflen, -+ char *lifename, -+ enum ipsec_life_type timebaselife, -+ struct ipsec_lifetime64 *lifetime); -+ -+extern void ipsec_lifetime_update_hard(struct ipsec_lifetime64 *lifetime, -+ __u64 newvalue); -+ -+extern void ipsec_lifetime_update_soft(struct ipsec_lifetime64 *lifetime, -+ __u64 newvalue); -+ -+/* ipsec_snprintf.c */ -+extern int ipsec_snprintf(char * buf, ssize_t size, const char *fmt, ...); -+extern void ipsec_dmp_block(char *s, caddr_t bb, int len); -+ -+ -+/* ipsec_alg.c */ -+extern int ipsec_alg_init(void); -+ -+ -+#ifdef CONFIG_KLIPS_DEBUG -+ -+extern int debug_xform; -+extern int debug_eroute; -+extern int debug_spi; -+extern int debug_netlink; -+ -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+ -+ -+ -+#define _IPSEC_PROTO_H -+#endif /* _IPSEC_PROTO_H_ */ -+ -+/* -+ * $Log: ipsec_proto.h,v $ -+ * Revision 1.14 2005/04/29 04:50:03 mcr -+ * prototypes for xmit and alg code. -+ * -+ * Revision 1.13 2005/04/17 03:46:07 mcr -+ * added prototypes for ipsec_rcv() routines. -+ * -+ * Revision 1.12 2005/04/14 20:28:37 mcr -+ * added additional prototypes. -+ * -+ * Revision 1.11 2005/04/14 01:16:28 mcr -+ * add prototypes for snprintf. -+ * -+ * Revision 1.10 2005/04/13 22:47:28 mcr -+ * make sure that forward references are available. -+ * -+ * Revision 1.9 2004/07/10 19:08:41 mcr -+ * CONFIG_IPSEC -> CONFIG_KLIPS. -+ * -+ * Revision 1.8 2004/04/05 19:55:06 mcr -+ * Moved from linux/include/freeswan/ipsec_proto.h,v -+ * -+ * Revision 1.7 2003/10/31 02:27:05 mcr -+ * pulled up port-selector patches and sa_id elimination. -+ * -+ * Revision 1.6.30.1 2003/10/29 01:10:19 mcr -+ * elimited "struct sa_id" -+ * -+ * Revision 1.6 2002/05/23 07:13:48 rgb -+ * Added ipsec_sa_put() for releasing an ipsec_sa refcount. -+ * -+ * Revision 1.5 2002/05/14 02:36:40 rgb -+ * Converted reference from ipsec_sa_put to ipsec_sa_add to avoid confusion -+ * with "put" usage in the kernel. -+ * -+ * Revision 1.4 2002/04/24 07:36:47 mcr -+ * Moved from ./klips/net/ipsec/ipsec_proto.h,v -+ * -+ * Revision 1.3 2002/04/20 00:12:25 rgb -+ * Added esp IV CBC attack fix, disabled. -+ * -+ * Revision 1.2 2001/11/26 09:16:15 rgb -+ * Merge MCR's ipsec_sa, eroute, proc and struct lifetime changes. -+ * -+ * Revision 1.1.2.1 2001/09/25 02:21:01 mcr -+ * ipsec_proto.h created to keep prototypes rather than deal with -+ * cyclic dependancies of structures and prototypes in .h files. -+ * -+ * -+ * -+ * Local variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_radij.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,179 @@ -+/* -+ * @(#) Definitions relevant to the IPSEC <> radij tree interfacing -+ * Copyright (C) 1996, 1997 John Ioannidis. -+ * Copyright (C) 1998, 1999, 2000, 2001 Richard Guy Briggs. -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * RCSID $Id: ipsec_radij.h,v 1.22 2004/07/10 19:08:41 mcr Exp $ -+ */ -+ -+#ifndef _IPSEC_RADIJ_H -+ -+#include -+ -+int ipsec_walk(char *); -+ -+int ipsec_rj_walker_procprint(struct radij_node *, void *); -+int ipsec_rj_walker_delete(struct radij_node *, void *); -+ -+/* This structure is used to pass information between -+ * ipsec_eroute_get_info and ipsec_rj_walker_procprint -+ * (through rj_walktree) and between calls of ipsec_rj_walker_procprint. -+ */ -+struct wsbuf -+{ -+ /* from caller of ipsec_eroute_get_info: */ -+ char *const buffer; /* start of buffer provided */ -+ const int length; /* length of buffer provided */ -+ const off_t offset; /* file position of first character of interest */ -+ /* accumulated by ipsec_rj_walker_procprint: */ -+ int len; /* number of character filled into buffer */ -+ off_t begin; /* file position contained in buffer[0] (<=offset) */ -+}; -+ -+extern struct radij_node_head *rnh; -+extern spinlock_t eroute_lock; -+ -+struct eroute * ipsec_findroute(struct sockaddr_encap *); -+ -+#define O1(x) (int)(((x)>>24)&0xff) -+#define O2(x) (int)(((x)>>16)&0xff) -+#define O3(x) (int)(((x)>>8)&0xff) -+#define O4(x) (int)(((x))&0xff) -+ -+#ifdef CONFIG_KLIPS_DEBUG -+extern int debug_radij; -+void rj_dumptrees(void); -+ -+#define DB_RJ_DUMPTREES 0x0001 -+#define DB_RJ_FINDROUTE 0x0002 -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+#define _IPSEC_RADIJ_H -+#endif -+ -+/* -+ * $Log: ipsec_radij.h,v $ -+ * Revision 1.22 2004/07/10 19:08:41 mcr -+ * CONFIG_IPSEC -> CONFIG_KLIPS. -+ * -+ * Revision 1.21 2004/04/29 11:06:42 ken -+ * Last bits from 2.06 procfs updates -+ * -+ * Revision 1.20 2004/04/06 02:49:08 mcr -+ * pullup of algo code from alg-branch. -+ * -+ * Revision 1.19 2004/04/05 19:55:06 mcr -+ * Moved from linux/include/freeswan/ipsec_radij.h,v -+ * -+ * Revision 1.18 2002/04/24 07:36:47 mcr -+ * Moved from ./klips/net/ipsec/ipsec_radij.h,v -+ * -+ * Revision 1.17 2001/11/26 09:23:49 rgb -+ * Merge MCR's ipsec_sa, eroute, proc and struct lifetime changes. -+ * -+ * Revision 1.16.2.1 2001/09/25 02:21:17 mcr -+ * ipsec_proto.h created to keep prototypes rather than deal with -+ * cyclic dependancies of structures and prototypes in .h files. -+ * -+ * Revision 1.16 2001/09/15 16:24:04 rgb -+ * Re-inject first and last HOLD packet when an eroute REPLACE is done. -+ * -+ * Revision 1.15 2001/09/14 16:58:37 rgb -+ * Added support for storing the first and last packets through a HOLD. -+ * -+ * Revision 1.14 2001/09/08 21:13:32 rgb -+ * Added pfkey ident extension support for ISAKMPd. (NetCelo) -+ * -+ * Revision 1.13 2001/06/14 19:35:09 rgb -+ * Update copyright date. -+ * -+ * Revision 1.12 2001/05/27 06:12:11 rgb -+ * Added structures for pid, packet count and last access time to eroute. -+ * Added packet count to beginning of /proc/net/ipsec_eroute. -+ * -+ * Revision 1.11 2000/09/08 19:12:56 rgb -+ * Change references from DEBUG_IPSEC to CONFIG_IPSEC_DEBUG. -+ * -+ * Revision 1.10 1999/11/17 15:53:39 rgb -+ * Changed all occurrences of #include "../../../lib/freeswan.h" -+ * to #include which works due to -Ilibfreeswan in the -+ * klips/net/ipsec/Makefile. -+ * -+ * Revision 1.9 1999/10/01 00:01:23 rgb -+ * Added eroute structure locking. -+ * -+ * Revision 1.8 1999/04/11 00:28:59 henry -+ * GPL boilerplate -+ * -+ * Revision 1.7 1999/04/06 04:54:26 rgb -+ * Fix/Add RCSID Id: and Log: bits to make PHMDs happy. This includes -+ * patch shell fixes. -+ * -+ * Revision 1.6 1999/01/22 06:23:26 rgb -+ * Cruft clean-out. -+ * -+ * Revision 1.5 1998/10/25 02:42:08 rgb -+ * Change return type on ipsec_breakroute and ipsec_makeroute and add an -+ * argument to be able to transmit more infomation about errors. -+ * -+ * Revision 1.4 1998/10/19 14:44:29 rgb -+ * Added inclusion of freeswan.h. -+ * sa_id structure implemented and used: now includes protocol. -+ * -+ * Revision 1.3 1998/07/28 00:03:31 rgb -+ * Comment out temporary inet_nto4u() kluge. -+ * -+ * Revision 1.2 1998/07/14 18:22:00 rgb -+ * Add function to clear the eroute table. -+ * -+ * Revision 1.1 1998/06/18 21:27:49 henry -+ * move sources from klips/src to klips/net/ipsec, to keep stupid -+ * kernel-build scripts happier in the presence of symlinks -+ * -+ * Revision 1.5 1998/05/25 20:30:38 rgb -+ * Remove temporary ipsec_walk, rj_deltree and rj_delnodes functions. -+ * -+ * Rename ipsec_rj_walker (ipsec_walk) to ipsec_rj_walker_procprint and -+ * add ipsec_rj_walker_delete. -+ * -+ * Revision 1.4 1998/05/21 13:02:56 rgb -+ * Imported definitions from ipsec_radij.c and radij.c to support /proc 3k -+ * limit fix. -+ * -+ * Revision 1.3 1998/04/21 21:29:09 rgb -+ * Rearrange debug switches to change on the fly debug output from user -+ * space. Only kernel changes checked in at this time. radij.c was also -+ * changed to temporarily remove buggy debugging code in rj_delete causing -+ * an OOPS and hence, netlink device open errors. -+ * -+ * Revision 1.2 1998/04/14 17:30:39 rgb -+ * Fix up compiling errors for radij tree memory reclamation. -+ * -+ * Revision 1.1 1998/04/09 03:06:10 henry -+ * sources moved up from linux/net/ipsec -+ * -+ * Revision 1.1.1.1 1998/04/08 05:35:04 henry -+ * RGB's ipsec-0.8pre2.tar.gz ipsec-0.8 -+ * -+ * Revision 0.4 1997/01/15 01:28:15 ji -+ * No changes. -+ * -+ * Revision 0.3 1996/11/20 14:39:04 ji -+ * Minor cleanups. -+ * Rationalized debugging code. -+ * -+ * Revision 0.2 1996/11/02 00:18:33 ji -+ * First limited release. -+ * -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_rcv.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,261 @@ -+/* -+ * -+ * Copyright (C) 1996, 1997 John Ioannidis. -+ * Copyright (C) 1998, 1999, 2000, 2001 Richard Guy Briggs. -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * RCSID $Id: ipsec_rcv.h,v 1.28.2.1 2006/07/10 15:52:20 paul Exp $ -+ */ -+ -+#ifndef IPSEC_RCV_H -+#define IPSEC_RCV_H -+ -+#include "openswan/ipsec_auth.h" -+ -+#define DB_RX_PKTRX 0x0001 -+#define DB_RX_PKTRX2 0x0002 -+#define DB_RX_DMP 0x0004 -+#define DB_RX_IPSA 0x0010 -+#define DB_RX_XF 0x0020 -+#define DB_RX_IPAD 0x0040 -+#define DB_RX_INAU 0x0080 -+#define DB_RX_OINFO 0x0100 -+#define DB_RX_OINFO2 0x0200 -+#define DB_RX_OH 0x0400 -+#define DB_RX_REPLAY 0x0800 -+ -+#ifdef __KERNEL__ -+/* struct options; */ -+ -+#define __NO_VERSION__ -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif /* for CONFIG_IP_FORWARD */ -+#ifdef CONFIG_MODULES -+#include -+#endif -+#include -+#include -+ -+#ifdef CONFIG_KLIPS_OCF -+#include -+#endif -+ -+#define IPSEC_BIRTH_TEMPLATE_MAXLEN 256 -+ -+struct ipsec_birth_reply { -+ int packet_template_len; -+ unsigned char packet_template[IPSEC_BIRTH_TEMPLATE_MAXLEN]; -+}; -+ -+extern struct ipsec_birth_reply ipsec_ipv4_birth_packet; -+extern struct ipsec_birth_reply ipsec_ipv6_birth_packet; -+ -+enum ipsec_rcv_value { -+ IPSEC_RCV_PENDING=2, -+ IPSEC_RCV_LASTPROTO=1, -+ IPSEC_RCV_OK=0, -+ IPSEC_RCV_BADPROTO=-1, -+ IPSEC_RCV_BADLEN=-2, -+ IPSEC_RCV_ESP_BADALG=-3, -+ IPSEC_RCV_3DES_BADBLOCKING=-4, -+ IPSEC_RCV_ESP_DECAPFAIL=-5, -+ IPSEC_RCV_DECAPFAIL=-6, -+ IPSEC_RCV_SAIDNOTFOUND=-7, -+ IPSEC_RCV_IPCOMPALONE=-8, -+ IPSEC_RCV_IPCOMPFAILED=-10, -+ IPSEC_RCV_SAIDNOTLIVE=-11, -+ IPSEC_RCV_FAILEDINBOUND=-12, -+ IPSEC_RCV_LIFETIMEFAILED=-13, -+ IPSEC_RCV_BADAUTH=-14, -+ IPSEC_RCV_REPLAYFAILED=-15, -+ IPSEC_RCV_AUTHFAILED=-16, -+ IPSEC_RCV_REPLAYROLLED=-17, -+ IPSEC_RCV_BAD_DECRYPT=-18, -+ IPSEC_RCV_REALLYBAD=-19 -+}; -+ -+/* -+ * state machine states -+ */ -+ -+#define IPSEC_RSM_INIT 0 /* make it easy, starting state is 0 */ -+#define IPSEC_RSM_DECAP_INIT 1 -+#define IPSEC_RSM_DECAP_LOOKUP 2 -+#define IPSEC_RSM_AUTH_INIT 3 -+#define IPSEC_RSM_AUTH_DECAP 4 -+#define IPSEC_RSM_AUTH_CALC 5 -+#define IPSEC_RSM_AUTH_CHK 6 -+#define IPSEC_RSM_DECRYPT 7 -+#define IPSEC_RSM_DECAP_CONT 8 /* do we restart at IPSEC_RSM_DECAP_INIT */ -+#define IPSEC_RSM_CLEANUP 9 -+#define IPSEC_RSM_IPCOMP 10 -+#define IPSEC_RSM_COMPLETE 11 -+#define IPSEC_RSM_DONE 100 -+ -+struct ipsec_rcv_state { -+ struct sk_buff *skb; -+ struct net_device_stats *stats; -+ struct iphdr *ipp; /* the IP header */ -+ struct ipsec_sa *ipsp; /* current SA being processed */ -+ struct ipsec_sa *lastipsp; /* last SA that was processed */ -+ int len; /* length of packet */ -+ int ilen; /* length of inner payload (-authlen) */ -+ int authlen; /* how big is the auth data at end */ -+ int hard_header_len; /* layer 2 size */ -+ int iphlen; /* how big is IP header */ -+ unsigned int transport_direct:1; -+ struct auth_alg *authfuncs; -+ ip_said said; -+ char sa[SATOT_BUF]; -+ size_t sa_len; -+ __u8 next_header; -+ __u8 hash[AH_AMAX]; -+ char ipsaddr_txt[ADDRTOA_BUF]; -+ char ipdaddr_txt[ADDRTOA_BUF]; -+ __u8 *octx; -+ __u8 *ictx; -+ int ictx_len; -+ int octx_len; -+ union { -+ struct { -+ struct esphdr *espp; -+ } espstuff; -+ struct { -+ struct ahhdr *ahp; -+ } ahstuff; -+ struct { -+ struct ipcomphdr *compp; -+ } ipcompstuff; -+ } protostuff; -+#ifdef CONFIG_IPSEC_NAT_TRAVERSAL -+ __u8 natt_type; -+ __u16 natt_sport; -+ __u16 natt_dport; -+ int natt_len; -+#endif -+ -+ /* -+ * rcv state machine use -+ */ -+ int state; -+ int next_state; -+ int auth_checked; -+ -+#ifdef CONFIG_KLIPS_OCF -+ struct work_struct workq; -+#ifdef DECLARE_TASKLET -+ struct tasklet_struct tasklet; -+#endif -+#endif -+#ifndef NET_21 -+ struct net_device *devp; -+ struct inet_protocol *protop; -+#endif -+ struct xform_functions *proto_funcs; -+ __u8 proto; -+ int replay; -+ unsigned char *authenticator; -+ int esphlen; -+#ifdef CONFIG_KLIPS_ALG -+ struct ipsec_alg_auth *ixt_a; -+#endif -+ __u8 ttl, tos; -+ __u16 frag_off, check; -+}; -+ -+extern void ipsec_rsm(struct ipsec_rcv_state *irs); -+#ifdef HAVE_KMEM_CACHE_T -+extern kmem_cache_t *ipsec_irs_cache; -+#else -+extern struct kmem_cache *ipsec_irs_cache; -+#endif -+extern int ipsec_irs_max; -+extern atomic_t ipsec_irs_cnt; -+ -+extern int -+#ifdef PROTO_HANDLER_SINGLE_PARM -+ipsec_rcv(struct sk_buff *skb); -+#else /* PROTO_HANDLER_SINGLE_PARM */ -+ipsec_rcv(struct sk_buff *skb, -+ unsigned short xlen); -+#endif /* PROTO_HANDLER_SINGLE_PARM */ -+ -+#ifdef CONFIG_KLIPS_DEBUG -+extern int debug_rcv; -+#define ipsec_rcv_dmp(_x,_y, _z) if (debug_rcv && sysctl_ipsec_debug_verbose) ipsec_dmp_block(_x,_y,_z) -+#else -+#define ipsec_rcv_dmp(_x,_y, _z) do {} while(0) -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+extern int sysctl_ipsec_inbound_policy_check; -+#endif /* __KERNEL__ */ -+ -+extern int klips26_udp_encap_rcv(struct sock *sk, struct sk_buff *skb); -+extern int klips26_rcv_encap(struct sk_buff *skb, __u16 encap_type); -+ -+// manage ipsec rcv state objects -+extern int ipsec_rcv_state_cache_init (void); -+extern void ipsec_rcv_state_cache_cleanup (void); -+ -+#endif /* IPSEC_RCV_H */ -+ -+/* -+ * $Log: ipsec_rcv.h,v $ -+ * Revision 1.28.2.1 2006/07/10 15:52:20 paul -+ * Fix for bug #642 by Bart Trojanowski -+ * -+ * Revision 1.28 2005/05/11 00:59:45 mcr -+ * do not call debug routines if !defined KLIPS_DEBUG. -+ * -+ * Revision 1.27 2005/04/29 04:59:46 mcr -+ * use ipsec_dmp_block. -+ * -+ * Revision 1.26 2005/04/13 22:48:35 mcr -+ * added comments, and removed some log. -+ * removed Linux 2.0 support. -+ * -+ * Revision 1.25 2005/04/08 18:25:37 mcr -+ * prototype klips26 encap receive function -+ * -+ * Revision 1.24 2004/08/20 21:45:37 mcr -+ * CONFIG_KLIPS_NAT_TRAVERSAL is not used in an attempt to -+ * be 26sec compatible. But, some defines where changed. -+ * -+ * Revision 1.23 2004/08/03 18:17:40 mcr -+ * in 2.6, use "net_device" instead of #define device->net_device. -+ * this probably breaks 2.0 compiles. -+ * -+ * Revision 1.22 2004/07/10 19:08:41 mcr -+ * CONFIG_IPSEC -> CONFIG_KLIPS. -+ * -+ * Revision 1.21 2004/04/06 02:49:08 mcr -+ * pullup of algo code from alg-branch. -+ * -+ * Revision 1.20 2004/04/05 19:55:06 mcr -+ * Moved from linux/include/freeswan/ipsec_rcv.h,v -+ * -+ * Revision 1.19 2003/12/15 18:13:09 mcr -+ * when compiling with NAT traversal, don't assume that the -+ * kernel has been patched, unless CONFIG_IPSEC_NAT_NON_ESP -+ * is set. -+ * -+ * history elided 2005-04-12. -+ * -+ * Local Variables: -+ * c-basic-offset:8 -+ * c-style:linux -+ * End: -+ * -+ */ -+ -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_sa.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,280 @@ -+/* -+ * @(#) Definitions of IPsec Security Association (ipsec_sa) -+ * -+ * Copyright (C) 2001, 2002, 2003 -+ * Richard Guy Briggs -+ * and Michael Richardson -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * RCSID $Id: ipsec_sa.h,v 1.23.2.1 2007/09/05 02:31:15 paul Exp $ -+ * -+ * This file derived from ipsec_xform.h on 2001/9/18 by mcr. -+ * -+ */ -+ -+/* -+ * This file describes the IPsec Security Association Structure. -+ * -+ * This structure keeps track of a single transform that may be done -+ * to a set of packets. It can describe applying the transform or -+ * apply the reverse. (e.g. compression vs expansion). However, it -+ * only describes one at a time. To describe both, two structures would -+ * be used, but since the sides of the transform are performed -+ * on different machines typically it is usual to have only one side -+ * of each association. -+ * -+ */ -+ -+#ifndef _IPSEC_SA_H_ -+ -+#ifdef __KERNEL__ -+#include "openswan/ipsec_stats.h" -+#include "openswan/ipsec_life.h" -+#include "openswan/ipsec_eroute.h" -+#endif /* __KERNEL__ */ -+#include "openswan/ipsec_param.h" -+ -+#include "openswan/pfkeyv2.h" -+ -+ -+/* SAs are held in a table. -+ * Entries in this table are referenced by IPsecSAref_t values. -+ * IPsecSAref_t values are conceptually subscripts. Because -+ * we want to allocate the table piece-meal, the subscripting -+ * is implemented with two levels, a bit like paged virtual memory. -+ * This representation mechanism is known as an Iliffe Vector. -+ * -+ * The Main table (AKA the refTable) consists of 2^IPSEC_SA_REF_MAINTABLE_IDX_WIDTH -+ * pointers to subtables. -+ * Each subtable has 2^IPSEC_SA_REF_SUBTABLE_IDX_WIDTH entries, each of which -+ * is a pointer to an SA. -+ * -+ * An IPsecSAref_t contains either an exceptional value (signified by the -+ * high-order bit being on) or a reference to a table entry. A table entry -+ * reference has the subtable subscript in the low-order -+ * IPSEC_SA_REF_SUBTABLE_IDX_WIDTH bits and the Main table subscript -+ * in the next lowest IPSEC_SA_REF_MAINTABLE_IDX_WIDTH bits. -+ * -+ * The Maintable entry for an IPsecSAref_t x, a pointer to its subtable, is -+ * IPsecSAref2table(x). It is of type struct IPsecSArefSubTable *. -+ * -+ * The pointer to the SA for x is IPsecSAref2SA(x). It is of type -+ * struct ipsec_sa*. The macro definition clearly shows the two-level -+ * access needed to find the SA pointer. -+ * -+ * The Maintable is allocated when IPsec is initialized. -+ * Each subtable is allocated when needed, but the first is allocated -+ * when IPsec is initialized. -+ * -+ * IPsecSAref_t is designed to be smaller than an NFmark so that -+ * they can be stored in NFmarks and still leave a few bits for other -+ * purposes. The spare bits are in the low order of the NFmark -+ * but in the high order of the IPsecSAref_t, so conversion is required. -+ * We pick the upper bits of NFmark on the theory that they are less likely to -+ * interfere with more pedestrian uses of nfmark. -+ */ -+ -+ -+typedef unsigned short int IPsecRefTableUnusedCount; -+ -+#define IPSEC_SA_REF_TABLE_NUM_ENTRIES (1 << IPSEC_SA_REF_TABLE_IDX_WIDTH) -+ -+#ifdef __KERNEL__ -+#if ((IPSEC_SA_REF_TABLE_IDX_WIDTH - (1 + IPSEC_SA_REF_MAINTABLE_IDX_WIDTH)) < 0) -+#error "IPSEC_SA_REF_TABLE_IDX_WIDTH("IPSEC_SA_REF_TABLE_IDX_WIDTH") MUST be < 1 + IPSEC_SA_REF_MAINTABLE_IDX_WIDTH("IPSEC_SA_REF_MAINTABLE_IDX_WIDTH")" -+#endif -+ -+#define IPSEC_SA_REF_SUBTABLE_IDX_WIDTH (IPSEC_SA_REF_TABLE_IDX_WIDTH - IPSEC_SA_REF_MAINTABLE_IDX_WIDTH) -+ -+#define IPSEC_SA_REF_MAINTABLE_NUM_ENTRIES (1 << IPSEC_SA_REF_MAINTABLE_IDX_WIDTH) -+#define IPSEC_SA_REF_SUBTABLE_NUM_ENTRIES (1 << IPSEC_SA_REF_SUBTABLE_IDX_WIDTH) -+ -+#ifdef CONFIG_NETFILTER -+#define IPSEC_SA_REF_HOST_FIELD(x) ((struct sk_buff*)(x))->nfmark -+#define IPSEC_SA_REF_HOST_FIELD_TYPE typeof(IPSEC_SA_REF_HOST_FIELD(NULL)) -+#else /* CONFIG_NETFILTER */ -+/* just make it work for now, it doesn't matter, since there is no nfmark */ -+#define IPSEC_SA_REF_HOST_FIELD_TYPE unsigned long -+#endif /* CONFIG_NETFILTER */ -+#define IPSEC_SA_REF_HOST_FIELD_WIDTH (8 * sizeof(IPSEC_SA_REF_HOST_FIELD_TYPE)) -+#define IPSEC_SA_REF_FIELD_WIDTH (8 * sizeof(IPsecSAref_t)) -+ -+#define IPSEC_SA_REF_MAX (~IPSEC_SAREF_NULL) -+#define IPSEC_SAREF_FIRST 1 -+#define IPSEC_SA_REF_MASK (IPSEC_SA_REF_MAX >> (IPSEC_SA_REF_FIELD_WIDTH - IPSEC_SA_REF_TABLE_IDX_WIDTH)) -+#define IPSEC_SA_REF_TABLE_MASK ((IPSEC_SA_REF_MAX >> (IPSEC_SA_REF_FIELD_WIDTH - IPSEC_SA_REF_MAINTABLE_IDX_WIDTH)) << IPSEC_SA_REF_SUBTABLE_IDX_WIDTH) -+#define IPSEC_SA_REF_ENTRY_MASK (IPSEC_SA_REF_MAX >> (IPSEC_SA_REF_FIELD_WIDTH - IPSEC_SA_REF_SUBTABLE_IDX_WIDTH)) -+ -+#define IPsecSAref2table(x) (((x) & IPSEC_SA_REF_TABLE_MASK) >> IPSEC_SA_REF_SUBTABLE_IDX_WIDTH) -+#define IPsecSAref2entry(x) ((x) & IPSEC_SA_REF_ENTRY_MASK) -+#define IPsecSArefBuild(x,y) (((x) << IPSEC_SA_REF_SUBTABLE_IDX_WIDTH) + (y)) -+ -+#define IPsecSAref2SA(x) (ipsec_sadb.refTable[IPsecSAref2table(x)]->entry[IPsecSAref2entry(x)]) -+#define IPsecSA2SAref(x) ((x)->ips_ref) -+ -+#define EMT_INBOUND 0x01 /* SA direction, 1=inbound */ -+ -+/* 'struct ipsec_sa' should be 64bit aligned when allocated. */ -+struct ipsec_sa -+{ -+ atomic_t ips_refcount; /* reference count for this struct */ -+ int ips_marked_deleted; /* used with reference counting */ -+ IPsecSAref_t ips_ref; /* reference table entry number */ -+ IPsecSAref_t ips_refhim; /* ref of paired SA, if any */ -+ struct ipsec_sa *ips_next; /* pointer to next xform */ -+ -+ struct ipsec_sa *ips_hnext; /* next in hash chain */ -+ -+ struct ifnet *ips_rcvif; /* related rcv encap interface */ -+ -+ struct xform_functions *ips_xformfuncs; /* pointer to routines to process this SA */ -+ -+ struct net_device *ips_out; /* what interface to emerge on */ -+ __u8 ips_transport_direct; /* if true, punt directly to -+ * the protocol layer */ -+ struct socket *ips_sock; /* cache of transport socket */ -+ -+ ip_said ips_said; /* SA ID */ -+ -+ __u32 ips_seq; /* seq num of msg that initiated this SA */ -+ __u32 ips_pid; /* PID of process that initiated this SA */ -+ __u8 ips_authalg; /* auth algorithm for this SA */ -+ __u8 ips_encalg; /* enc algorithm for this SA */ -+ -+ struct ipsec_stats ips_errs; -+ -+ __u8 ips_replaywin; /* replay window size */ -+ enum sadb_sastate ips_state; /* state of SA */ -+ __u32 ips_replaywin_lastseq; /* last pkt sequence num */ -+ __u64 ips_replaywin_bitmap; /* bitmap of received pkts */ -+ __u32 ips_replaywin_maxdiff; /* max pkt sequence difference */ -+ -+ __u32 ips_flags; /* generic xform flags */ -+ -+ -+ struct ipsec_lifetimes ips_life; /* lifetime records */ -+ -+ /* selector information */ -+ __u8 ips_transport_protocol; /* protocol for this SA, if ports are involved */ -+ struct sockaddr*ips_addr_s; /* src sockaddr */ -+ struct sockaddr*ips_addr_d; /* dst sockaddr */ -+ struct sockaddr*ips_addr_p; /* proxy sockaddr */ -+ __u16 ips_addr_s_size; -+ __u16 ips_addr_d_size; -+ __u16 ips_addr_p_size; -+ ip_address ips_flow_s; -+ ip_address ips_flow_d; -+ ip_address ips_mask_s; -+ ip_address ips_mask_d; -+ -+ __u16 ips_key_bits_a; /* size of authkey in bits */ -+ __u16 ips_auth_bits; /* size of authenticator in bits */ -+ __u16 ips_key_bits_e; /* size of enckey in bits */ -+ __u16 ips_iv_bits; /* size of IV in bits */ -+ __u8 ips_iv_size; -+ __u16 ips_key_a_size; -+ __u16 ips_key_e_size; -+ -+ caddr_t ips_key_a; /* authentication key */ -+ caddr_t ips_key_e; /* encryption key */ -+ caddr_t ips_iv; /* Initialisation Vector */ -+ -+ struct ident ips_ident_s; /* identity src */ -+ struct ident ips_ident_d; /* identity dst */ -+ -+ /* these are included even if CONFIG_KLIPS_IPCOMP is off */ -+ __u16 ips_comp_adapt_tries; /* ipcomp self-adaption tries */ -+ __u16 ips_comp_adapt_skip; /* ipcomp self-adaption to-skip */ -+ __u64 ips_comp_ratio_cbytes; /* compressed bytes */ -+ __u64 ips_comp_ratio_dbytes; /* decompressed (or uncompressed) bytes */ -+ -+ /* these are included even if CONFIG_IPSEC_NAT_TRAVERSAL is off */ -+ __u8 ips_natt_type; -+ __u8 ips_natt_reserved[3]; -+ __u16 ips_natt_sport; -+ __u16 ips_natt_dport; -+ -+ struct sockaddr *ips_natt_oa; -+ __u16 ips_natt_oa_size; -+ __u16 ips_natt_reserved2; -+ -+#if 0 -+ __u32 ips_sens_dpd; -+ __u8 ips_sens_sens_level; -+ __u8 ips_sens_sens_len; -+ __u64* ips_sens_sens_bitmap; -+ __u8 ips_sens_integ_level; -+ __u8 ips_sens_integ_len; -+ __u64* ips_sens_integ_bitmap; -+#endif -+ struct ipsec_alg_enc *ips_alg_enc; -+ struct ipsec_alg_auth *ips_alg_auth; -+ -+ int ocf_in_use; -+ int64_t ocf_cryptoid; -+}; -+ -+struct IPsecSArefSubTable -+{ -+ struct ipsec_sa* entry[IPSEC_SA_REF_SUBTABLE_NUM_ENTRIES]; -+}; -+ -+struct ipsec_sadb { -+ struct IPsecSArefSubTable* refTable[IPSEC_SA_REF_MAINTABLE_NUM_ENTRIES]; -+ IPsecSAref_t refFreeList[IPSEC_SA_REF_FREELIST_NUM_ENTRIES]; -+ int refFreeListHead; -+ int refFreeListTail; -+ IPsecSAref_t refFreeListCont; -+ IPsecSAref_t said_hash[SADB_HASHMOD]; -+ spinlock_t sadb_lock; -+}; -+ -+extern struct ipsec_sadb ipsec_sadb; -+ -+extern int ipsec_SAref_recycle(void); -+extern int ipsec_SArefSubTable_alloc(unsigned table); -+extern int ipsec_saref_freelist_init(void); -+extern int ipsec_sadb_init(void); -+extern struct ipsec_sa *ipsec_sa_alloc(int*error); /* pass in error var by pointer */ -+extern IPsecSAref_t ipsec_SAref_alloc(int*erorr); /* pass in error var by pointer */ -+extern int ipsec_sa_free(struct ipsec_sa* ips); -+ -+#define ipsec_sa_get(ips) __ipsec_sa_get(ips, __FUNCTION__, __LINE__) -+extern struct ipsec_sa * __ipsec_sa_get(struct ipsec_sa *ips, const char *func, int line); -+ -+#define ipsec_sa_put(ips) __ipsec_sa_put(ips, __FUNCTION__, __LINE__) -+extern void __ipsec_sa_put(struct ipsec_sa *ips, const char *func, int line); -+extern int ipsec_sa_add(struct ipsec_sa *ips); -+extern void ipsec_sa_rm(struct ipsec_sa *ips); -+extern int ipsec_sadb_cleanup(__u8 proto); -+extern int ipsec_sadb_free(void); -+extern int ipsec_sa_wipe(struct ipsec_sa *ips); -+extern int ipsec_sa_intern(struct ipsec_sa *ips); -+extern struct ipsec_sa *ipsec_sa_getbyref(IPsecSAref_t ref); -+ -+extern void ipsec_sa_untern(struct ipsec_sa *ips); -+#endif /* __KERNEL__ */ -+ -+enum ipsec_direction { -+ ipsec_incoming = 1, -+ ipsec_outgoing = 2 -+}; -+ -+#define _IPSEC_SA_H_ -+#endif /* _IPSEC_SA_H_ */ -+ -+/* -+ * Local variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_sha1.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,79 @@ -+/* -+ * RCSID $Id: ipsec_sha1.h,v 1.8 2004/04/05 19:55:07 mcr Exp $ -+ */ -+ -+/* -+ * Here is the original comment from the distribution: -+ -+SHA-1 in C -+By Steve Reid -+100% Public Domain -+ -+ * Adapted for use by the IPSEC code by John Ioannidis -+ */ -+ -+ -+#ifndef _IPSEC_SHA1_H_ -+#define _IPSEC_SHA1_H_ -+ -+typedef struct -+{ -+ __u32 state[5]; -+ __u32 count[2]; -+ __u8 buffer[64]; -+} SHA1_CTX; -+ -+void SHA1Transform(__u32 state[5], __u8 buffer[64]); -+void SHA1Init(void *context); -+void SHA1Update(void *context, unsigned char *data, __u32 len); -+void SHA1Final(unsigned char digest[20], void *context); -+ -+ -+#endif /* _IPSEC_SHA1_H_ */ -+ -+/* -+ * $Log: ipsec_sha1.h,v $ -+ * Revision 1.8 2004/04/05 19:55:07 mcr -+ * Moved from linux/include/freeswan/ipsec_sha1.h,v -+ * -+ * Revision 1.7 2002/09/10 01:45:09 mcr -+ * changed type of MD5_CTX and SHA1_CTX to void * so that -+ * the function prototypes would match, and could be placed -+ * into a pointer to a function. -+ * -+ * Revision 1.6 2002/04/24 07:36:47 mcr -+ * Moved from ./klips/net/ipsec/ipsec_sha1.h,v -+ * -+ * Revision 1.5 1999/12/13 13:59:13 rgb -+ * Quick fix to argument size to Update bugs. -+ * -+ * Revision 1.4 1999/12/07 18:16:23 rgb -+ * Fixed comments at end of #endif lines. -+ * -+ * Revision 1.3 1999/04/06 04:54:27 rgb -+ * Fix/Add RCSID Id: and Log: bits to make PHMDs happy. This includes -+ * patch shell fixes. -+ * -+ * Revision 1.2 1998/11/30 13:22:54 rgb -+ * Rationalised all the klips kernel file headers. They are much shorter -+ * now and won't conflict under RH5.2. -+ * -+ * Revision 1.1 1998/06/18 21:27:50 henry -+ * move sources from klips/src to klips/net/ipsec, to keep stupid -+ * kernel-build scripts happier in the presence of symlinks -+ * -+ * Revision 1.2 1998/04/23 20:54:05 rgb -+ * Fixed md5 and sha1 include file nesting issues, to be cleaned up when -+ * verified. -+ * -+ * Revision 1.1 1998/04/09 03:04:21 henry -+ * sources moved up from linux/net/ipsec -+ * these two include files modified not to include others except in kernel -+ * -+ * Revision 1.1.1.1 1998/04/08 05:35:04 henry -+ * RGB's ipsec-0.8pre2.tar.gz ipsec-0.8 -+ * -+ * Revision 0.4 1997/01/15 01:28:15 ji -+ * New transform -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_stats.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,76 @@ -+/* -+ * @(#) definition of ipsec_stats structure -+ * -+ * Copyright (C) 2001 Richard Guy Briggs -+ * and Michael Richardson -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * RCSID $Id: ipsec_stats.h,v 1.7 2005/04/14 01:17:45 mcr Exp $ -+ * -+ */ -+ -+/* -+ * This file describes the errors/statistics that FreeSWAN collects. -+ */ -+ -+#ifndef _IPSEC_STATS_H_ -+ -+struct ipsec_stats { -+ __u32 ips_alg_errs; /* number of algorithm errors */ -+ __u32 ips_auth_errs; /* # of authentication errors */ -+ __u32 ips_encsize_errs; /* # of encryption size errors*/ -+ __u32 ips_encpad_errs; /* # of encryption pad errors*/ -+ __u32 ips_replaywin_errs; /* # of pkt sequence errors */ -+}; -+ -+#define _IPSEC_STATS_H_ -+#endif /* _IPSEC_STATS_H_ */ -+ -+/* -+ * $Log: ipsec_stats.h,v $ -+ * Revision 1.7 2005/04/14 01:17:45 mcr -+ * add prototypes for snprintf. -+ * -+ * Revision 1.6 2004/04/05 19:55:07 mcr -+ * Moved from linux/include/freeswan/ipsec_stats.h,v -+ * -+ * Revision 1.5 2004/04/05 19:41:05 mcr -+ * merged alg-branch code. -+ * -+ * Revision 1.4 2004/03/28 20:27:19 paul -+ * Included tested and confirmed fixes mcr made and dhr verified for -+ * snprint statements. Changed one other snprintf to use ipsec_snprintf -+ * so it wouldnt break compatibility with 2.0/2.2 kernels. Verified with -+ * dhr. (thanks dhr!) -+ * -+ * Revision 1.4 2004/03/24 01:58:31 mcr -+ * sprintf->snprintf for formatting into proc buffer. -+ * -+ * Revision 1.3.34.1 2004/04/05 04:30:46 mcr -+ * patches for alg-branch to compile/work with 2.x openswan -+ * -+ * Revision 1.3 2002/04/24 07:36:47 mcr -+ * Moved from ./klips/net/ipsec/ipsec_stats.h,v -+ * -+ * Revision 1.2 2001/11/26 09:16:16 rgb -+ * Merge MCR's ipsec_sa, eroute, proc and struct lifetime changes. -+ * -+ * Revision 1.1.2.1 2001/09/25 02:27:00 mcr -+ * statistics moved to seperate structure. -+ * -+ * -+ * -+ * Local variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_sysctl.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,20 @@ -+#ifndef OPENSWAN_SYSCTL_H -+#define OPENSWAN_SYSCTL_H -+ -+extern int debug_ah; -+extern int debug_esp; -+extern int debug_xform; -+extern int debug_eroute; -+extern int debug_spi; -+extern int debug_netlink; -+extern int debug_radij; -+extern int debug_rcv; -+extern int debug_tunnel; -+extern int debug_xmit; -+extern int debug_mast; -+ -+extern int sysctl_ip_default_ttl; -+extern int sysctl_ipsec_inbound_policy_check; -+extern int sysctl_ipsec_debug_ipcomp; -+extern int sysctl_ipsec_debug_verbose; -+#endif ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_tunnel.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,286 @@ -+/* -+ * IPSEC tunneling code -+ * Copyright (C) 1996, 1997 John Ioannidis. -+ * Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003 Richard Guy Briggs. -+ * Copyright (C) 2006 Michael Richardson -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ */ -+ -+ -+# define DEV_QUEUE_XMIT(skb, device, pri) {\ -+ skb->dev = device; \ -+ neigh_compat_output(skb); \ -+ /* skb->dst->output(skb); */ \ -+ } -+# define ICMP_SEND(skb_in, type, code, info, dev) \ -+ icmp_send(skb_in, type, code, htonl(info)) -+# define IP_SEND(skb, dev) \ -+ ip_send(skb); -+ -+ -+#if defined(KLIPS) -+/* -+ * Heavily based on drivers/net/new_tunnel.c. Lots -+ * of ideas also taken from the 2.1.x version of drivers/net/shaper.c -+ */ -+ -+struct ipsectunnelconf -+{ -+ uint32_t cf_cmd; -+ union -+ { -+ char cfu_name[12]; -+ } cf_u; -+#define cf_name cf_u.cfu_name -+}; -+ -+#define IPSEC_SET_DEV (SIOCDEVPRIVATE) -+#define IPSEC_DEL_DEV (SIOCDEVPRIVATE + 1) -+#define IPSEC_CLR_DEV (SIOCDEVPRIVATE + 2) -+#define IPSEC_UDP_ENCAP_CONVERT (SIOCDEVPRIVATE + 3) -+#endif -+ -+#ifdef __KERNEL__ -+#include -+#ifndef KERNEL_VERSION -+# define KERNEL_VERSION(x,y,z) (((x)<<16)+((y)<<8)+(z)) -+#endif -+struct ipsecpriv -+{ -+ struct sk_buff_head sendq; -+ struct net_device *dev; -+ struct wait_queue *wait_queue; -+ int vifnum; -+ char locked; -+ int (*hard_start_xmit) (struct sk_buff *skb, -+ struct net_device *dev); -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) -+ const struct header_ops *header_ops; -+#else -+ -+ int (*hard_header) (struct sk_buff *skb, -+ struct net_device *dev, -+ unsigned short type, -+ void *daddr, -+ void *saddr, -+ unsigned len); -+#ifdef NET_21 -+ int (*rebuild_header)(struct sk_buff *skb); -+#else /* NET_21 */ -+ int (*rebuild_header)(void *buff, struct net_device *dev, -+ unsigned long raddr, struct sk_buff *skb); -+#endif /* NET_21 */ -+#ifndef NET_21 -+ void (*header_cache_bind)(struct hh_cache **hhp, struct net_device *dev, -+ unsigned short htype, __u32 daddr); -+#endif /* !NET_21 */ -+ void (*header_cache_update)(struct hh_cache *hh, struct net_device *dev, unsigned char * haddr); -+#endif -+ int (*set_mac_address)(struct net_device *dev, void *addr); -+ struct net_device_stats *(*get_stats)(struct net_device *dev); -+ struct net_device_stats mystats; -+ int mtu; /* What is the desired MTU? */ -+}; -+ -+extern char ipsec_tunnel_c_version[]; -+ -+extern struct net_device *ipsecdevices[IPSEC_NUM_IFMAX]; -+extern int ipsecdevices_max; -+ -+int ipsec_tunnel_init_devices(void); -+ -+/* void */ int ipsec_tunnel_cleanup_devices(void); -+ -+extern /* void */ int ipsec_init(void); -+ -+extern int ipsec_tunnel_start_xmit(struct sk_buff *skb, struct net_device *dev); -+extern struct net_device *ipsec_get_device(int inst); -+ -+#ifdef CONFIG_KLIPS_DEBUG -+extern int debug_tunnel; -+extern int sysctl_ipsec_debug_verbose; -+#endif /* CONFIG_KLIPS_DEBUG */ -+#endif /* __KERNEL__ */ -+ -+#ifdef CONFIG_KLIPS_DEBUG -+#define DB_TN_INIT 0x0001 -+#define DB_TN_PROCFS 0x0002 -+#define DB_TN_XMIT 0x0010 -+#define DB_TN_OHDR 0x0020 -+#define DB_TN_CROUT 0x0040 -+#define DB_TN_OXFS 0x0080 -+#define DB_TN_REVEC 0x0100 -+#define DB_TN_ENCAP 0x0200 -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+extern int ipsec_tunnel_deletenum(int vifnum); -+extern int ipsec_tunnel_createnum(int vifnum); -+extern struct net_device *ipsec_tunnel_get_device(int vifnum); -+ -+ -+/* manage ipsec xmit state objects */ -+extern int ipsec_xmit_state_cache_init (void); -+extern void ipsec_xmit_state_cache_cleanup (void); -+struct ipsec_xmit_state *ipsec_xmit_state_new (void); -+void ipsec_xmit_state_delete (struct ipsec_xmit_state *ixs); -+ -+/* -+ * $Log: ipsec_tunnel.h,v $ -+ * Revision 1.33 2005/06/04 16:06:05 mcr -+ * better patch for nat-t rcv-device code. -+ * -+ * Revision 1.32 2005/05/21 03:18:35 mcr -+ * added additional debug flag tunnelling. -+ * -+ * Revision 1.31 2004/08/03 18:18:02 mcr -+ * in 2.6, use "net_device" instead of #define device->net_device. -+ * this probably breaks 2.0 compiles. -+ * -+ * Revision 1.30 2004/07/10 19:08:41 mcr -+ * CONFIG_IPSEC -> CONFIG_KLIPS. -+ * -+ * Revision 1.29 2004/04/05 19:55:07 mcr -+ * Moved from linux/include/freeswan/ipsec_tunnel.h,v -+ * -+ * Revision 1.28 2003/06/24 20:22:32 mcr -+ * added new global: ipsecdevices[] so that we can keep track of -+ * the ipsecX devices. They will be referenced with dev_hold(), -+ * so 2.2 may need this as well. -+ * -+ * Revision 1.27 2003/04/03 17:38:09 rgb -+ * Centralised ipsec_kfree_skb and ipsec_dev_{get,put}. -+ * -+ * Revision 1.26 2003/02/12 19:32:20 rgb -+ * Updated copyright year. -+ * -+ * Revision 1.25 2002/05/27 18:56:07 rgb -+ * Convert to dynamic ipsec device allocation. -+ * -+ * Revision 1.24 2002/04/24 07:36:48 mcr -+ * Moved from ./klips/net/ipsec/ipsec_tunnel.h,v -+ * -+ * Revision 1.23 2001/11/06 19:50:44 rgb -+ * Moved IP_SEND, ICMP_SEND, DEV_QUEUE_XMIT macros to ipsec_tunnel.h for -+ * use also by pfkey_v2_parser.c -+ * -+ * Revision 1.22 2001/09/15 16:24:05 rgb -+ * Re-inject first and last HOLD packet when an eroute REPLACE is done. -+ * -+ * Revision 1.21 2001/06/14 19:35:10 rgb -+ * Update copyright date. -+ * -+ * Revision 1.20 2000/09/15 11:37:02 rgb -+ * Merge in heavily modified Svenning Soerensen's -+ * IPCOMP zlib deflate code. -+ * -+ * Revision 1.19 2000/09/08 19:12:56 rgb -+ * Change references from DEBUG_IPSEC to CONFIG_IPSEC_DEBUG. -+ * -+ * Revision 1.18 2000/07/28 13:50:54 rgb -+ * Changed enet_statistics to net_device_stats and added back compatibility -+ * for pre-2.1.19. -+ * -+ * Revision 1.17 1999/11/19 01:12:15 rgb -+ * Purge unneeded proc_info prototypes, now that static linking uses -+ * dynamic proc_info registration. -+ * -+ * Revision 1.16 1999/11/18 18:51:00 rgb -+ * Changed all device registrations for static linking to -+ * dynamic to reduce the number and size of patches. -+ * -+ * Revision 1.15 1999/11/18 04:14:21 rgb -+ * Replaced all kernel version macros to shorter, readable form. -+ * Added CONFIG_PROC_FS compiler directives in case it is shut off. -+ * Added Marc Boucher's 2.3.25 proc patches. -+ * -+ * Revision 1.14 1999/05/25 02:50:10 rgb -+ * Fix kernel version macros for 2.0.x static linking. -+ * -+ * Revision 1.13 1999/05/25 02:41:06 rgb -+ * Add ipsec_klipsdebug support for static linking. -+ * -+ * Revision 1.12 1999/05/05 22:02:32 rgb -+ * Add a quick and dirty port to 2.2 kernels by Marc Boucher . -+ * -+ * Revision 1.11 1999/04/29 15:19:50 rgb -+ * Add return values to init and cleanup functions. -+ * -+ * Revision 1.10 1999/04/16 16:02:39 rgb -+ * Bump up macro to 4 ipsec I/Fs. -+ * -+ * Revision 1.9 1999/04/15 15:37:25 rgb -+ * Forward check changes from POST1_00 branch. -+ * -+ * Revision 1.5.2.1 1999/04/02 04:26:14 rgb -+ * Backcheck from HEAD, pre1.0. -+ * -+ * Revision 1.8 1999/04/11 00:29:01 henry -+ * GPL boilerplate -+ * -+ * Revision 1.7 1999/04/06 04:54:28 rgb -+ * Fix/Add RCSID Id: and Log: bits to make PHMDs happy. This includes -+ * patch shell fixes. -+ * -+ * Revision 1.6 1999/03/31 05:44:48 rgb -+ * Keep PMTU reduction private. -+ * -+ * Revision 1.5 1999/02/10 22:31:20 rgb -+ * Change rebuild_header member to reflect generality of link layer. -+ * -+ * Revision 1.4 1998/12/01 13:22:04 rgb -+ * Added support for debug printing of version info. -+ * -+ * Revision 1.3 1998/07/29 20:42:46 rgb -+ * Add a macro for clearing all tunnel devices. -+ * Rearrange structures and declarations for sharing with userspace. -+ * -+ * Revision 1.2 1998/06/25 20:01:45 rgb -+ * Make prototypes available for ipsec_init and ipsec proc_dir_entries -+ * for static linking. -+ * -+ * Revision 1.1 1998/06/18 21:27:50 henry -+ * move sources from klips/src to klips/net/ipsec, to keep stupid -+ * kernel-build scripts happier in the presence of symlinks -+ * -+ * Revision 1.3 1998/05/18 21:51:50 rgb -+ * Added macros for num of I/F's and a procfs debug switch. -+ * -+ * Revision 1.2 1998/04/21 21:29:09 rgb -+ * Rearrange debug switches to change on the fly debug output from user -+ * space. Only kernel changes checked in at this time. radij.c was also -+ * changed to temporarily remove buggy debugging code in rj_delete causing -+ * an OOPS and hence, netlink device open errors. -+ * -+ * Revision 1.1 1998/04/09 03:06:13 henry -+ * sources moved up from linux/net/ipsec -+ * -+ * Revision 1.1.1.1 1998/04/08 05:35:05 henry -+ * RGB's ipsec-0.8pre2.tar.gz ipsec-0.8 -+ * -+ * Revision 0.5 1997/06/03 04:24:48 ji -+ * Added transport mode. -+ * Changed the way routing is done. -+ * Lots of bug fixes. -+ * -+ * Revision 0.4 1997/01/15 01:28:15 ji -+ * No changes. -+ * -+ * Revision 0.3 1996/11/20 14:39:04 ji -+ * Minor cleanups. -+ * Rationalized debugging code. -+ * -+ * Revision 0.2 1996/11/02 00:18:33 ji -+ * First limited release. -+ * -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_xform.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,278 @@ -+/* -+ * Definitions relevant to IPSEC transformations -+ * Copyright (C) 1996, 1997 John Ioannidis. -+ * Copyright (C) 1998, 1999, 2000, 2001 Richard Guy Briggs. -+ * COpyright (C) 2003 Michael Richardson -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * RCSID $Id: ipsec_xform.h,v 1.42 2005/08/05 08:50:45 mcr Exp $ -+ */ -+ -+#ifndef _IPSEC_XFORM_H_ -+ -+#include -+ -+#define XF_NONE 0 /* No transform set */ -+#define XF_IP4 1 /* IPv4 inside IPv4 */ -+#define XF_AHMD5 2 /* AH MD5 */ -+#define XF_AHSHA 3 /* AH SHA */ -+#define XF_ESP3DES 5 /* ESP DES3-CBC */ -+#define XF_AHHMACMD5 6 /* AH-HMAC-MD5 with opt replay prot */ -+#define XF_AHHMACSHA1 7 /* AH-HMAC-SHA1 with opt replay prot */ -+#define XF_ESP3DESMD5 9 /* triple DES, HMAC-MD-5, 128-bits of authentication */ -+#define XF_ESP3DESMD596 10 /* triple DES, HMAC-MD-5, 96-bits of authentication */ -+#define XF_ESPNULLMD596 12 /* NULL, HMAC-MD-5 with 96-bits of authentication */ -+#define XF_ESPNULLSHA196 13 /* NULL, HMAC-SHA-1 with 96-bits of authentication */ -+#define XF_ESP3DESSHA196 14 /* triple DES, HMAC-SHA-1, 96-bits of authentication */ -+#define XF_IP6 15 /* IPv6 inside IPv6 */ -+#define XF_COMPDEFLATE 16 /* IPCOMP deflate */ -+ -+#define XF_CLR 126 /* Clear SA table */ -+#define XF_DEL 127 /* Delete SA */ -+ -+/* IPsec AH transform values -+ * RFC 2407 -+ * draft-ietf-ipsec-doi-tc-mib-02.txt -+ */ -+ -+/* why are these hardcoded here? See ipsec_policy.h for their enums -- Paul*/ -+/* ---------- These really need to go from here ------------------ */ -+#define AH_NONE 0 -+#define AH_MD5 2 -+#define AH_SHA 3 -+/* draft-ietf-ipsec-ciph-aes-cbc-03.txt */ -+#define AH_SHA2_256 5 -+#define AH_SHA2_384 6 -+#define AH_SHA2_512 7 -+#define AH_RIPEMD 8 -+#define AH_AES 9 -+#define AH_NULL 251 -+#define AH_MAX 251 -+ -+/* IPsec ESP transform values */ -+ -+#define ESP_NONE 0 -+#define ESP_DES 2 -+#define ESP_3DES 3 -+#define ESP_RC5 4 -+#define ESP_IDEA 5 -+#define ESP_CAST 6 -+#define ESP_BLOWFISH 7 -+#define ESP_3IDEA 8 -+#define ESP_RC4 10 -+#define ESP_NULL 11 -+#define ESP_AES 12 -+#define ESP_AES_CTR 13 -+#define ESP_AES_CCM_A 14 -+#define ESP_AES_CCM_B 15 -+#define ESP_AES_CCM_C 16 -+#define ESP_ID17 17 -+#define ESP_AES_GCM_A 18 -+#define ESP_AES_GCM_B 19 -+#define ESP_AES_GCM_C 20 -+#define ESP_SEED_CBC 21 -+#define ESP_CAMELLIA 22 -+ -+/* as draft-ietf-ipsec-ciph-aes-cbc-02.txt */ -+#define ESP_MARS 249 -+#define ESP_RC6 250 -+#define ESP_SERPENT 252 -+#define ESP_TWOFISH 253 -+ -+/* IPCOMP transform values */ -+ -+#define IPCOMP_NONE 0 -+#define IPCOMP_OUI 1 -+#define IPCOMP_DEFLAT 2 -+#define IPCOMP_LZS 3 -+#define IPCOMP_V42BIS 4 -+ -+#define XFT_AUTH 0x0001 -+#define XFT_CONF 0x0100 -+ -+/* available if CONFIG_KLIPS_DEBUG is defined */ -+#define DB_XF_INIT 0x0001 -+ -+#define PROTO2TXT(x) \ -+ (x) == IPPROTO_AH ? "AH" : \ -+ (x) == IPPROTO_ESP ? "ESP" : \ -+ (x) == IPPROTO_IPIP ? "IPIP" : \ -+ (x) == IPPROTO_COMP ? "COMP" : \ -+ "UNKNOWN_proto" -+static inline const char *enc_name_id (unsigned id) { -+ static char buf[16]; -+ snprintf(buf, sizeof(buf), "_ID%d", id); -+ return buf; -+} -+static inline const char *auth_name_id (unsigned id) { -+ static char buf[16]; -+ snprintf(buf, sizeof(buf), "_ID%d", id); -+ return buf; -+} -+#define IPS_XFORM_NAME(x) \ -+ PROTO2TXT((x)->ips_said.proto), \ -+ (x)->ips_said.proto == IPPROTO_COMP ? \ -+ ((x)->ips_encalg == SADB_X_CALG_DEFLATE ? \ -+ "_DEFLATE" : "_UNKNOWN_comp") : \ -+ (x)->ips_encalg == ESP_NONE ? "" : \ -+ (x)->ips_encalg == ESP_3DES ? "_3DES" : \ -+ (x)->ips_encalg == ESP_AES ? "_AES" : \ -+ (x)->ips_encalg == ESP_SERPENT ? "_SERPENT" : \ -+ (x)->ips_encalg == ESP_TWOFISH ? "_TWOFISH" : \ -+ enc_name_id(x->ips_encalg)/* "_UNKNOWN_encr" */, \ -+ (x)->ips_authalg == AH_NONE ? "" : \ -+ (x)->ips_authalg == AH_MD5 ? "_HMAC_MD5" : \ -+ (x)->ips_authalg == AH_SHA ? "_HMAC_SHA1" : \ -+ (x)->ips_authalg == AH_SHA2_256 ? "_HMAC_SHA2_256" : \ -+ (x)->ips_authalg == AH_SHA2_384 ? "_HMAC_SHA2_384" : \ -+ (x)->ips_authalg == AH_SHA2_512 ? "_HMAC_SHA2_512" : \ -+ auth_name_id(x->ips_authalg) /* "_UNKNOWN_auth" */ \ -+ -+#ifdef __KERNEL__ -+#include -+ -+struct ipsec_rcv_state; -+struct ipsec_xmit_state; -+ -+struct xform_functions { -+ u8 protocol; -+ enum ipsec_rcv_value (*rcv_checks)(struct ipsec_rcv_state *irs, -+ struct sk_buff *skb); -+ enum ipsec_rcv_value (*rcv_decrypt)(struct ipsec_rcv_state *irs); -+ -+ enum ipsec_rcv_value (*rcv_setup_auth)(struct ipsec_rcv_state *irs, -+ struct sk_buff *skb, -+ __u32 *replay, -+ unsigned char **authenticator); -+ enum ipsec_rcv_value (*rcv_calc_auth)(struct ipsec_rcv_state *irs, -+ struct sk_buff *skb); -+ -+ enum ipsec_xmit_value (*xmit_setup)(struct ipsec_xmit_state *ixs); -+ enum ipsec_xmit_value (*xmit_encrypt)(struct ipsec_xmit_state *ixs); -+ -+ enum ipsec_xmit_value (*xmit_setup_auth)(struct ipsec_xmit_state *ixs, -+ struct sk_buff *skb, -+ __u32 *replay, -+ unsigned char **authenticator); -+ enum ipsec_xmit_value (*xmit_calc_auth)(struct ipsec_xmit_state *ixs, -+ struct sk_buff *skb); -+ int xmit_headroom; -+ int xmit_needtailroom; -+}; -+ -+#endif /* __KERNEL__ */ -+ -+#ifdef CONFIG_KLIPS_DEBUG -+extern void ipsec_dmp(char *s, caddr_t bb, int len); -+#else /* CONFIG_KLIPS_DEBUG */ -+#define ipsec_dmp(_x, _y, _z) -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+ -+#define _IPSEC_XFORM_H_ -+#endif /* _IPSEC_XFORM_H_ */ -+ -+/* -+ * $Log: ipsec_xform.h,v $ -+ * Revision 1.42 2005/08/05 08:50:45 mcr -+ * move #include of skbuff.h to a place where -+ * we know it will be kernel only code. -+ * -+ * Revision 1.41 2004/07/10 19:08:41 mcr -+ * CONFIG_IPSEC -> CONFIG_KLIPS. -+ * -+ * Revision 1.40 2004/04/06 02:49:08 mcr -+ * pullup of algo code from alg-branch. -+ * -+ * Revision 1.39 2004/04/05 19:55:07 mcr -+ * Moved from linux/include/freeswan/ipsec_xform.h,v -+ * -+ * Revision 1.38 2004/04/05 19:41:05 mcr -+ * merged alg-branch code. -+ * -+ * Revision 1.37 2003/12/13 19:10:16 mcr -+ * refactored rcv and xmit code - same as FS 2.05. -+ * -+ * Revision 1.36.34.1 2003/12/22 15:25:52 jjo -+ * Merged algo-0.8.1-rc11-test1 into alg-branch -+ * -+ * Revision 1.36 2002/04/24 07:36:48 mcr -+ * Moved from ./klips/net/ipsec/ipsec_xform.h,v -+ * -+ * Revision 1.35 2001/11/26 09:23:51 rgb -+ * Merge MCR's ipsec_sa, eroute, proc and struct lifetime changes. -+ * -+ * Revision 1.33.2.1 2001/09/25 02:24:58 mcr -+ * struct tdb -> struct ipsec_sa. -+ * sa(tdb) manipulation functions renamed and moved to ipsec_sa.c -+ * ipsec_xform.c removed. header file still contains useful things. -+ * -+ * Revision 1.34 2001/11/06 19:47:17 rgb -+ * Changed lifetime_packets to uint32 from uint64. -+ * -+ * Revision 1.33 2001/09/08 21:13:34 rgb -+ * Added pfkey ident extension support for ISAKMPd. (NetCelo) -+ * -+ * Revision 1.32 2001/07/06 07:40:01 rgb -+ * Reformatted for readability. -+ * Added inbound policy checking fields for use with IPIP SAs. -+ * -+ * Revision 1.31 2001/06/14 19:35:11 rgb -+ * Update copyright date. -+ * -+ * Revision 1.30 2001/05/30 08:14:03 rgb -+ * Removed vestiges of esp-null transforms. -+ * -+ * Revision 1.29 2001/01/30 23:42:47 rgb -+ * Allow pfkey msgs from pid other than user context required for ACQUIRE -+ * and subsequent ADD or UDATE. -+ * -+ * Revision 1.28 2000/11/06 04:30:40 rgb -+ * Add Svenning's adaptive content compression. -+ * -+ * Revision 1.27 2000/09/19 00:38:25 rgb -+ * Fixed algorithm name bugs introduced for ipcomp. -+ * -+ * Revision 1.26 2000/09/17 21:36:48 rgb -+ * Added proto2txt macro. -+ * -+ * Revision 1.25 2000/09/17 18:56:47 rgb -+ * Added IPCOMP support. -+ * -+ * Revision 1.24 2000/09/12 19:34:12 rgb -+ * Defined XF_IP6 from Gerhard for ipv6 tunnel support. -+ * -+ * Revision 1.23 2000/09/12 03:23:14 rgb -+ * Cleaned out now unused tdb_xform and tdb_xdata members of struct tdb. -+ * -+ * Revision 1.22 2000/09/08 19:12:56 rgb -+ * Change references from DEBUG_IPSEC to CONFIG_IPSEC_DEBUG. -+ * -+ * Revision 1.21 2000/09/01 18:32:43 rgb -+ * Added (disabled) sensitivity members to tdb struct. -+ * -+ * Revision 1.20 2000/08/30 05:31:01 rgb -+ * Removed all the rest of the references to tdb_spi, tdb_proto, tdb_dst. -+ * Kill remainder of tdb_xform, tdb_xdata, xformsw. -+ * -+ * Revision 1.19 2000/08/01 14:51:52 rgb -+ * Removed _all_ remaining traces of DES. -+ * -+ * Revision 1.18 2000/01/21 06:17:45 rgb -+ * Tidied up spacing. -+ * -+ * -+ * Local variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/ipsec_xmit.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,267 @@ -+/* -+ * IPSEC tunneling code -+ * Copyright (C) 1996, 1997 John Ioannidis. -+ * Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003 Richard Guy Briggs. -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * RCSID $Id: ipsec_xmit.h,v 1.14 2005/05/11 01:00:26 mcr Exp $ -+ */ -+ -+#include "openswan/ipsec_sa.h" -+ -+#ifdef CONFIG_KLIPS_OCF -+#include -+#endif -+ -+enum ipsec_xmit_value -+{ -+ IPSEC_XMIT_STOLEN=2, -+ IPSEC_XMIT_PASS=1, -+ IPSEC_XMIT_OK=0, -+ IPSEC_XMIT_ERRMEMALLOC=-1, -+ IPSEC_XMIT_ESP_BADALG=-2, -+ IPSEC_XMIT_BADPROTO=-3, -+ IPSEC_XMIT_ESP_PUSHPULLERR=-4, -+ IPSEC_XMIT_BADLEN=-5, -+ IPSEC_XMIT_AH_BADALG=-6, -+ IPSEC_XMIT_SAIDNOTFOUND=-7, -+ IPSEC_XMIT_SAIDNOTLIVE=-8, -+ IPSEC_XMIT_REPLAYROLLED=-9, -+ IPSEC_XMIT_LIFETIMEFAILED=-10, -+ IPSEC_XMIT_CANNOTFRAG=-11, -+ IPSEC_XMIT_MSSERR=-12, -+ IPSEC_XMIT_ERRSKBALLOC=-13, -+ IPSEC_XMIT_ENCAPFAIL=-14, -+ IPSEC_XMIT_NODEV=-15, -+ IPSEC_XMIT_NOPRIVDEV=-16, -+ IPSEC_XMIT_NOPHYSDEV=-17, -+ IPSEC_XMIT_NOSKB=-18, -+ IPSEC_XMIT_NOIPV6=-19, -+ IPSEC_XMIT_NOIPOPTIONS=-20, -+ IPSEC_XMIT_TTLEXPIRED=-21, -+ IPSEC_XMIT_BADHHLEN=-22, -+ IPSEC_XMIT_PUSHPULLERR=-23, -+ IPSEC_XMIT_ROUTEERR=-24, -+ IPSEC_XMIT_RECURSDETECT=-25, -+ IPSEC_XMIT_IPSENDFAILURE=-26, -+ IPSEC_XMIT_ESPUDP=-27, -+ IPSEC_XMIT_ESPUDP_BADTYPE=-28, -+ IPSEC_XMIT_PENDING=-29, -+}; -+ -+ -+/* -+ * state machine states -+ */ -+ -+#define IPSEC_XSM_INIT1 0 /* make it easy, starting state is 0 */ -+#define IPSEC_XSM_INIT2 1 -+#define IPSEC_XSM_ENCAP_INIT 2 -+#define IPSEC_XSM_ENCAP_SELECT 3 -+#define IPSEC_XSM_ESP 4 -+#define IPSEC_XSM_ESP_AH 5 -+#define IPSEC_XSM_AH 6 -+#define IPSEC_XSM_IPIP 7 -+#define IPSEC_XSM_IPCOMP 8 -+#define IPSEC_XSM_CONT 9 -+#define IPSEC_XSM_DONE 100 -+ -+ -+struct ipsec_xmit_state -+{ -+ struct sk_buff *skb; /* working skb pointer */ -+ struct net_device *dev; /* working dev pointer */ -+ struct ipsecpriv *prv; /* Our device' private space */ -+ struct sk_buff *oskb; /* Original skb pointer */ -+ struct net_device_stats *stats; /* This device's statistics */ -+ struct iphdr *iph; /* Our new IP header */ -+ __u32 newdst; /* The other SG's IP address */ -+ __u32 orgdst; /* Original IP destination address */ -+ __u32 orgedst; /* 1st SG's IP address */ -+ __u32 newsrc; /* The new source SG's IP address */ -+ __u32 orgsrc; /* Original IP source address */ -+ __u32 innersrc; /* Innermost IP source address */ -+ int iphlen; /* IP header length */ -+ int pyldsz; /* upper protocol payload size */ -+ int headroom; -+ int tailroom; -+ int authlen; -+ int max_headroom; /* The extra header space needed */ -+ int max_tailroom; /* The extra stuffing needed */ -+ int ll_headroom; /* The extra link layer hard_header space needed */ -+ int tot_headroom; /* The total header space needed */ -+ int tot_tailroom; /* The totalstuffing needed */ -+ __u8 *saved_header; /* saved copy of the hard header */ -+ unsigned short sport, dport; -+ -+ struct sockaddr_encap matcher; /* eroute search key */ -+ struct eroute *eroute; -+ struct ipsec_sa *ipsp; /* ipsec_sa pointers */ -+ //struct ipsec_sa *ipsp_outer; /* last SA applied by encap_bundle */ -+ char sa_txt[SATOT_BUF]; -+ size_t sa_len; -+ int hard_header_stripped; /* has the hard header been removed yet? */ -+ int hard_header_len; -+ struct net_device *physdev; -+/* struct device *virtdev; */ -+ short physmtu; -+ short cur_mtu; /* copy of prv->mtu, cause prv may == NULL */ -+ short mtudiff; -+#ifdef NET_21 -+ struct rtable *route; -+#endif /* NET_21 */ -+ ip_said outgoing_said; -+#ifdef NET_21 -+ int pass; -+#endif /* NET_21 */ -+ uint32_t eroute_pid; -+ struct ipsec_sa ips; -+#ifdef CONFIG_IPSEC_NAT_TRAVERSAL -+ uint8_t natt_type; -+ uint8_t natt_head; -+ uint16_t natt_sport; -+ uint16_t natt_dport; -+#endif -+ -+ /* -+ * xmit state machine use -+ */ -+ void (*xsm_complete)(struct ipsec_xmit_state *ixs, -+ enum ipsec_xmit_value stat); -+ int state; -+ int next_state; -+#ifdef CONFIG_KLIPS_OCF -+ struct work_struct workq; -+#ifdef DECLARE_TASKLET -+ struct tasklet_struct tasklet; -+#endif -+#endif -+#ifdef CONFIG_KLIPS_ALG -+ struct ipsec_alg_auth *ixt_a; -+ struct ipsec_alg_enc *ixt_e; -+#endif -+#ifdef CONFIG_KLIPS_ESP -+ struct esphdr *espp; -+ unsigned char *idat; -+#endif /* !CONFIG_KLIPS_ESP */ -+ int blocksize; -+ int ilen, len; -+ unsigned char *dat; -+ __u8 frag_off, tos; -+ __u16 ttl, check; -+}; -+ -+enum ipsec_xmit_value -+ipsec_xmit_sanity_check_dev(struct ipsec_xmit_state *ixs); -+ -+enum ipsec_xmit_value -+ipsec_xmit_sanity_check_skb(struct ipsec_xmit_state *ixs); -+ -+enum ipsec_xmit_value -+ipsec_xmit_encap_bundle(struct ipsec_xmit_state *ixs); -+ -+extern void ipsec_xsm(struct ipsec_xmit_state *ixs); -+#ifdef HAVE_KMEM_CACHE_T -+extern kmem_cache_t *ipsec_ixs_cache; -+#else -+extern struct kmem_cache *ipsec_ixs_cache; -+#endif -+extern int ipsec_ixs_max; -+extern atomic_t ipsec_ixs_cnt; -+ -+extern void ipsec_extract_ports(struct iphdr * iph, struct sockaddr_encap * er); -+ -+extern enum ipsec_xmit_value -+ipsec_xmit_send(struct ipsec_xmit_state*ixs, struct flowi *fl); -+ -+extern enum ipsec_xmit_value -+ipsec_nat_encap(struct ipsec_xmit_state*ixs); -+ -+extern enum ipsec_xmit_value -+ipsec_tunnel_send(struct ipsec_xmit_state *ixs); -+ -+extern void ipsec_xmit_cleanup(struct ipsec_xmit_state*ixs); -+ -+ -+extern int ipsec_xmit_trap_count; -+extern int ipsec_xmit_trap_sendcount; -+ -+#ifdef CONFIG_KLIPS_DEBUG -+extern int debug_xmit; -+extern int debug_mast; -+ -+#define ipsec_xmit_dmp(_x,_y, _z) if (debug_xmit && sysctl_ipsec_debug_verbose) ipsec_dmp_block(_x,_y,_z) -+#else -+#define ipsec_xmit_dmp(_x,_y, _z) do {} while(0) -+ -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+extern int sysctl_ipsec_debug_verbose; -+extern int sysctl_ipsec_icmp; -+extern int sysctl_ipsec_tos; -+ -+ -+/* -+ * $Log: ipsec_xmit.h,v $ -+ * Revision 1.14 2005/05/11 01:00:26 mcr -+ * do not call debug routines if !defined KLIPS_DEBUG. -+ * -+ * Revision 1.13 2005/04/29 05:01:38 mcr -+ * use ipsec_dmp_block. -+ * added cur_mtu to ixs instead of using ixs->dev. -+ * -+ * Revision 1.12 2004/08/20 21:45:37 mcr -+ * CONFIG_KLIPS_NAT_TRAVERSAL is not used in an attempt to -+ * be 26sec compatible. But, some defines where changed. -+ * -+ * Revision 1.11 2004/08/03 18:18:21 mcr -+ * in 2.6, use "net_device" instead of #define device->net_device. -+ * this probably breaks 2.0 compiles. -+ * -+ * Revision 1.10 2004/07/10 19:08:41 mcr -+ * CONFIG_IPSEC -> CONFIG_KLIPS. -+ * -+ * Revision 1.9 2004/04/06 02:49:08 mcr -+ * pullup of algo code from alg-branch. -+ * -+ * Revision 1.8 2004/04/05 19:55:07 mcr -+ * Moved from linux/include/freeswan/ipsec_xmit.h,v -+ * -+ * Revision 1.7 2004/02/03 03:11:40 mcr -+ * new xmit type if the UDP encapsulation is wrong. -+ * -+ * Revision 1.6 2003/12/13 19:10:16 mcr -+ * refactored rcv and xmit code - same as FS 2.05. -+ * -+ * Revision 1.5 2003/12/10 01:20:06 mcr -+ * NAT-traversal patches to KLIPS. -+ * -+ * Revision 1.4 2003/12/06 16:37:04 mcr -+ * 1.4.7a X.509 patch applied. -+ * -+ * Revision 1.3 2003/10/31 02:27:05 mcr -+ * pulled up port-selector patches and sa_id elimination. -+ * -+ * Revision 1.2.4.2 2003/10/29 01:10:19 mcr -+ * elimited "struct sa_id" -+ * -+ * Revision 1.2.4.1 2003/09/21 13:59:38 mcr -+ * pre-liminary X.509 patch - does not yet pass tests. -+ * -+ * Revision 1.2 2003/06/20 01:42:13 mcr -+ * added counters to measure how many ACQUIREs we send to pluto, -+ * and how many are successfully sent. -+ * -+ * Revision 1.1 2003/02/12 19:31:03 rgb -+ * Refactored from ipsec_tunnel.c -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/mast.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,33 @@ -+struct mast_callbacks { -+ int (*packet_encap)(struct device *mast, void *context, -+ struct sk_buff *skb, int flowref); -+ int (*link_inquire)(struct device *mast, void *context); -+}; -+ -+ -+struct device *mast_init (int family, -+ struct mast_callbacks *callbacks, -+ unsigned int flags, -+ unsigned int desired_unit, -+ unsigned int max_flowref, -+ void *context); -+ -+int mast_destroy(struct device *mast); -+ -+int mast_recv(struct device *mast, struct sk_buff *skb, int flowref); -+ -+/* free this skb as being useless, increment failure count. */ -+int mast_toast(struct device *mast, struct sk_buff *skb, int flowref); -+ -+int mast_linkstat (struct device *mast, int flowref, -+ int status); -+ -+int mast_setreference (struct device *mast, -+ int defaultSA); -+ -+int mast_setneighbor (struct device *mast, -+ struct sockaddr *source, -+ struct sockaddr *destination, -+ int flowref); -+ -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/passert.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,75 @@ -+/* -+ * sanitize a string into a printable format. -+ * -+ * Copyright (C) 1998-2002 D. Hugh Redelmeier. -+ * Copyright (C) 2003 Michael Richardson -+ * -+ * This library is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU Library General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This library is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public -+ * License for more details. -+ * -+ * RCSID $Id: passert.h,v 1.7.8.1 2007/09/05 02:32:24 paul Exp $ -+ */ -+ -+#include "openswan.h" -+ -+#ifndef _OPENSWAN_PASSERT_H -+#define _OPENSWAN_PASSERT_H -+/* our versions of assert: log result */ -+ -+#ifdef DEBUG -+ -+typedef void (*openswan_passert_fail_t)(const char *pred_str, -+ const char *file_str, -+ unsigned long line_no) NEVER_RETURNS; -+ -+extern openswan_passert_fail_t openswan_passert_fail; -+ -+extern void pexpect_log(const char *pred_str -+ , const char *file_str, unsigned long line_no); -+ -+# define impossible() do { \ -+ if(openswan_passert_fail) { \ -+ (*openswan_passert_fail)("impossible", __FILE__, __LINE__); \ -+ }} while(0) -+ -+extern void openswan_switch_fail(int n -+ , const char *file_str, unsigned long line_no) NEVER_RETURNS; -+ -+# define bad_case(n) openswan_switch_fail((int) n, __FILE__, __LINE__) -+ -+# define passert(pred) do { \ -+ if (!(pred)) \ -+ if(openswan_passert_fail) { \ -+ (*openswan_passert_fail)(#pred, __FILE__, __LINE__); \ -+ } \ -+ } while(0) -+ -+# define pexpect(pred) do { \ -+ if (!(pred)) \ -+ pexpect_log(#pred, __FILE__, __LINE__); \ -+ } while(0) -+ -+/* assert that an err_t is NULL; evaluate exactly once */ -+# define happy(x) { \ -+ err_t ugh = x; \ -+ if (ugh != NULL) \ -+ if(openswan_passert_fail) { (*openswan_passert_fail)(ugh, __FILE__, __LINE__); } \ -+ } -+ -+#else /*!DEBUG*/ -+ -+# define impossible() abort() -+# define bad_case(n) abort() -+# define passert(pred) { } /* do nothing */ -+# define happy(x) { (void) x; } /* evaluate non-judgementally */ -+ -+#endif /*!DEBUG*/ -+ -+#endif /* _OPENSWAN_PASSERT_H */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/pfkey.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,384 @@ -+/* -+ * Openswan specific PF_KEY headers -+ * Copyright (C) 1999, 2000, 2001 Richard Guy Briggs. -+ * Copyright (C) 2006-2007 Michael Richardson -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * RCSID $Id: pfkey.h,v 1.52 2005/11/09 00:30:37 mcr Exp $ -+ */ -+ -+#ifndef __NET_IPSEC_PF_KEY_H -+#define __NET_IPSEC_PF_KEY_H -+ -+#include "pfkeyv2.h" -+#ifdef __KERNEL__ -+extern struct proto_ops pfkey_proto_ops; -+typedef struct sock pfkey_sock; -+extern int debug_pfkey; -+ -+extern /* void */ int pfkey_init(void); -+extern /* void */ int pfkey_cleanup(void); -+ -+struct socket_list -+{ -+ struct socket *socketp; -+ struct socket_list *next; -+}; -+extern int pfkey_list_insert_socket(struct socket*, struct socket_list**); -+extern int pfkey_list_remove_socket(struct socket*, struct socket_list**); -+extern struct socket_list *pfkey_open_sockets; -+extern struct socket_list *pfkey_registered_sockets[]; -+ -+struct ipsec_alg_supported -+{ -+ uint16_t ias_exttype; -+ uint8_t ias_id; -+ uint8_t ias_ivlen; -+ uint16_t ias_keyminbits; -+ uint16_t ias_keymaxbits; -+ const char *ias_name; -+}; -+ -+extern struct supported_list *pfkey_supported_list[]; -+struct supported_list -+{ -+ struct ipsec_alg_supported *supportedp; -+ struct supported_list *next; -+}; -+extern int pfkey_list_insert_supported(struct ipsec_alg_supported*, struct supported_list**); -+extern int pfkey_list_remove_supported(struct ipsec_alg_supported*, struct supported_list**); -+ -+struct sockaddr_key -+{ -+ uint16_t key_family; /* PF_KEY */ -+ uint16_t key_pad; /* not used */ -+ uint32_t key_pid; /* process ID */ -+}; -+ -+struct pfkey_extracted_data -+{ -+ struct ipsec_sa* ips; -+ struct ipsec_sa* ips2; -+ struct eroute *eroute; -+ int outif; -+ IPsecSAref_t sarefme; -+ IPsecSAref_t sarefhim; -+}; -+ -+/* forward reference */ -+struct sadb_ext; -+struct sadb_msg; -+struct sockaddr; -+struct sadb_comb; -+struct sadb_sadb; -+struct sadb_alg; -+ -+extern int -+pfkey_alloc_eroute(struct eroute** eroute); -+ -+extern int -+pfkey_sa_process(struct sadb_ext *pfkey_ext, -+ struct pfkey_extracted_data* extr); -+ -+extern int -+pfkey_lifetime_process(struct sadb_ext *pfkey_ext, -+ struct pfkey_extracted_data* extr); -+ -+extern int -+pfkey_address_process(struct sadb_ext *pfkey_ext, -+ struct pfkey_extracted_data* extr); -+ -+extern int -+pfkey_key_process(struct sadb_ext *pfkey_ext, -+ struct pfkey_extracted_data* extr); -+ -+extern int -+pfkey_ident_process(struct sadb_ext *pfkey_ext, -+ struct pfkey_extracted_data* extr); -+ -+extern int -+pfkey_sens_process(struct sadb_ext *pfkey_ext, -+ struct pfkey_extracted_data* extr); -+ -+extern int -+pfkey_prop_process(struct sadb_ext *pfkey_ext, -+ struct pfkey_extracted_data* extr); -+ -+extern int -+pfkey_supported_process(struct sadb_ext *pfkey_ext, -+ struct pfkey_extracted_data* extr); -+ -+extern int -+pfkey_spirange_process(struct sadb_ext *pfkey_ext, -+ struct pfkey_extracted_data* extr); -+ -+extern int -+pfkey_x_kmprivate_process(struct sadb_ext *pfkey_ext, -+ struct pfkey_extracted_data* extr); -+ -+extern int -+pfkey_x_satype_process(struct sadb_ext *pfkey_ext, -+ struct pfkey_extracted_data* extr); -+ -+extern int -+pfkey_x_debug_process(struct sadb_ext *pfkey_ext, -+ struct pfkey_extracted_data* extr); -+ -+extern int pfkey_upmsg(struct socket *, struct sadb_msg *); -+extern int pfkey_upmsgsk(struct sock *, struct sadb_msg *); -+extern int pfkey_expire(struct ipsec_sa *, int); -+extern int pfkey_acquire(struct ipsec_sa *); -+#else /* ! __KERNEL__ */ -+ -+extern openswan_keying_debug_func_t pfkey_debug_func; -+extern openswan_keying_debug_func_t pfkey_error_func; -+extern void pfkey_print(struct sadb_msg *msg, FILE *out); -+ -+ -+#endif /* __KERNEL__ */ -+ -+extern uint8_t satype2proto(uint8_t satype); -+extern uint8_t proto2satype(uint8_t proto); -+extern char* satype2name(uint8_t satype); -+extern char* proto2name(uint8_t proto); -+ -+struct key_opt -+{ -+ uint32_t key_pid; /* process ID */ -+ struct sock *sk; -+}; -+ -+#define key_pid(sk) ((struct key_opt*)&((sk)->sk_protinfo))->key_pid -+ -+/* XXX-mcr this is not an alignment, this is because the count is in 64-bit -+ * words. -+ */ -+#define IPSEC_PFKEYv2_ALIGN (sizeof(uint64_t)/sizeof(uint8_t)) -+#define BITS_PER_OCTET 8 -+#define OCTETBITS 8 -+#define PFKEYBITS 64 -+#define DIVUP(x,y) ((x + y -1) / y) /* divide, rounding upwards */ -+#define ALIGN_N(x,y) (DIVUP(x,y) * y) /* align on y boundary */ -+ -+#define IPSEC_PFKEYv2_LEN(x) ((x) * IPSEC_PFKEYv2_ALIGN) -+#define IPSEC_PFKEYv2_WORDS(x) (DIVUP(x,IPSEC_PFKEYv2_ALIGN)) -+ -+ -+#define PFKEYv2_MAX_MSGSIZE 4096 -+ -+/* -+ * PF_KEYv2 permitted and required extensions in and out bitmaps -+ */ -+struct pf_key_ext_parsers_def { -+ int (*parser)(struct sadb_ext*); -+ char *parser_name; -+}; -+ -+enum pfkey_ext_required { -+ EXT_BITS_IN=0, -+ EXT_BITS_OUT=1 -+}; -+ -+enum pfkey_ext_perm { -+ EXT_BITS_PERM=0, -+ EXT_BITS_REQ=1 -+}; -+ -+ -+typedef uint64_t pfkey_ext_track; -+static inline void pfkey_mark_extension(enum sadb_extension_t exttype, -+ pfkey_ext_track *exten_track) -+{ -+ *exten_track |= (1 << exttype); -+} -+ -+extern int pfkey_extensions_missing(enum pfkey_ext_required inout, -+ enum sadb_msg_t sadb_operation, -+ pfkey_ext_track extensions_seen); -+extern int pfkey_required_extension(enum pfkey_ext_required inout, -+ enum sadb_msg_t sadb_operation, -+ enum sadb_extension_t exttype); -+extern int pfkey_permitted_extension(enum pfkey_ext_required inout, -+ enum sadb_msg_t sadb_operation, -+ enum sadb_extension_t exttype); -+ -+ -+extern void pfkey_extensions_init(struct sadb_ext *extensions[]); -+extern void pfkey_extensions_free(struct sadb_ext *extensions[]); -+extern void pfkey_msg_free(struct sadb_msg **pfkey_msg); -+ -+extern int pfkey_msg_parse(struct sadb_msg *pfkey_msg, -+ struct pf_key_ext_parsers_def *ext_parsers[], -+ struct sadb_ext **extensions, -+ int dir); -+ -+extern int pfkey_register_reply(int satype, struct sadb_msg *sadb_msg); -+ -+/* -+ * PF_KEYv2 build function prototypes -+ */ -+ -+int -+pfkey_msg_hdr_build(struct sadb_ext** pfkey_ext, -+ uint8_t msg_type, -+ uint8_t satype, -+ uint8_t msg_errno, -+ uint32_t seq, -+ uint32_t pid); -+ -+int -+pfkey_sa_ref_build(struct sadb_ext ** pfkey_ext, -+ uint16_t exttype, -+ uint32_t spi, /* in network order */ -+ uint8_t replay_window, -+ uint8_t sa_state, -+ uint8_t auth, -+ uint8_t encrypt, -+ uint32_t flags, -+ uint32_t/*IPsecSAref_t*/ ref); -+ -+int -+pfkey_sa_build(struct sadb_ext ** pfkey_ext, -+ uint16_t exttype, -+ uint32_t spi, /* in network order */ -+ uint8_t replay_window, -+ uint8_t sa_state, -+ uint8_t auth, -+ uint8_t encrypt, -+ uint32_t flags); -+ -+extern int -+pfkey_saref_build(struct sadb_ext **pfkey_ext, -+ IPsecSAref_t in, IPsecSAref_t out); -+ -+int -+pfkey_lifetime_build(struct sadb_ext ** pfkey_ext, -+ uint16_t exttype, -+ uint32_t allocations, -+ uint64_t bytes, -+ uint64_t addtime, -+ uint64_t usetime, -+ uint32_t packets); -+ -+int -+pfkey_address_build(struct sadb_ext** pfkey_ext, -+ uint16_t exttype, -+ uint8_t proto, -+ uint8_t prefixlen, -+ struct sockaddr* address); -+ -+int -+pfkey_key_build(struct sadb_ext** pfkey_ext, -+ uint16_t exttype, -+ uint16_t key_bits, -+ unsigned char *key); -+ -+int -+pfkey_ident_build(struct sadb_ext** pfkey_ext, -+ uint16_t exttype, -+ uint16_t ident_type, -+ uint64_t ident_id, -+ uint8_t ident_len, -+ char* ident_string); -+ -+#ifdef __KERNEL__ -+extern int pfkey_nat_t_new_mapping(struct ipsec_sa *, struct sockaddr *, __u16); -+extern int pfkey_x_nat_t_type_process(struct sadb_ext *pfkey_ext, struct pfkey_extracted_data* extr); -+extern int pfkey_x_nat_t_port_process(struct sadb_ext *pfkey_ext, struct pfkey_extracted_data* extr); -+#endif /* __KERNEL__ */ -+int -+pfkey_x_nat_t_type_build(struct sadb_ext** pfkey_ext, -+ uint8_t type); -+int -+pfkey_x_nat_t_port_build(struct sadb_ext** pfkey_ext, -+ uint16_t exttype, -+ uint16_t port); -+ -+int -+pfkey_sens_build(struct sadb_ext** pfkey_ext, -+ uint32_t dpd, -+ uint8_t sens_level, -+ uint8_t sens_len, -+ uint64_t* sens_bitmap, -+ uint8_t integ_level, -+ uint8_t integ_len, -+ uint64_t* integ_bitmap); -+ -+int pfkey_x_protocol_build(struct sadb_ext **, uint8_t); -+ -+ -+int -+pfkey_prop_build(struct sadb_ext** pfkey_ext, -+ uint8_t replay, -+ unsigned int comb_num, -+ struct sadb_comb* comb); -+ -+int -+pfkey_supported_build(struct sadb_ext** pfkey_ext, -+ uint16_t exttype, -+ unsigned int alg_num, -+ struct sadb_alg* alg); -+ -+int -+pfkey_spirange_build(struct sadb_ext** pfkey_ext, -+ uint16_t exttype, -+ uint32_t min, -+ uint32_t max); -+ -+int -+pfkey_x_kmprivate_build(struct sadb_ext** pfkey_ext); -+ -+int -+pfkey_x_satype_build(struct sadb_ext** pfkey_ext, -+ uint8_t satype); -+ -+int -+pfkey_x_debug_build(struct sadb_ext** pfkey_ext, -+ uint32_t tunnel, -+ uint32_t netlink, -+ uint32_t xform, -+ uint32_t eroute, -+ uint32_t spi, -+ uint32_t radij, -+ uint32_t esp, -+ uint32_t ah, -+ uint32_t rcv, -+ uint32_t pfkey, -+ uint32_t ipcomp, -+ uint32_t verbose); -+ -+int -+pfkey_msg_build(struct sadb_msg** pfkey_msg, -+ struct sadb_ext* extensions[], -+ int dir); -+ -+/* in pfkey_v2_debug.c - routines to decode numbers -> strings */ -+const char * -+pfkey_v2_sadb_ext_string(int extnum); -+ -+const char * -+pfkey_v2_sadb_type_string(int sadb_type); -+ -+struct sadb_builds { -+ struct k_sadb_sa sa_base; -+}; -+ -+int -+pfkey_sa_builds(struct sadb_ext **pfkey_ext, -+ struct sadb_builds sab); -+ -+extern int -+pfkey_outif_build(struct sadb_ext **pfkey_ext, -+ uint16_t outif); -+ -+#endif /* __NET_IPSEC_PF_KEY_H */ -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/pfkey_debug.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,54 @@ -+/* -+ * sanitize a string into a printable format. -+ * -+ * Copyright (C) 1998-2002 D. Hugh Redelmeier. -+ * Copyright (C) 2003 Michael Richardson -+ * -+ * This library is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU Library General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This library is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public -+ * License for more details. -+ * -+ * RCSID $Id: pfkey_debug.h,v 1.3 2004/04/05 19:55:07 mcr Exp $ -+ */ -+ -+#ifndef _FREESWAN_PFKEY_DEBUG_H -+#define _FREESWAN_PFKEY_DEBUG_H -+ -+#ifdef __KERNEL__ -+ -+/* note, kernel version ignores pfkey levels */ -+# define DEBUGGING(level,args...) \ -+ KLIPS_PRINT(debug_pfkey, "klips_debug:" args) -+ -+# define ERROR(args...) printk(KERN_ERR "klips:" args) -+ -+#else -+ -+extern unsigned int pfkey_lib_debug; -+ -+extern int (*pfkey_debug_func)(const char *message, ...) PRINTF_LIKE(1); -+extern int (*pfkey_error_func)(const char *message, ...) PRINTF_LIKE(1); -+ -+#define DEBUGGING(level,args...) if(pfkey_lib_debug & level) { \ -+ if(pfkey_debug_func != NULL) { \ -+ (*pfkey_debug_func)("pfkey_lib_debug:" args); \ -+ } else { \ -+ printf("pfkey_lib_debug:" args); \ -+ } } -+ -+#define ERROR(args...) if(pfkey_error_func != NULL) { \ -+ (*pfkey_error_func)("pfkey_lib_debug:" args); \ -+ } -+ -+# define MALLOC(size) malloc(size) -+# define FREE(obj) free(obj) -+ -+#endif -+ -+#endif ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/openswan/pfkeyv2.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,347 @@ -+/* -+ * RCSID $Id: pfkeyv2.h,v 1.31 2005/04/14 01:14:54 mcr Exp $ -+ */ -+ -+/* -+RFC 2367 PF_KEY Key Management API July 1998 -+ -+ -+Appendix D: Sample Header File -+ -+This file defines structures and symbols for the PF_KEY Version 2 -+key management interface. It was written at the U.S. Naval Research -+Laboratory. This file is in the public domain. The authors ask that -+you leave this credit intact on any copies of this file. -+*/ -+ -+#ifndef __PFKEY_V2_H -+#define __PFKEY_V2_H 1 -+ -+#include -+ -+#define PF_KEY_V2 2 -+#define PFKEYV2_REVISION 199806L -+ -+enum sadb_msg_t { -+ K_SADB_RESERVED=SADB_RESERVED, -+ K_SADB_GETSPI=SADB_GETSPI, -+ K_SADB_UPDATE=SADB_UPDATE, -+ K_SADB_ADD=SADB_ADD, -+ K_SADB_DELETE=SADB_DELETE, -+ K_SADB_GET=SADB_GET, -+ K_SADB_ACQUIRE=SADB_ACQUIRE, -+ K_SADB_REGISTER=SADB_REGISTER, -+ K_SADB_EXPIRE=SADB_EXPIRE, -+ K_SADB_FLUSH=SADB_FLUSH, -+ K_SADB_DUMP=SADB_DUMP, -+ K_SADB_X_PROMISC=SADB_X_PROMISC, -+ K_SADB_X_PCHANGE=SADB_X_PCHANGE, -+ K_SADB_X_GRPSA=13, -+ K_SADB_X_ADDFLOW=14, -+ K_SADB_X_DELFLOW=15, -+ K_SADB_X_DEBUG=16, -+ K_SADB_X_NAT_T_NEW_MAPPING=17, -+ K_SADB_X_PLUMBIF=18, -+ K_SADB_X_UNPLUMBIF=19, -+ K_SADB_MAX=19 -+}; -+ -+#define SADB_X_GRPSA K_SADB_X_GRPSA -+#define SADB_X_ADDFLOW K_SADB_X_ADDFLOW -+#define SADB_X_DELFLOW K_SADB_X_DELFLOW -+#define SADB_X_DEBUG K_SADB_X_DEBUG -+#define SADB_X_PLUMBIF K_SADB_X_PLUMBIF -+#define SADB_X_UNPLUMBIF K_SADB_X_UNPLUMBIF -+ -+struct k_sadb_sa { -+ uint16_t sadb_sa_len; -+ uint16_t sadb_sa_exttype; -+ uint32_t sadb_sa_spi; -+ uint8_t sadb_sa_replay; -+ uint8_t sadb_sa_state; -+ uint8_t sadb_sa_auth; -+ uint8_t sadb_sa_encrypt; -+ uint32_t sadb_sa_flags; -+ uint32_t /*IPsecSAref_t*/ sadb_x_sa_ref; /* 32 bits */ -+ uint8_t sadb_x_reserved[4]; -+} __attribute__((packed)); -+ -+struct sadb_sa_v1 { -+ uint16_t sadb_sa_len; -+ uint16_t sadb_sa_exttype; -+ uint32_t sadb_sa_spi; -+ uint8_t sadb_sa_replay; -+ uint8_t sadb_sa_state; -+ uint8_t sadb_sa_auth; -+ uint8_t sadb_sa_encrypt; -+ uint32_t sadb_sa_flags; -+} __attribute__((packed)); -+ -+struct sadb_x_satype { -+ uint16_t sadb_x_satype_len; -+ uint16_t sadb_x_satype_exttype; -+ uint8_t sadb_x_satype_satype; -+ uint8_t sadb_x_satype_reserved[3]; -+} __attribute__((packed)); -+ -+struct sadb_x_debug { -+ uint16_t sadb_x_debug_len; -+ uint16_t sadb_x_debug_exttype; -+ uint32_t sadb_x_debug_tunnel; -+ uint32_t sadb_x_debug_netlink; -+ uint32_t sadb_x_debug_xform; -+ uint32_t sadb_x_debug_eroute; -+ uint32_t sadb_x_debug_spi; -+ uint32_t sadb_x_debug_radij; -+ uint32_t sadb_x_debug_esp; -+ uint32_t sadb_x_debug_ah; -+ uint32_t sadb_x_debug_rcv; -+ uint32_t sadb_x_debug_pfkey; -+ uint32_t sadb_x_debug_ipcomp; -+ uint32_t sadb_x_debug_verbose; -+ uint8_t sadb_x_debug_reserved[4]; -+} __attribute__((packed)); -+ -+/* -+ * a plumbif extension can appear in -+ * - a plumbif message to create the interface. -+ * - a unplumbif message to delete the interface. -+ * - a sadb add/replace to indicate which interface -+ * a decrypted packet should emerge on. -+ * -+ * the create/delete part could/should be replaced with netlink equivalents, -+ * or better yet, FORCES versions of same. -+ * -+ */ -+struct sadb_x_plumbif { -+ uint16_t sadb_x_outif_len; -+ uint16_t sadb_x_outif_exttype; -+ uint16_t sadb_x_outif_ifnum; -+} __attribute__((packed)); -+ -+/* -+ * the ifnum describes a device that you wish to create refer to. -+ * -+ * devices 0-40959 are mastXXX devices. -+ * devices 40960-49141 are mastXXX devices with transport set. -+ * devices 49152-65536 are deprecated ipsecXXX devices. -+ */ -+#define IPSECDEV_OFFSET (48*1024) -+#define MASTTRANSPORT_OFFSET (40*1024) -+ -+/* -+ * an saref extension sets the SA's reference number, and -+ * may also set the paired SA's reference number. -+ * -+ */ -+struct sadb_x_saref { -+ uint16_t sadb_x_saref_len; -+ uint16_t sadb_x_saref_exttype; -+ uint32_t sadb_x_saref_me; -+ uint32_t sadb_x_saref_him; -+} __attribute__((packed)); -+ -+/* -+ * A protocol structure for passing through the transport level -+ * protocol. It contains more fields than are actually used/needed -+ * but it is this way to be compatible with the structure used in -+ * OpenBSD (http://www.openbsd.org/cgi-bin/cvsweb/src/sys/net/pfkeyv2.h) -+ */ -+struct sadb_protocol { -+ uint16_t sadb_protocol_len; -+ uint16_t sadb_protocol_exttype; -+ uint8_t sadb_protocol_proto; -+ uint8_t sadb_protocol_direction; -+ uint8_t sadb_protocol_flags; -+ uint8_t sadb_protocol_reserved2; -+} __attribute__((packed)); -+ -+/* -+ * NOTE that there is a limit of 31 extensions due to current implementation -+ * in pfkeyv2_ext_bits.c -+ */ -+enum sadb_extension_t { -+ K_SADB_EXT_RESERVED=SADB_RESERVED, -+ K_SADB_EXT_SA= SADB_EXT_SA, -+ K_SADB_EXT_LIFETIME_CURRENT=SADB_EXT_LIFETIME_CURRENT, -+ K_SADB_EXT_LIFETIME_HARD= SADB_EXT_LIFETIME_HARD, -+ K_SADB_EXT_LIFETIME_SOFT= SADB_EXT_LIFETIME_SOFT, -+ K_SADB_EXT_ADDRESS_SRC= SADB_EXT_ADDRESS_SRC, -+ K_SADB_EXT_ADDRESS_DST= SADB_EXT_ADDRESS_DST, -+ K_SADB_EXT_ADDRESS_PROXY= SADB_EXT_ADDRESS_PROXY, -+ K_SADB_EXT_KEY_AUTH= SADB_EXT_KEY_AUTH, -+ K_SADB_EXT_KEY_ENCRYPT= SADB_EXT_KEY_ENCRYPT, -+ K_SADB_EXT_IDENTITY_SRC= SADB_EXT_IDENTITY_SRC, -+ K_SADB_EXT_IDENTITY_DST= SADB_EXT_IDENTITY_DST, -+ K_SADB_EXT_SENSITIVITY= SADB_EXT_SENSITIVITY, -+ K_SADB_EXT_PROPOSAL= SADB_EXT_PROPOSAL, -+ K_SADB_EXT_SUPPORTED_AUTH= SADB_EXT_SUPPORTED_AUTH, -+ K_SADB_EXT_SUPPORTED_ENCRYPT=SADB_EXT_SUPPORTED_ENCRYPT, -+ K_SADB_EXT_SPIRANGE= SADB_EXT_SPIRANGE, -+ K_SADB_X_EXT_KMPRIVATE= SADB_X_EXT_KMPRIVATE, -+ K_SADB_X_EXT_SATYPE2= 18, -+ K_SADB_X_EXT_POLICY= SADB_X_EXT_POLICY, -+ K_SADB_X_EXT_SA2= SADB_X_EXT_SA2, -+ K_SADB_X_EXT_ADDRESS_DST2= 20, -+ K_SADB_X_EXT_ADDRESS_SRC_FLOW=21, -+ K_SADB_X_EXT_ADDRESS_DST_FLOW=22, -+ K_SADB_X_EXT_ADDRESS_SRC_MASK=23, -+ K_SADB_X_EXT_ADDRESS_DST_MASK=24, -+ K_SADB_X_EXT_DEBUG= 25, -+ K_SADB_X_EXT_PROTOCOL= 26, -+ K_SADB_X_EXT_NAT_T_TYPE= 27, -+ K_SADB_X_EXT_NAT_T_SPORT= 28, -+ K_SADB_X_EXT_NAT_T_DPORT= 29, -+ K_SADB_X_EXT_NAT_T_OA= 30, -+ K_SADB_X_EXT_PLUMBIF= 31, -+ K_SADB_X_EXT_SAREF= 32, -+ K_SADB_EXT_MAX= 32, -+}; -+ -+ -+#define SADB_X_EXT_SATYPE2 K_SADB_X_EXT_SATYPE2 -+#define SADB_X_EXT_ADDRESS_DST2 K_SADB_X_EXT_ADDRESS_DST2 -+#define SADB_X_EXT_ADDRESS_SRC_FLOW K_SADB_X_EXT_ADDRESS_SRC_FLOW -+#define SADB_X_EXT_ADDRESS_DST_FLOW K_SADB_X_EXT_ADDRESS_DST_FLOW -+#define SADB_X_EXT_ADDRESS_SRC_MASK K_SADB_X_EXT_ADDRESS_SRC_MASK -+#define SADB_X_EXT_ADDRESS_DST_MASK K_SADB_X_EXT_ADDRESS_DST_MASK -+#define SADB_X_EXT_DEBUG K_SADB_X_EXT_DEBUG -+#define SADB_X_EXT_PROTOCOL K_SADB_X_EXT_PROTOCOL -+ -+#undef SADB_X_EXT_NAT_T_TYPE -+#undef SADB_X_EXT_NAT_T_SPORT -+#undef SADB_X_EXT_NAT_T_DPORT -+#undef SADB_X_EXT_NAT_T_OA -+#define SADB_X_EXT_PLUMBIF K_SADB_X_EXT_PLUMBIF -+ -+ -+ -+/* K_SADB_X_DELFLOW required over and above K_SADB_X_SAFLAGS_CLEARFLOW */ -+#define K_SADB_X_EXT_ADDRESS_DELFLOW \ -+ ( (1<rm_mklist; \ -+ } else \ -+ R_Malloc(m, struct radij_mask *, sizeof (*(m))); }\ -+ -+#define MKFree(m) { (m)->rm_mklist = rj_mkfreelist; rj_mkfreelist = (m);} -+ -+struct radij_node_head { -+ struct radij_node *rnh_treetop; -+ int rnh_addrsize; /* permit, but not require fixed keys */ -+ int rnh_pktsize; /* permit, but not require fixed keys */ -+#if 0 -+ struct radij_node *(*rnh_addaddr) /* add based on sockaddr */ -+ __P((void *v, void *mask, -+ struct radij_node_head *head, struct radij_node nodes[])); -+#endif -+ int (*rnh_addaddr) /* add based on sockaddr */ -+ __P((void *v, void *mask, -+ struct radij_node_head *head, struct radij_node nodes[])); -+ struct radij_node *(*rnh_addpkt) /* add based on packet hdr */ -+ __P((void *v, void *mask, -+ struct radij_node_head *head, struct radij_node nodes[])); -+#if 0 -+ struct radij_node *(*rnh_deladdr) /* remove based on sockaddr */ -+ __P((void *v, void *mask, struct radij_node_head *head)); -+#endif -+ int (*rnh_deladdr) /* remove based on sockaddr */ -+ __P((void *v, void *mask, struct radij_node_head *head, struct radij_node **node)); -+ struct radij_node *(*rnh_delpkt) /* remove based on packet hdr */ -+ __P((void *v, void *mask, struct radij_node_head *head)); -+ struct radij_node *(*rnh_matchaddr) /* locate based on sockaddr */ -+ __P((void *v, struct radij_node_head *head)); -+ struct radij_node *(*rnh_matchpkt) /* locate based on packet hdr */ -+ __P((void *v, struct radij_node_head *head)); -+ int (*rnh_walktree) /* traverse tree */ -+ __P((struct radij_node_head *head, int (*f)(struct radij_node *rn, void *w), void *w)); -+ struct radij_node rnh_nodes[3]; /* empty tree for common case */ -+}; -+ -+ -+#define Bcmp(a, b, n) memcmp(((caddr_t)(b)), ((caddr_t)(a)), (unsigned)(n)) -+#define Bcopy(a, b, n) memmove(((caddr_t)(b)), ((caddr_t)(a)), (unsigned)(n)) -+#define Bzero(p, n) memset((caddr_t)(p), 0, (unsigned)(n)) -+#define R_Malloc(p, t, n) ((p = (t) kmalloc((size_t)(n), GFP_ATOMIC)), Bzero((p),(n))) -+#define Free(p) kfree((caddr_t)p); -+ -+void rj_init __P((void)); -+int rj_inithead __P((void **, int)); -+int rj_refines __P((void *, void *)); -+int rj_walktree __P((struct radij_node_head *head, int (*f)(struct radij_node *rn, void *w), void *w)); -+struct radij_node -+ *rj_addmask __P((void *, int, int)) /* , rgb */ ; -+int /* * */ rj_addroute __P((void *, void *, struct radij_node_head *, -+ struct radij_node [2])) /* , rgb */ ; -+int /* * */ rj_delete __P((void *, void *, struct radij_node_head *, struct radij_node **)) /* , rgb */ ; -+struct radij_node /* rgb */ -+ *rj_insert __P((void *, struct radij_node_head *, int *, -+ struct radij_node [2])), -+ *rj_match __P((void *, struct radij_node_head *)), -+ *rj_newpair __P((void *, int, struct radij_node[2])), -+ *rj_search __P((void *, struct radij_node *)), -+ *rj_search_m __P((void *, struct radij_node *, void *)); -+ -+void rj_deltree(struct radij_node_head *); -+void rj_delnodes(struct radij_node *); -+void rj_free_mkfreelist(void); -+int radijcleartree(void); -+int radijcleanup(void); -+ -+extern struct radij_node_head *mask_rjhead; -+extern int maj_keylen; -+#endif /* __KERNEL__ */ -+ -+#endif /* _RADIJ_H_ */ -+ -+ -+/* -+ * $Log: radij.h,v $ -+ * Revision 1.13 2004/04/05 19:55:08 mcr -+ * Moved from linux/include/freeswan/radij.h,v -+ * -+ * Revision 1.12 2002/04/24 07:36:48 mcr -+ * Moved from ./klips/net/ipsec/radij.h,v -+ * -+ * Revision 1.11 2001/09/20 15:33:00 rgb -+ * Min/max cleanup. -+ * -+ * Revision 1.10 1999/11/18 04:09:20 rgb -+ * Replaced all kernel version macros to shorter, readable form. -+ * -+ * Revision 1.9 1999/05/05 22:02:33 rgb -+ * Add a quick and dirty port to 2.2 kernels by Marc Boucher . -+ * -+ * Revision 1.8 1999/04/29 15:24:58 rgb -+ * Add check for existence of macros min/max. -+ * -+ * Revision 1.7 1999/04/11 00:29:02 henry -+ * GPL boilerplate -+ * -+ * Revision 1.6 1999/04/06 04:54:29 rgb -+ * Fix/Add RCSID Id: and Log: bits to make PHMDs happy. This includes -+ * patch shell fixes. -+ * -+ * Revision 1.5 1999/01/22 06:30:32 rgb -+ * 64-bit clean-up. -+ * -+ * Revision 1.4 1998/11/30 13:22:55 rgb -+ * Rationalised all the klips kernel file headers. They are much shorter -+ * now and won't conflict under RH5.2. -+ * -+ * Revision 1.3 1998/10/25 02:43:27 rgb -+ * Change return type on rj_addroute and rj_delete and add and argument -+ * to the latter to be able to transmit more infomation about errors. -+ * -+ * Revision 1.2 1998/07/14 18:09:51 rgb -+ * Add a routine to clear eroute table. -+ * Added #ifdef __KERNEL__ directives to restrict scope of header. -+ * -+ * Revision 1.1 1998/06/18 21:30:22 henry -+ * move sources from klips/src to klips/net/ipsec to keep stupid kernel -+ * build scripts happier about symlinks -+ * -+ * Revision 1.4 1998/05/25 20:34:16 rgb -+ * Remove temporary ipsec_walk, rj_deltree and rj_delnodes functions. -+ * -+ * Rename ipsec_rj_walker (ipsec_walk) to ipsec_rj_walker_procprint and -+ * add ipsec_rj_walker_delete. -+ * -+ * Recover memory for eroute table on unload of module. -+ * -+ * Revision 1.3 1998/04/22 16:51:37 rgb -+ * Tidy up radij debug code from recent rash of modifications to debug code. -+ * -+ * Revision 1.2 1998/04/14 17:30:38 rgb -+ * Fix up compiling errors for radij tree memory reclamation. -+ * -+ * Revision 1.1 1998/04/09 03:06:16 henry -+ * sources moved up from linux/net/ipsec -+ * -+ * Revision 1.1.1.1 1998/04/08 05:35:04 henry -+ * RGB's ipsec-0.8pre2.tar.gz ipsec-0.8 -+ * -+ * Revision 0.4 1997/01/15 01:28:15 ji -+ * No changes. -+ * -+ * Revision 0.3 1996/11/20 14:44:45 ji -+ * Release update only. -+ * -+ * Revision 0.2 1996/11/02 00:18:33 ji -+ * First limited release. -+ * -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/zlib/zconf.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,309 @@ -+/* zconf.h -- configuration of the zlib compression library -+ * Copyright (C) 1995-2002 Jean-loup Gailly. -+ * For conditions of distribution and use, see copyright notice in zlib.h -+ */ -+ -+/* @(#) $Id: zconf.h,v 1.4 2004/07/10 07:48:40 mcr Exp $ */ -+ -+#ifndef _ZCONF_H -+#define _ZCONF_H -+ -+/* -+ * If you *really* need a unique prefix for all types and library functions, -+ * compile with -DZ_PREFIX. The "standard" zlib should be compiled without it. -+ */ -+#ifdef IPCOMP_PREFIX -+# define deflateInit_ ipcomp_deflateInit_ -+# define deflate ipcomp_deflate -+# define deflateEnd ipcomp_deflateEnd -+# define inflateInit_ ipcomp_inflateInit_ -+# define inflate ipcomp_inflate -+# define inflateEnd ipcomp_inflateEnd -+# define deflateInit2_ ipcomp_deflateInit2_ -+# define deflateSetDictionary ipcomp_deflateSetDictionary -+# define deflateCopy ipcomp_deflateCopy -+# define deflateReset ipcomp_deflateReset -+# define deflateParams ipcomp_deflateParams -+# define inflateInit2_ ipcomp_inflateInit2_ -+# define inflateSetDictionary ipcomp_inflateSetDictionary -+# define inflateSync ipcomp_inflateSync -+# define inflateSyncPoint ipcomp_inflateSyncPoint -+# define inflateReset ipcomp_inflateReset -+# define compress ipcomp_compress -+# define compress2 ipcomp_compress2 -+# define uncompress ipcomp_uncompress -+# define adler32 ipcomp_adler32 -+# define crc32 ipcomp_crc32 -+# define get_crc_table ipcomp_get_crc_table -+/* SSS: these also need to be prefixed to avoid clash with ppp_deflate and ext2compression */ -+# define inflate_blocks ipcomp_deflate_blocks -+# define inflate_blocks_free ipcomp_deflate_blocks_free -+# define inflate_blocks_new ipcomp_inflate_blocks_new -+# define inflate_blocks_reset ipcomp_inflate_blocks_reset -+# define inflate_blocks_sync_point ipcomp_inflate_blocks_sync_point -+# define inflate_set_dictionary ipcomp_inflate_set_dictionary -+# define inflate_codes ipcomp_inflate_codes -+# define inflate_codes_free ipcomp_inflate_codes_free -+# define inflate_codes_new ipcomp_inflate_codes_new -+# define inflate_fast ipcomp_inflate_fast -+# define inflate_trees_bits ipcomp_inflate_trees_bits -+# define inflate_trees_dynamic ipcomp_inflate_trees_dynamic -+# define inflate_trees_fixed ipcomp_inflate_trees_fixed -+# define inflate_flush ipcomp_inflate_flush -+# define inflate_mask ipcomp_inflate_mask -+# define _dist_code _ipcomp_dist_code -+# define _length_code _ipcomp_length_code -+# define _tr_align _ipcomp_tr_align -+# define _tr_flush_block _ipcomp_tr_flush_block -+# define _tr_init _ipcomp_tr_init -+# define _tr_stored_block _ipcomp_tr_stored_block -+# define _tr_tally _ipcomp_tr_tally -+# define zError ipcomp_zError -+# define z_errmsg ipcomp_z_errmsg -+# define zlibVersion ipcomp_zlibVersion -+# define match_init ipcomp_match_init -+# define longest_match ipcomp_longest_match -+#endif -+ -+#ifdef Z_PREFIX -+# define Byte z_Byte -+# define uInt z_uInt -+# define uLong z_uLong -+# define Bytef z_Bytef -+# define charf z_charf -+# define intf z_intf -+# define uIntf z_uIntf -+# define uLongf z_uLongf -+# define voidpf z_voidpf -+# define voidp z_voidp -+#endif -+ -+#if (defined(_WIN32) || defined(__WIN32__)) && !defined(WIN32) -+# define WIN32 -+#endif -+#if defined(__GNUC__) || defined(WIN32) || defined(__386__) || defined(i386) -+# ifndef __32BIT__ -+# define __32BIT__ -+# endif -+#endif -+#if defined(__MSDOS__) && !defined(MSDOS) -+# define MSDOS -+#endif -+ -+/* -+ * Compile with -DMAXSEG_64K if the alloc function cannot allocate more -+ * than 64k bytes at a time (needed on systems with 16-bit int). -+ */ -+#if defined(MSDOS) && !defined(__32BIT__) -+# define MAXSEG_64K -+#endif -+#ifdef MSDOS -+# define UNALIGNED_OK -+#endif -+ -+#if (defined(MSDOS) || defined(_WINDOWS) || defined(WIN32)) && !defined(STDC) -+# define STDC -+#endif -+#if defined(__STDC__) || defined(__cplusplus) || defined(__OS2__) -+# ifndef STDC -+# define STDC -+# endif -+#endif -+ -+#ifndef STDC -+# ifndef const /* cannot use !defined(STDC) && !defined(const) on Mac */ -+# define const -+# endif -+#endif -+ -+/* Some Mac compilers merge all .h files incorrectly: */ -+#if defined(__MWERKS__) || defined(applec) ||defined(THINK_C) ||defined(__SC__) -+# define NO_DUMMY_DECL -+#endif -+ -+/* Old Borland C incorrectly complains about missing returns: */ -+#if defined(__BORLANDC__) && (__BORLANDC__ < 0x500) -+# define NEED_DUMMY_RETURN -+#endif -+ -+ -+/* Maximum value for memLevel in deflateInit2 */ -+#ifndef MAX_MEM_LEVEL -+# ifdef MAXSEG_64K -+# define MAX_MEM_LEVEL 8 -+# else -+# define MAX_MEM_LEVEL 9 -+# endif -+#endif -+ -+/* Maximum value for windowBits in deflateInit2 and inflateInit2. -+ * WARNING: reducing MAX_WBITS makes minigzip unable to extract .gz files -+ * created by gzip. (Files created by minigzip can still be extracted by -+ * gzip.) -+ */ -+#ifndef MAX_WBITS -+# define MAX_WBITS 15 /* 32K LZ77 window */ -+#endif -+ -+/* The memory requirements for deflate are (in bytes): -+ (1 << (windowBits+2)) + (1 << (memLevel+9)) -+ that is: 128K for windowBits=15 + 128K for memLevel = 8 (default values) -+ plus a few kilobytes for small objects. For example, if you want to reduce -+ the default memory requirements from 256K to 128K, compile with -+ make CFLAGS="-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7" -+ Of course this will generally degrade compression (there's no free lunch). -+ -+ The memory requirements for inflate are (in bytes) 1 << windowBits -+ that is, 32K for windowBits=15 (default value) plus a few kilobytes -+ for small objects. -+*/ -+ -+ /* Type declarations */ -+ -+#ifndef OF /* function prototypes */ -+# ifdef STDC -+# define OF(args) args -+# else -+# define OF(args) () -+# endif -+#endif -+ -+/* The following definitions for FAR are needed only for MSDOS mixed -+ * model programming (small or medium model with some far allocations). -+ * This was tested only with MSC; for other MSDOS compilers you may have -+ * to define NO_MEMCPY in zutil.h. If you don't need the mixed model, -+ * just define FAR to be empty. -+ */ -+#if (defined(M_I86SM) || defined(M_I86MM)) && !defined(__32BIT__) -+ /* MSC small or medium model */ -+# define SMALL_MEDIUM -+# ifdef _MSC_VER -+# define FAR _far -+# else -+# define FAR far -+# endif -+#endif -+#if defined(__BORLANDC__) && (defined(__SMALL__) || defined(__MEDIUM__)) -+# ifndef __32BIT__ -+# define SMALL_MEDIUM -+# define FAR _far -+# endif -+#endif -+ -+/* Compile with -DZLIB_DLL for Windows DLL support */ -+#if defined(ZLIB_DLL) -+# if defined(_WINDOWS) || defined(WINDOWS) -+# ifdef FAR -+# undef FAR -+# endif -+# include -+# define ZEXPORT WINAPI -+# ifdef WIN32 -+# define ZEXPORTVA WINAPIV -+# else -+# define ZEXPORTVA FAR _cdecl _export -+# endif -+# endif -+# if defined (__BORLANDC__) -+# if (__BORLANDC__ >= 0x0500) && defined (WIN32) -+# include -+# define ZEXPORT __declspec(dllexport) WINAPI -+# define ZEXPORTRVA __declspec(dllexport) WINAPIV -+# else -+# if defined (_Windows) && defined (__DLL__) -+# define ZEXPORT _export -+# define ZEXPORTVA _export -+# endif -+# endif -+# endif -+#endif -+ -+#if defined (__BEOS__) -+# if defined (ZLIB_DLL) -+# define ZEXTERN extern __declspec(dllexport) -+# else -+# define ZEXTERN extern __declspec(dllimport) -+# endif -+#endif -+ -+#ifndef ZEXPORT -+# define ZEXPORT -+#endif -+#ifndef ZEXPORTVA -+# define ZEXPORTVA -+#endif -+#ifndef ZEXTERN -+# define ZEXTERN extern -+#endif -+ -+#ifndef FAR -+# define FAR -+#endif -+ -+#if !defined(MACOS) && !defined(TARGET_OS_MAC) -+typedef unsigned char Byte; /* 8 bits */ -+#endif -+typedef unsigned int uInt; /* 16 bits or more */ -+typedef unsigned long uLong; /* 32 bits or more */ -+ -+#ifdef SMALL_MEDIUM -+ /* Borland C/C++ and some old MSC versions ignore FAR inside typedef */ -+# define Bytef Byte FAR -+#else -+ typedef Byte FAR Bytef; -+#endif -+typedef char FAR charf; -+typedef int FAR intf; -+typedef uInt FAR uIntf; -+typedef uLong FAR uLongf; -+ -+#ifdef STDC -+ typedef void FAR *voidpf; -+ typedef void *voidp; -+#else -+ typedef Byte FAR *voidpf; -+ typedef Byte *voidp; -+#endif -+ -+#ifdef HAVE_UNISTD_H -+# include /* for off_t */ -+# include /* for SEEK_* and off_t */ -+# define z_off_t off_t -+#endif -+#ifndef SEEK_SET -+# define SEEK_SET 0 /* Seek from beginning of file. */ -+# define SEEK_CUR 1 /* Seek from current position. */ -+# define SEEK_END 2 /* Set file pointer to EOF plus "offset" */ -+#endif -+#ifndef z_off_t -+# define z_off_t long -+#endif -+ -+/* MVS linker does not support external names larger than 8 bytes */ -+#if defined(__MVS__) -+# pragma map(deflateInit_,"DEIN") -+# pragma map(deflateInit2_,"DEIN2") -+# pragma map(deflateEnd,"DEEND") -+# pragma map(inflateInit_,"ININ") -+# pragma map(inflateInit2_,"ININ2") -+# pragma map(inflateEnd,"INEND") -+# pragma map(inflateSync,"INSY") -+# pragma map(inflateSetDictionary,"INSEDI") -+# pragma map(inflate_blocks,"INBL") -+# pragma map(inflate_blocks_new,"INBLNE") -+# pragma map(inflate_blocks_free,"INBLFR") -+# pragma map(inflate_blocks_reset,"INBLRE") -+# pragma map(inflate_codes_free,"INCOFR") -+# pragma map(inflate_codes,"INCO") -+# pragma map(inflate_fast,"INFA") -+# pragma map(inflate_flush,"INFLU") -+# pragma map(inflate_mask,"INMA") -+# pragma map(inflate_set_dictionary,"INSEDI2") -+# pragma map(ipcomp_inflate_copyright,"INCOPY") -+# pragma map(inflate_trees_bits,"INTRBI") -+# pragma map(inflate_trees_dynamic,"INTRDY") -+# pragma map(inflate_trees_fixed,"INTRFI") -+# pragma map(inflate_trees_free,"INTRFR") -+#endif -+ -+#endif /* _ZCONF_H */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/zlib/zlib.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,893 @@ -+/* zlib.h -- interface of the 'zlib' general purpose compression library -+ version 1.1.4, March 11th, 2002 -+ -+ Copyright (C) 1995-2002 Jean-loup Gailly and Mark Adler -+ -+ This software is provided 'as-is', without any express or implied -+ warranty. In no event will the authors be held liable for any damages -+ arising from the use of this software. -+ -+ Permission is granted to anyone to use this software for any purpose, -+ including commercial applications, and to alter it and redistribute it -+ freely, subject to the following restrictions: -+ -+ 1. The origin of this software must not be misrepresented; you must not -+ claim that you wrote the original software. If you use this software -+ in a product, an acknowledgment in the product documentation would be -+ appreciated but is not required. -+ 2. Altered source versions must be plainly marked as such, and must not be -+ misrepresented as being the original software. -+ 3. This notice may not be removed or altered from any source distribution. -+ -+ Jean-loup Gailly Mark Adler -+ jloup@gzip.org madler@alumni.caltech.edu -+ -+ -+ The data format used by the zlib library is described by RFCs (Request for -+ Comments) 1950 to 1952 in the files ftp://ds.internic.net/rfc/rfc1950.txt -+ (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format). -+*/ -+ -+#ifndef _ZLIB_H -+#define _ZLIB_H -+ -+#include "zconf.h" -+ -+#ifdef __cplusplus -+extern "C" { -+#endif -+ -+#define ZLIB_VERSION "1.1.4" -+ -+/* -+ The 'zlib' compression library provides in-memory compression and -+ decompression functions, including integrity checks of the uncompressed -+ data. This version of the library supports only one compression method -+ (deflation) but other algorithms will be added later and will have the same -+ stream interface. -+ -+ Compression can be done in a single step if the buffers are large -+ enough (for example if an input file is mmap'ed), or can be done by -+ repeated calls of the compression function. In the latter case, the -+ application must provide more input and/or consume the output -+ (providing more output space) before each call. -+ -+ The library also supports reading and writing files in gzip (.gz) format -+ with an interface similar to that of stdio. -+ -+ The library does not install any signal handler. The decoder checks -+ the consistency of the compressed data, so the library should never -+ crash even in case of corrupted input. -+*/ -+ -+typedef voidpf (*alloc_func) OF((voidpf opaque, uInt items, uInt size)); -+typedef void (*free_func) OF((voidpf opaque, voidpf address)); -+ -+struct internal_state; -+ -+typedef struct z_stream_s { -+ Bytef *next_in; /* next input byte */ -+ uInt avail_in; /* number of bytes available at next_in */ -+ uLong total_in; /* total nb of input bytes read so far */ -+ -+ Bytef *next_out; /* next output byte should be put there */ -+ uInt avail_out; /* remaining free space at next_out */ -+ uLong total_out; /* total nb of bytes output so far */ -+ -+ const char *msg; /* last error message, NULL if no error */ -+ struct internal_state FAR *state; /* not visible by applications */ -+ -+ alloc_func zalloc; /* used to allocate the internal state */ -+ free_func zfree; /* used to free the internal state */ -+ voidpf opaque; /* private data object passed to zalloc and zfree */ -+ -+ int data_type; /* best guess about the data type: ascii or binary */ -+ uLong adler; /* adler32 value of the uncompressed data */ -+ uLong reserved; /* reserved for future use */ -+} z_stream; -+ -+typedef z_stream FAR *z_streamp; -+ -+/* -+ The application must update next_in and avail_in when avail_in has -+ dropped to zero. It must update next_out and avail_out when avail_out -+ has dropped to zero. The application must initialize zalloc, zfree and -+ opaque before calling the init function. All other fields are set by the -+ compression library and must not be updated by the application. -+ -+ The opaque value provided by the application will be passed as the first -+ parameter for calls of zalloc and zfree. This can be useful for custom -+ memory management. The compression library attaches no meaning to the -+ opaque value. -+ -+ zalloc must return Z_NULL if there is not enough memory for the object. -+ If zlib is used in a multi-threaded application, zalloc and zfree must be -+ thread safe. -+ -+ On 16-bit systems, the functions zalloc and zfree must be able to allocate -+ exactly 65536 bytes, but will not be required to allocate more than this -+ if the symbol MAXSEG_64K is defined (see zconf.h). WARNING: On MSDOS, -+ pointers returned by zalloc for objects of exactly 65536 bytes *must* -+ have their offset normalized to zero. The default allocation function -+ provided by this library ensures this (see zutil.c). To reduce memory -+ requirements and avoid any allocation of 64K objects, at the expense of -+ compression ratio, compile the library with -DMAX_WBITS=14 (see zconf.h). -+ -+ The fields total_in and total_out can be used for statistics or -+ progress reports. After compression, total_in holds the total size of -+ the uncompressed data and may be saved for use in the decompressor -+ (particularly if the decompressor wants to decompress everything in -+ a single step). -+*/ -+ -+ /* constants */ -+ -+#define Z_NO_FLUSH 0 -+#define Z_PARTIAL_FLUSH 1 /* will be removed, use Z_SYNC_FLUSH instead */ -+#define Z_SYNC_FLUSH 2 -+#define Z_FULL_FLUSH 3 -+#define Z_FINISH 4 -+/* Allowed flush values; see deflate() below for details */ -+ -+#define Z_OK 0 -+#define Z_STREAM_END 1 -+#define Z_NEED_DICT 2 -+#define Z_ERRNO (-1) -+#define Z_STREAM_ERROR (-2) -+#define Z_DATA_ERROR (-3) -+#define Z_MEM_ERROR (-4) -+#define Z_BUF_ERROR (-5) -+#define Z_VERSION_ERROR (-6) -+/* Return codes for the compression/decompression functions. Negative -+ * values are errors, positive values are used for special but normal events. -+ */ -+ -+#define Z_NO_COMPRESSION 0 -+#define Z_BEST_SPEED 1 -+#define Z_BEST_COMPRESSION 9 -+#define Z_DEFAULT_COMPRESSION (-1) -+/* compression levels */ -+ -+#define Z_FILTERED 1 -+#define Z_HUFFMAN_ONLY 2 -+#define Z_DEFAULT_STRATEGY 0 -+/* compression strategy; see deflateInit2() below for details */ -+ -+#define Z_BINARY 0 -+#define Z_ASCII 1 -+#define Z_UNKNOWN 2 -+/* Possible values of the data_type field */ -+ -+#define Z_DEFLATED 8 -+/* The deflate compression method (the only one supported in this version) */ -+ -+#define Z_NULL 0 /* for initializing zalloc, zfree, opaque */ -+ -+#define zlib_version zlibVersion() -+/* for compatibility with versions < 1.0.2 */ -+ -+ /* basic functions */ -+ -+ZEXTERN const char * ZEXPORT zlibVersion OF((void)); -+/* The application can compare zlibVersion and ZLIB_VERSION for consistency. -+ If the first character differs, the library code actually used is -+ not compatible with the zlib.h header file used by the application. -+ This check is automatically made by deflateInit and inflateInit. -+ */ -+ -+/* -+ZEXTERN int ZEXPORT deflateInit OF((z_streamp strm, int level)); -+ -+ Initializes the internal stream state for compression. The fields -+ zalloc, zfree and opaque must be initialized before by the caller. -+ If zalloc and zfree are set to Z_NULL, deflateInit updates them to -+ use default allocation functions. -+ -+ The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9: -+ 1 gives best speed, 9 gives best compression, 0 gives no compression at -+ all (the input data is simply copied a block at a time). -+ Z_DEFAULT_COMPRESSION requests a default compromise between speed and -+ compression (currently equivalent to level 6). -+ -+ deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not -+ enough memory, Z_STREAM_ERROR if level is not a valid compression level, -+ Z_VERSION_ERROR if the zlib library version (zlib_version) is incompatible -+ with the version assumed by the caller (ZLIB_VERSION). -+ msg is set to null if there is no error message. deflateInit does not -+ perform any compression: this will be done by deflate(). -+*/ -+ -+ -+ZEXTERN int ZEXPORT deflate OF((z_streamp strm, int flush)); -+/* -+ deflate compresses as much data as possible, and stops when the input -+ buffer becomes empty or the output buffer becomes full. It may introduce some -+ output latency (reading input without producing any output) except when -+ forced to flush. -+ -+ The detailed semantics are as follows. deflate performs one or both of the -+ following actions: -+ -+ - Compress more input starting at next_in and update next_in and avail_in -+ accordingly. If not all input can be processed (because there is not -+ enough room in the output buffer), next_in and avail_in are updated and -+ processing will resume at this point for the next call of deflate(). -+ -+ - Provide more output starting at next_out and update next_out and avail_out -+ accordingly. This action is forced if the parameter flush is non zero. -+ Forcing flush frequently degrades the compression ratio, so this parameter -+ should be set only when necessary (in interactive applications). -+ Some output may be provided even if flush is not set. -+ -+ Before the call of deflate(), the application should ensure that at least -+ one of the actions is possible, by providing more input and/or consuming -+ more output, and updating avail_in or avail_out accordingly; avail_out -+ should never be zero before the call. The application can consume the -+ compressed output when it wants, for example when the output buffer is full -+ (avail_out == 0), or after each call of deflate(). If deflate returns Z_OK -+ and with zero avail_out, it must be called again after making room in the -+ output buffer because there might be more output pending. -+ -+ If the parameter flush is set to Z_SYNC_FLUSH, all pending output is -+ flushed to the output buffer and the output is aligned on a byte boundary, so -+ that the decompressor can get all input data available so far. (In particular -+ avail_in is zero after the call if enough output space has been provided -+ before the call.) Flushing may degrade compression for some compression -+ algorithms and so it should be used only when necessary. -+ -+ If flush is set to Z_FULL_FLUSH, all output is flushed as with -+ Z_SYNC_FLUSH, and the compression state is reset so that decompression can -+ restart from this point if previous compressed data has been damaged or if -+ random access is desired. Using Z_FULL_FLUSH too often can seriously degrade -+ the compression. -+ -+ If deflate returns with avail_out == 0, this function must be called again -+ with the same value of the flush parameter and more output space (updated -+ avail_out), until the flush is complete (deflate returns with non-zero -+ avail_out). -+ -+ If the parameter flush is set to Z_FINISH, pending input is processed, -+ pending output is flushed and deflate returns with Z_STREAM_END if there -+ was enough output space; if deflate returns with Z_OK, this function must be -+ called again with Z_FINISH and more output space (updated avail_out) but no -+ more input data, until it returns with Z_STREAM_END or an error. After -+ deflate has returned Z_STREAM_END, the only possible operations on the -+ stream are deflateReset or deflateEnd. -+ -+ Z_FINISH can be used immediately after deflateInit if all the compression -+ is to be done in a single step. In this case, avail_out must be at least -+ 0.1% larger than avail_in plus 12 bytes. If deflate does not return -+ Z_STREAM_END, then it must be called again as described above. -+ -+ deflate() sets strm->adler to the adler32 checksum of all input read -+ so far (that is, total_in bytes). -+ -+ deflate() may update data_type if it can make a good guess about -+ the input data type (Z_ASCII or Z_BINARY). In doubt, the data is considered -+ binary. This field is only for information purposes and does not affect -+ the compression algorithm in any manner. -+ -+ deflate() returns Z_OK if some progress has been made (more input -+ processed or more output produced), Z_STREAM_END if all input has been -+ consumed and all output has been produced (only when flush is set to -+ Z_FINISH), Z_STREAM_ERROR if the stream state was inconsistent (for example -+ if next_in or next_out was NULL), Z_BUF_ERROR if no progress is possible -+ (for example avail_in or avail_out was zero). -+*/ -+ -+ -+ZEXTERN int ZEXPORT deflateEnd OF((z_streamp strm)); -+/* -+ All dynamically allocated data structures for this stream are freed. -+ This function discards any unprocessed input and does not flush any -+ pending output. -+ -+ deflateEnd returns Z_OK if success, Z_STREAM_ERROR if the -+ stream state was inconsistent, Z_DATA_ERROR if the stream was freed -+ prematurely (some input or output was discarded). In the error case, -+ msg may be set but then points to a static string (which must not be -+ deallocated). -+*/ -+ -+ -+/* -+ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm)); -+ -+ Initializes the internal stream state for decompression. The fields -+ next_in, avail_in, zalloc, zfree and opaque must be initialized before by -+ the caller. If next_in is not Z_NULL and avail_in is large enough (the exact -+ value depends on the compression method), inflateInit determines the -+ compression method from the zlib header and allocates all data structures -+ accordingly; otherwise the allocation will be deferred to the first call of -+ inflate. If zalloc and zfree are set to Z_NULL, inflateInit updates them to -+ use default allocation functions. -+ -+ inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough -+ memory, Z_VERSION_ERROR if the zlib library version is incompatible with the -+ version assumed by the caller. msg is set to null if there is no error -+ message. inflateInit does not perform any decompression apart from reading -+ the zlib header if present: this will be done by inflate(). (So next_in and -+ avail_in may be modified, but next_out and avail_out are unchanged.) -+*/ -+ -+ -+ZEXTERN int ZEXPORT inflate OF((z_streamp strm, int flush)); -+/* -+ inflate decompresses as much data as possible, and stops when the input -+ buffer becomes empty or the output buffer becomes full. It may some -+ introduce some output latency (reading input without producing any output) -+ except when forced to flush. -+ -+ The detailed semantics are as follows. inflate performs one or both of the -+ following actions: -+ -+ - Decompress more input starting at next_in and update next_in and avail_in -+ accordingly. If not all input can be processed (because there is not -+ enough room in the output buffer), next_in is updated and processing -+ will resume at this point for the next call of inflate(). -+ -+ - Provide more output starting at next_out and update next_out and avail_out -+ accordingly. inflate() provides as much output as possible, until there -+ is no more input data or no more space in the output buffer (see below -+ about the flush parameter). -+ -+ Before the call of inflate(), the application should ensure that at least -+ one of the actions is possible, by providing more input and/or consuming -+ more output, and updating the next_* and avail_* values accordingly. -+ The application can consume the uncompressed output when it wants, for -+ example when the output buffer is full (avail_out == 0), or after each -+ call of inflate(). If inflate returns Z_OK and with zero avail_out, it -+ must be called again after making room in the output buffer because there -+ might be more output pending. -+ -+ If the parameter flush is set to Z_SYNC_FLUSH, inflate flushes as much -+ output as possible to the output buffer. The flushing behavior of inflate is -+ not specified for values of the flush parameter other than Z_SYNC_FLUSH -+ and Z_FINISH, but the current implementation actually flushes as much output -+ as possible anyway. -+ -+ inflate() should normally be called until it returns Z_STREAM_END or an -+ error. However if all decompression is to be performed in a single step -+ (a single call of inflate), the parameter flush should be set to -+ Z_FINISH. In this case all pending input is processed and all pending -+ output is flushed; avail_out must be large enough to hold all the -+ uncompressed data. (The size of the uncompressed data may have been saved -+ by the compressor for this purpose.) The next operation on this stream must -+ be inflateEnd to deallocate the decompression state. The use of Z_FINISH -+ is never required, but can be used to inform inflate that a faster routine -+ may be used for the single inflate() call. -+ -+ If a preset dictionary is needed at this point (see inflateSetDictionary -+ below), inflate sets strm-adler to the adler32 checksum of the -+ dictionary chosen by the compressor and returns Z_NEED_DICT; otherwise -+ it sets strm->adler to the adler32 checksum of all output produced -+ so far (that is, total_out bytes) and returns Z_OK, Z_STREAM_END or -+ an error code as described below. At the end of the stream, inflate() -+ checks that its computed adler32 checksum is equal to that saved by the -+ compressor and returns Z_STREAM_END only if the checksum is correct. -+ -+ inflate() returns Z_OK if some progress has been made (more input processed -+ or more output produced), Z_STREAM_END if the end of the compressed data has -+ been reached and all uncompressed output has been produced, Z_NEED_DICT if a -+ preset dictionary is needed at this point, Z_DATA_ERROR if the input data was -+ corrupted (input stream not conforming to the zlib format or incorrect -+ adler32 checksum), Z_STREAM_ERROR if the stream structure was inconsistent -+ (for example if next_in or next_out was NULL), Z_MEM_ERROR if there was not -+ enough memory, Z_BUF_ERROR if no progress is possible or if there was not -+ enough room in the output buffer when Z_FINISH is used. In the Z_DATA_ERROR -+ case, the application may then call inflateSync to look for a good -+ compression block. -+*/ -+ -+ -+ZEXTERN int ZEXPORT inflateEnd OF((z_streamp strm)); -+/* -+ All dynamically allocated data structures for this stream are freed. -+ This function discards any unprocessed input and does not flush any -+ pending output. -+ -+ inflateEnd returns Z_OK if success, Z_STREAM_ERROR if the stream state -+ was inconsistent. In the error case, msg may be set but then points to a -+ static string (which must not be deallocated). -+*/ -+ -+ /* Advanced functions */ -+ -+/* -+ The following functions are needed only in some special applications. -+*/ -+ -+/* -+ZEXTERN int ZEXPORT deflateInit2 OF((z_streamp strm, -+ int level, -+ int method, -+ int windowBits, -+ int memLevel, -+ int strategy)); -+ -+ This is another version of deflateInit with more compression options. The -+ fields next_in, zalloc, zfree and opaque must be initialized before by -+ the caller. -+ -+ The method parameter is the compression method. It must be Z_DEFLATED in -+ this version of the library. -+ -+ The windowBits parameter is the base two logarithm of the window size -+ (the size of the history buffer). It should be in the range 8..15 for this -+ version of the library. Larger values of this parameter result in better -+ compression at the expense of memory usage. The default value is 15 if -+ deflateInit is used instead. -+ -+ The memLevel parameter specifies how much memory should be allocated -+ for the internal compression state. memLevel=1 uses minimum memory but -+ is slow and reduces compression ratio; memLevel=9 uses maximum memory -+ for optimal speed. The default value is 8. See zconf.h for total memory -+ usage as a function of windowBits and memLevel. -+ -+ The strategy parameter is used to tune the compression algorithm. Use the -+ value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a -+ filter (or predictor), or Z_HUFFMAN_ONLY to force Huffman encoding only (no -+ string match). Filtered data consists mostly of small values with a -+ somewhat random distribution. In this case, the compression algorithm is -+ tuned to compress them better. The effect of Z_FILTERED is to force more -+ Huffman coding and less string matching; it is somewhat intermediate -+ between Z_DEFAULT and Z_HUFFMAN_ONLY. The strategy parameter only affects -+ the compression ratio but not the correctness of the compressed output even -+ if it is not set appropriately. -+ -+ deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough -+ memory, Z_STREAM_ERROR if a parameter is invalid (such as an invalid -+ method). msg is set to null if there is no error message. deflateInit2 does -+ not perform any compression: this will be done by deflate(). -+*/ -+ -+ZEXTERN int ZEXPORT deflateSetDictionary OF((z_streamp strm, -+ const Bytef *dictionary, -+ uInt dictLength)); -+/* -+ Initializes the compression dictionary from the given byte sequence -+ without producing any compressed output. This function must be called -+ immediately after deflateInit, deflateInit2 or deflateReset, before any -+ call of deflate. The compressor and decompressor must use exactly the same -+ dictionary (see inflateSetDictionary). -+ -+ The dictionary should consist of strings (byte sequences) that are likely -+ to be encountered later in the data to be compressed, with the most commonly -+ used strings preferably put towards the end of the dictionary. Using a -+ dictionary is most useful when the data to be compressed is short and can be -+ predicted with good accuracy; the data can then be compressed better than -+ with the default empty dictionary. -+ -+ Depending on the size of the compression data structures selected by -+ deflateInit or deflateInit2, a part of the dictionary may in effect be -+ discarded, for example if the dictionary is larger than the window size in -+ deflate or deflate2. Thus the strings most likely to be useful should be -+ put at the end of the dictionary, not at the front. -+ -+ Upon return of this function, strm->adler is set to the Adler32 value -+ of the dictionary; the decompressor may later use this value to determine -+ which dictionary has been used by the compressor. (The Adler32 value -+ applies to the whole dictionary even if only a subset of the dictionary is -+ actually used by the compressor.) -+ -+ deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a -+ parameter is invalid (such as NULL dictionary) or the stream state is -+ inconsistent (for example if deflate has already been called for this stream -+ or if the compression method is bsort). deflateSetDictionary does not -+ perform any compression: this will be done by deflate(). -+*/ -+ -+ZEXTERN int ZEXPORT deflateCopy OF((z_streamp dest, -+ z_streamp source)); -+/* -+ Sets the destination stream as a complete copy of the source stream. -+ -+ This function can be useful when several compression strategies will be -+ tried, for example when there are several ways of pre-processing the input -+ data with a filter. The streams that will be discarded should then be freed -+ by calling deflateEnd. Note that deflateCopy duplicates the internal -+ compression state which can be quite large, so this strategy is slow and -+ can consume lots of memory. -+ -+ deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not -+ enough memory, Z_STREAM_ERROR if the source stream state was inconsistent -+ (such as zalloc being NULL). msg is left unchanged in both source and -+ destination. -+*/ -+ -+ZEXTERN int ZEXPORT deflateReset OF((z_streamp strm)); -+/* -+ This function is equivalent to deflateEnd followed by deflateInit, -+ but does not free and reallocate all the internal compression state. -+ The stream will keep the same compression level and any other attributes -+ that may have been set by deflateInit2. -+ -+ deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source -+ stream state was inconsistent (such as zalloc or state being NULL). -+*/ -+ -+ZEXTERN int ZEXPORT deflateParams OF((z_streamp strm, -+ int level, -+ int strategy)); -+/* -+ Dynamically update the compression level and compression strategy. The -+ interpretation of level and strategy is as in deflateInit2. This can be -+ used to switch between compression and straight copy of the input data, or -+ to switch to a different kind of input data requiring a different -+ strategy. If the compression level is changed, the input available so far -+ is compressed with the old level (and may be flushed); the new level will -+ take effect only at the next call of deflate(). -+ -+ Before the call of deflateParams, the stream state must be set as for -+ a call of deflate(), since the currently available input may have to -+ be compressed and flushed. In particular, strm->avail_out must be non-zero. -+ -+ deflateParams returns Z_OK if success, Z_STREAM_ERROR if the source -+ stream state was inconsistent or if a parameter was invalid, Z_BUF_ERROR -+ if strm->avail_out was zero. -+*/ -+ -+/* -+ZEXTERN int ZEXPORT inflateInit2 OF((z_streamp strm, -+ int windowBits)); -+ -+ This is another version of inflateInit with an extra parameter. The -+ fields next_in, avail_in, zalloc, zfree and opaque must be initialized -+ before by the caller. -+ -+ The windowBits parameter is the base two logarithm of the maximum window -+ size (the size of the history buffer). It should be in the range 8..15 for -+ this version of the library. The default value is 15 if inflateInit is used -+ instead. If a compressed stream with a larger window size is given as -+ input, inflate() will return with the error code Z_DATA_ERROR instead of -+ trying to allocate a larger window. -+ -+ inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough -+ memory, Z_STREAM_ERROR if a parameter is invalid (such as a negative -+ memLevel). msg is set to null if there is no error message. inflateInit2 -+ does not perform any decompression apart from reading the zlib header if -+ present: this will be done by inflate(). (So next_in and avail_in may be -+ modified, but next_out and avail_out are unchanged.) -+*/ -+ -+ZEXTERN int ZEXPORT inflateSetDictionary OF((z_streamp strm, -+ const Bytef *dictionary, -+ uInt dictLength)); -+/* -+ Initializes the decompression dictionary from the given uncompressed byte -+ sequence. This function must be called immediately after a call of inflate -+ if this call returned Z_NEED_DICT. The dictionary chosen by the compressor -+ can be determined from the Adler32 value returned by this call of -+ inflate. The compressor and decompressor must use exactly the same -+ dictionary (see deflateSetDictionary). -+ -+ inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a -+ parameter is invalid (such as NULL dictionary) or the stream state is -+ inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the -+ expected one (incorrect Adler32 value). inflateSetDictionary does not -+ perform any decompression: this will be done by subsequent calls of -+ inflate(). -+*/ -+ -+ZEXTERN int ZEXPORT inflateSync OF((z_streamp strm)); -+/* -+ Skips invalid compressed data until a full flush point (see above the -+ description of deflate with Z_FULL_FLUSH) can be found, or until all -+ available input is skipped. No output is provided. -+ -+ inflateSync returns Z_OK if a full flush point has been found, Z_BUF_ERROR -+ if no more input was provided, Z_DATA_ERROR if no flush point has been found, -+ or Z_STREAM_ERROR if the stream structure was inconsistent. In the success -+ case, the application may save the current current value of total_in which -+ indicates where valid compressed data was found. In the error case, the -+ application may repeatedly call inflateSync, providing more input each time, -+ until success or end of the input data. -+*/ -+ -+ZEXTERN int ZEXPORT inflateReset OF((z_streamp strm)); -+/* -+ This function is equivalent to inflateEnd followed by inflateInit, -+ but does not free and reallocate all the internal decompression state. -+ The stream will keep attributes that may have been set by inflateInit2. -+ -+ inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source -+ stream state was inconsistent (such as zalloc or state being NULL). -+*/ -+ -+ -+ /* utility functions */ -+ -+/* -+ The following utility functions are implemented on top of the -+ basic stream-oriented functions. To simplify the interface, some -+ default options are assumed (compression level and memory usage, -+ standard memory allocation functions). The source code of these -+ utility functions can easily be modified if you need special options. -+*/ -+ -+ZEXTERN int ZEXPORT compress OF((Bytef *dest, uLongf *destLen, -+ const Bytef *source, uLong sourceLen)); -+/* -+ Compresses the source buffer into the destination buffer. sourceLen is -+ the byte length of the source buffer. Upon entry, destLen is the total -+ size of the destination buffer, which must be at least 0.1% larger than -+ sourceLen plus 12 bytes. Upon exit, destLen is the actual size of the -+ compressed buffer. -+ This function can be used to compress a whole file at once if the -+ input file is mmap'ed. -+ compress returns Z_OK if success, Z_MEM_ERROR if there was not -+ enough memory, Z_BUF_ERROR if there was not enough room in the output -+ buffer. -+*/ -+ -+ZEXTERN int ZEXPORT compress2 OF((Bytef *dest, uLongf *destLen, -+ const Bytef *source, uLong sourceLen, -+ int level)); -+/* -+ Compresses the source buffer into the destination buffer. The level -+ parameter has the same meaning as in deflateInit. sourceLen is the byte -+ length of the source buffer. Upon entry, destLen is the total size of the -+ destination buffer, which must be at least 0.1% larger than sourceLen plus -+ 12 bytes. Upon exit, destLen is the actual size of the compressed buffer. -+ -+ compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough -+ memory, Z_BUF_ERROR if there was not enough room in the output buffer, -+ Z_STREAM_ERROR if the level parameter is invalid. -+*/ -+ -+ZEXTERN int ZEXPORT uncompress OF((Bytef *dest, uLongf *destLen, -+ const Bytef *source, uLong sourceLen)); -+/* -+ Decompresses the source buffer into the destination buffer. sourceLen is -+ the byte length of the source buffer. Upon entry, destLen is the total -+ size of the destination buffer, which must be large enough to hold the -+ entire uncompressed data. (The size of the uncompressed data must have -+ been saved previously by the compressor and transmitted to the decompressor -+ by some mechanism outside the scope of this compression library.) -+ Upon exit, destLen is the actual size of the compressed buffer. -+ This function can be used to decompress a whole file at once if the -+ input file is mmap'ed. -+ -+ uncompress returns Z_OK if success, Z_MEM_ERROR if there was not -+ enough memory, Z_BUF_ERROR if there was not enough room in the output -+ buffer, or Z_DATA_ERROR if the input data was corrupted. -+*/ -+ -+ -+typedef voidp gzFile; -+ -+ZEXTERN gzFile ZEXPORT gzopen OF((const char *path, const char *mode)); -+/* -+ Opens a gzip (.gz) file for reading or writing. The mode parameter -+ is as in fopen ("rb" or "wb") but can also include a compression level -+ ("wb9") or a strategy: 'f' for filtered data as in "wb6f", 'h' for -+ Huffman only compression as in "wb1h". (See the description -+ of deflateInit2 for more information about the strategy parameter.) -+ -+ gzopen can be used to read a file which is not in gzip format; in this -+ case gzread will directly read from the file without decompression. -+ -+ gzopen returns NULL if the file could not be opened or if there was -+ insufficient memory to allocate the (de)compression state; errno -+ can be checked to distinguish the two cases (if errno is zero, the -+ zlib error is Z_MEM_ERROR). */ -+ -+ZEXTERN gzFile ZEXPORT gzdopen OF((int fd, const char *mode)); -+/* -+ gzdopen() associates a gzFile with the file descriptor fd. File -+ descriptors are obtained from calls like open, dup, creat, pipe or -+ fileno (in the file has been previously opened with fopen). -+ The mode parameter is as in gzopen. -+ The next call of gzclose on the returned gzFile will also close the -+ file descriptor fd, just like fclose(fdopen(fd), mode) closes the file -+ descriptor fd. If you want to keep fd open, use gzdopen(dup(fd), mode). -+ gzdopen returns NULL if there was insufficient memory to allocate -+ the (de)compression state. -+*/ -+ -+ZEXTERN int ZEXPORT gzsetparams OF((gzFile file, int level, int strategy)); -+/* -+ Dynamically update the compression level or strategy. See the description -+ of deflateInit2 for the meaning of these parameters. -+ gzsetparams returns Z_OK if success, or Z_STREAM_ERROR if the file was not -+ opened for writing. -+*/ -+ -+ZEXTERN int ZEXPORT gzread OF((gzFile file, voidp buf, unsigned len)); -+/* -+ Reads the given number of uncompressed bytes from the compressed file. -+ If the input file was not in gzip format, gzread copies the given number -+ of bytes into the buffer. -+ gzread returns the number of uncompressed bytes actually read (0 for -+ end of file, -1 for error). */ -+ -+ZEXTERN int ZEXPORT gzwrite OF((gzFile file, -+ const voidp buf, unsigned len)); -+/* -+ Writes the given number of uncompressed bytes into the compressed file. -+ gzwrite returns the number of uncompressed bytes actually written -+ (0 in case of error). -+*/ -+ -+ZEXTERN int ZEXPORTVA gzprintf OF((gzFile file, const char *format, ...)); -+/* -+ Converts, formats, and writes the args to the compressed file under -+ control of the format string, as in fprintf. gzprintf returns the number of -+ uncompressed bytes actually written (0 in case of error). -+*/ -+ -+ZEXTERN int ZEXPORT gzputs OF((gzFile file, const char *s)); -+/* -+ Writes the given null-terminated string to the compressed file, excluding -+ the terminating null character. -+ gzputs returns the number of characters written, or -1 in case of error. -+*/ -+ -+ZEXTERN char * ZEXPORT gzgets OF((gzFile file, char *buf, int len)); -+/* -+ Reads bytes from the compressed file until len-1 characters are read, or -+ a newline character is read and transferred to buf, or an end-of-file -+ condition is encountered. The string is then terminated with a null -+ character. -+ gzgets returns buf, or Z_NULL in case of error. -+*/ -+ -+ZEXTERN int ZEXPORT gzputc OF((gzFile file, int c)); -+/* -+ Writes c, converted to an unsigned char, into the compressed file. -+ gzputc returns the value that was written, or -1 in case of error. -+*/ -+ -+ZEXTERN int ZEXPORT gzgetc OF((gzFile file)); -+/* -+ Reads one byte from the compressed file. gzgetc returns this byte -+ or -1 in case of end of file or error. -+*/ -+ -+ZEXTERN int ZEXPORT gzflush OF((gzFile file, int flush)); -+/* -+ Flushes all pending output into the compressed file. The parameter -+ flush is as in the deflate() function. The return value is the zlib -+ error number (see function gzerror below). gzflush returns Z_OK if -+ the flush parameter is Z_FINISH and all output could be flushed. -+ gzflush should be called only when strictly necessary because it can -+ degrade compression. -+*/ -+ -+ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile file, -+ z_off_t offset, int whence)); -+/* -+ Sets the starting position for the next gzread or gzwrite on the -+ given compressed file. The offset represents a number of bytes in the -+ uncompressed data stream. The whence parameter is defined as in lseek(2); -+ the value SEEK_END is not supported. -+ If the file is opened for reading, this function is emulated but can be -+ extremely slow. If the file is opened for writing, only forward seeks are -+ supported; gzseek then compresses a sequence of zeroes up to the new -+ starting position. -+ -+ gzseek returns the resulting offset location as measured in bytes from -+ the beginning of the uncompressed stream, or -1 in case of error, in -+ particular if the file is opened for writing and the new starting position -+ would be before the current position. -+*/ -+ -+ZEXTERN int ZEXPORT gzrewind OF((gzFile file)); -+/* -+ Rewinds the given file. This function is supported only for reading. -+ -+ gzrewind(file) is equivalent to (int)gzseek(file, 0L, SEEK_SET) -+*/ -+ -+ZEXTERN z_off_t ZEXPORT gztell OF((gzFile file)); -+/* -+ Returns the starting position for the next gzread or gzwrite on the -+ given compressed file. This position represents a number of bytes in the -+ uncompressed data stream. -+ -+ gztell(file) is equivalent to gzseek(file, 0L, SEEK_CUR) -+*/ -+ -+ZEXTERN int ZEXPORT gzeof OF((gzFile file)); -+/* -+ Returns 1 when EOF has previously been detected reading the given -+ input stream, otherwise zero. -+*/ -+ -+ZEXTERN int ZEXPORT gzclose OF((gzFile file)); -+/* -+ Flushes all pending output if necessary, closes the compressed file -+ and deallocates all the (de)compression state. The return value is the zlib -+ error number (see function gzerror below). -+*/ -+ -+ZEXTERN const char * ZEXPORT gzerror OF((gzFile file, int *errnum)); -+/* -+ Returns the error message for the last error which occurred on the -+ given compressed file. errnum is set to zlib error number. If an -+ error occurred in the file system and not in the compression library, -+ errnum is set to Z_ERRNO and the application may consult errno -+ to get the exact error code. -+*/ -+ -+ /* checksum functions */ -+ -+/* -+ These functions are not related to compression but are exported -+ anyway because they might be useful in applications using the -+ compression library. -+*/ -+ -+ZEXTERN uLong ZEXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len)); -+ -+/* -+ Update a running Adler-32 checksum with the bytes buf[0..len-1] and -+ return the updated checksum. If buf is NULL, this function returns -+ the required initial value for the checksum. -+ An Adler-32 checksum is almost as reliable as a CRC32 but can be computed -+ much faster. Usage example: -+ -+ uLong adler = adler32(0L, Z_NULL, 0); -+ -+ while (read_buffer(buffer, length) != EOF) { -+ adler = adler32(adler, buffer, length); -+ } -+ if (adler != original_adler) error(); -+*/ -+ -+ZEXTERN uLong ZEXPORT crc32 OF((uLong crc, const Bytef *buf, uInt len)); -+/* -+ Update a running crc with the bytes buf[0..len-1] and return the updated -+ crc. If buf is NULL, this function returns the required initial value -+ for the crc. Pre- and post-conditioning (one's complement) is performed -+ within this function so it shouldn't be done by the application. -+ Usage example: -+ -+ uLong crc = crc32(0L, Z_NULL, 0); -+ -+ while (read_buffer(buffer, length) != EOF) { -+ crc = crc32(crc, buffer, length); -+ } -+ if (crc != original_crc) error(); -+*/ -+ -+ -+ /* various hacks, don't look :) */ -+ -+/* deflateInit and inflateInit are macros to allow checking the zlib version -+ * and the compiler's view of z_stream: -+ */ -+ZEXTERN int ZEXPORT deflateInit_ OF((z_streamp strm, int level, -+ const char *version, int stream_size)); -+ZEXTERN int ZEXPORT inflateInit_ OF((z_streamp strm, -+ const char *version, int stream_size)); -+ZEXTERN int ZEXPORT deflateInit2_ OF((z_streamp strm, int level, int method, -+ int windowBits, int memLevel, -+ int strategy, const char *version, -+ int stream_size)); -+ZEXTERN int ZEXPORT inflateInit2_ OF((z_streamp strm, int windowBits, -+ const char *version, int stream_size)); -+#define deflateInit(strm, level) \ -+ deflateInit_((strm), (level), ZLIB_VERSION, sizeof(z_stream)) -+#define inflateInit(strm) \ -+ inflateInit_((strm), ZLIB_VERSION, sizeof(z_stream)) -+#define deflateInit2(strm, level, method, windowBits, memLevel, strategy) \ -+ deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\ -+ (strategy), ZLIB_VERSION, sizeof(z_stream)) -+#define inflateInit2(strm, windowBits) \ -+ inflateInit2_((strm), (windowBits), ZLIB_VERSION, sizeof(z_stream)) -+ -+ -+#if !defined(_Z_UTIL_H) && !defined(NO_DUMMY_DECL) -+ struct internal_state {int dummy;}; /* hack for buggy compilers */ -+#endif -+ -+ZEXTERN const char * ZEXPORT zError OF((int err)); -+ZEXTERN int ZEXPORT inflateSyncPoint OF((z_streamp z)); -+ZEXTERN const uLongf * ZEXPORT get_crc_table OF((void)); -+ -+#ifdef __cplusplus -+} -+#endif -+ -+#endif /* _ZLIB_H */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/include/zlib/zutil.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,225 @@ -+/* zutil.h -- internal interface and configuration of the compression library -+ * Copyright (C) 1995-2002 Jean-loup Gailly. -+ * For conditions of distribution and use, see copyright notice in zlib.h -+ */ -+ -+/* WARNING: this file should *not* be used by applications. It is -+ part of the implementation of the compression library and is -+ subject to change. Applications should only use zlib.h. -+ */ -+ -+/* @(#) $Id: zutil.h,v 1.4 2002/04/24 07:36:48 mcr Exp $ */ -+ -+#ifndef _Z_UTIL_H -+#define _Z_UTIL_H -+ -+#include "zlib.h" -+ -+#include -+#define HAVE_MEMCPY -+ -+#if 0 // #ifdef STDC -+# include -+# include -+# include -+#endif -+#ifndef __KERNEL__ -+#ifdef NO_ERRNO_H -+ extern int errno; -+#else -+# include -+#endif -+#endif -+ -+#ifndef local -+# define local static -+#endif -+/* compile with -Dlocal if your debugger can't find static symbols */ -+ -+typedef unsigned char uch; -+typedef uch FAR uchf; -+typedef unsigned short ush; -+typedef ush FAR ushf; -+typedef unsigned long ulg; -+ -+extern const char *z_errmsg[10]; /* indexed by 2-zlib_error */ -+/* (size given to avoid silly warnings with Visual C++) */ -+ -+#define ERR_MSG(err) z_errmsg[Z_NEED_DICT-(err)] -+ -+#define ERR_RETURN(strm,err) \ -+ return (strm->msg = ERR_MSG(err), (err)) -+/* To be used only when the state is known to be valid */ -+ -+ /* common constants */ -+ -+#ifndef DEF_WBITS -+# define DEF_WBITS MAX_WBITS -+#endif -+/* default windowBits for decompression. MAX_WBITS is for compression only */ -+ -+#if MAX_MEM_LEVEL >= 8 -+# define DEF_MEM_LEVEL 8 -+#else -+# define DEF_MEM_LEVEL MAX_MEM_LEVEL -+#endif -+/* default memLevel */ -+ -+#define STORED_BLOCK 0 -+#define STATIC_TREES 1 -+#define DYN_TREES 2 -+/* The three kinds of block type */ -+ -+#define MIN_MATCH 3 -+#define MAX_MATCH 258 -+/* The minimum and maximum match lengths */ -+ -+#define PRESET_DICT 0x20 /* preset dictionary flag in zlib header */ -+ -+ /* target dependencies */ -+ -+#ifdef MSDOS -+# define OS_CODE 0x00 -+# if defined(__TURBOC__) || defined(__BORLANDC__) -+# if(__STDC__ == 1) && (defined(__LARGE__) || defined(__COMPACT__)) -+ /* Allow compilation with ANSI keywords only enabled */ -+ void _Cdecl farfree( void *block ); -+ void *_Cdecl farmalloc( unsigned long nbytes ); -+# else -+# include -+# endif -+# else /* MSC or DJGPP */ -+# include -+# endif -+#endif -+ -+#ifdef OS2 -+# define OS_CODE 0x06 -+#endif -+ -+#ifdef WIN32 /* Window 95 & Windows NT */ -+# define OS_CODE 0x0b -+#endif -+ -+#if defined(VAXC) || defined(VMS) -+# define OS_CODE 0x02 -+# define F_OPEN(name, mode) \ -+ fopen((name), (mode), "mbc=60", "ctx=stm", "rfm=fix", "mrs=512") -+#endif -+ -+#ifdef AMIGA -+# define OS_CODE 0x01 -+#endif -+ -+#if defined(ATARI) || defined(atarist) -+# define OS_CODE 0x05 -+#endif -+ -+#if defined(MACOS) || defined(TARGET_OS_MAC) -+# define OS_CODE 0x07 -+# if defined(__MWERKS__) && __dest_os != __be_os && __dest_os != __win32_os -+# include /* for fdopen */ -+# else -+# ifndef fdopen -+# define fdopen(fd,mode) NULL /* No fdopen() */ -+# endif -+# endif -+#endif -+ -+#ifdef __50SERIES /* Prime/PRIMOS */ -+# define OS_CODE 0x0F -+#endif -+ -+#ifdef TOPS20 -+# define OS_CODE 0x0a -+#endif -+ -+#if defined(_BEOS_) || defined(RISCOS) -+# define fdopen(fd,mode) NULL /* No fdopen() */ -+#endif -+ -+#if (defined(_MSC_VER) && (_MSC_VER > 600)) -+# define fdopen(fd,type) _fdopen(fd,type) -+#endif -+ -+ -+ /* Common defaults */ -+ -+#ifndef OS_CODE -+# define OS_CODE 0x03 /* assume Unix */ -+#endif -+ -+#ifndef F_OPEN -+# define F_OPEN(name, mode) fopen((name), (mode)) -+#endif -+ -+ /* functions */ -+ -+#ifdef HAVE_STRERROR -+ extern char *strerror OF((int)); -+# define zstrerror(errnum) strerror(errnum) -+#else -+# define zstrerror(errnum) "" -+#endif -+ -+#if defined(pyr) -+# define NO_MEMCPY -+#endif -+#if defined(SMALL_MEDIUM) && !defined(_MSC_VER) && !defined(__SC__) -+ /* Use our own functions for small and medium model with MSC <= 5.0. -+ * You may have to use the same strategy for Borland C (untested). -+ * The __SC__ check is for Symantec. -+ */ -+# define NO_MEMCPY -+#endif -+#if defined(STDC) && !defined(HAVE_MEMCPY) && !defined(NO_MEMCPY) -+# define HAVE_MEMCPY -+#endif -+#ifdef HAVE_MEMCPY -+# ifdef SMALL_MEDIUM /* MSDOS small or medium model */ -+# define zmemcpy _fmemcpy -+# define zmemcmp _fmemcmp -+# define zmemzero(dest, len) _fmemset(dest, 0, len) -+# else -+# define zmemcpy memcpy -+# define zmemcmp memcmp -+# define zmemzero(dest, len) memset(dest, 0, len) -+# endif -+#else -+ extern void zmemcpy OF((Bytef* dest, const Bytef* source, uInt len)); -+ extern int zmemcmp OF((const Bytef* s1, const Bytef* s2, uInt len)); -+ extern void zmemzero OF((Bytef* dest, uInt len)); -+#endif -+ -+/* Diagnostic functions */ -+#ifdef DEBUG -+# include -+ extern int z_verbose; -+ extern void z_error OF((char *m)); -+# define Assert(cond,msg) {if(!(cond)) z_error(msg);} -+# define Trace(x) {if (z_verbose>=0) fprintf x ;} -+# define Tracev(x) {if (z_verbose>0) fprintf x ;} -+# define Tracevv(x) {if (z_verbose>1) fprintf x ;} -+# define Tracec(c,x) {if (z_verbose>0 && (c)) fprintf x ;} -+# define Tracecv(c,x) {if (z_verbose>1 && (c)) fprintf x ;} -+#else -+# define Assert(cond,msg) -+# define Trace(x) -+# define Tracev(x) -+# define Tracevv(x) -+# define Tracec(c,x) -+# define Tracecv(c,x) -+#endif -+ -+ -+typedef uLong (ZEXPORT *check_func) OF((uLong check, const Bytef *buf, -+ uInt len)); -+voidpf zcalloc OF((voidpf opaque, unsigned items, unsigned size)); -+void zcfree OF((voidpf opaque, voidpf ptr)); -+ -+#define ZALLOC(strm, items, size) \ -+ (*((strm)->zalloc))((strm)->opaque, (items), (size)) -+#define ZFREE(strm, addr) (*((strm)->zfree))((strm)->opaque, (voidpf)(addr)) -+#define TRY_FREE(s, p) {if (p) ZFREE(s, p);} -+ -+#endif /* _Z_UTIL_H */ ---- swan26/net/Kconfig.preipsec 2005-09-01 18:15:19.000000000 -0400 -+++ swan26/net/Kconfig 2005-09-03 16:51:17.000000000 -0400 -@@ -215,2 +215,6 @@ - -+if INET -+source "net/ipsec/Kconfig" -+endif # if INET -+ - endif # if NET ---- /distros/kernel/linux-2.6.3-rc4/net/Makefile Mon Feb 16 21:22:12 2004 -+++ ref26/net/Makefile Thu Feb 19 21:02:25 2004 -@@ -42,3 +42,6 @@ - ifeq ($(CONFIG_NET),y) - obj-$(CONFIG_SYSCTL) += sysctl_net.o - endif -+ -+obj-$(CONFIG_KLIPS) += ipsec/ -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/Kconfig Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,143 @@ -+# -+# IPSEC configuration -+# Copyright (C) 2004 Michael Richardson -+# -+# This program is free software; you can redistribute it and/or modify it -+# under the terms of the GNU General Public License as published by the -+# Free Software Foundation; either version 2 of the License, or (at your -+# option) any later version. See . -+# -+# This program is distributed in the hope that it will be useful, but -+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+# for more details. -+# -+ -+config KLIPS -+ tristate "Openswan IPsec (KLIPS)" -+ default n -+# depends on NF_CONNTRACK && NETFILTER -+ help -+ KLIPS is the Openswan (www.openswan.org) Kernel Level IP Security -+ system. It is extensively tested, and has interoperated with -+ many other systems. -+ It provides "ipsecX" devices on which one can do firewalling. -+ The Openswan userland, is compatible with both KLIPS and NETKEY -+ You cannot build KLIPS and NETKEY inline into the kernel. -+ -+menu "KLIPS options" -+ depends on KLIPS -+ -+config KLIPS_ESP -+ bool 'Encapsulating Security Payload - ESP ("VPN")' -+ default y -+ help -+ This option provides support for the IPSEC Encapsulation Security -+ Payload (IP protocol 50) which provides packet layer content -+ hiding, and content authentication. -+ It is recommended to enable this. RFC2406 -+ -+config KLIPS_AH -+ bool 'Authentication Header - AH' -+ default n -+ help -+ This option provides support for the IPSEC Authentication Header -+ (IP protocol 51) which provides packet layer sender and content -+ authentication. It does not provide for confidentiality. -+ It is not recommended to enable this. RFC2402 -+ -+config KLIPS_AUTH_HMAC_MD5 -+ bool 'HMAC-MD5 authentication algorithm' -+ default y -+ help -+ The HMAC-MD5 algorithm is used by ESP (and AH) to guarantee packet -+ integrity. There is little reason not to include it. -+ -+config KLIPS_AUTH_HMAC_SHA1 -+ bool 'HMAC-SHA1 authentication algorithm' -+ default y -+ help -+ The HMAC-SHA1 algorithm is used by ESP (and AH) to guarantee packet -+ integrity. SHA1 is a little slower than MD5, but is said to be -+ a bit more secure. There is little reason not to include it. -+ -+config KLIPS_ALG -+ bool 'KLIPS_ALG software encryption' -+ default y -+ help -+ This option provides support for loading new algorithms into the -+ kernel for crypto use. You may disable this if using the -+ CONFIG_KLIPS_OCF option for hardware offload. -+ -+config KLIPS_ENC_CRYPTOAPI -+ bool 'CryptoAPI algorithm interface' -+ default n -+ depends on KLIPS_ALG -+ help -+ Enable the algorithm interface to make all CryptoAPI 1.0 algorithms -+ available to KLIPS. -+ -+config KLIPS_ENC_1DES -+ bool 'Include 1DES with CryptoAPI' -+ default n -+ depends on KLIPS_ENC_CRYPTOAPI -+ help -+ The CryptoAPI interface does not include support for every algorithm -+ yet, and one that it doesn't support by default is the VERY WEAK -+ 1DES. Select this if you are terminally stupid. -+ -+config KLIPS_ENC_3DES -+ bool '3DES encryption algorithm' -+ default y -+ help -+ The 3DES algorithm is used by ESP to provide for packet privacy. -+ 3DES is 3-repeats of the DES algorithm. 3DES is widely supported, -+ and analyzed and is considered very secure. 1DES is not supported. -+ -+config KLIPS_ENC_AES -+ bool 'AES encryption algorithm' -+ default y -+ depends on KLIPS_ALG -+ help -+ The AES algorithm is used by ESP to provide for packet privacy. -+ AES the NIST replacement for DES. AES is being widely analyzed, -+ and is very fast. -+ -+config KLIPS_IPCOMP -+ bool 'IP compression' -+ default y -+ help -+ The IPcomp protocol is used prior to ESP to make the packet -+ smaller. Once encrypted, compression will fail, so any link -+ layer efforts (e.g. PPP) will not work. -+ -+config KLIPS_OCF -+ bool 'IPsec OCF Acceleration Support' -+ default n -+ help -+ OCF provides Asynchronous crypto acceleration for kernel and -+ user applications. It supports various HW accelerators. -+ If you have OCF support enabled and wish IPsec to utilise -+ the hardware managed by OCF, then enable this option. -+ OCF is a kernel patch, see http://ocf-linux.sourceforge.net/ -+ -+config KLIPS_DEBUG -+ bool 'IPsec debugging' -+ default y -+ help -+ KLIPS includes a lot of debugging code. Unless there is a real -+ tangible benefit to removing this code, it should be left in place. -+ Debugging connections without access to kernel level debugging is -+ essentially impossible. Leave this on. -+ -+config KLIPS_IF_MAX -+ int 'Maximum number of virtual interfaces' -+ default 64 -+ range 4 256 -+ help -+ KLIPS creates virtual interfaces for tunnel purposes. At present -+ it keeps track of certain items in an array (FIX ME), and needs -+ to preallocate this array. Only a pointer is used per item. -+ -+endmenu -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/Makefile Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,192 @@ -+# Makefile for KLIPS kernel code as a module for 2.6 kernels -+# -+# Makefile for KLIPS kernel code as a module -+# Copyright (C) 1998, 1999, 2000,2001 Richard Guy Briggs. -+# Copyright (C) 2002-2004 Michael Richardson -+# -+# This program is free software; you can redistribute it and/or modify it -+# under the terms of the GNU General Public License as published by the -+# Free Software Foundation; either version 2 of the License, or (at your -+# option) any later version. See . -+# -+# This program is distributed in the hope that it will be useful, but -+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+# for more details. -+# -+# RCSID $Id: Makefile.fs2_6,v 1.8.2.1 2006/04/20 16:33:06 mcr Exp $ -+# -+# Note! Dependencies are done automagically by 'make dep', which also -+# removes any old dependencies. DON'T put your own dependencies here -+# unless it's something special (ie not a .c file). -+# -+ -+OPENSWANSRCDIR?=. -+KLIPS_TOP?=. -+ -+-include ${OPENSWANSRCDIR}/Makefile.ver -+ -+base-klips-objs := -+ -+base-klips-objs+= ipsec_init.o ipsec_sa.o ipsec_radij.o radij.o -+base-klips-objs+= ipsec_life.o ipsec_proc.o -+base-klips-objs+= ipsec_tunnel.o ipsec_xmit.o ipsec_rcv.o ipsec_ipip.o -+base-klips-objs+= ipsec_snprintf.o -+base-klips-objs+= ipsec_mast.o -+base-klips-objs+= sysctl_net_ipsec.o -+base-klips-objs+= pfkey_v2.o pfkey_v2_parser.o pfkey_v2_ext_process.o -+base-klips-objs+= version.o -+ -+base-klips-objs+= satot.o -+base-klips-objs+= addrtot.o -+base-klips-objs+= ultot.o -+base-klips-objs+= addrtypeof.o -+base-klips-objs+= anyaddr.o -+base-klips-objs+= initaddr.o -+base-klips-objs+= ultoa.o -+base-klips-objs+= addrtoa.o -+base-klips-objs+= subnettoa.o -+base-klips-objs+= subnetof.o -+base-klips-objs+= goodmask.o -+base-klips-objs+= datatot.o -+base-klips-objs+= rangetoa.o -+base-klips-objs+= prng.o -+base-klips-objs+= pfkey_v2_parse.o -+base-klips-objs+= pfkey_v2_build.o -+base-klips-objs+= pfkey_v2_debug.o -+base-klips-objs+= pfkey_v2_ext_bits.o -+base-klips-objs+= version.o -+ -+obj-${CONFIG_KLIPS} += ipsec.o -+ -+ipsec-objs += ${base-klips-objs} -+ -+ipsec-$(CONFIG_KLIPS_ESP) += ipsec_esp.o -+ipsec-$(CONFIG_KLIPS_OCF) += ipsec_ocf.o -+ipsec-$(CONFIG_KLIPS_IPCOMP) += ipsec_ipcomp.o -+ipsec-$(CONFIG_KLIPS_AUTH_HMAC_MD5) += ipsec_md5c.o -+ipsec-$(CONFIG_KLIPS_AUTH_HMAC_SHA1) += ipsec_sha1.o -+ -+# AH, if you really think you need it. -+ipsec-$(CONFIG_KLIPS_AH) += ipsec_ah.o -+ -+ipsec-$(CONFIG_KLIPS_ALG) += ipsec_alg.o -+ -+# include code from DES subdir -+crypto-$(CONFIG_KLIPS_ENC_3DES) += des/ipsec_alg_3des.o -+crypto-$(CONFIG_KLIPS_ENC_3DES) += des/cbc_enc.o -+crypto-$(CONFIG_KLIPS_ENC_3DES) += des/ecb_enc.o -+crypto-$(CONFIG_KLIPS_ENC_3DES) += des/set_key.o -+ -+ifeq ($(strip ${SUBARCH}),) -+SUBARCH:=${ARCH} -+endif -+ -+# the assembly version expects frame pointers, which are -+# optional in many kernel builds. If you want speed, you should -+# probably use cryptoapi code instead. -+USEASSEMBLY=${SUBARCH}${CONFIG_FRAME_POINTER} -+ifeq (${USEASSEMBLY},i386y) -+crypto-$(CONFIG_KLIPS_ENC_3DES) += des/dx86unix.o -+else -+crypto-$(CONFIG_KLIPS_ENC_3DES) += des/des_enc.o -+endif -+ -+# include code from AES subdir -+crypto-$(CONFIG_KLIPS_ENC_AES) += aes/ipsec_alg_aes.o -+crypto-$(CONFIG_KLIPS_ENC_AES) += aes/aes_xcbc_mac.o -+crypto-$(CONFIG_KLIPS_ENC_AES) += aes/aes_cbc.o -+ -+ifeq ($(strip ${SUBARCH}),) -+SUBARCH:=${ARCH} -+endif -+ -+USEASSEMBLY=${SUBARCH}${CONFIG_FRAME_POINTER} -+ifeq (${USEASSEMBLY},i386y) -+crypto-$(CONFIG_KLIPS_ENC_AES) += aes/aes-i586.o -+else -+crypto-$(CONFIG_KLIPS_ENC_AES) += aes/aes.o -+endif -+ -+ipsec-y += ${crypto-y} -+ -+ipsec-$(CONFIG_KLIPS_ENC_CRYPTOAPI) += ipsec_alg_cryptoapi.o -+ -+# IPcomp stuff -+base-ipcomp-objs := ipcomp.o -+base-ipcomp-objs += adler32.o -+base-ipcomp-objs += deflate.o -+base-ipcomp-objs += infblock.o -+base-ipcomp-objs += infcodes.o -+base-ipcomp-objs += inffast.o -+base-ipcomp-objs += inflate.o -+base-ipcomp-objs += inftrees.o -+base-ipcomp-objs += infutil.o -+base-ipcomp-objs += trees.o -+base-ipcomp-objs += zutil.o -+asm-ipcomp-obj-$(CONFIG_M586) += match586.o -+asm-ipcomp-obj-$(CONFIG_M586TSC) += match586.o -+asm-ipcomp-obj-$(CONFIG_M586MMX) += match586.o -+asm-ipcomp-obj-$(CONFIG_M686) += match686.o -+asm-ipcomp-obj-$(CONFIG_MPENTIUMIII) += match686.o -+asm-ipcomp-obj-$(CONFIG_MPENTIUM4) += match686.o -+asm-ipcomp-obj-$(CONFIG_MK6) += match586.o -+asm-ipcomp-obj-$(CONFIG_MK7) += match686.o -+asm-ipcomp-obj-$(CONFIG_MCRUSOE) += match586.o -+asm-ipcomp-obj-$(CONFIG_MWINCHIPC6) += match586.o -+asm-ipcomp-obj-$(CONFIG_MWINCHIP2) += match686.o -+asm-ipcomp-obj-$(CONFIG_MWINCHIP3D) += match686.o -+base-ipcomp-objs += ${asm-ipcomp-obj-y} -+ -+ipsec-$(CONFIG_KLIPS_IPCOMP) += ${base-ipcomp-objs} -+ -+EXTRA_CFLAGS += -DIPCOMP_PREFIX -DKLIPS -+EXTRA_CFLAGS += -Icrypto/ocf -+ -+# -+# $Log: Makefile.fs2_6,v $ -+# Revision 1.8.2.1 2006/04/20 16:33:06 mcr -+# remove all of CONFIG_KLIPS_ALG --- one can no longer build without it. -+# Fix in-kernel module compilation. Sub-makefiles do not work. -+# -+# Revision 1.8 2005/05/11 03:15:42 mcr -+# adjusted makefiles to sanely build modules properly. -+# -+# Revision 1.7 2005/04/13 22:52:12 mcr -+# moved KLIPS specific snprintf() wrapper to seperate file. -+# -+# Revision 1.6 2004/08/22 05:02:03 mcr -+# organized symbols such that it is easier to build modules. -+# -+# Revision 1.5 2004/08/18 01:43:56 mcr -+# adjusted makefile enumation so that it can be used by module -+# wrapper. -+# -+# Revision 1.4 2004/08/17 03:27:23 mcr -+# klips 2.6 edits. -+# -+# Revision 1.3 2004/08/04 16:50:13 mcr -+# removed duplicate definition of dx86unix.o -+# -+# Revision 1.2 2004/08/03 18:21:09 mcr -+# only set KLIPS_TOP and OPENSWANSRCDIR if not already set. -+# -+# Revision 1.1 2004/07/26 15:02:22 mcr -+# makefile for KLIPS module for 2.6. -+# -+# Revision 1.3 2004/02/24 17:17:04 mcr -+# s/CONFIG_IPSEC/CONFIG_KLIPS/ as 26sec uses "CONFIG_IPSEC" to -+# turn it on/off as well. -+# -+# Revision 1.2 2004/02/22 06:50:42 mcr -+# kernel 2.6 port - merged with 2.4 code. -+# -+# Revision 1.1.2.1 2004/02/20 02:07:53 mcr -+# module configuration for KLIPS 2.6 -+# -+# -+# Local Variables: -+# compile-command: "(cd ../../.. && source umlsetup.sh && make -C ${POOLSPACE} module/ipsec.o)" -+# End Variables: -+# -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/README-zlib Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,147 @@ -+zlib 1.1.4 is a general purpose data compression library. All the code -+is thread safe. The data format used by the zlib library -+is described by RFCs (Request for Comments) 1950 to 1952 in the files -+http://www.ietf.org/rfc/rfc1950.txt (zlib format), rfc1951.txt (deflate -+format) and rfc1952.txt (gzip format). These documents are also available in -+other formats from ftp://ftp.uu.net/graphics/png/documents/zlib/zdoc-index.html -+ -+All functions of the compression library are documented in the file zlib.h -+(volunteer to write man pages welcome, contact jloup@gzip.org). A usage -+example of the library is given in the file example.c which also tests that -+the library is working correctly. Another example is given in the file -+minigzip.c. The compression library itself is composed of all source files -+except example.c and minigzip.c. -+ -+To compile all files and run the test program, follow the instructions -+given at the top of Makefile. In short "make test; make install" -+should work for most machines. For Unix: "./configure; make test; make install" -+For MSDOS, use one of the special makefiles such as Makefile.msc. -+For VMS, use Make_vms.com or descrip.mms. -+ -+Questions about zlib should be sent to , or to -+Gilles Vollant for the Windows DLL version. -+The zlib home page is http://www.zlib.org or http://www.gzip.org/zlib/ -+Before reporting a problem, please check this site to verify that -+you have the latest version of zlib; otherwise get the latest version and -+check whether the problem still exists or not. -+ -+PLEASE read the zlib FAQ http://www.gzip.org/zlib/zlib_faq.html -+before asking for help. -+ -+Mark Nelson wrote an article about zlib for the Jan. 1997 -+issue of Dr. Dobb's Journal; a copy of the article is available in -+http://dogma.net/markn/articles/zlibtool/zlibtool.htm -+ -+The changes made in version 1.1.4 are documented in the file ChangeLog. -+The only changes made since 1.1.3 are bug corrections: -+ -+- ZFREE was repeated on same allocation on some error conditions. -+ This creates a security problem described in -+ http://www.zlib.org/advisory-2002-03-11.txt -+- Returned incorrect error (Z_MEM_ERROR) on some invalid data -+- Avoid accesses before window for invalid distances with inflate window -+ less than 32K. -+- force windowBits > 8 to avoid a bug in the encoder for a window size -+ of 256 bytes. (A complete fix will be available in 1.1.5). -+ -+The beta version 1.1.5beta includes many more changes. A new official -+version 1.1.5 will be released as soon as extensive testing has been -+completed on it. -+ -+ -+Unsupported third party contributions are provided in directory "contrib". -+ -+A Java implementation of zlib is available in the Java Development Kit -+http://www.javasoft.com/products/JDK/1.1/docs/api/Package-java.util.zip.html -+See the zlib home page http://www.zlib.org for details. -+ -+A Perl interface to zlib written by Paul Marquess -+is in the CPAN (Comprehensive Perl Archive Network) sites -+http://www.cpan.org/modules/by-module/Compress/ -+ -+A Python interface to zlib written by A.M. Kuchling -+is available in Python 1.5 and later versions, see -+http://www.python.org/doc/lib/module-zlib.html -+ -+A zlib binding for TCL written by Andreas Kupries -+is availlable at http://www.westend.com/~kupries/doc/trf/man/man.html -+ -+An experimental package to read and write files in .zip format, -+written on top of zlib by Gilles Vollant , is -+available at http://www.winimage.com/zLibDll/unzip.html -+and also in the contrib/minizip directory of zlib. -+ -+ -+Notes for some targets: -+ -+- To build a Windows DLL version, include in a DLL project zlib.def, zlib.rc -+ and all .c files except example.c and minigzip.c; compile with -DZLIB_DLL -+ The zlib DLL support was initially done by Alessandro Iacopetti and is -+ now maintained by Gilles Vollant . Check the zlib DLL -+ home page at http://www.winimage.com/zLibDll -+ -+ From Visual Basic, you can call the DLL functions which do not take -+ a structure as argument: compress, uncompress and all gz* functions. -+ See contrib/visual-basic.txt for more information, or get -+ http://www.tcfb.com/dowseware/cmp-z-it.zip -+ -+- For 64-bit Irix, deflate.c must be compiled without any optimization. -+ With -O, one libpng test fails. The test works in 32 bit mode (with -+ the -n32 compiler flag). The compiler bug has been reported to SGI. -+ -+- zlib doesn't work with gcc 2.6.3 on a DEC 3000/300LX under OSF/1 2.1 -+ it works when compiled with cc. -+ -+- on Digital Unix 4.0D (formely OSF/1) on AlphaServer, the cc option -std1 -+ is necessary to get gzprintf working correctly. This is done by configure. -+ -+- zlib doesn't work on HP-UX 9.05 with some versions of /bin/cc. It works -+ with other compilers. Use "make test" to check your compiler. -+ -+- gzdopen is not supported on RISCOS, BEOS and by some Mac compilers. -+ -+- For Turbo C the small model is supported only with reduced performance to -+ avoid any far allocation; it was tested with -DMAX_WBITS=11 -DMAX_MEM_LEVEL=3 -+ -+- For PalmOs, see http://www.cs.uit.no/~perm/PASTA/pilot/software.html -+ Per Harald Myrvang -+ -+ -+Acknowledgments: -+ -+ The deflate format used by zlib was defined by Phil Katz. The deflate -+ and zlib specifications were written by L. Peter Deutsch. Thanks to all the -+ people who reported problems and suggested various improvements in zlib; -+ they are too numerous to cite here. -+ -+Copyright notice: -+ -+ (C) 1995-2002 Jean-loup Gailly and Mark Adler -+ -+ This software is provided 'as-is', without any express or implied -+ warranty. In no event will the authors be held liable for any damages -+ arising from the use of this software. -+ -+ Permission is granted to anyone to use this software for any purpose, -+ including commercial applications, and to alter it and redistribute it -+ freely, subject to the following restrictions: -+ -+ 1. The origin of this software must not be misrepresented; you must not -+ claim that you wrote the original software. If you use this software -+ in a product, an acknowledgment in the product documentation would be -+ appreciated but is not required. -+ 2. Altered source versions must be plainly marked as such, and must not be -+ misrepresented as being the original software. -+ 3. This notice may not be removed or altered from any source distribution. -+ -+ Jean-loup Gailly Mark Adler -+ jloup@gzip.org madler@alumni.caltech.edu -+ -+If you use the zlib library in a product, we would appreciate *not* -+receiving lengthy legal documents to sign. The sources are provided -+for free but without warranty of any kind. The library has been -+entirely written by Jean-loup Gailly and Mark Adler; it does not -+include third-party code. -+ -+If you redistribute modified sources, we would appreciate that you include -+in the file ChangeLog history information documenting your changes. ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/README-zlib.freeswan Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,13 @@ -+The only changes made to these files for use in FreeS/WAN are: -+ -+ - In zconf.h, macros are defined to prefix global symbols with "ipcomp_" -+ (or "_ipcomp"), when compiled with -DIPCOMP_PREFIX. -+ - The copyright strings are defined local (static) -+ -+ The above changes are made to avoid name collisions with ppp_deflate -+ and ext2compr. -+ -+ - Files not needed for FreeS/WAN have been removed -+ -+ See the "README" file for information about where to obtain the complete -+ zlib package. ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/addrtoa.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,67 @@ -+/* -+ * addresses to ASCII -+ * Copyright (C) 1998, 1999 Henry Spencer. -+ * -+ * This library is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU Library General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This library is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public -+ * License for more details. -+ * -+ * RCSID $Id: addrtoa.c,v 1.10 2004/07/10 07:43:47 mcr Exp $ -+ */ -+#include "openswan.h" -+ -+#define NBYTES 4 /* bytes in an address */ -+#define PERBYTE 4 /* three digits plus a dot or NUL */ -+#define BUFLEN (NBYTES*PERBYTE) -+ -+#if BUFLEN != ADDRTOA_BUF -+#error "ADDRTOA_BUF in openswan.h inconsistent with addrtoa() code" -+#endif -+ -+/* -+ - addrtoa - convert binary address to ASCII dotted decimal -+ */ -+size_t /* space needed for full conversion */ -+addrtoa(addr, format, dst, dstlen) -+struct in_addr addr; -+int format; /* character */ -+char *dst; /* need not be valid if dstlen is 0 */ -+size_t dstlen; -+{ -+ unsigned long a = ntohl(addr.s_addr); -+ int i; -+ size_t n; -+ unsigned long byte; -+ char buf[BUFLEN]; -+ char *p; -+ -+ switch (format) { -+ case 0: -+ break; -+ default: -+ return 0; -+ break; -+ } -+ -+ p = buf; -+ for (i = NBYTES-1; i >= 0; i--) { -+ byte = (a >> (i*8)) & 0xff; -+ p += ultoa(byte, 10, p, PERBYTE); -+ if (i != 0) -+ *(p-1) = '.'; -+ } -+ n = p - buf; -+ -+ if (dstlen > 0) { -+ if (n > dstlen) -+ buf[dstlen - 1] = '\0'; -+ strcpy(dst, buf); -+ } -+ return n; -+} ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/addrtot.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,344 @@ -+/* -+ * addresses to text -+ * Copyright (C) 2000 Henry Spencer. -+ * -+ * This library is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU Library General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This library is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public -+ * License for more details. -+ * -+ */ -+ -+#if defined(__KERNEL__) && defined(__HAVE_ARCH_STRSTR) -+#include -+#endif -+ -+#include "openswan.h" -+ -+#define IP4BYTES 4 /* bytes in an IPv4 address */ -+#define PERBYTE 4 /* three digits plus a dot or NUL */ -+#define IP6BYTES 16 /* bytes in an IPv6 address */ -+ -+/* forwards */ -+static size_t normal4(const unsigned char *s, size_t len, char *b, char **dp); -+static size_t normal6(const unsigned char *s, size_t len, char *b, char **dp, int squish); -+static size_t reverse4(const unsigned char *s, size_t len, char *b, char **dp); -+static size_t reverse6(const unsigned char *s, size_t len, char *b, char **dp); -+ -+#if defined(__KERNEL__) && !defined(__HAVE_ARCH_STRSTR) -+#define strstr ipsec_strstr -+/* -+ * Find the first occurrence of find in s. -+ * (from NetBSD 1.6's /src/lib/libc/string/strstr.c) -+ */ -+ -+static char * -+ipsec_strstr(s, find) -+ const char *s, *find; -+{ -+ char c, sc; -+ size_t len; -+ -+ if ((c = *find++) != 0) { -+ len = strlen(find); -+ do { -+ do { -+ if ((sc = *s++) == 0) -+ return (NULL); -+ } while (sc != c); -+ } while (strncmp(s, find, len) != 0); -+ s--; -+ } -+ /* LINTED interface specification */ -+ return ((char *)s); -+} -+#endif -+ -+/* -+ - addrtot - convert binary address to text (dotted decimal or IPv6 string) -+ */ -+size_t /* space needed for full conversion */ -+addrtot(src, format, dst, dstlen) -+const ip_address *src; -+int format; /* character */ -+char *dst; /* need not be valid if dstlen is 0 */ -+size_t dstlen; -+{ -+ const unsigned char *b; -+ size_t n; -+ char buf[1+ADDRTOT_BUF+1]; /* :address: */ -+ char *p; -+ int t = addrtypeof(src); -+# define TF(t, f) (((t)<<8) | (f)) -+ -+ n = addrbytesptr(src, &b); -+ if (n == 0) { -+ bad: -+ dst[0]='\0'; -+ strncat(dst, "", dstlen); -+ return sizeof(""); -+ } -+ -+ switch (TF(t, format)) { -+ case TF(AF_INET, 0): -+ n = normal4(b, n, buf, &p); -+ break; -+ case TF(AF_INET6, 0): -+ n = normal6(b, n, buf, &p, 1); -+ break; -+ case TF(AF_INET, 'Q'): -+ n = normal4(b, n, buf, &p); -+ break; -+ case TF(AF_INET6, 'Q'): -+ n = normal6(b, n, buf, &p, 0); -+ break; -+ case TF(AF_INET, 'r'): -+ n = reverse4(b, n, buf, &p); -+ break; -+ case TF(AF_INET6, 'r'): -+ n = reverse6(b, n, buf, &p); -+ break; -+ default: /* including (AF_INET, 'R') */ -+ goto bad; -+ break; -+ } -+ -+ if (dstlen > 0) { -+ if (dstlen < n) -+ p[dstlen - 1] = '\0'; -+ strcpy(dst, p); -+ } -+ return n; -+} -+ -+/* -+ - normal4 - normal IPv4 address-text conversion -+ */ -+static size_t /* size of text, including NUL */ -+normal4(srcp, srclen, buf, dstp) -+const unsigned char *srcp; -+size_t srclen; -+char *buf; /* guaranteed large enough */ -+char **dstp; /* where to put result pointer */ -+{ -+ int i; -+ char *p; -+ -+ if (srclen != IP4BYTES) /* "can't happen" */ -+ return 0; -+ p = buf; -+ for (i = 0; i < IP4BYTES; i++) { -+ p += ultot(srcp[i], 10, p, PERBYTE); -+ if (i != IP4BYTES - 1) -+ *(p-1) = '.'; /* overwrites the NUL */ -+ } -+ *dstp = buf; -+ return p - buf; -+} -+ -+/* -+ - normal6 - normal IPv6 address-text conversion -+ */ -+static size_t /* size of text, including NUL */ -+normal6(srcp, srclen, buf, dstp, squish) -+const unsigned char *srcp; -+size_t srclen; -+char *buf; /* guaranteed large enough, plus 2 */ -+char **dstp; /* where to put result pointer */ -+int squish; /* whether to squish out 0:0 */ -+{ -+ int i; -+ unsigned long piece; -+ char *p; -+ char *q; -+ -+ if (srclen != IP6BYTES) /* "can't happen" */ -+ return 0; -+ p = buf; -+ *p++ = ':'; -+ for (i = 0; i < IP6BYTES/2; i++) { -+ piece = (srcp[2*i] << 8) + srcp[2*i + 1]; -+ p += ultot(piece, 16, p, 5); /* 5 = abcd + NUL */ -+ *(p-1) = ':'; /* overwrites the NUL */ -+ } -+ *p = '\0'; -+ q = strstr(buf, ":0:0:"); -+ if (squish && q != NULL) { /* zero squishing is possible */ -+ p = q + 1; -+ while (*p == '0' && *(p+1) == ':') -+ p += 2; -+ q++; -+ *q++ = ':'; /* overwrite first 0 */ -+ while (*p != '\0') -+ *q++ = *p++; -+ *q = '\0'; -+ if (!(*(q-1) == ':' && *(q-2) == ':')) -+ *--q = '\0'; /* strip final : unless :: */ -+ p = buf; -+ if (!(*p == ':' && *(p+1) == ':')) -+ p++; /* skip initial : unless :: */ -+ } else { -+ q = p; -+ *--q = '\0'; /* strip final : */ -+ p = buf + 1; /* skip initial : */ -+ } -+ *dstp = p; -+ return q - p + 1; -+} -+ -+/* -+ - reverse4 - IPv4 reverse-lookup conversion -+ */ -+static size_t /* size of text, including NUL */ -+reverse4(srcp, srclen, buf, dstp) -+const unsigned char *srcp; -+size_t srclen; -+char *buf; /* guaranteed large enough */ -+char **dstp; /* where to put result pointer */ -+{ -+ int i; -+ char *p; -+ -+ if (srclen != IP4BYTES) /* "can't happen" */ -+ return 0; -+ p = buf; -+ for (i = IP4BYTES-1; i >= 0; i--) { -+ p += ultot(srcp[i], 10, p, PERBYTE); -+ *(p-1) = '.'; /* overwrites the NUL */ -+ } -+ strcpy(p, "IN-ADDR.ARPA."); -+ *dstp = buf; -+ return strlen(buf) + 1; -+} -+ -+/* -+ - reverse6 - IPv6 reverse-lookup conversion (RFC 1886) -+ * A trifle inefficient, really shouldn't use ultot... -+ */ -+static size_t /* size of text, including NUL */ -+reverse6(srcp, srclen, buf, dstp) -+const unsigned char *srcp; -+size_t srclen; -+char *buf; /* guaranteed large enough */ -+char **dstp; /* where to put result pointer */ -+{ -+ int i; -+ unsigned long piece; -+ char *p; -+ -+ if (srclen != IP6BYTES) /* "can't happen" */ -+ return 0; -+ p = buf; -+ for (i = IP6BYTES-1; i >= 0; i--) { -+ piece = srcp[i]; -+ p += ultot(piece&0xf, 16, p, 2); -+ *(p-1) = '.'; -+ p += ultot(piece>>4, 16, p, 2); -+ *(p-1) = '.'; -+ } -+ strcpy(p, "IP6.ARPA."); -+ *dstp = buf; -+ return strlen(buf) + 1; -+} -+ -+/* -+ - reverse6 - modern IPv6 reverse-lookup conversion (RFC 2874) -+ * this version removed as it was obsoleted in the end. -+ */ -+ -+#ifdef ADDRTOT_MAIN -+ -+#include -+#include -+#include -+#include -+ -+void regress(void); -+ -+int -+main(int argc, char *argv[]) -+{ -+ if (argc < 2) { -+ fprintf(stderr, "Usage: %s {addr|net/mask|begin...end|-r}\n", -+ argv[0]); -+ exit(2); -+ } -+ -+ if (strcmp(argv[1], "-r") == 0) { -+ regress(); -+ fprintf(stderr, "regress() returned?!?\n"); -+ exit(1); -+ } -+ exit(0); -+} -+ -+struct rtab { -+ char *input; -+ char format; -+ char *output; /* NULL means error expected */ -+} rtab[] = { -+ {"1.2.3.0", 0, "1.2.3.0"}, -+ {"1:2::3:4", 0, "1:2::3:4"}, -+ {"1:2::3:4", 'Q', "1:2:0:0:0:0:3:4"}, -+ {"1:2:0:0:3:4:0:0", 0, "1:2::3:4:0:0"}, -+ {"1.2.3.4", 'r' , "4.3.2.1.IN-ADDR.ARPA."}, -+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f 0 1 2 3 4 5 6 7 8 9 a b c d e f */ -+ {"1:2::3:4", 'r', "4.0.0.0.3.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.2.0.0.0.1.0.0.0.IP6.ARPA."}, -+ {NULL, 0, NULL} -+}; -+ -+void -+regress() -+{ -+ struct rtab *r; -+ int status = 0; -+ ip_address a; -+ char in[100]; -+ char buf[100]; -+ const char *oops; -+ size_t n; -+ -+ for (r = rtab; r->input != NULL; r++) { -+ strcpy(in, r->input); -+ -+ /* convert it *to* internal format */ -+ oops = ttoaddr(in, strlen(in), 0, &a); -+ -+ /* now convert it back */ -+ -+ n = addrtot(&a, r->format, buf, sizeof(buf)); -+ -+ if (n == 0 && r->output == NULL) -+ {} /* okay, error expected */ -+ -+ else if (n == 0) { -+ printf("`%s' atoasr failed\n", r->input); -+ status = 1; -+ -+ } else if (r->output == NULL) { -+ printf("`%s' atoasr succeeded unexpectedly '%c'\n", -+ r->input, r->format); -+ status = 1; -+ } else { -+ if (strcasecmp(r->output, buf) != 0) { -+ printf("`%s' '%c' gave `%s', expected `%s'\n", -+ r->input, r->format, buf, r->output); -+ status = 1; -+ } -+ } -+ } -+ exit(status); -+} -+ -+#endif /* ADDRTOT_MAIN */ -+ -+/* -+ * Local variables: -+ * c-file-style: "linux" -+ * End: -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/addrtypeof.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,123 @@ -+/* -+ * extract parts of an ip_address -+ * Copyright (C) 2000 Henry Spencer. -+ * -+ * This library is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU Library General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This library is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public -+ * License for more details. -+ * -+ * RCSID $Id: addrtypeof.c,v 1.10 2004/07/10 07:43:47 mcr Exp $ -+ */ -+#include "openswan.h" -+ -+/* -+ - addrtypeof - get the type of an ip_address -+ */ -+int -+addrtypeof(src) -+const ip_address *src; -+{ -+ return src->u.v4.sin_family; -+} -+ -+/* -+ - addrbytesptr - get pointer to the address bytes of an ip_address -+ */ -+size_t /* 0 for error */ -+addrbytesptr(src, dstp) -+const ip_address *src; -+const unsigned char **dstp; /* NULL means just a size query */ -+{ -+ const unsigned char *p; -+ size_t n; -+ -+ switch (src->u.v4.sin_family) { -+ case AF_INET: -+ p = (const unsigned char *)&src->u.v4.sin_addr.s_addr; -+ n = 4; -+ break; -+ case AF_INET6: -+ p = (const unsigned char *)&src->u.v6.sin6_addr; -+ n = 16; -+ break; -+ default: -+ return 0; -+ break; -+ } -+ -+ if (dstp != NULL) -+ *dstp = p; -+ return n; -+ -+} -+/* -+ - addrbytesptr - get pointer to the address bytes of an ip_address -+ */ -+size_t /* 0 for error */ -+addrbytesptr_write(src, dstp) -+ip_address *src; -+unsigned char **dstp; /* NULL means just a size query */ -+{ -+ unsigned char *p; -+ size_t n; -+ -+ switch (src->u.v4.sin_family) { -+ case AF_INET: -+ p = (unsigned char *)&src->u.v4.sin_addr.s_addr; -+ n = 4; -+ break; -+ case AF_INET6: -+ p = (unsigned char *)&src->u.v6.sin6_addr; -+ n = 16; -+ break; -+ default: -+ return 0; -+ break; -+ } -+ -+ if (dstp != NULL) -+ *dstp = p; -+ return n; -+} -+ -+/* -+ - addrlenof - get length of the address bytes of an ip_address -+ */ -+size_t /* 0 for error */ -+addrlenof(src) -+const ip_address *src; -+{ -+ return addrbytesptr(src, NULL); -+} -+ -+/* -+ - addrbytesof - get the address bytes of an ip_address -+ */ -+size_t /* 0 for error */ -+addrbytesof(src, dst, dstlen) -+const ip_address *src; -+unsigned char *dst; -+size_t dstlen; -+{ -+ const unsigned char *p; -+ size_t n; -+ size_t ncopy; -+ -+ n = addrbytesptr(src, &p); -+ if (n == 0) -+ return 0; -+ -+ if (dstlen > 0) { -+ ncopy = n; -+ if (ncopy > dstlen) -+ ncopy = dstlen; -+ memcpy(dst, p, ncopy); -+ } -+ return n; -+} ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/adler32.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,49 @@ -+/* adler32.c -- compute the Adler-32 checksum of a data stream -+ * Copyright (C) 1995-2002 Mark Adler -+ * For conditions of distribution and use, see copyright notice in zlib.h -+ */ -+ -+/* @(#) $Id: adler32.c,v 1.6 2004/07/10 19:11:18 mcr Exp $ */ -+ -+#include -+#include -+ -+#define BASE 65521L /* largest prime smaller than 65536 */ -+#define NMAX 5552 -+/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */ -+ -+#define DO1(buf,i) {s1 += buf[i]; s2 += s1;} -+#define DO2(buf,i) DO1(buf,i); DO1(buf,i+1); -+#define DO4(buf,i) DO2(buf,i); DO2(buf,i+2); -+#define DO8(buf,i) DO4(buf,i); DO4(buf,i+4); -+#define DO16(buf) DO8(buf,0); DO8(buf,8); -+ -+/* ========================================================================= */ -+uLong ZEXPORT adler32(adler, buf, len) -+ uLong adler; -+ const Bytef *buf; -+ uInt len; -+{ -+ unsigned long s1 = adler & 0xffff; -+ unsigned long s2 = (adler >> 16) & 0xffff; -+ int k; -+ -+ if (buf == Z_NULL) return 1L; -+ -+ while (len > 0) { -+ k = len < NMAX ? len : NMAX; -+ len -= k; -+ while (k >= 16) { -+ DO16(buf); -+ buf += 16; -+ k -= 16; -+ } -+ if (k != 0) do { -+ s1 += *buf++; -+ s2 += s1; -+ } while (--k); -+ s1 %= BASE; -+ s2 %= BASE; -+ } -+ return (s2 << 16) | s1; -+} ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/aes/Makefile Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,56 @@ -+# Makefile for KLIPS 3DES kernel code as a module for 2.6 kernels -+# -+# Makefile for KLIPS kernel code as a module -+# Copyright (C) 2002-2004 Michael Richardson -+# -+# This program is free software; you can redistribute it and/or modify it -+# under the terms of the GNU General Public License as published by the -+# Free Software Foundation; either version 2 of the License, or (at your -+# option) any later version. See . -+# -+# This program is distributed in the hope that it will be useful, but -+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+# for more details. -+# -+# RCSID $Id: Makefile.fs2_6,v 1.2 2005/08/12 14:13:58 mcr Exp $ -+# -+# Note! Dependencies are done automagically by 'make dep', which also -+# removes any old dependencies. DON'T put your own dependencies here -+# unless it's something special (ie not a .c file). -+# -+ -+obj-$(CONFIG_KLIPS_ENC_AES) += ipsec_alg_aes.o -+obj-$(CONFIG_KLIPS_ENC_AES) += aes_xcbc_mac.o -+obj-$(CONFIG_KLIPS_ENC_AES) += aes_cbc.o -+ -+ifeq ($(strip ${SUBARCH}),) -+SUBARCH:=${ARCH} -+endif -+ -+# the assembly version expects frame pointers, which are -+# optional in many kernel builds. If you want speed, you should -+# probably use cryptoapi code instead. -+USEASSEMBLY=${SUBARCH}${CONFIG_FRAME_POINTER} -+ifeq (${USEASSEMBLY},i386y) -+obj-$(CONFIG_KLIPS_ENC_AES) += aes-i586.o -+else -+obj-$(CONFIG_KLIPS_ENC_AES) += aes.o -+endif -+ -+ -+# -+# $Log: Makefile.fs2_6,v $ -+# Revision 1.2 2005/08/12 14:13:58 mcr -+# do not use assembly code with there are no frame pointers, -+# as it does not have the right linkages. -+# -+# Revision 1.1 2004/08/17 03:31:34 mcr -+# klips 2.6 edits. -+# -+# -+# Local Variables: -+# compile-command: "(cd ../../.. && source umlsetup.sh && make -C ${POOLSPACE} module/ipsec.o)" -+# End Variables: -+# -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/aes/aes-i586.S Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,892 @@ -+// -+// Copyright (c) 2001, Dr Brian Gladman , Worcester, UK. -+// All rights reserved. -+// -+// TERMS -+// -+// Redistribution and use in source and binary forms, with or without -+// modification, are permitted subject to the following conditions: -+// -+// 1. Redistributions of source code must retain the above copyright -+// notice, this list of conditions and the following disclaimer. -+// -+// 2. Redistributions in binary form must reproduce the above copyright -+// notice, this list of conditions and the following disclaimer in the -+// documentation and/or other materials provided with the distribution. -+// -+// 3. The copyright holder's name must not be used to endorse or promote -+// any products derived from this software without his specific prior -+// written permission. -+// -+// This software is provided 'as is' with no express or implied warranties -+// of correctness or fitness for purpose. -+ -+// Modified by Jari Ruusu, December 24 2001 -+// - Converted syntax to GNU CPP/assembler syntax -+// - C programming interface converted back to "old" API -+// - Minor portability cleanups and speed optimizations -+ -+// An AES (Rijndael) implementation for the Pentium. This version only -+// implements the standard AES block length (128 bits, 16 bytes). This code -+// does not preserve the eax, ecx or edx registers or the artihmetic status -+// flags. However, the ebx, esi, edi, and ebp registers are preserved across -+// calls. -+ -+// void aes_set_key(aes_context *cx, const unsigned char key[], const int key_len, const int f) -+// void aes_encrypt(const aes_context *cx, const unsigned char in_blk[], unsigned char out_blk[]) -+// void aes_decrypt(const aes_context *cx, const unsigned char in_blk[], unsigned char out_blk[]) -+ -+#if defined(USE_UNDERLINE) -+# define aes_set_key _aes_set_key -+# define aes_encrypt _aes_encrypt -+# define aes_decrypt _aes_decrypt -+#endif -+#if !defined(ALIGN32BYTES) -+# define ALIGN32BYTES 32 -+#endif -+ -+ .file "aes-i586.S" -+ .globl aes_set_key -+ .globl aes_encrypt -+ .globl aes_decrypt -+ -+#define tlen 1024 // length of each of 4 'xor' arrays (256 32-bit words) -+ -+// offsets to parameters with one register pushed onto stack -+ -+#define ctx 8 // AES context structure -+#define in_blk 12 // input byte array address parameter -+#define out_blk 16 // output byte array address parameter -+ -+// offsets in context structure -+ -+#define nkey 0 // key length, size 4 -+#define nrnd 4 // number of rounds, size 4 -+#define ekey 8 // encryption key schedule base address, size 256 -+#define dkey 264 // decryption key schedule base address, size 256 -+ -+// This macro performs a forward encryption cycle. It is entered with -+// the first previous round column values in %eax, %ebx, %esi and %edi and -+// exits with the final values in the same registers. -+ -+#define fwd_rnd(p1,p2) \ -+ mov %ebx,(%esp) ;\ -+ movzbl %al,%edx ;\ -+ mov %eax,%ecx ;\ -+ mov p2(%ebp),%eax ;\ -+ mov %edi,4(%esp) ;\ -+ mov p2+12(%ebp),%edi ;\ -+ xor p1(,%edx,4),%eax ;\ -+ movzbl %ch,%edx ;\ -+ shr $16,%ecx ;\ -+ mov p2+4(%ebp),%ebx ;\ -+ xor p1+tlen(,%edx,4),%edi ;\ -+ movzbl %cl,%edx ;\ -+ movzbl %ch,%ecx ;\ -+ xor p1+3*tlen(,%ecx,4),%ebx ;\ -+ mov %esi,%ecx ;\ -+ mov p1+2*tlen(,%edx,4),%esi ;\ -+ movzbl %cl,%edx ;\ -+ xor p1(,%edx,4),%esi ;\ -+ movzbl %ch,%edx ;\ -+ shr $16,%ecx ;\ -+ xor p1+tlen(,%edx,4),%ebx ;\ -+ movzbl %cl,%edx ;\ -+ movzbl %ch,%ecx ;\ -+ xor p1+2*tlen(,%edx,4),%eax ;\ -+ mov (%esp),%edx ;\ -+ xor p1+3*tlen(,%ecx,4),%edi ;\ -+ movzbl %dl,%ecx ;\ -+ xor p2+8(%ebp),%esi ;\ -+ xor p1(,%ecx,4),%ebx ;\ -+ movzbl %dh,%ecx ;\ -+ shr $16,%edx ;\ -+ xor p1+tlen(,%ecx,4),%eax ;\ -+ movzbl %dl,%ecx ;\ -+ movzbl %dh,%edx ;\ -+ xor p1+2*tlen(,%ecx,4),%edi ;\ -+ mov 4(%esp),%ecx ;\ -+ xor p1+3*tlen(,%edx,4),%esi ;\ -+ movzbl %cl,%edx ;\ -+ xor p1(,%edx,4),%edi ;\ -+ movzbl %ch,%edx ;\ -+ shr $16,%ecx ;\ -+ xor p1+tlen(,%edx,4),%esi ;\ -+ movzbl %cl,%edx ;\ -+ movzbl %ch,%ecx ;\ -+ xor p1+2*tlen(,%edx,4),%ebx ;\ -+ xor p1+3*tlen(,%ecx,4),%eax -+ -+// This macro performs an inverse encryption cycle. It is entered with -+// the first previous round column values in %eax, %ebx, %esi and %edi and -+// exits with the final values in the same registers. -+ -+#define inv_rnd(p1,p2) \ -+ movzbl %al,%edx ;\ -+ mov %ebx,(%esp) ;\ -+ mov %eax,%ecx ;\ -+ mov p2(%ebp),%eax ;\ -+ mov %edi,4(%esp) ;\ -+ mov p2+4(%ebp),%ebx ;\ -+ xor p1(,%edx,4),%eax ;\ -+ movzbl %ch,%edx ;\ -+ shr $16,%ecx ;\ -+ mov p2+12(%ebp),%edi ;\ -+ xor p1+tlen(,%edx,4),%ebx ;\ -+ movzbl %cl,%edx ;\ -+ movzbl %ch,%ecx ;\ -+ xor p1+3*tlen(,%ecx,4),%edi ;\ -+ mov %esi,%ecx ;\ -+ mov p1+2*tlen(,%edx,4),%esi ;\ -+ movzbl %cl,%edx ;\ -+ xor p1(,%edx,4),%esi ;\ -+ movzbl %ch,%edx ;\ -+ shr $16,%ecx ;\ -+ xor p1+tlen(,%edx,4),%edi ;\ -+ movzbl %cl,%edx ;\ -+ movzbl %ch,%ecx ;\ -+ xor p1+2*tlen(,%edx,4),%eax ;\ -+ mov (%esp),%edx ;\ -+ xor p1+3*tlen(,%ecx,4),%ebx ;\ -+ movzbl %dl,%ecx ;\ -+ xor p2+8(%ebp),%esi ;\ -+ xor p1(,%ecx,4),%ebx ;\ -+ movzbl %dh,%ecx ;\ -+ shr $16,%edx ;\ -+ xor p1+tlen(,%ecx,4),%esi ;\ -+ movzbl %dl,%ecx ;\ -+ movzbl %dh,%edx ;\ -+ xor p1+2*tlen(,%ecx,4),%edi ;\ -+ mov 4(%esp),%ecx ;\ -+ xor p1+3*tlen(,%edx,4),%eax ;\ -+ movzbl %cl,%edx ;\ -+ xor p1(,%edx,4),%edi ;\ -+ movzbl %ch,%edx ;\ -+ shr $16,%ecx ;\ -+ xor p1+tlen(,%edx,4),%eax ;\ -+ movzbl %cl,%edx ;\ -+ movzbl %ch,%ecx ;\ -+ xor p1+2*tlen(,%edx,4),%ebx ;\ -+ xor p1+3*tlen(,%ecx,4),%esi -+ -+// AES (Rijndael) Encryption Subroutine -+ -+ .text -+ .align ALIGN32BYTES -+aes_encrypt: -+ push %ebp -+ mov ctx(%esp),%ebp // pointer to context -+ mov in_blk(%esp),%ecx -+ push %ebx -+ push %esi -+ push %edi -+ mov nrnd(%ebp),%edx // number of rounds -+ lea ekey+16(%ebp),%ebp // key pointer -+ -+// input four columns and xor in first round key -+ -+ mov (%ecx),%eax -+ mov 4(%ecx),%ebx -+ mov 8(%ecx),%esi -+ mov 12(%ecx),%edi -+ xor -16(%ebp),%eax -+ xor -12(%ebp),%ebx -+ xor -8(%ebp),%esi -+ xor -4(%ebp),%edi -+ -+ sub $8,%esp // space for register saves on stack -+ -+ sub $10,%edx -+ je aes_15 -+ add $32,%ebp -+ sub $2,%edx -+ je aes_13 -+ add $32,%ebp -+ -+ fwd_rnd(aes_ft_tab,-64) // 14 rounds for 256-bit key -+ fwd_rnd(aes_ft_tab,-48) -+aes_13: fwd_rnd(aes_ft_tab,-32) // 12 rounds for 192-bit key -+ fwd_rnd(aes_ft_tab,-16) -+aes_15: fwd_rnd(aes_ft_tab,0) // 10 rounds for 128-bit key -+ fwd_rnd(aes_ft_tab,16) -+ fwd_rnd(aes_ft_tab,32) -+ fwd_rnd(aes_ft_tab,48) -+ fwd_rnd(aes_ft_tab,64) -+ fwd_rnd(aes_ft_tab,80) -+ fwd_rnd(aes_ft_tab,96) -+ fwd_rnd(aes_ft_tab,112) -+ fwd_rnd(aes_ft_tab,128) -+ fwd_rnd(aes_fl_tab,144) // last round uses a different table -+ -+// move final values to the output array. -+ -+ mov out_blk+20(%esp),%ebp -+ add $8,%esp -+ mov %eax,(%ebp) -+ mov %ebx,4(%ebp) -+ mov %esi,8(%ebp) -+ mov %edi,12(%ebp) -+ pop %edi -+ pop %esi -+ pop %ebx -+ pop %ebp -+ ret -+ -+ -+// AES (Rijndael) Decryption Subroutine -+ -+ .align ALIGN32BYTES -+aes_decrypt: -+ push %ebp -+ mov ctx(%esp),%ebp // pointer to context -+ mov in_blk(%esp),%ecx -+ push %ebx -+ push %esi -+ push %edi -+ mov nrnd(%ebp),%edx // number of rounds -+ lea dkey+16(%ebp),%ebp // key pointer -+ -+// input four columns and xor in first round key -+ -+ mov (%ecx),%eax -+ mov 4(%ecx),%ebx -+ mov 8(%ecx),%esi -+ mov 12(%ecx),%edi -+ xor -16(%ebp),%eax -+ xor -12(%ebp),%ebx -+ xor -8(%ebp),%esi -+ xor -4(%ebp),%edi -+ -+ sub $8,%esp // space for register saves on stack -+ -+ sub $10,%edx -+ je aes_25 -+ add $32,%ebp -+ sub $2,%edx -+ je aes_23 -+ add $32,%ebp -+ -+ inv_rnd(aes_it_tab,-64) // 14 rounds for 256-bit key -+ inv_rnd(aes_it_tab,-48) -+aes_23: inv_rnd(aes_it_tab,-32) // 12 rounds for 192-bit key -+ inv_rnd(aes_it_tab,-16) -+aes_25: inv_rnd(aes_it_tab,0) // 10 rounds for 128-bit key -+ inv_rnd(aes_it_tab,16) -+ inv_rnd(aes_it_tab,32) -+ inv_rnd(aes_it_tab,48) -+ inv_rnd(aes_it_tab,64) -+ inv_rnd(aes_it_tab,80) -+ inv_rnd(aes_it_tab,96) -+ inv_rnd(aes_it_tab,112) -+ inv_rnd(aes_it_tab,128) -+ inv_rnd(aes_il_tab,144) // last round uses a different table -+ -+// move final values to the output array. -+ -+ mov out_blk+20(%esp),%ebp -+ add $8,%esp -+ mov %eax,(%ebp) -+ mov %ebx,4(%ebp) -+ mov %esi,8(%ebp) -+ mov %edi,12(%ebp) -+ pop %edi -+ pop %esi -+ pop %ebx -+ pop %ebp -+ ret -+ -+// AES (Rijndael) Key Schedule Subroutine -+ -+// input/output parameters -+ -+#define aes_cx 12 // AES context -+#define in_key 16 // key input array address -+#define key_ln 20 // key length, bytes (16,24,32) or bits (128,192,256) -+#define ed_flg 24 // 0=create both encr/decr keys, 1=create encr key only -+ -+// offsets for locals -+ -+#define cnt -4 -+#define kpf -8 -+#define slen 8 -+ -+// This macro performs a column mixing operation on an input 32-bit -+// word to give a 32-bit result. It uses each of the 4 bytes in the -+// the input column to index 4 different tables of 256 32-bit words -+// that are xored together to form the output value. -+ -+#define mix_col(p1) \ -+ movzbl %bl,%ecx ;\ -+ mov p1(,%ecx,4),%eax ;\ -+ movzbl %bh,%ecx ;\ -+ ror $16,%ebx ;\ -+ xor p1+tlen(,%ecx,4),%eax ;\ -+ movzbl %bl,%ecx ;\ -+ xor p1+2*tlen(,%ecx,4),%eax ;\ -+ movzbl %bh,%ecx ;\ -+ xor p1+3*tlen(,%ecx,4),%eax -+ -+// Key Schedule Macros -+ -+#define ksc4(p1) \ -+ rol $24,%ebx ;\ -+ mix_col(aes_fl_tab) ;\ -+ ror $8,%ebx ;\ -+ xor 4*p1+aes_rcon_tab,%eax ;\ -+ xor %eax,%esi ;\ -+ xor %esi,%ebp ;\ -+ mov %esi,16*p1(%edi) ;\ -+ mov %ebp,16*p1+4(%edi) ;\ -+ xor %ebp,%edx ;\ -+ xor %edx,%ebx ;\ -+ mov %edx,16*p1+8(%edi) ;\ -+ mov %ebx,16*p1+12(%edi) -+ -+#define ksc6(p1) \ -+ rol $24,%ebx ;\ -+ mix_col(aes_fl_tab) ;\ -+ ror $8,%ebx ;\ -+ xor 4*p1+aes_rcon_tab,%eax ;\ -+ xor 24*p1-24(%edi),%eax ;\ -+ mov %eax,24*p1(%edi) ;\ -+ xor 24*p1-20(%edi),%eax ;\ -+ mov %eax,24*p1+4(%edi) ;\ -+ xor %eax,%esi ;\ -+ xor %esi,%ebp ;\ -+ mov %esi,24*p1+8(%edi) ;\ -+ mov %ebp,24*p1+12(%edi) ;\ -+ xor %ebp,%edx ;\ -+ xor %edx,%ebx ;\ -+ mov %edx,24*p1+16(%edi) ;\ -+ mov %ebx,24*p1+20(%edi) -+ -+#define ksc8(p1) \ -+ rol $24,%ebx ;\ -+ mix_col(aes_fl_tab) ;\ -+ ror $8,%ebx ;\ -+ xor 4*p1+aes_rcon_tab,%eax ;\ -+ xor 32*p1-32(%edi),%eax ;\ -+ mov %eax,32*p1(%edi) ;\ -+ xor 32*p1-28(%edi),%eax ;\ -+ mov %eax,32*p1+4(%edi) ;\ -+ xor 32*p1-24(%edi),%eax ;\ -+ mov %eax,32*p1+8(%edi) ;\ -+ xor 32*p1-20(%edi),%eax ;\ -+ mov %eax,32*p1+12(%edi) ;\ -+ push %ebx ;\ -+ mov %eax,%ebx ;\ -+ mix_col(aes_fl_tab) ;\ -+ pop %ebx ;\ -+ xor %eax,%esi ;\ -+ xor %esi,%ebp ;\ -+ mov %esi,32*p1+16(%edi) ;\ -+ mov %ebp,32*p1+20(%edi) ;\ -+ xor %ebp,%edx ;\ -+ xor %edx,%ebx ;\ -+ mov %edx,32*p1+24(%edi) ;\ -+ mov %ebx,32*p1+28(%edi) -+ -+ .align ALIGN32BYTES -+aes_set_key: -+ pushfl -+ push %ebp -+ mov %esp,%ebp -+ sub $slen,%esp -+ push %ebx -+ push %esi -+ push %edi -+ -+ mov aes_cx(%ebp),%edx // edx -> AES context -+ -+ mov key_ln(%ebp),%ecx // key length -+ cmpl $128,%ecx -+ jb aes_30 -+ shr $3,%ecx -+aes_30: cmpl $32,%ecx -+ je aes_32 -+ cmpl $24,%ecx -+ je aes_32 -+ mov $16,%ecx -+aes_32: shr $2,%ecx -+ mov %ecx,nkey(%edx) -+ -+ lea 6(%ecx),%eax // 10/12/14 for 4/6/8 32-bit key length -+ mov %eax,nrnd(%edx) -+ -+ mov in_key(%ebp),%esi // key input array -+ lea ekey(%edx),%edi // key position in AES context -+ cld -+ push %ebp -+ mov %ecx,%eax // save key length in eax -+ rep ; movsl // words in the key schedule -+ mov -4(%esi),%ebx // put some values in registers -+ mov -8(%esi),%edx // to allow faster code -+ mov -12(%esi),%ebp -+ mov -16(%esi),%esi -+ -+ cmpl $4,%eax // jump on key size -+ je aes_36 -+ cmpl $6,%eax -+ je aes_35 -+ -+ ksc8(0) -+ ksc8(1) -+ ksc8(2) -+ ksc8(3) -+ ksc8(4) -+ ksc8(5) -+ ksc8(6) -+ jmp aes_37 -+aes_35: ksc6(0) -+ ksc6(1) -+ ksc6(2) -+ ksc6(3) -+ ksc6(4) -+ ksc6(5) -+ ksc6(6) -+ ksc6(7) -+ jmp aes_37 -+aes_36: ksc4(0) -+ ksc4(1) -+ ksc4(2) -+ ksc4(3) -+ ksc4(4) -+ ksc4(5) -+ ksc4(6) -+ ksc4(7) -+ ksc4(8) -+ ksc4(9) -+aes_37: pop %ebp -+ mov aes_cx(%ebp),%edx // edx -> AES context -+ cmpl $0,ed_flg(%ebp) -+ jne aes_39 -+ -+// compile decryption key schedule from encryption schedule - reverse -+// order and do mix_column operation on round keys except first and last -+ -+ mov nrnd(%edx),%eax // kt = cx->d_key + nc * cx->Nrnd -+ shl $2,%eax -+ lea dkey(%edx,%eax,4),%edi -+ lea ekey(%edx),%esi // kf = cx->e_key -+ -+ movsl // copy first round key (unmodified) -+ movsl -+ movsl -+ movsl -+ sub $32,%edi -+ movl $1,cnt(%ebp) -+aes_38: // do mix column on each column of -+ lodsl // each round key -+ mov %eax,%ebx -+ mix_col(aes_im_tab) -+ stosl -+ lodsl -+ mov %eax,%ebx -+ mix_col(aes_im_tab) -+ stosl -+ lodsl -+ mov %eax,%ebx -+ mix_col(aes_im_tab) -+ stosl -+ lodsl -+ mov %eax,%ebx -+ mix_col(aes_im_tab) -+ stosl -+ sub $32,%edi -+ -+ incl cnt(%ebp) -+ mov cnt(%ebp),%eax -+ cmp nrnd(%edx),%eax -+ jb aes_38 -+ -+ movsl // copy last round key (unmodified) -+ movsl -+ movsl -+ movsl -+aes_39: pop %edi -+ pop %esi -+ pop %ebx -+ mov %ebp,%esp -+ pop %ebp -+ popfl -+ ret -+ -+ -+// finite field multiplies by {02}, {04} and {08} -+ -+#define f2(x) ((x<<1)^(((x>>7)&1)*0x11b)) -+#define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b)) -+#define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b)) -+ -+// finite field multiplies required in table generation -+ -+#define f3(x) (f2(x) ^ x) -+#define f9(x) (f8(x) ^ x) -+#define fb(x) (f8(x) ^ f2(x) ^ x) -+#define fd(x) (f8(x) ^ f4(x) ^ x) -+#define fe(x) (f8(x) ^ f4(x) ^ f2(x)) -+ -+// These defines generate the forward table entries -+ -+#define u0(x) ((f3(x) << 24) | (x << 16) | (x << 8) | f2(x)) -+#define u1(x) ((x << 24) | (x << 16) | (f2(x) << 8) | f3(x)) -+#define u2(x) ((x << 24) | (f2(x) << 16) | (f3(x) << 8) | x) -+#define u3(x) ((f2(x) << 24) | (f3(x) << 16) | (x << 8) | x) -+ -+// These defines generate the inverse table entries -+ -+#define v0(x) ((fb(x) << 24) | (fd(x) << 16) | (f9(x) << 8) | fe(x)) -+#define v1(x) ((fd(x) << 24) | (f9(x) << 16) | (fe(x) << 8) | fb(x)) -+#define v2(x) ((f9(x) << 24) | (fe(x) << 16) | (fb(x) << 8) | fd(x)) -+#define v3(x) ((fe(x) << 24) | (fb(x) << 16) | (fd(x) << 8) | f9(x)) -+ -+// These defines generate entries for the last round tables -+ -+#define w0(x) (x) -+#define w1(x) (x << 8) -+#define w2(x) (x << 16) -+#define w3(x) (x << 24) -+ -+// macro to generate inverse mix column tables (needed for the key schedule) -+ -+#define im_data0(p1) \ -+ .long p1(0x00),p1(0x01),p1(0x02),p1(0x03),p1(0x04),p1(0x05),p1(0x06),p1(0x07) ;\ -+ .long p1(0x08),p1(0x09),p1(0x0a),p1(0x0b),p1(0x0c),p1(0x0d),p1(0x0e),p1(0x0f) ;\ -+ .long p1(0x10),p1(0x11),p1(0x12),p1(0x13),p1(0x14),p1(0x15),p1(0x16),p1(0x17) ;\ -+ .long p1(0x18),p1(0x19),p1(0x1a),p1(0x1b),p1(0x1c),p1(0x1d),p1(0x1e),p1(0x1f) -+#define im_data1(p1) \ -+ .long p1(0x20),p1(0x21),p1(0x22),p1(0x23),p1(0x24),p1(0x25),p1(0x26),p1(0x27) ;\ -+ .long p1(0x28),p1(0x29),p1(0x2a),p1(0x2b),p1(0x2c),p1(0x2d),p1(0x2e),p1(0x2f) ;\ -+ .long p1(0x30),p1(0x31),p1(0x32),p1(0x33),p1(0x34),p1(0x35),p1(0x36),p1(0x37) ;\ -+ .long p1(0x38),p1(0x39),p1(0x3a),p1(0x3b),p1(0x3c),p1(0x3d),p1(0x3e),p1(0x3f) -+#define im_data2(p1) \ -+ .long p1(0x40),p1(0x41),p1(0x42),p1(0x43),p1(0x44),p1(0x45),p1(0x46),p1(0x47) ;\ -+ .long p1(0x48),p1(0x49),p1(0x4a),p1(0x4b),p1(0x4c),p1(0x4d),p1(0x4e),p1(0x4f) ;\ -+ .long p1(0x50),p1(0x51),p1(0x52),p1(0x53),p1(0x54),p1(0x55),p1(0x56),p1(0x57) ;\ -+ .long p1(0x58),p1(0x59),p1(0x5a),p1(0x5b),p1(0x5c),p1(0x5d),p1(0x5e),p1(0x5f) -+#define im_data3(p1) \ -+ .long p1(0x60),p1(0x61),p1(0x62),p1(0x63),p1(0x64),p1(0x65),p1(0x66),p1(0x67) ;\ -+ .long p1(0x68),p1(0x69),p1(0x6a),p1(0x6b),p1(0x6c),p1(0x6d),p1(0x6e),p1(0x6f) ;\ -+ .long p1(0x70),p1(0x71),p1(0x72),p1(0x73),p1(0x74),p1(0x75),p1(0x76),p1(0x77) ;\ -+ .long p1(0x78),p1(0x79),p1(0x7a),p1(0x7b),p1(0x7c),p1(0x7d),p1(0x7e),p1(0x7f) -+#define im_data4(p1) \ -+ .long p1(0x80),p1(0x81),p1(0x82),p1(0x83),p1(0x84),p1(0x85),p1(0x86),p1(0x87) ;\ -+ .long p1(0x88),p1(0x89),p1(0x8a),p1(0x8b),p1(0x8c),p1(0x8d),p1(0x8e),p1(0x8f) ;\ -+ .long p1(0x90),p1(0x91),p1(0x92),p1(0x93),p1(0x94),p1(0x95),p1(0x96),p1(0x97) ;\ -+ .long p1(0x98),p1(0x99),p1(0x9a),p1(0x9b),p1(0x9c),p1(0x9d),p1(0x9e),p1(0x9f) -+#define im_data5(p1) \ -+ .long p1(0xa0),p1(0xa1),p1(0xa2),p1(0xa3),p1(0xa4),p1(0xa5),p1(0xa6),p1(0xa7) ;\ -+ .long p1(0xa8),p1(0xa9),p1(0xaa),p1(0xab),p1(0xac),p1(0xad),p1(0xae),p1(0xaf) ;\ -+ .long p1(0xb0),p1(0xb1),p1(0xb2),p1(0xb3),p1(0xb4),p1(0xb5),p1(0xb6),p1(0xb7) ;\ -+ .long p1(0xb8),p1(0xb9),p1(0xba),p1(0xbb),p1(0xbc),p1(0xbd),p1(0xbe),p1(0xbf) -+#define im_data6(p1) \ -+ .long p1(0xc0),p1(0xc1),p1(0xc2),p1(0xc3),p1(0xc4),p1(0xc5),p1(0xc6),p1(0xc7) ;\ -+ .long p1(0xc8),p1(0xc9),p1(0xca),p1(0xcb),p1(0xcc),p1(0xcd),p1(0xce),p1(0xcf) ;\ -+ .long p1(0xd0),p1(0xd1),p1(0xd2),p1(0xd3),p1(0xd4),p1(0xd5),p1(0xd6),p1(0xd7) ;\ -+ .long p1(0xd8),p1(0xd9),p1(0xda),p1(0xdb),p1(0xdc),p1(0xdd),p1(0xde),p1(0xdf) -+#define im_data7(p1) \ -+ .long p1(0xe0),p1(0xe1),p1(0xe2),p1(0xe3),p1(0xe4),p1(0xe5),p1(0xe6),p1(0xe7) ;\ -+ .long p1(0xe8),p1(0xe9),p1(0xea),p1(0xeb),p1(0xec),p1(0xed),p1(0xee),p1(0xef) ;\ -+ .long p1(0xf0),p1(0xf1),p1(0xf2),p1(0xf3),p1(0xf4),p1(0xf5),p1(0xf6),p1(0xf7) ;\ -+ .long p1(0xf8),p1(0xf9),p1(0xfa),p1(0xfb),p1(0xfc),p1(0xfd),p1(0xfe),p1(0xff) -+ -+// S-box data - 256 entries -+ -+#define sb_data0(p1) \ -+ .long p1(0x63),p1(0x7c),p1(0x77),p1(0x7b),p1(0xf2),p1(0x6b),p1(0x6f),p1(0xc5) ;\ -+ .long p1(0x30),p1(0x01),p1(0x67),p1(0x2b),p1(0xfe),p1(0xd7),p1(0xab),p1(0x76) ;\ -+ .long p1(0xca),p1(0x82),p1(0xc9),p1(0x7d),p1(0xfa),p1(0x59),p1(0x47),p1(0xf0) ;\ -+ .long p1(0xad),p1(0xd4),p1(0xa2),p1(0xaf),p1(0x9c),p1(0xa4),p1(0x72),p1(0xc0) -+#define sb_data1(p1) \ -+ .long p1(0xb7),p1(0xfd),p1(0x93),p1(0x26),p1(0x36),p1(0x3f),p1(0xf7),p1(0xcc) ;\ -+ .long p1(0x34),p1(0xa5),p1(0xe5),p1(0xf1),p1(0x71),p1(0xd8),p1(0x31),p1(0x15) ;\ -+ .long p1(0x04),p1(0xc7),p1(0x23),p1(0xc3),p1(0x18),p1(0x96),p1(0x05),p1(0x9a) ;\ -+ .long p1(0x07),p1(0x12),p1(0x80),p1(0xe2),p1(0xeb),p1(0x27),p1(0xb2),p1(0x75) -+#define sb_data2(p1) \ -+ .long p1(0x09),p1(0x83),p1(0x2c),p1(0x1a),p1(0x1b),p1(0x6e),p1(0x5a),p1(0xa0) ;\ -+ .long p1(0x52),p1(0x3b),p1(0xd6),p1(0xb3),p1(0x29),p1(0xe3),p1(0x2f),p1(0x84) ;\ -+ .long p1(0x53),p1(0xd1),p1(0x00),p1(0xed),p1(0x20),p1(0xfc),p1(0xb1),p1(0x5b) ;\ -+ .long p1(0x6a),p1(0xcb),p1(0xbe),p1(0x39),p1(0x4a),p1(0x4c),p1(0x58),p1(0xcf) -+#define sb_data3(p1) \ -+ .long p1(0xd0),p1(0xef),p1(0xaa),p1(0xfb),p1(0x43),p1(0x4d),p1(0x33),p1(0x85) ;\ -+ .long p1(0x45),p1(0xf9),p1(0x02),p1(0x7f),p1(0x50),p1(0x3c),p1(0x9f),p1(0xa8) ;\ -+ .long p1(0x51),p1(0xa3),p1(0x40),p1(0x8f),p1(0x92),p1(0x9d),p1(0x38),p1(0xf5) ;\ -+ .long p1(0xbc),p1(0xb6),p1(0xda),p1(0x21),p1(0x10),p1(0xff),p1(0xf3),p1(0xd2) -+#define sb_data4(p1) \ -+ .long p1(0xcd),p1(0x0c),p1(0x13),p1(0xec),p1(0x5f),p1(0x97),p1(0x44),p1(0x17) ;\ -+ .long p1(0xc4),p1(0xa7),p1(0x7e),p1(0x3d),p1(0x64),p1(0x5d),p1(0x19),p1(0x73) ;\ -+ .long p1(0x60),p1(0x81),p1(0x4f),p1(0xdc),p1(0x22),p1(0x2a),p1(0x90),p1(0x88) ;\ -+ .long p1(0x46),p1(0xee),p1(0xb8),p1(0x14),p1(0xde),p1(0x5e),p1(0x0b),p1(0xdb) -+#define sb_data5(p1) \ -+ .long p1(0xe0),p1(0x32),p1(0x3a),p1(0x0a),p1(0x49),p1(0x06),p1(0x24),p1(0x5c) ;\ -+ .long p1(0xc2),p1(0xd3),p1(0xac),p1(0x62),p1(0x91),p1(0x95),p1(0xe4),p1(0x79) ;\ -+ .long p1(0xe7),p1(0xc8),p1(0x37),p1(0x6d),p1(0x8d),p1(0xd5),p1(0x4e),p1(0xa9) ;\ -+ .long p1(0x6c),p1(0x56),p1(0xf4),p1(0xea),p1(0x65),p1(0x7a),p1(0xae),p1(0x08) -+#define sb_data6(p1) \ -+ .long p1(0xba),p1(0x78),p1(0x25),p1(0x2e),p1(0x1c),p1(0xa6),p1(0xb4),p1(0xc6) ;\ -+ .long p1(0xe8),p1(0xdd),p1(0x74),p1(0x1f),p1(0x4b),p1(0xbd),p1(0x8b),p1(0x8a) ;\ -+ .long p1(0x70),p1(0x3e),p1(0xb5),p1(0x66),p1(0x48),p1(0x03),p1(0xf6),p1(0x0e) ;\ -+ .long p1(0x61),p1(0x35),p1(0x57),p1(0xb9),p1(0x86),p1(0xc1),p1(0x1d),p1(0x9e) -+#define sb_data7(p1) \ -+ .long p1(0xe1),p1(0xf8),p1(0x98),p1(0x11),p1(0x69),p1(0xd9),p1(0x8e),p1(0x94) ;\ -+ .long p1(0x9b),p1(0x1e),p1(0x87),p1(0xe9),p1(0xce),p1(0x55),p1(0x28),p1(0xdf) ;\ -+ .long p1(0x8c),p1(0xa1),p1(0x89),p1(0x0d),p1(0xbf),p1(0xe6),p1(0x42),p1(0x68) ;\ -+ .long p1(0x41),p1(0x99),p1(0x2d),p1(0x0f),p1(0xb0),p1(0x54),p1(0xbb),p1(0x16) -+ -+// Inverse S-box data - 256 entries -+ -+#define ib_data0(p1) \ -+ .long p1(0x52),p1(0x09),p1(0x6a),p1(0xd5),p1(0x30),p1(0x36),p1(0xa5),p1(0x38) ;\ -+ .long p1(0xbf),p1(0x40),p1(0xa3),p1(0x9e),p1(0x81),p1(0xf3),p1(0xd7),p1(0xfb) ;\ -+ .long p1(0x7c),p1(0xe3),p1(0x39),p1(0x82),p1(0x9b),p1(0x2f),p1(0xff),p1(0x87) ;\ -+ .long p1(0x34),p1(0x8e),p1(0x43),p1(0x44),p1(0xc4),p1(0xde),p1(0xe9),p1(0xcb) -+#define ib_data1(p1) \ -+ .long p1(0x54),p1(0x7b),p1(0x94),p1(0x32),p1(0xa6),p1(0xc2),p1(0x23),p1(0x3d) ;\ -+ .long p1(0xee),p1(0x4c),p1(0x95),p1(0x0b),p1(0x42),p1(0xfa),p1(0xc3),p1(0x4e) ;\ -+ .long p1(0x08),p1(0x2e),p1(0xa1),p1(0x66),p1(0x28),p1(0xd9),p1(0x24),p1(0xb2) ;\ -+ .long p1(0x76),p1(0x5b),p1(0xa2),p1(0x49),p1(0x6d),p1(0x8b),p1(0xd1),p1(0x25) -+#define ib_data2(p1) \ -+ .long p1(0x72),p1(0xf8),p1(0xf6),p1(0x64),p1(0x86),p1(0x68),p1(0x98),p1(0x16) ;\ -+ .long p1(0xd4),p1(0xa4),p1(0x5c),p1(0xcc),p1(0x5d),p1(0x65),p1(0xb6),p1(0x92) ;\ -+ .long p1(0x6c),p1(0x70),p1(0x48),p1(0x50),p1(0xfd),p1(0xed),p1(0xb9),p1(0xda) ;\ -+ .long p1(0x5e),p1(0x15),p1(0x46),p1(0x57),p1(0xa7),p1(0x8d),p1(0x9d),p1(0x84) -+#define ib_data3(p1) \ -+ .long p1(0x90),p1(0xd8),p1(0xab),p1(0x00),p1(0x8c),p1(0xbc),p1(0xd3),p1(0x0a) ;\ -+ .long p1(0xf7),p1(0xe4),p1(0x58),p1(0x05),p1(0xb8),p1(0xb3),p1(0x45),p1(0x06) ;\ -+ .long p1(0xd0),p1(0x2c),p1(0x1e),p1(0x8f),p1(0xca),p1(0x3f),p1(0x0f),p1(0x02) ;\ -+ .long p1(0xc1),p1(0xaf),p1(0xbd),p1(0x03),p1(0x01),p1(0x13),p1(0x8a),p1(0x6b) -+#define ib_data4(p1) \ -+ .long p1(0x3a),p1(0x91),p1(0x11),p1(0x41),p1(0x4f),p1(0x67),p1(0xdc),p1(0xea) ;\ -+ .long p1(0x97),p1(0xf2),p1(0xcf),p1(0xce),p1(0xf0),p1(0xb4),p1(0xe6),p1(0x73) ;\ -+ .long p1(0x96),p1(0xac),p1(0x74),p1(0x22),p1(0xe7),p1(0xad),p1(0x35),p1(0x85) ;\ -+ .long p1(0xe2),p1(0xf9),p1(0x37),p1(0xe8),p1(0x1c),p1(0x75),p1(0xdf),p1(0x6e) -+#define ib_data5(p1) \ -+ .long p1(0x47),p1(0xf1),p1(0x1a),p1(0x71),p1(0x1d),p1(0x29),p1(0xc5),p1(0x89) ;\ -+ .long p1(0x6f),p1(0xb7),p1(0x62),p1(0x0e),p1(0xaa),p1(0x18),p1(0xbe),p1(0x1b) ;\ -+ .long p1(0xfc),p1(0x56),p1(0x3e),p1(0x4b),p1(0xc6),p1(0xd2),p1(0x79),p1(0x20) ;\ -+ .long p1(0x9a),p1(0xdb),p1(0xc0),p1(0xfe),p1(0x78),p1(0xcd),p1(0x5a),p1(0xf4) -+#define ib_data6(p1) \ -+ .long p1(0x1f),p1(0xdd),p1(0xa8),p1(0x33),p1(0x88),p1(0x07),p1(0xc7),p1(0x31) ;\ -+ .long p1(0xb1),p1(0x12),p1(0x10),p1(0x59),p1(0x27),p1(0x80),p1(0xec),p1(0x5f) ;\ -+ .long p1(0x60),p1(0x51),p1(0x7f),p1(0xa9),p1(0x19),p1(0xb5),p1(0x4a),p1(0x0d) ;\ -+ .long p1(0x2d),p1(0xe5),p1(0x7a),p1(0x9f),p1(0x93),p1(0xc9),p1(0x9c),p1(0xef) -+#define ib_data7(p1) \ -+ .long p1(0xa0),p1(0xe0),p1(0x3b),p1(0x4d),p1(0xae),p1(0x2a),p1(0xf5),p1(0xb0) ;\ -+ .long p1(0xc8),p1(0xeb),p1(0xbb),p1(0x3c),p1(0x83),p1(0x53),p1(0x99),p1(0x61) ;\ -+ .long p1(0x17),p1(0x2b),p1(0x04),p1(0x7e),p1(0xba),p1(0x77),p1(0xd6),p1(0x26) ;\ -+ .long p1(0xe1),p1(0x69),p1(0x14),p1(0x63),p1(0x55),p1(0x21),p1(0x0c),p1(0x7d) -+ -+// The rcon_table (needed for the key schedule) -+// -+// Here is original Dr Brian Gladman's source code: -+// _rcon_tab: -+// %assign x 1 -+// %rep 29 -+// dd x -+// %assign x f2(x) -+// %endrep -+// -+// Here is precomputed output (it's more portable this way): -+ -+ .align ALIGN32BYTES -+aes_rcon_tab: -+ .long 0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80 -+ .long 0x1b,0x36,0x6c,0xd8,0xab,0x4d,0x9a,0x2f -+ .long 0x5e,0xbc,0x63,0xc6,0x97,0x35,0x6a,0xd4 -+ .long 0xb3,0x7d,0xfa,0xef,0xc5 -+ -+// The forward xor tables -+ -+ .align ALIGN32BYTES -+aes_ft_tab: -+ sb_data0(u0) -+ sb_data1(u0) -+ sb_data2(u0) -+ sb_data3(u0) -+ sb_data4(u0) -+ sb_data5(u0) -+ sb_data6(u0) -+ sb_data7(u0) -+ -+ sb_data0(u1) -+ sb_data1(u1) -+ sb_data2(u1) -+ sb_data3(u1) -+ sb_data4(u1) -+ sb_data5(u1) -+ sb_data6(u1) -+ sb_data7(u1) -+ -+ sb_data0(u2) -+ sb_data1(u2) -+ sb_data2(u2) -+ sb_data3(u2) -+ sb_data4(u2) -+ sb_data5(u2) -+ sb_data6(u2) -+ sb_data7(u2) -+ -+ sb_data0(u3) -+ sb_data1(u3) -+ sb_data2(u3) -+ sb_data3(u3) -+ sb_data4(u3) -+ sb_data5(u3) -+ sb_data6(u3) -+ sb_data7(u3) -+ -+ .align ALIGN32BYTES -+aes_fl_tab: -+ sb_data0(w0) -+ sb_data1(w0) -+ sb_data2(w0) -+ sb_data3(w0) -+ sb_data4(w0) -+ sb_data5(w0) -+ sb_data6(w0) -+ sb_data7(w0) -+ -+ sb_data0(w1) -+ sb_data1(w1) -+ sb_data2(w1) -+ sb_data3(w1) -+ sb_data4(w1) -+ sb_data5(w1) -+ sb_data6(w1) -+ sb_data7(w1) -+ -+ sb_data0(w2) -+ sb_data1(w2) -+ sb_data2(w2) -+ sb_data3(w2) -+ sb_data4(w2) -+ sb_data5(w2) -+ sb_data6(w2) -+ sb_data7(w2) -+ -+ sb_data0(w3) -+ sb_data1(w3) -+ sb_data2(w3) -+ sb_data3(w3) -+ sb_data4(w3) -+ sb_data5(w3) -+ sb_data6(w3) -+ sb_data7(w3) -+ -+// The inverse xor tables -+ -+ .align ALIGN32BYTES -+aes_it_tab: -+ ib_data0(v0) -+ ib_data1(v0) -+ ib_data2(v0) -+ ib_data3(v0) -+ ib_data4(v0) -+ ib_data5(v0) -+ ib_data6(v0) -+ ib_data7(v0) -+ -+ ib_data0(v1) -+ ib_data1(v1) -+ ib_data2(v1) -+ ib_data3(v1) -+ ib_data4(v1) -+ ib_data5(v1) -+ ib_data6(v1) -+ ib_data7(v1) -+ -+ ib_data0(v2) -+ ib_data1(v2) -+ ib_data2(v2) -+ ib_data3(v2) -+ ib_data4(v2) -+ ib_data5(v2) -+ ib_data6(v2) -+ ib_data7(v2) -+ -+ ib_data0(v3) -+ ib_data1(v3) -+ ib_data2(v3) -+ ib_data3(v3) -+ ib_data4(v3) -+ ib_data5(v3) -+ ib_data6(v3) -+ ib_data7(v3) -+ -+ .align ALIGN32BYTES -+aes_il_tab: -+ ib_data0(w0) -+ ib_data1(w0) -+ ib_data2(w0) -+ ib_data3(w0) -+ ib_data4(w0) -+ ib_data5(w0) -+ ib_data6(w0) -+ ib_data7(w0) -+ -+ ib_data0(w1) -+ ib_data1(w1) -+ ib_data2(w1) -+ ib_data3(w1) -+ ib_data4(w1) -+ ib_data5(w1) -+ ib_data6(w1) -+ ib_data7(w1) -+ -+ ib_data0(w2) -+ ib_data1(w2) -+ ib_data2(w2) -+ ib_data3(w2) -+ ib_data4(w2) -+ ib_data5(w2) -+ ib_data6(w2) -+ ib_data7(w2) -+ -+ ib_data0(w3) -+ ib_data1(w3) -+ ib_data2(w3) -+ ib_data3(w3) -+ ib_data4(w3) -+ ib_data5(w3) -+ ib_data6(w3) -+ ib_data7(w3) -+ -+// The inverse mix column tables -+ -+ .align ALIGN32BYTES -+aes_im_tab: -+ im_data0(v0) -+ im_data1(v0) -+ im_data2(v0) -+ im_data3(v0) -+ im_data4(v0) -+ im_data5(v0) -+ im_data6(v0) -+ im_data7(v0) -+ -+ im_data0(v1) -+ im_data1(v1) -+ im_data2(v1) -+ im_data3(v1) -+ im_data4(v1) -+ im_data5(v1) -+ im_data6(v1) -+ im_data7(v1) -+ -+ im_data0(v2) -+ im_data1(v2) -+ im_data2(v2) -+ im_data3(v2) -+ im_data4(v2) -+ im_data5(v2) -+ im_data6(v2) -+ im_data7(v2) -+ -+ im_data0(v3) -+ im_data1(v3) -+ im_data2(v3) -+ im_data3(v3) -+ im_data4(v3) -+ im_data5(v3) -+ im_data6(v3) -+ im_data7(v3) ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/aes/aes.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,1427 @@ -+// I retain copyright in this code but I encourage its free use provided -+// that I don't carry any responsibility for the results. I am especially -+// happy to see it used in free and open source software. If you do use -+// it I would appreciate an acknowledgement of its origin in the code or -+// the product that results and I would also appreciate knowing a little -+// about the use to which it is being put. I am grateful to Frank Yellin -+// for some ideas that are used in this implementation. -+// -+// Dr B. R. Gladman 6th April 2001. -+// -+// This is an implementation of the AES encryption algorithm (Rijndael) -+// designed by Joan Daemen and Vincent Rijmen. This version is designed -+// to provide both fixed and dynamic block and key lengths and can also -+// run with either big or little endian internal byte order (see aes.h). -+// It inputs block and key lengths in bytes with the legal values being -+// 16, 24 and 32. -+ -+/* -+ * Modified by Jari Ruusu, May 1 2001 -+ * - Fixed some compile warnings, code was ok but gcc warned anyway. -+ * - Changed basic types: byte -> unsigned char, word -> u_int32_t -+ * - Major name space cleanup: Names visible to outside now begin -+ * with "aes_" or "AES_". A lot of stuff moved from aes.h to aes.c -+ * - Removed C++ and DLL support as part of name space cleanup. -+ * - Eliminated unnecessary recomputation of tables. (actual bug fix) -+ * - Merged precomputed constant tables to aes.c file. -+ * - Removed data alignment restrictions for portability reasons. -+ * - Made block and key lengths accept bit count (128/192/256) -+ * as well byte count (16/24/32). -+ * - Removed all error checks. This change also eliminated the need -+ * to preinitialize the context struct to zero. -+ * - Removed some totally unused constants. -+ */ -+ -+#include "klips-crypto/aes.h" -+ -+#ifdef OCF_ASSIST -+#include "klips-crypto/ocf_assist.h" -+#endif -+ -+// CONFIGURATION OPTIONS (see also aes.h) -+// -+// 1. Define UNROLL for full loop unrolling in encryption and decryption. -+// 2. Define PARTIAL_UNROLL to unroll two loops in encryption and decryption. -+// 3. Define FIXED_TABLES for compiled rather than dynamic tables. -+// 4. Define FF_TABLES to use tables for field multiplies and inverses. -+// Do not enable this without understanding stack space requirements. -+// 5. Define ARRAYS to use arrays to hold the local state block. If this -+// is not defined, individually declared 32-bit words are used. -+// 6. Define FAST_VARIABLE if a high speed variable block implementation -+// is needed (essentially three separate fixed block size code sequences) -+// 7. Define either ONE_TABLE or FOUR_TABLES for a fast table driven -+// version using 1 table (2 kbytes of table space) or 4 tables (8 -+// kbytes of table space) for higher speed. -+// 8. Define either ONE_LR_TABLE or FOUR_LR_TABLES for a further speed -+// increase by using tables for the last rounds but with more table -+// space (2 or 8 kbytes extra). -+// 9. If neither ONE_TABLE nor FOUR_TABLES is defined, a compact but -+// slower version is provided. -+// 10. If fast decryption key scheduling is needed define ONE_IM_TABLE -+// or FOUR_IM_TABLES for higher speed (2 or 8 kbytes extra). -+ -+#define UNROLL -+//#define PARTIAL_UNROLL -+ -+#define FIXED_TABLES -+//#define FF_TABLES -+//#define ARRAYS -+#define FAST_VARIABLE -+ -+//#define ONE_TABLE -+#define FOUR_TABLES -+ -+//#define ONE_LR_TABLE -+#define FOUR_LR_TABLES -+ -+//#define ONE_IM_TABLE -+#define FOUR_IM_TABLES -+ -+#if defined(UNROLL) && defined (PARTIAL_UNROLL) -+#error both UNROLL and PARTIAL_UNROLL are defined -+#endif -+ -+#if defined(ONE_TABLE) && defined (FOUR_TABLES) -+#error both ONE_TABLE and FOUR_TABLES are defined -+#endif -+ -+#if defined(ONE_LR_TABLE) && defined (FOUR_LR_TABLES) -+#error both ONE_LR_TABLE and FOUR_LR_TABLES are defined -+#endif -+ -+#if defined(ONE_IM_TABLE) && defined (FOUR_IM_TABLES) -+#error both ONE_IM_TABLE and FOUR_IM_TABLES are defined -+#endif -+ -+#if defined(AES_BLOCK_SIZE) && AES_BLOCK_SIZE != 16 && AES_BLOCK_SIZE != 24 && AES_BLOCK_SIZE != 32 -+#error an illegal block size has been specified -+#endif -+ -+// upr(x,n): rotates bytes within words by n positions, moving bytes -+// to higher index positions with wrap around into low positions -+// ups(x,n): moves bytes by n positions to higher index positions in -+// words but without wrap around -+// bval(x,n): extracts a byte from a word -+ -+#define upr(x,n) (((x) << 8 * (n)) | ((x) >> (32 - 8 * (n)))) -+#define ups(x,n) ((x) << 8 * (n)) -+#define bval(x,n) ((unsigned char)((x) >> 8 * (n))) -+#define bytes2word(b0, b1, b2, b3) \ -+ ((u_int32_t)(b3) << 24 | (u_int32_t)(b2) << 16 | (u_int32_t)(b1) << 8 | (b0)) -+ -+ -+/* little endian processor without data alignment restrictions: AES_LE_OK */ -+/* original code: i386 */ -+#if defined(i386) || defined(_I386) || defined(__i386__) || defined(__i386) -+#define AES_LE_OK 1 -+/* added (tested): alpha --jjo */ -+#elif defined(__alpha__)|| defined (__alpha) -+#define AES_LE_OK 1 -+/* added (tested): ia64 --jjo */ -+#elif defined(__ia64__)|| defined (__ia64) -+#define AES_LE_OK 1 -+#endif -+ -+#ifdef AES_LE_OK -+/* little endian processor without data alignment restrictions */ -+#define word_in(x) *(u_int32_t*)(x) -+#define const_word_in(x) *(const u_int32_t*)(x) -+#define word_out(x,v) *(u_int32_t*)(x) = (v) -+#define const_word_out(x,v) *(const u_int32_t*)(x) = (v) -+#else -+/* slower but generic big endian or with data alignment restrictions */ -+/* some additional "const" touches to stop "gcc -Wcast-qual" complains --jjo */ -+#define word_in(x) ((u_int32_t)(((unsigned char *)(x))[0])|((u_int32_t)(((unsigned char *)(x))[1])<<8)|((u_int32_t)(((unsigned char *)(x))[2])<<16)|((u_int32_t)(((unsigned char *)(x))[3])<<24)) -+#define const_word_in(x) ((const u_int32_t)(((const unsigned char *)(x))[0])|((const u_int32_t)(((const unsigned char *)(x))[1])<<8)|((const u_int32_t)(((const unsigned char *)(x))[2])<<16)|((const u_int32_t)(((const unsigned char *)(x))[3])<<24)) -+#define word_out(x,v) ((unsigned char *)(x))[0]=(v),((unsigned char *)(x))[1]=((v)>>8),((unsigned char *)(x))[2]=((v)>>16),((unsigned char *)(x))[3]=((v)>>24) -+#define const_word_out(x,v) ((const unsigned char *)(x))[0]=(v),((const unsigned char *)(x))[1]=((v)>>8),((const unsigned char *)(x))[2]=((v)>>16),((const unsigned char *)(x))[3]=((v)>>24) -+#endif -+ -+// Disable at least some poor combinations of options -+ -+#if !defined(ONE_TABLE) && !defined(FOUR_TABLES) -+#define FIXED_TABLES -+#undef UNROLL -+#undef ONE_LR_TABLE -+#undef FOUR_LR_TABLES -+#undef ONE_IM_TABLE -+#undef FOUR_IM_TABLES -+#elif !defined(FOUR_TABLES) -+#ifdef FOUR_LR_TABLES -+#undef FOUR_LR_TABLES -+#define ONE_LR_TABLE -+#endif -+#ifdef FOUR_IM_TABLES -+#undef FOUR_IM_TABLES -+#define ONE_IM_TABLE -+#endif -+#elif !defined(AES_BLOCK_SIZE) -+#if defined(UNROLL) -+#define PARTIAL_UNROLL -+#undef UNROLL -+#endif -+#endif -+ -+// the finite field modular polynomial and elements -+ -+#define ff_poly 0x011b -+#define ff_hi 0x80 -+ -+// multiply four bytes in GF(2^8) by 'x' {02} in parallel -+ -+#define m1 0x80808080 -+#define m2 0x7f7f7f7f -+#define m3 0x0000001b -+#define FFmulX(x) ((((x) & m2) << 1) ^ ((((x) & m1) >> 7) * m3)) -+ -+// The following defines provide alternative definitions of FFmulX that might -+// give improved performance if a fast 32-bit multiply is not available. Note -+// that a temporary variable u needs to be defined where FFmulX is used. -+ -+// #define FFmulX(x) (u = (x) & m1, u |= (u >> 1), ((x) & m2) << 1) ^ ((u >> 3) | (u >> 6)) -+// #define m4 0x1b1b1b1b -+// #define FFmulX(x) (u = (x) & m1, ((x) & m2) << 1) ^ ((u - (u >> 7)) & m4) -+ -+// perform column mix operation on four bytes in parallel -+ -+#define fwd_mcol(x) (f2 = FFmulX(x), f2 ^ upr(x ^ f2,3) ^ upr(x,2) ^ upr(x,1)) -+ -+#if defined(FIXED_TABLES) -+ -+// the S-Box table -+ -+static const unsigned char s_box[256] = -+{ -+ 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, -+ 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, -+ 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, -+ 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, -+ 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, -+ 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, -+ 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, -+ 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, -+ 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, -+ 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, -+ 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, -+ 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, -+ 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, -+ 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, -+ 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, -+ 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, -+ 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, -+ 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, -+ 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, -+ 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, -+ 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, -+ 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, -+ 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, -+ 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, -+ 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, -+ 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, -+ 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, -+ 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, -+ 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, -+ 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, -+ 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, -+ 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 -+}; -+ -+// the inverse S-Box table -+ -+static const unsigned char inv_s_box[256] = -+{ -+ 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, -+ 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, -+ 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, -+ 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, -+ 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, -+ 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, -+ 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, -+ 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, -+ 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, -+ 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, -+ 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, -+ 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, -+ 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, -+ 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, -+ 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, -+ 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, -+ 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, -+ 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, -+ 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, -+ 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, -+ 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, -+ 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, -+ 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, -+ 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, -+ 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, -+ 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, -+ 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, -+ 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, -+ 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, -+ 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, -+ 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, -+ 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d -+}; -+ -+#define w0(p) 0x000000##p -+ -+// Number of elements required in this table for different -+// block and key lengths is: -+// -+// Nk = 4 6 8 -+// ---------- -+// Nb = 4 | 10 8 7 -+// 6 | 19 12 11 -+// 8 | 29 19 14 -+// -+// this table can be a table of bytes if the key schedule -+// code is adjusted accordingly -+ -+static const u_int32_t rcon_tab[29] = -+{ -+ w0(01), w0(02), w0(04), w0(08), -+ w0(10), w0(20), w0(40), w0(80), -+ w0(1b), w0(36), w0(6c), w0(d8), -+ w0(ab), w0(4d), w0(9a), w0(2f), -+ w0(5e), w0(bc), w0(63), w0(c6), -+ w0(97), w0(35), w0(6a), w0(d4), -+ w0(b3), w0(7d), w0(fa), w0(ef), -+ w0(c5) -+}; -+ -+#undef w0 -+ -+#define r0(p,q,r,s) 0x##p##q##r##s -+#define r1(p,q,r,s) 0x##q##r##s##p -+#define r2(p,q,r,s) 0x##r##s##p##q -+#define r3(p,q,r,s) 0x##s##p##q##r -+#define w0(p) 0x000000##p -+#define w1(p) 0x0000##p##00 -+#define w2(p) 0x00##p##0000 -+#define w3(p) 0x##p##000000 -+ -+#if defined(FIXED_TABLES) && (defined(ONE_TABLE) || defined(FOUR_TABLES)) -+ -+// data for forward tables (other than last round) -+ -+#define f_table \ -+ r(a5,63,63,c6), r(84,7c,7c,f8), r(99,77,77,ee), r(8d,7b,7b,f6),\ -+ r(0d,f2,f2,ff), r(bd,6b,6b,d6), r(b1,6f,6f,de), r(54,c5,c5,91),\ -+ r(50,30,30,60), r(03,01,01,02), r(a9,67,67,ce), r(7d,2b,2b,56),\ -+ r(19,fe,fe,e7), r(62,d7,d7,b5), r(e6,ab,ab,4d), r(9a,76,76,ec),\ -+ r(45,ca,ca,8f), r(9d,82,82,1f), r(40,c9,c9,89), r(87,7d,7d,fa),\ -+ r(15,fa,fa,ef), r(eb,59,59,b2), r(c9,47,47,8e), r(0b,f0,f0,fb),\ -+ r(ec,ad,ad,41), r(67,d4,d4,b3), r(fd,a2,a2,5f), r(ea,af,af,45),\ -+ r(bf,9c,9c,23), r(f7,a4,a4,53), r(96,72,72,e4), r(5b,c0,c0,9b),\ -+ r(c2,b7,b7,75), r(1c,fd,fd,e1), r(ae,93,93,3d), r(6a,26,26,4c),\ -+ r(5a,36,36,6c), r(41,3f,3f,7e), r(02,f7,f7,f5), r(4f,cc,cc,83),\ -+ r(5c,34,34,68), r(f4,a5,a5,51), r(34,e5,e5,d1), r(08,f1,f1,f9),\ -+ r(93,71,71,e2), r(73,d8,d8,ab), r(53,31,31,62), r(3f,15,15,2a),\ -+ r(0c,04,04,08), r(52,c7,c7,95), r(65,23,23,46), r(5e,c3,c3,9d),\ -+ r(28,18,18,30), r(a1,96,96,37), r(0f,05,05,0a), r(b5,9a,9a,2f),\ -+ r(09,07,07,0e), r(36,12,12,24), r(9b,80,80,1b), r(3d,e2,e2,df),\ -+ r(26,eb,eb,cd), r(69,27,27,4e), r(cd,b2,b2,7f), r(9f,75,75,ea),\ -+ r(1b,09,09,12), r(9e,83,83,1d), r(74,2c,2c,58), r(2e,1a,1a,34),\ -+ r(2d,1b,1b,36), r(b2,6e,6e,dc), r(ee,5a,5a,b4), r(fb,a0,a0,5b),\ -+ r(f6,52,52,a4), r(4d,3b,3b,76), r(61,d6,d6,b7), r(ce,b3,b3,7d),\ -+ r(7b,29,29,52), r(3e,e3,e3,dd), r(71,2f,2f,5e), r(97,84,84,13),\ -+ r(f5,53,53,a6), r(68,d1,d1,b9), r(00,00,00,00), r(2c,ed,ed,c1),\ -+ r(60,20,20,40), r(1f,fc,fc,e3), r(c8,b1,b1,79), r(ed,5b,5b,b6),\ -+ r(be,6a,6a,d4), r(46,cb,cb,8d), r(d9,be,be,67), r(4b,39,39,72),\ -+ r(de,4a,4a,94), r(d4,4c,4c,98), r(e8,58,58,b0), r(4a,cf,cf,85),\ -+ r(6b,d0,d0,bb), r(2a,ef,ef,c5), r(e5,aa,aa,4f), r(16,fb,fb,ed),\ -+ r(c5,43,43,86), r(d7,4d,4d,9a), r(55,33,33,66), r(94,85,85,11),\ -+ r(cf,45,45,8a), r(10,f9,f9,e9), r(06,02,02,04), r(81,7f,7f,fe),\ -+ r(f0,50,50,a0), r(44,3c,3c,78), r(ba,9f,9f,25), r(e3,a8,a8,4b),\ -+ r(f3,51,51,a2), r(fe,a3,a3,5d), r(c0,40,40,80), r(8a,8f,8f,05),\ -+ r(ad,92,92,3f), r(bc,9d,9d,21), r(48,38,38,70), r(04,f5,f5,f1),\ -+ r(df,bc,bc,63), r(c1,b6,b6,77), r(75,da,da,af), r(63,21,21,42),\ -+ r(30,10,10,20), r(1a,ff,ff,e5), r(0e,f3,f3,fd), r(6d,d2,d2,bf),\ -+ r(4c,cd,cd,81), r(14,0c,0c,18), r(35,13,13,26), r(2f,ec,ec,c3),\ -+ r(e1,5f,5f,be), r(a2,97,97,35), r(cc,44,44,88), r(39,17,17,2e),\ -+ r(57,c4,c4,93), r(f2,a7,a7,55), r(82,7e,7e,fc), r(47,3d,3d,7a),\ -+ r(ac,64,64,c8), r(e7,5d,5d,ba), r(2b,19,19,32), r(95,73,73,e6),\ -+ r(a0,60,60,c0), r(98,81,81,19), r(d1,4f,4f,9e), r(7f,dc,dc,a3),\ -+ r(66,22,22,44), r(7e,2a,2a,54), r(ab,90,90,3b), r(83,88,88,0b),\ -+ r(ca,46,46,8c), r(29,ee,ee,c7), r(d3,b8,b8,6b), r(3c,14,14,28),\ -+ r(79,de,de,a7), r(e2,5e,5e,bc), r(1d,0b,0b,16), r(76,db,db,ad),\ -+ r(3b,e0,e0,db), r(56,32,32,64), r(4e,3a,3a,74), r(1e,0a,0a,14),\ -+ r(db,49,49,92), r(0a,06,06,0c), r(6c,24,24,48), r(e4,5c,5c,b8),\ -+ r(5d,c2,c2,9f), r(6e,d3,d3,bd), r(ef,ac,ac,43), r(a6,62,62,c4),\ -+ r(a8,91,91,39), r(a4,95,95,31), r(37,e4,e4,d3), r(8b,79,79,f2),\ -+ r(32,e7,e7,d5), r(43,c8,c8,8b), r(59,37,37,6e), r(b7,6d,6d,da),\ -+ r(8c,8d,8d,01), r(64,d5,d5,b1), r(d2,4e,4e,9c), r(e0,a9,a9,49),\ -+ r(b4,6c,6c,d8), r(fa,56,56,ac), r(07,f4,f4,f3), r(25,ea,ea,cf),\ -+ r(af,65,65,ca), r(8e,7a,7a,f4), r(e9,ae,ae,47), r(18,08,08,10),\ -+ r(d5,ba,ba,6f), r(88,78,78,f0), r(6f,25,25,4a), r(72,2e,2e,5c),\ -+ r(24,1c,1c,38), r(f1,a6,a6,57), r(c7,b4,b4,73), r(51,c6,c6,97),\ -+ r(23,e8,e8,cb), r(7c,dd,dd,a1), r(9c,74,74,e8), r(21,1f,1f,3e),\ -+ r(dd,4b,4b,96), r(dc,bd,bd,61), r(86,8b,8b,0d), r(85,8a,8a,0f),\ -+ r(90,70,70,e0), r(42,3e,3e,7c), r(c4,b5,b5,71), r(aa,66,66,cc),\ -+ r(d8,48,48,90), r(05,03,03,06), r(01,f6,f6,f7), r(12,0e,0e,1c),\ -+ r(a3,61,61,c2), r(5f,35,35,6a), r(f9,57,57,ae), r(d0,b9,b9,69),\ -+ r(91,86,86,17), r(58,c1,c1,99), r(27,1d,1d,3a), r(b9,9e,9e,27),\ -+ r(38,e1,e1,d9), r(13,f8,f8,eb), r(b3,98,98,2b), r(33,11,11,22),\ -+ r(bb,69,69,d2), r(70,d9,d9,a9), r(89,8e,8e,07), r(a7,94,94,33),\ -+ r(b6,9b,9b,2d), r(22,1e,1e,3c), r(92,87,87,15), r(20,e9,e9,c9),\ -+ r(49,ce,ce,87), r(ff,55,55,aa), r(78,28,28,50), r(7a,df,df,a5),\ -+ r(8f,8c,8c,03), r(f8,a1,a1,59), r(80,89,89,09), r(17,0d,0d,1a),\ -+ r(da,bf,bf,65), r(31,e6,e6,d7), r(c6,42,42,84), r(b8,68,68,d0),\ -+ r(c3,41,41,82), r(b0,99,99,29), r(77,2d,2d,5a), r(11,0f,0f,1e),\ -+ r(cb,b0,b0,7b), r(fc,54,54,a8), r(d6,bb,bb,6d), r(3a,16,16,2c) -+ -+// data for inverse tables (other than last round) -+ -+#define i_table \ -+ r(50,a7,f4,51), r(53,65,41,7e), r(c3,a4,17,1a), r(96,5e,27,3a),\ -+ r(cb,6b,ab,3b), r(f1,45,9d,1f), r(ab,58,fa,ac), r(93,03,e3,4b),\ -+ r(55,fa,30,20), r(f6,6d,76,ad), r(91,76,cc,88), r(25,4c,02,f5),\ -+ r(fc,d7,e5,4f), r(d7,cb,2a,c5), r(80,44,35,26), r(8f,a3,62,b5),\ -+ r(49,5a,b1,de), r(67,1b,ba,25), r(98,0e,ea,45), r(e1,c0,fe,5d),\ -+ r(02,75,2f,c3), r(12,f0,4c,81), r(a3,97,46,8d), r(c6,f9,d3,6b),\ -+ r(e7,5f,8f,03), r(95,9c,92,15), r(eb,7a,6d,bf), r(da,59,52,95),\ -+ r(2d,83,be,d4), r(d3,21,74,58), r(29,69,e0,49), r(44,c8,c9,8e),\ -+ r(6a,89,c2,75), r(78,79,8e,f4), r(6b,3e,58,99), r(dd,71,b9,27),\ -+ r(b6,4f,e1,be), r(17,ad,88,f0), r(66,ac,20,c9), r(b4,3a,ce,7d),\ -+ r(18,4a,df,63), r(82,31,1a,e5), r(60,33,51,97), r(45,7f,53,62),\ -+ r(e0,77,64,b1), r(84,ae,6b,bb), r(1c,a0,81,fe), r(94,2b,08,f9),\ -+ r(58,68,48,70), r(19,fd,45,8f), r(87,6c,de,94), r(b7,f8,7b,52),\ -+ r(23,d3,73,ab), r(e2,02,4b,72), r(57,8f,1f,e3), r(2a,ab,55,66),\ -+ r(07,28,eb,b2), r(03,c2,b5,2f), r(9a,7b,c5,86), r(a5,08,37,d3),\ -+ r(f2,87,28,30), r(b2,a5,bf,23), r(ba,6a,03,02), r(5c,82,16,ed),\ -+ r(2b,1c,cf,8a), r(92,b4,79,a7), r(f0,f2,07,f3), r(a1,e2,69,4e),\ -+ r(cd,f4,da,65), r(d5,be,05,06), r(1f,62,34,d1), r(8a,fe,a6,c4),\ -+ r(9d,53,2e,34), r(a0,55,f3,a2), r(32,e1,8a,05), r(75,eb,f6,a4),\ -+ r(39,ec,83,0b), r(aa,ef,60,40), r(06,9f,71,5e), r(51,10,6e,bd),\ -+ r(f9,8a,21,3e), r(3d,06,dd,96), r(ae,05,3e,dd), r(46,bd,e6,4d),\ -+ r(b5,8d,54,91), r(05,5d,c4,71), r(6f,d4,06,04), r(ff,15,50,60),\ -+ r(24,fb,98,19), r(97,e9,bd,d6), r(cc,43,40,89), r(77,9e,d9,67),\ -+ r(bd,42,e8,b0), r(88,8b,89,07), r(38,5b,19,e7), r(db,ee,c8,79),\ -+ r(47,0a,7c,a1), r(e9,0f,42,7c), r(c9,1e,84,f8), r(00,00,00,00),\ -+ r(83,86,80,09), r(48,ed,2b,32), r(ac,70,11,1e), r(4e,72,5a,6c),\ -+ r(fb,ff,0e,fd), r(56,38,85,0f), r(1e,d5,ae,3d), r(27,39,2d,36),\ -+ r(64,d9,0f,0a), r(21,a6,5c,68), r(d1,54,5b,9b), r(3a,2e,36,24),\ -+ r(b1,67,0a,0c), r(0f,e7,57,93), r(d2,96,ee,b4), r(9e,91,9b,1b),\ -+ r(4f,c5,c0,80), r(a2,20,dc,61), r(69,4b,77,5a), r(16,1a,12,1c),\ -+ r(0a,ba,93,e2), r(e5,2a,a0,c0), r(43,e0,22,3c), r(1d,17,1b,12),\ -+ r(0b,0d,09,0e), r(ad,c7,8b,f2), r(b9,a8,b6,2d), r(c8,a9,1e,14),\ -+ r(85,19,f1,57), r(4c,07,75,af), r(bb,dd,99,ee), r(fd,60,7f,a3),\ -+ r(9f,26,01,f7), r(bc,f5,72,5c), r(c5,3b,66,44), r(34,7e,fb,5b),\ -+ r(76,29,43,8b), r(dc,c6,23,cb), r(68,fc,ed,b6), r(63,f1,e4,b8),\ -+ r(ca,dc,31,d7), r(10,85,63,42), r(40,22,97,13), r(20,11,c6,84),\ -+ r(7d,24,4a,85), r(f8,3d,bb,d2), r(11,32,f9,ae), r(6d,a1,29,c7),\ -+ r(4b,2f,9e,1d), r(f3,30,b2,dc), r(ec,52,86,0d), r(d0,e3,c1,77),\ -+ r(6c,16,b3,2b), r(99,b9,70,a9), r(fa,48,94,11), r(22,64,e9,47),\ -+ r(c4,8c,fc,a8), r(1a,3f,f0,a0), r(d8,2c,7d,56), r(ef,90,33,22),\ -+ r(c7,4e,49,87), r(c1,d1,38,d9), r(fe,a2,ca,8c), r(36,0b,d4,98),\ -+ r(cf,81,f5,a6), r(28,de,7a,a5), r(26,8e,b7,da), r(a4,bf,ad,3f),\ -+ r(e4,9d,3a,2c), r(0d,92,78,50), r(9b,cc,5f,6a), r(62,46,7e,54),\ -+ r(c2,13,8d,f6), r(e8,b8,d8,90), r(5e,f7,39,2e), r(f5,af,c3,82),\ -+ r(be,80,5d,9f), r(7c,93,d0,69), r(a9,2d,d5,6f), r(b3,12,25,cf),\ -+ r(3b,99,ac,c8), r(a7,7d,18,10), r(6e,63,9c,e8), r(7b,bb,3b,db),\ -+ r(09,78,26,cd), r(f4,18,59,6e), r(01,b7,9a,ec), r(a8,9a,4f,83),\ -+ r(65,6e,95,e6), r(7e,e6,ff,aa), r(08,cf,bc,21), r(e6,e8,15,ef),\ -+ r(d9,9b,e7,ba), r(ce,36,6f,4a), r(d4,09,9f,ea), r(d6,7c,b0,29),\ -+ r(af,b2,a4,31), r(31,23,3f,2a), r(30,94,a5,c6), r(c0,66,a2,35),\ -+ r(37,bc,4e,74), r(a6,ca,82,fc), r(b0,d0,90,e0), r(15,d8,a7,33),\ -+ r(4a,98,04,f1), r(f7,da,ec,41), r(0e,50,cd,7f), r(2f,f6,91,17),\ -+ r(8d,d6,4d,76), r(4d,b0,ef,43), r(54,4d,aa,cc), r(df,04,96,e4),\ -+ r(e3,b5,d1,9e), r(1b,88,6a,4c), r(b8,1f,2c,c1), r(7f,51,65,46),\ -+ r(04,ea,5e,9d), r(5d,35,8c,01), r(73,74,87,fa), r(2e,41,0b,fb),\ -+ r(5a,1d,67,b3), r(52,d2,db,92), r(33,56,10,e9), r(13,47,d6,6d),\ -+ r(8c,61,d7,9a), r(7a,0c,a1,37), r(8e,14,f8,59), r(89,3c,13,eb),\ -+ r(ee,27,a9,ce), r(35,c9,61,b7), r(ed,e5,1c,e1), r(3c,b1,47,7a),\ -+ r(59,df,d2,9c), r(3f,73,f2,55), r(79,ce,14,18), r(bf,37,c7,73),\ -+ r(ea,cd,f7,53), r(5b,aa,fd,5f), r(14,6f,3d,df), r(86,db,44,78),\ -+ r(81,f3,af,ca), r(3e,c4,68,b9), r(2c,34,24,38), r(5f,40,a3,c2),\ -+ r(72,c3,1d,16), r(0c,25,e2,bc), r(8b,49,3c,28), r(41,95,0d,ff),\ -+ r(71,01,a8,39), r(de,b3,0c,08), r(9c,e4,b4,d8), r(90,c1,56,64),\ -+ r(61,84,cb,7b), r(70,b6,32,d5), r(74,5c,6c,48), r(42,57,b8,d0) -+ -+// generate the required tables in the desired endian format -+ -+#undef r -+#define r r0 -+ -+#if defined(ONE_TABLE) -+static const u_int32_t ft_tab[256] = -+ { f_table }; -+#elif defined(FOUR_TABLES) -+static const u_int32_t ft_tab[4][256] = -+{ { f_table }, -+#undef r -+#define r r1 -+ { f_table }, -+#undef r -+#define r r2 -+ { f_table }, -+#undef r -+#define r r3 -+ { f_table } -+}; -+#endif -+ -+#undef r -+#define r r0 -+#if defined(ONE_TABLE) -+static const u_int32_t it_tab[256] = -+ { i_table }; -+#elif defined(FOUR_TABLES) -+static const u_int32_t it_tab[4][256] = -+{ { i_table }, -+#undef r -+#define r r1 -+ { i_table }, -+#undef r -+#define r r2 -+ { i_table }, -+#undef r -+#define r r3 -+ { i_table } -+}; -+#endif -+ -+#endif -+ -+#if defined(FIXED_TABLES) && (defined(ONE_LR_TABLE) || defined(FOUR_LR_TABLES)) -+ -+// data for inverse tables (last round) -+ -+#define li_table \ -+ w(52), w(09), w(6a), w(d5), w(30), w(36), w(a5), w(38),\ -+ w(bf), w(40), w(a3), w(9e), w(81), w(f3), w(d7), w(fb),\ -+ w(7c), w(e3), w(39), w(82), w(9b), w(2f), w(ff), w(87),\ -+ w(34), w(8e), w(43), w(44), w(c4), w(de), w(e9), w(cb),\ -+ w(54), w(7b), w(94), w(32), w(a6), w(c2), w(23), w(3d),\ -+ w(ee), w(4c), w(95), w(0b), w(42), w(fa), w(c3), w(4e),\ -+ w(08), w(2e), w(a1), w(66), w(28), w(d9), w(24), w(b2),\ -+ w(76), w(5b), w(a2), w(49), w(6d), w(8b), w(d1), w(25),\ -+ w(72), w(f8), w(f6), w(64), w(86), w(68), w(98), w(16),\ -+ w(d4), w(a4), w(5c), w(cc), w(5d), w(65), w(b6), w(92),\ -+ w(6c), w(70), w(48), w(50), w(fd), w(ed), w(b9), w(da),\ -+ w(5e), w(15), w(46), w(57), w(a7), w(8d), w(9d), w(84),\ -+ w(90), w(d8), w(ab), w(00), w(8c), w(bc), w(d3), w(0a),\ -+ w(f7), w(e4), w(58), w(05), w(b8), w(b3), w(45), w(06),\ -+ w(d0), w(2c), w(1e), w(8f), w(ca), w(3f), w(0f), w(02),\ -+ w(c1), w(af), w(bd), w(03), w(01), w(13), w(8a), w(6b),\ -+ w(3a), w(91), w(11), w(41), w(4f), w(67), w(dc), w(ea),\ -+ w(97), w(f2), w(cf), w(ce), w(f0), w(b4), w(e6), w(73),\ -+ w(96), w(ac), w(74), w(22), w(e7), w(ad), w(35), w(85),\ -+ w(e2), w(f9), w(37), w(e8), w(1c), w(75), w(df), w(6e),\ -+ w(47), w(f1), w(1a), w(71), w(1d), w(29), w(c5), w(89),\ -+ w(6f), w(b7), w(62), w(0e), w(aa), w(18), w(be), w(1b),\ -+ w(fc), w(56), w(3e), w(4b), w(c6), w(d2), w(79), w(20),\ -+ w(9a), w(db), w(c0), w(fe), w(78), w(cd), w(5a), w(f4),\ -+ w(1f), w(dd), w(a8), w(33), w(88), w(07), w(c7), w(31),\ -+ w(b1), w(12), w(10), w(59), w(27), w(80), w(ec), w(5f),\ -+ w(60), w(51), w(7f), w(a9), w(19), w(b5), w(4a), w(0d),\ -+ w(2d), w(e5), w(7a), w(9f), w(93), w(c9), w(9c), w(ef),\ -+ w(a0), w(e0), w(3b), w(4d), w(ae), w(2a), w(f5), w(b0),\ -+ w(c8), w(eb), w(bb), w(3c), w(83), w(53), w(99), w(61),\ -+ w(17), w(2b), w(04), w(7e), w(ba), w(77), w(d6), w(26),\ -+ w(e1), w(69), w(14), w(63), w(55), w(21), w(0c), w(7d), -+ -+// generate the required tables in the desired endian format -+ -+#undef r -+#define r(p,q,r,s) w0(q) -+#if defined(ONE_LR_TABLE) -+static const u_int32_t fl_tab[256] = -+ { f_table }; -+#elif defined(FOUR_LR_TABLES) -+static const u_int32_t fl_tab[4][256] = -+{ { f_table }, -+#undef r -+#define r(p,q,r,s) w1(q) -+ { f_table }, -+#undef r -+#define r(p,q,r,s) w2(q) -+ { f_table }, -+#undef r -+#define r(p,q,r,s) w3(q) -+ { f_table } -+}; -+#endif -+ -+#undef w -+#define w w0 -+#if defined(ONE_LR_TABLE) -+static const u_int32_t il_tab[256] = -+ { li_table }; -+#elif defined(FOUR_LR_TABLES) -+static const u_int32_t il_tab[4][256] = -+{ { li_table }, -+#undef w -+#define w w1 -+ { li_table }, -+#undef w -+#define w w2 -+ { li_table }, -+#undef w -+#define w w3 -+ { li_table } -+}; -+#endif -+ -+#endif -+ -+#if defined(FIXED_TABLES) && (defined(ONE_IM_TABLE) || defined(FOUR_IM_TABLES)) -+ -+#define m_table \ -+ r(00,00,00,00), r(0b,0d,09,0e), r(16,1a,12,1c), r(1d,17,1b,12),\ -+ r(2c,34,24,38), r(27,39,2d,36), r(3a,2e,36,24), r(31,23,3f,2a),\ -+ r(58,68,48,70), r(53,65,41,7e), r(4e,72,5a,6c), r(45,7f,53,62),\ -+ r(74,5c,6c,48), r(7f,51,65,46), r(62,46,7e,54), r(69,4b,77,5a),\ -+ r(b0,d0,90,e0), r(bb,dd,99,ee), r(a6,ca,82,fc), r(ad,c7,8b,f2),\ -+ r(9c,e4,b4,d8), r(97,e9,bd,d6), r(8a,fe,a6,c4), r(81,f3,af,ca),\ -+ r(e8,b8,d8,90), r(e3,b5,d1,9e), r(fe,a2,ca,8c), r(f5,af,c3,82),\ -+ r(c4,8c,fc,a8), r(cf,81,f5,a6), r(d2,96,ee,b4), r(d9,9b,e7,ba),\ -+ r(7b,bb,3b,db), r(70,b6,32,d5), r(6d,a1,29,c7), r(66,ac,20,c9),\ -+ r(57,8f,1f,e3), r(5c,82,16,ed), r(41,95,0d,ff), r(4a,98,04,f1),\ -+ r(23,d3,73,ab), r(28,de,7a,a5), r(35,c9,61,b7), r(3e,c4,68,b9),\ -+ r(0f,e7,57,93), r(04,ea,5e,9d), r(19,fd,45,8f), r(12,f0,4c,81),\ -+ r(cb,6b,ab,3b), r(c0,66,a2,35), r(dd,71,b9,27), r(d6,7c,b0,29),\ -+ r(e7,5f,8f,03), r(ec,52,86,0d), r(f1,45,9d,1f), r(fa,48,94,11),\ -+ r(93,03,e3,4b), r(98,0e,ea,45), r(85,19,f1,57), r(8e,14,f8,59),\ -+ r(bf,37,c7,73), r(b4,3a,ce,7d), r(a9,2d,d5,6f), r(a2,20,dc,61),\ -+ r(f6,6d,76,ad), r(fd,60,7f,a3), r(e0,77,64,b1), r(eb,7a,6d,bf),\ -+ r(da,59,52,95), r(d1,54,5b,9b), r(cc,43,40,89), r(c7,4e,49,87),\ -+ r(ae,05,3e,dd), r(a5,08,37,d3), r(b8,1f,2c,c1), r(b3,12,25,cf),\ -+ r(82,31,1a,e5), r(89,3c,13,eb), r(94,2b,08,f9), r(9f,26,01,f7),\ -+ r(46,bd,e6,4d), r(4d,b0,ef,43), r(50,a7,f4,51), r(5b,aa,fd,5f),\ -+ r(6a,89,c2,75), r(61,84,cb,7b), r(7c,93,d0,69), r(77,9e,d9,67),\ -+ r(1e,d5,ae,3d), r(15,d8,a7,33), r(08,cf,bc,21), r(03,c2,b5,2f),\ -+ r(32,e1,8a,05), r(39,ec,83,0b), r(24,fb,98,19), r(2f,f6,91,17),\ -+ r(8d,d6,4d,76), r(86,db,44,78), r(9b,cc,5f,6a), r(90,c1,56,64),\ -+ r(a1,e2,69,4e), r(aa,ef,60,40), r(b7,f8,7b,52), r(bc,f5,72,5c),\ -+ r(d5,be,05,06), r(de,b3,0c,08), r(c3,a4,17,1a), r(c8,a9,1e,14),\ -+ r(f9,8a,21,3e), r(f2,87,28,30), r(ef,90,33,22), r(e4,9d,3a,2c),\ -+ r(3d,06,dd,96), r(36,0b,d4,98), r(2b,1c,cf,8a), r(20,11,c6,84),\ -+ r(11,32,f9,ae), r(1a,3f,f0,a0), r(07,28,eb,b2), r(0c,25,e2,bc),\ -+ r(65,6e,95,e6), r(6e,63,9c,e8), r(73,74,87,fa), r(78,79,8e,f4),\ -+ r(49,5a,b1,de), r(42,57,b8,d0), r(5f,40,a3,c2), r(54,4d,aa,cc),\ -+ r(f7,da,ec,41), r(fc,d7,e5,4f), r(e1,c0,fe,5d), r(ea,cd,f7,53),\ -+ r(db,ee,c8,79), r(d0,e3,c1,77), r(cd,f4,da,65), r(c6,f9,d3,6b),\ -+ r(af,b2,a4,31), r(a4,bf,ad,3f), r(b9,a8,b6,2d), r(b2,a5,bf,23),\ -+ r(83,86,80,09), r(88,8b,89,07), r(95,9c,92,15), r(9e,91,9b,1b),\ -+ r(47,0a,7c,a1), r(4c,07,75,af), r(51,10,6e,bd), r(5a,1d,67,b3),\ -+ r(6b,3e,58,99), r(60,33,51,97), r(7d,24,4a,85), r(76,29,43,8b),\ -+ r(1f,62,34,d1), r(14,6f,3d,df), r(09,78,26,cd), r(02,75,2f,c3),\ -+ r(33,56,10,e9), r(38,5b,19,e7), r(25,4c,02,f5), r(2e,41,0b,fb),\ -+ r(8c,61,d7,9a), r(87,6c,de,94), r(9a,7b,c5,86), r(91,76,cc,88),\ -+ r(a0,55,f3,a2), r(ab,58,fa,ac), r(b6,4f,e1,be), r(bd,42,e8,b0),\ -+ r(d4,09,9f,ea), r(df,04,96,e4), r(c2,13,8d,f6), r(c9,1e,84,f8),\ -+ r(f8,3d,bb,d2), r(f3,30,b2,dc), r(ee,27,a9,ce), r(e5,2a,a0,c0),\ -+ r(3c,b1,47,7a), r(37,bc,4e,74), r(2a,ab,55,66), r(21,a6,5c,68),\ -+ r(10,85,63,42), r(1b,88,6a,4c), r(06,9f,71,5e), r(0d,92,78,50),\ -+ r(64,d9,0f,0a), r(6f,d4,06,04), r(72,c3,1d,16), r(79,ce,14,18),\ -+ r(48,ed,2b,32), r(43,e0,22,3c), r(5e,f7,39,2e), r(55,fa,30,20),\ -+ r(01,b7,9a,ec), r(0a,ba,93,e2), r(17,ad,88,f0), r(1c,a0,81,fe),\ -+ r(2d,83,be,d4), r(26,8e,b7,da), r(3b,99,ac,c8), r(30,94,a5,c6),\ -+ r(59,df,d2,9c), r(52,d2,db,92), r(4f,c5,c0,80), r(44,c8,c9,8e),\ -+ r(75,eb,f6,a4), r(7e,e6,ff,aa), r(63,f1,e4,b8), r(68,fc,ed,b6),\ -+ r(b1,67,0a,0c), r(ba,6a,03,02), r(a7,7d,18,10), r(ac,70,11,1e),\ -+ r(9d,53,2e,34), r(96,5e,27,3a), r(8b,49,3c,28), r(80,44,35,26),\ -+ r(e9,0f,42,7c), r(e2,02,4b,72), r(ff,15,50,60), r(f4,18,59,6e),\ -+ r(c5,3b,66,44), r(ce,36,6f,4a), r(d3,21,74,58), r(d8,2c,7d,56),\ -+ r(7a,0c,a1,37), r(71,01,a8,39), r(6c,16,b3,2b), r(67,1b,ba,25),\ -+ r(56,38,85,0f), r(5d,35,8c,01), r(40,22,97,13), r(4b,2f,9e,1d),\ -+ r(22,64,e9,47), r(29,69,e0,49), r(34,7e,fb,5b), r(3f,73,f2,55),\ -+ r(0e,50,cd,7f), r(05,5d,c4,71), r(18,4a,df,63), r(13,47,d6,6d),\ -+ r(ca,dc,31,d7), r(c1,d1,38,d9), r(dc,c6,23,cb), r(d7,cb,2a,c5),\ -+ r(e6,e8,15,ef), r(ed,e5,1c,e1), r(f0,f2,07,f3), r(fb,ff,0e,fd),\ -+ r(92,b4,79,a7), r(99,b9,70,a9), r(84,ae,6b,bb), r(8f,a3,62,b5),\ -+ r(be,80,5d,9f), r(b5,8d,54,91), r(a8,9a,4f,83), r(a3,97,46,8d) -+ -+#undef r -+#define r r0 -+ -+#if defined(ONE_IM_TABLE) -+static const u_int32_t im_tab[256] = -+ { m_table }; -+#elif defined(FOUR_IM_TABLES) -+static const u_int32_t im_tab[4][256] = -+{ { m_table }, -+#undef r -+#define r r1 -+ { m_table }, -+#undef r -+#define r r2 -+ { m_table }, -+#undef r -+#define r r3 -+ { m_table } -+}; -+#endif -+ -+#endif -+ -+#else -+ -+static int tab_gen = 0; -+ -+static unsigned char s_box[256]; // the S box -+static unsigned char inv_s_box[256]; // the inverse S box -+static u_int32_t rcon_tab[AES_RC_LENGTH]; // table of round constants -+ -+#if defined(ONE_TABLE) -+static u_int32_t ft_tab[256]; -+static u_int32_t it_tab[256]; -+#elif defined(FOUR_TABLES) -+static u_int32_t ft_tab[4][256]; -+static u_int32_t it_tab[4][256]; -+#endif -+ -+#if defined(ONE_LR_TABLE) -+static u_int32_t fl_tab[256]; -+static u_int32_t il_tab[256]; -+#elif defined(FOUR_LR_TABLES) -+static u_int32_t fl_tab[4][256]; -+static u_int32_t il_tab[4][256]; -+#endif -+ -+#if defined(ONE_IM_TABLE) -+static u_int32_t im_tab[256]; -+#elif defined(FOUR_IM_TABLES) -+static u_int32_t im_tab[4][256]; -+#endif -+ -+// Generate the tables for the dynamic table option -+ -+#if !defined(FF_TABLES) -+ -+// It will generally be sensible to use tables to compute finite -+// field multiplies and inverses but where memory is scarse this -+// code might sometimes be better. -+ -+// return 2 ^ (n - 1) where n is the bit number of the highest bit -+// set in x with x in the range 1 < x < 0x00000200. This form is -+// used so that locals within FFinv can be bytes rather than words -+ -+static unsigned char hibit(const u_int32_t x) -+{ unsigned char r = (unsigned char)((x >> 1) | (x >> 2)); -+ -+ r |= (r >> 2); -+ r |= (r >> 4); -+ return (r + 1) >> 1; -+} -+ -+// return the inverse of the finite field element x -+ -+static unsigned char FFinv(const unsigned char x) -+{ unsigned char p1 = x, p2 = 0x1b, n1 = hibit(x), n2 = 0x80, v1 = 1, v2 = 0; -+ -+ if(x < 2) return x; -+ -+ for(;;) -+ { -+ if(!n1) return v1; -+ -+ while(n2 >= n1) -+ { -+ n2 /= n1; p2 ^= p1 * n2; v2 ^= v1 * n2; n2 = hibit(p2); -+ } -+ -+ if(!n2) return v2; -+ -+ while(n1 >= n2) -+ { -+ n1 /= n2; p1 ^= p2 * n1; v1 ^= v2 * n1; n1 = hibit(p1); -+ } -+ } -+} -+ -+// define the finite field multiplies required for Rijndael -+ -+#define FFmul02(x) ((((x) & 0x7f) << 1) ^ ((x) & 0x80 ? 0x1b : 0)) -+#define FFmul03(x) ((x) ^ FFmul02(x)) -+#define FFmul09(x) ((x) ^ FFmul02(FFmul02(FFmul02(x)))) -+#define FFmul0b(x) ((x) ^ FFmul02((x) ^ FFmul02(FFmul02(x)))) -+#define FFmul0d(x) ((x) ^ FFmul02(FFmul02((x) ^ FFmul02(x)))) -+#define FFmul0e(x) FFmul02((x) ^ FFmul02((x) ^ FFmul02(x))) -+ -+#else -+ -+#define FFinv(x) ((x) ? pow[255 - log[x]]: 0) -+ -+#define FFmul02(x) (x ? pow[log[x] + 0x19] : 0) -+#define FFmul03(x) (x ? pow[log[x] + 0x01] : 0) -+#define FFmul09(x) (x ? pow[log[x] + 0xc7] : 0) -+#define FFmul0b(x) (x ? pow[log[x] + 0x68] : 0) -+#define FFmul0d(x) (x ? pow[log[x] + 0xee] : 0) -+#define FFmul0e(x) (x ? pow[log[x] + 0xdf] : 0) -+ -+#endif -+ -+// The forward and inverse affine transformations used in the S-box -+ -+#define fwd_affine(x) \ -+ (w = (u_int32_t)x, w ^= (w<<1)^(w<<2)^(w<<3)^(w<<4), 0x63^(unsigned char)(w^(w>>8))) -+ -+#define inv_affine(x) \ -+ (w = (u_int32_t)x, w = (w<<1)^(w<<3)^(w<<6), 0x05^(unsigned char)(w^(w>>8))) -+ -+static void gen_tabs(void) -+{ u_int32_t i, w; -+ -+#if defined(FF_TABLES) -+ -+ unsigned char pow[512], log[256]; -+ -+ // log and power tables for GF(2^8) finite field with -+ // 0x011b as modular polynomial - the simplest primitive -+ // root is 0x03, used here to generate the tables -+ -+ i = 0; w = 1; -+ do -+ { -+ pow[i] = (unsigned char)w; -+ pow[i + 255] = (unsigned char)w; -+ log[w] = (unsigned char)i++; -+ w ^= (w << 1) ^ (w & ff_hi ? ff_poly : 0); -+ } -+ while (w != 1); -+ -+#endif -+ -+ for(i = 0, w = 1; i < AES_RC_LENGTH; ++i) -+ { -+ rcon_tab[i] = bytes2word(w, 0, 0, 0); -+ w = (w << 1) ^ (w & ff_hi ? ff_poly : 0); -+ } -+ -+ for(i = 0; i < 256; ++i) -+ { unsigned char b; -+ -+ s_box[i] = b = fwd_affine(FFinv((unsigned char)i)); -+ -+ w = bytes2word(b, 0, 0, 0); -+#if defined(ONE_LR_TABLE) -+ fl_tab[i] = w; -+#elif defined(FOUR_LR_TABLES) -+ fl_tab[0][i] = w; -+ fl_tab[1][i] = upr(w,1); -+ fl_tab[2][i] = upr(w,2); -+ fl_tab[3][i] = upr(w,3); -+#endif -+ w = bytes2word(FFmul02(b), b, b, FFmul03(b)); -+#if defined(ONE_TABLE) -+ ft_tab[i] = w; -+#elif defined(FOUR_TABLES) -+ ft_tab[0][i] = w; -+ ft_tab[1][i] = upr(w,1); -+ ft_tab[2][i] = upr(w,2); -+ ft_tab[3][i] = upr(w,3); -+#endif -+ inv_s_box[i] = b = FFinv(inv_affine((unsigned char)i)); -+ -+ w = bytes2word(b, 0, 0, 0); -+#if defined(ONE_LR_TABLE) -+ il_tab[i] = w; -+#elif defined(FOUR_LR_TABLES) -+ il_tab[0][i] = w; -+ il_tab[1][i] = upr(w,1); -+ il_tab[2][i] = upr(w,2); -+ il_tab[3][i] = upr(w,3); -+#endif -+ w = bytes2word(FFmul0e(b), FFmul09(b), FFmul0d(b), FFmul0b(b)); -+#if defined(ONE_TABLE) -+ it_tab[i] = w; -+#elif defined(FOUR_TABLES) -+ it_tab[0][i] = w; -+ it_tab[1][i] = upr(w,1); -+ it_tab[2][i] = upr(w,2); -+ it_tab[3][i] = upr(w,3); -+#endif -+#if defined(ONE_IM_TABLE) -+ im_tab[b] = w; -+#elif defined(FOUR_IM_TABLES) -+ im_tab[0][b] = w; -+ im_tab[1][b] = upr(w,1); -+ im_tab[2][b] = upr(w,2); -+ im_tab[3][b] = upr(w,3); -+#endif -+ -+ } -+} -+ -+#endif -+ -+#define no_table(x,box,vf,rf,c) bytes2word( \ -+ box[bval(vf(x,0,c),rf(0,c))], \ -+ box[bval(vf(x,1,c),rf(1,c))], \ -+ box[bval(vf(x,2,c),rf(2,c))], \ -+ box[bval(vf(x,3,c),rf(3,c))]) -+ -+#define one_table(x,op,tab,vf,rf,c) \ -+ ( tab[bval(vf(x,0,c),rf(0,c))] \ -+ ^ op(tab[bval(vf(x,1,c),rf(1,c))],1) \ -+ ^ op(tab[bval(vf(x,2,c),rf(2,c))],2) \ -+ ^ op(tab[bval(vf(x,3,c),rf(3,c))],3)) -+ -+#define four_tables(x,tab,vf,rf,c) \ -+ ( tab[0][bval(vf(x,0,c),rf(0,c))] \ -+ ^ tab[1][bval(vf(x,1,c),rf(1,c))] \ -+ ^ tab[2][bval(vf(x,2,c),rf(2,c))] \ -+ ^ tab[3][bval(vf(x,3,c),rf(3,c))]) -+ -+#define vf1(x,r,c) (x) -+#define rf1(r,c) (r) -+#define rf2(r,c) ((r-c)&3) -+ -+#if defined(FOUR_LR_TABLES) -+#define ls_box(x,c) four_tables(x,fl_tab,vf1,rf2,c) -+#elif defined(ONE_LR_TABLE) -+#define ls_box(x,c) one_table(x,upr,fl_tab,vf1,rf2,c) -+#else -+#define ls_box(x,c) no_table(x,s_box,vf1,rf2,c) -+#endif -+ -+#if defined(FOUR_IM_TABLES) -+#define inv_mcol(x) four_tables(x,im_tab,vf1,rf1,0) -+#elif defined(ONE_IM_TABLE) -+#define inv_mcol(x) one_table(x,upr,im_tab,vf1,rf1,0) -+#else -+#define inv_mcol(x) \ -+ (f9 = (x),f2 = FFmulX(f9), f4 = FFmulX(f2), f8 = FFmulX(f4), f9 ^= f8, \ -+ f2 ^= f4 ^ f8 ^ upr(f2 ^ f9,3) ^ upr(f4 ^ f9,2) ^ upr(f9,1)) -+#endif -+ -+// Subroutine to set the block size (if variable) in bytes, legal -+// values being 16, 24 and 32. -+ -+#if defined(AES_BLOCK_SIZE) -+#define nc (AES_BLOCK_SIZE / 4) -+#else -+#define nc (cx->aes_Ncol) -+ -+void aes_set_blk(aes_context *cx, int n_bytes) -+{ -+#if !defined(FIXED_TABLES) -+ if(!tab_gen) { gen_tabs(); tab_gen = 1; } -+#endif -+ -+ switch(n_bytes) { -+ case 32: /* bytes */ -+ case 256: /* bits */ -+ nc = 8; -+ break; -+ case 24: /* bytes */ -+ case 192: /* bits */ -+ nc = 6; -+ break; -+ case 16: /* bytes */ -+ case 128: /* bits */ -+ default: -+ nc = 4; -+ break; -+ } -+} -+ -+#endif -+ -+// Initialise the key schedule from the user supplied key. The key -+// length is now specified in bytes - 16, 24 or 32 as appropriate. -+// This corresponds to bit lengths of 128, 192 and 256 bits, and -+// to Nk values of 4, 6 and 8 respectively. -+ -+#define mx(t,f) (*t++ = inv_mcol(*f),f++) -+#define cp(t,f) *t++ = *f++ -+ -+#if AES_BLOCK_SIZE == 16 -+#define cpy(d,s) cp(d,s); cp(d,s); cp(d,s); cp(d,s) -+#define mix(d,s) mx(d,s); mx(d,s); mx(d,s); mx(d,s) -+#elif AES_BLOCK_SIZE == 24 -+#define cpy(d,s) cp(d,s); cp(d,s); cp(d,s); cp(d,s); \ -+ cp(d,s); cp(d,s) -+#define mix(d,s) mx(d,s); mx(d,s); mx(d,s); mx(d,s); \ -+ mx(d,s); mx(d,s) -+#elif AES_BLOCK_SIZE == 32 -+#define cpy(d,s) cp(d,s); cp(d,s); cp(d,s); cp(d,s); \ -+ cp(d,s); cp(d,s); cp(d,s); cp(d,s) -+#define mix(d,s) mx(d,s); mx(d,s); mx(d,s); mx(d,s); \ -+ mx(d,s); mx(d,s); mx(d,s); mx(d,s) -+#else -+ -+#define cpy(d,s) \ -+switch(nc) \ -+{ case 8: cp(d,s); cp(d,s); \ -+ case 6: cp(d,s); cp(d,s); \ -+ case 4: cp(d,s); cp(d,s); \ -+ cp(d,s); cp(d,s); \ -+} -+ -+#define mix(d,s) \ -+switch(nc) \ -+{ case 8: mx(d,s); mx(d,s); \ -+ case 6: mx(d,s); mx(d,s); \ -+ case 4: mx(d,s); mx(d,s); \ -+ mx(d,s); mx(d,s); \ -+} -+ -+#endif -+ -+void aes_set_key(aes_context *cx, const unsigned char in_key[], int n_bytes, const int f) -+{ u_int32_t *kf, *kt, rci; -+ -+#if !defined(FIXED_TABLES) -+ if(!tab_gen) { gen_tabs(); tab_gen = 1; } -+#endif -+ -+/* only need to do a special set_key for the cryptodev hw acceleration */ -+#ifdef OCF_ASSIST -+ if (ocf_aes_assist() & OCF_PROVIDES_AES) { -+ ocf_aes_set_key(cx, in_key, n_bytes, f); -+ return; -+ } -+#endif -+ -+ switch(n_bytes) { -+ case 32: /* bytes */ -+ case 256: /* bits */ -+ cx->aes_Nkey = 8; -+ break; -+ case 24: /* bytes */ -+ case 192: /* bits */ -+ cx->aes_Nkey = 6; -+ break; -+ case 16: /* bytes */ -+ case 128: /* bits */ -+ default: -+ cx->aes_Nkey = 4; -+ break; -+ } -+ -+ cx->aes_Nrnd = (cx->aes_Nkey > nc ? cx->aes_Nkey : nc) + 6; -+ -+ cx->aes_e_key[0] = const_word_in(in_key ); -+ cx->aes_e_key[1] = const_word_in(in_key + 4); -+ cx->aes_e_key[2] = const_word_in(in_key + 8); -+ cx->aes_e_key[3] = const_word_in(in_key + 12); -+ -+ kf = cx->aes_e_key; -+ kt = kf + nc * (cx->aes_Nrnd + 1) - cx->aes_Nkey; -+ rci = 0; -+ -+ switch(cx->aes_Nkey) -+ { -+ case 4: do -+ { kf[4] = kf[0] ^ ls_box(kf[3],3) ^ rcon_tab[rci++]; -+ kf[5] = kf[1] ^ kf[4]; -+ kf[6] = kf[2] ^ kf[5]; -+ kf[7] = kf[3] ^ kf[6]; -+ kf += 4; -+ } -+ while(kf < kt); -+ break; -+ -+ case 6: cx->aes_e_key[4] = const_word_in(in_key + 16); -+ cx->aes_e_key[5] = const_word_in(in_key + 20); -+ do -+ { kf[ 6] = kf[0] ^ ls_box(kf[5],3) ^ rcon_tab[rci++]; -+ kf[ 7] = kf[1] ^ kf[ 6]; -+ kf[ 8] = kf[2] ^ kf[ 7]; -+ kf[ 9] = kf[3] ^ kf[ 8]; -+ kf[10] = kf[4] ^ kf[ 9]; -+ kf[11] = kf[5] ^ kf[10]; -+ kf += 6; -+ } -+ while(kf < kt); -+ break; -+ -+ case 8: cx->aes_e_key[4] = const_word_in(in_key + 16); -+ cx->aes_e_key[5] = const_word_in(in_key + 20); -+ cx->aes_e_key[6] = const_word_in(in_key + 24); -+ cx->aes_e_key[7] = const_word_in(in_key + 28); -+ do -+ { kf[ 8] = kf[0] ^ ls_box(kf[7],3) ^ rcon_tab[rci++]; -+ kf[ 9] = kf[1] ^ kf[ 8]; -+ kf[10] = kf[2] ^ kf[ 9]; -+ kf[11] = kf[3] ^ kf[10]; -+ kf[12] = kf[4] ^ ls_box(kf[11],0); -+ kf[13] = kf[5] ^ kf[12]; -+ kf[14] = kf[6] ^ kf[13]; -+ kf[15] = kf[7] ^ kf[14]; -+ kf += 8; -+ } -+ while (kf < kt); -+ break; -+ } -+ -+ if(!f) -+ { u_int32_t i; -+ -+ kt = cx->aes_d_key + nc * cx->aes_Nrnd; -+ kf = cx->aes_e_key; -+ -+ cpy(kt, kf); kt -= 2 * nc; -+ -+ for(i = 1; i < cx->aes_Nrnd; ++i) -+ { -+#if defined(ONE_TABLE) || defined(FOUR_TABLES) -+#if !defined(ONE_IM_TABLE) && !defined(FOUR_IM_TABLES) -+ u_int32_t f2, f4, f8, f9; -+#endif -+ mix(kt, kf); -+#else -+ cpy(kt, kf); -+#endif -+ kt -= 2 * nc; -+ } -+ -+ cpy(kt, kf); -+ } -+} -+ -+// y = output word, x = input word, r = row, c = column -+// for r = 0, 1, 2 and 3 = column accessed for row r -+ -+#if defined(ARRAYS) -+#define s(x,c) x[c] -+#else -+#define s(x,c) x##c -+#endif -+ -+// I am grateful to Frank Yellin for the following constructions -+// which, given the column (c) of the output state variable that -+// is being computed, return the input state variables which are -+// needed for each row (r) of the state -+ -+// For the fixed block size options, compilers reduce these two -+// expressions to fixed variable references. For variable block -+// size code conditional clauses will sometimes be returned -+ -+#define unused 77 // Sunset Strip -+ -+#define fwd_var(x,r,c) \ -+ ( r==0 ? \ -+ ( c==0 ? s(x,0) \ -+ : c==1 ? s(x,1) \ -+ : c==2 ? s(x,2) \ -+ : c==3 ? s(x,3) \ -+ : c==4 ? s(x,4) \ -+ : c==5 ? s(x,5) \ -+ : c==6 ? s(x,6) \ -+ : s(x,7)) \ -+ : r==1 ? \ -+ ( c==0 ? s(x,1) \ -+ : c==1 ? s(x,2) \ -+ : c==2 ? s(x,3) \ -+ : c==3 ? nc==4 ? s(x,0) : s(x,4) \ -+ : c==4 ? s(x,5) \ -+ : c==5 ? nc==8 ? s(x,6) : s(x,0) \ -+ : c==6 ? s(x,7) \ -+ : s(x,0)) \ -+ : r==2 ? \ -+ ( c==0 ? nc==8 ? s(x,3) : s(x,2) \ -+ : c==1 ? nc==8 ? s(x,4) : s(x,3) \ -+ : c==2 ? nc==4 ? s(x,0) : nc==8 ? s(x,5) : s(x,4) \ -+ : c==3 ? nc==4 ? s(x,1) : nc==8 ? s(x,6) : s(x,5) \ -+ : c==4 ? nc==8 ? s(x,7) : s(x,0) \ -+ : c==5 ? nc==8 ? s(x,0) : s(x,1) \ -+ : c==6 ? s(x,1) \ -+ : s(x,2)) \ -+ : \ -+ ( c==0 ? nc==8 ? s(x,4) : s(x,3) \ -+ : c==1 ? nc==4 ? s(x,0) : nc==8 ? s(x,5) : s(x,4) \ -+ : c==2 ? nc==4 ? s(x,1) : nc==8 ? s(x,6) : s(x,5) \ -+ : c==3 ? nc==4 ? s(x,2) : nc==8 ? s(x,7) : s(x,0) \ -+ : c==4 ? nc==8 ? s(x,0) : s(x,1) \ -+ : c==5 ? nc==8 ? s(x,1) : s(x,2) \ -+ : c==6 ? s(x,2) \ -+ : s(x,3))) -+ -+#define inv_var(x,r,c) \ -+ ( r==0 ? \ -+ ( c==0 ? s(x,0) \ -+ : c==1 ? s(x,1) \ -+ : c==2 ? s(x,2) \ -+ : c==3 ? s(x,3) \ -+ : c==4 ? s(x,4) \ -+ : c==5 ? s(x,5) \ -+ : c==6 ? s(x,6) \ -+ : s(x,7)) \ -+ : r==1 ? \ -+ ( c==0 ? nc==4 ? s(x,3) : nc==8 ? s(x,7) : s(x,5) \ -+ : c==1 ? s(x,0) \ -+ : c==2 ? s(x,1) \ -+ : c==3 ? s(x,2) \ -+ : c==4 ? s(x,3) \ -+ : c==5 ? s(x,4) \ -+ : c==6 ? s(x,5) \ -+ : s(x,6)) \ -+ : r==2 ? \ -+ ( c==0 ? nc==4 ? s(x,2) : nc==8 ? s(x,5) : s(x,4) \ -+ : c==1 ? nc==4 ? s(x,3) : nc==8 ? s(x,6) : s(x,5) \ -+ : c==2 ? nc==8 ? s(x,7) : s(x,0) \ -+ : c==3 ? nc==8 ? s(x,0) : s(x,1) \ -+ : c==4 ? nc==8 ? s(x,1) : s(x,2) \ -+ : c==5 ? nc==8 ? s(x,2) : s(x,3) \ -+ : c==6 ? s(x,3) \ -+ : s(x,4)) \ -+ : \ -+ ( c==0 ? nc==4 ? s(x,1) : nc==8 ? s(x,4) : s(x,3) \ -+ : c==1 ? nc==4 ? s(x,2) : nc==8 ? s(x,5) : s(x,4) \ -+ : c==2 ? nc==4 ? s(x,3) : nc==8 ? s(x,6) : s(x,5) \ -+ : c==3 ? nc==8 ? s(x,7) : s(x,0) \ -+ : c==4 ? nc==8 ? s(x,0) : s(x,1) \ -+ : c==5 ? nc==8 ? s(x,1) : s(x,2) \ -+ : c==6 ? s(x,2) \ -+ : s(x,3))) -+ -+#define si(y,x,k,c) s(y,c) = const_word_in(x + 4 * c) ^ k[c] -+#define so(y,x,c) word_out(y + 4 * c, s(x,c)) -+ -+#if defined(FOUR_TABLES) -+#define fwd_rnd(y,x,k,c) s(y,c)= (k)[c] ^ four_tables(x,ft_tab,fwd_var,rf1,c) -+#define inv_rnd(y,x,k,c) s(y,c)= (k)[c] ^ four_tables(x,it_tab,inv_var,rf1,c) -+#elif defined(ONE_TABLE) -+#define fwd_rnd(y,x,k,c) s(y,c)= (k)[c] ^ one_table(x,upr,ft_tab,fwd_var,rf1,c) -+#define inv_rnd(y,x,k,c) s(y,c)= (k)[c] ^ one_table(x,upr,it_tab,inv_var,rf1,c) -+#else -+#define fwd_rnd(y,x,k,c) s(y,c) = fwd_mcol(no_table(x,s_box,fwd_var,rf1,c)) ^ (k)[c] -+#define inv_rnd(y,x,k,c) s(y,c) = inv_mcol(no_table(x,inv_s_box,inv_var,rf1,c) ^ (k)[c]) -+#endif -+ -+#if defined(FOUR_LR_TABLES) -+#define fwd_lrnd(y,x,k,c) s(y,c)= (k)[c] ^ four_tables(x,fl_tab,fwd_var,rf1,c) -+#define inv_lrnd(y,x,k,c) s(y,c)= (k)[c] ^ four_tables(x,il_tab,inv_var,rf1,c) -+#elif defined(ONE_LR_TABLE) -+#define fwd_lrnd(y,x,k,c) s(y,c)= (k)[c] ^ one_table(x,ups,fl_tab,fwd_var,rf1,c) -+#define inv_lrnd(y,x,k,c) s(y,c)= (k)[c] ^ one_table(x,ups,il_tab,inv_var,rf1,c) -+#else -+#define fwd_lrnd(y,x,k,c) s(y,c) = no_table(x,s_box,fwd_var,rf1,c) ^ (k)[c] -+#define inv_lrnd(y,x,k,c) s(y,c) = no_table(x,inv_s_box,inv_var,rf1,c) ^ (k)[c] -+#endif -+ -+#if AES_BLOCK_SIZE == 16 -+ -+#if defined(ARRAYS) -+#define locals(y,x) x[4],y[4] -+#else -+#define locals(y,x) x##0,x##1,x##2,x##3,y##0,y##1,y##2,y##3 -+// the following defines prevent the compiler requiring the declaration -+// of generated but unused variables in the fwd_var and inv_var macros -+#define b04 unused -+#define b05 unused -+#define b06 unused -+#define b07 unused -+#define b14 unused -+#define b15 unused -+#define b16 unused -+#define b17 unused -+#endif -+#define l_copy(y, x) s(y,0) = s(x,0); s(y,1) = s(x,1); \ -+ s(y,2) = s(x,2); s(y,3) = s(x,3); -+#define state_in(y,x,k) si(y,x,k,0); si(y,x,k,1); si(y,x,k,2); si(y,x,k,3) -+#define state_out(y,x) so(y,x,0); so(y,x,1); so(y,x,2); so(y,x,3) -+#define round(rm,y,x,k) rm(y,x,k,0); rm(y,x,k,1); rm(y,x,k,2); rm(y,x,k,3) -+ -+#elif AES_BLOCK_SIZE == 24 -+ -+#if defined(ARRAYS) -+#define locals(y,x) x[6],y[6] -+#else -+#define locals(y,x) x##0,x##1,x##2,x##3,x##4,x##5, \ -+ y##0,y##1,y##2,y##3,y##4,y##5 -+#define b06 unused -+#define b07 unused -+#define b16 unused -+#define b17 unused -+#endif -+#define l_copy(y, x) s(y,0) = s(x,0); s(y,1) = s(x,1); \ -+ s(y,2) = s(x,2); s(y,3) = s(x,3); \ -+ s(y,4) = s(x,4); s(y,5) = s(x,5); -+#define state_in(y,x,k) si(y,x,k,0); si(y,x,k,1); si(y,x,k,2); \ -+ si(y,x,k,3); si(y,x,k,4); si(y,x,k,5) -+#define state_out(y,x) so(y,x,0); so(y,x,1); so(y,x,2); \ -+ so(y,x,3); so(y,x,4); so(y,x,5) -+#define round(rm,y,x,k) rm(y,x,k,0); rm(y,x,k,1); rm(y,x,k,2); \ -+ rm(y,x,k,3); rm(y,x,k,4); rm(y,x,k,5) -+#else -+ -+#if defined(ARRAYS) -+#define locals(y,x) x[8],y[8] -+#else -+#define locals(y,x) x##0,x##1,x##2,x##3,x##4,x##5,x##6,x##7, \ -+ y##0,y##1,y##2,y##3,y##4,y##5,y##6,y##7 -+#endif -+#define l_copy(y, x) s(y,0) = s(x,0); s(y,1) = s(x,1); \ -+ s(y,2) = s(x,2); s(y,3) = s(x,3); \ -+ s(y,4) = s(x,4); s(y,5) = s(x,5); \ -+ s(y,6) = s(x,6); s(y,7) = s(x,7); -+ -+#if AES_BLOCK_SIZE == 32 -+ -+#define state_in(y,x,k) si(y,x,k,0); si(y,x,k,1); si(y,x,k,2); si(y,x,k,3); \ -+ si(y,x,k,4); si(y,x,k,5); si(y,x,k,6); si(y,x,k,7) -+#define state_out(y,x) so(y,x,0); so(y,x,1); so(y,x,2); so(y,x,3); \ -+ so(y,x,4); so(y,x,5); so(y,x,6); so(y,x,7) -+#define round(rm,y,x,k) rm(y,x,k,0); rm(y,x,k,1); rm(y,x,k,2); rm(y,x,k,3); \ -+ rm(y,x,k,4); rm(y,x,k,5); rm(y,x,k,6); rm(y,x,k,7) -+#else -+ -+#define state_in(y,x,k) \ -+switch(nc) \ -+{ case 8: si(y,x,k,7); si(y,x,k,6); \ -+ case 6: si(y,x,k,5); si(y,x,k,4); \ -+ case 4: si(y,x,k,3); si(y,x,k,2); \ -+ si(y,x,k,1); si(y,x,k,0); \ -+} -+ -+#define state_out(y,x) \ -+switch(nc) \ -+{ case 8: so(y,x,7); so(y,x,6); \ -+ case 6: so(y,x,5); so(y,x,4); \ -+ case 4: so(y,x,3); so(y,x,2); \ -+ so(y,x,1); so(y,x,0); \ -+} -+ -+#if defined(FAST_VARIABLE) -+ -+#define round(rm,y,x,k) \ -+switch(nc) \ -+{ case 8: rm(y,x,k,7); rm(y,x,k,6); \ -+ rm(y,x,k,5); rm(y,x,k,4); \ -+ rm(y,x,k,3); rm(y,x,k,2); \ -+ rm(y,x,k,1); rm(y,x,k,0); \ -+ break; \ -+ case 6: rm(y,x,k,5); rm(y,x,k,4); \ -+ rm(y,x,k,3); rm(y,x,k,2); \ -+ rm(y,x,k,1); rm(y,x,k,0); \ -+ break; \ -+ case 4: rm(y,x,k,3); rm(y,x,k,2); \ -+ rm(y,x,k,1); rm(y,x,k,0); \ -+ break; \ -+} -+#else -+ -+#define round(rm,y,x,k) \ -+switch(nc) \ -+{ case 8: rm(y,x,k,7); rm(y,x,k,6); \ -+ case 6: rm(y,x,k,5); rm(y,x,k,4); \ -+ case 4: rm(y,x,k,3); rm(y,x,k,2); \ -+ rm(y,x,k,1); rm(y,x,k,0); \ -+} -+ -+#endif -+ -+#endif -+#endif -+ -+void aes_encrypt(const aes_context *cx, const unsigned char in_blk[], unsigned char out_blk[]) -+{ u_int32_t locals(b0, b1); -+ const u_int32_t *kp = cx->aes_e_key; -+ -+#if !defined(ONE_TABLE) && !defined(FOUR_TABLES) -+ u_int32_t f2; -+#endif -+ -+ state_in(b0, in_blk, kp); kp += nc; -+ -+#if defined(UNROLL) -+ -+ switch(cx->aes_Nrnd) -+ { -+ case 14: round(fwd_rnd, b1, b0, kp ); -+ round(fwd_rnd, b0, b1, kp + nc ); kp += 2 * nc; -+ case 12: round(fwd_rnd, b1, b0, kp ); -+ round(fwd_rnd, b0, b1, kp + nc ); kp += 2 * nc; -+ case 10: round(fwd_rnd, b1, b0, kp ); -+ round(fwd_rnd, b0, b1, kp + nc); -+ round(fwd_rnd, b1, b0, kp + 2 * nc); -+ round(fwd_rnd, b0, b1, kp + 3 * nc); -+ round(fwd_rnd, b1, b0, kp + 4 * nc); -+ round(fwd_rnd, b0, b1, kp + 5 * nc); -+ round(fwd_rnd, b1, b0, kp + 6 * nc); -+ round(fwd_rnd, b0, b1, kp + 7 * nc); -+ round(fwd_rnd, b1, b0, kp + 8 * nc); -+ round(fwd_lrnd, b0, b1, kp + 9 * nc); -+ } -+ -+#elif defined(PARTIAL_UNROLL) -+ { u_int32_t rnd; -+ -+ for(rnd = 0; rnd < (cx->aes_Nrnd >> 1) - 1; ++rnd) -+ { -+ round(fwd_rnd, b1, b0, kp); -+ round(fwd_rnd, b0, b1, kp + nc); kp += 2 * nc; -+ } -+ -+ round(fwd_rnd, b1, b0, kp); -+ round(fwd_lrnd, b0, b1, kp + nc); -+ } -+#else -+ { u_int32_t rnd; -+ -+ for(rnd = 0; rnd < cx->aes_Nrnd - 1; ++rnd) -+ { -+ round(fwd_rnd, b1, b0, kp); -+ l_copy(b0, b1); kp += nc; -+ } -+ -+ round(fwd_lrnd, b0, b1, kp); -+ } -+#endif -+ -+ state_out(out_blk, b0); -+} -+ -+void aes_decrypt(const aes_context *cx, const unsigned char in_blk[], unsigned char out_blk[]) -+{ u_int32_t locals(b0, b1); -+ const u_int32_t *kp = cx->aes_d_key; -+ -+#if !defined(ONE_TABLE) && !defined(FOUR_TABLES) -+ u_int32_t f2, f4, f8, f9; -+#endif -+ -+ state_in(b0, in_blk, kp); kp += nc; -+ -+#if defined(UNROLL) -+ -+ switch(cx->aes_Nrnd) -+ { -+ case 14: round(inv_rnd, b1, b0, kp ); -+ round(inv_rnd, b0, b1, kp + nc ); kp += 2 * nc; -+ case 12: round(inv_rnd, b1, b0, kp ); -+ round(inv_rnd, b0, b1, kp + nc ); kp += 2 * nc; -+ case 10: round(inv_rnd, b1, b0, kp ); -+ round(inv_rnd, b0, b1, kp + nc); -+ round(inv_rnd, b1, b0, kp + 2 * nc); -+ round(inv_rnd, b0, b1, kp + 3 * nc); -+ round(inv_rnd, b1, b0, kp + 4 * nc); -+ round(inv_rnd, b0, b1, kp + 5 * nc); -+ round(inv_rnd, b1, b0, kp + 6 * nc); -+ round(inv_rnd, b0, b1, kp + 7 * nc); -+ round(inv_rnd, b1, b0, kp + 8 * nc); -+ round(inv_lrnd, b0, b1, kp + 9 * nc); -+ } -+ -+#elif defined(PARTIAL_UNROLL) -+ { u_int32_t rnd; -+ -+ for(rnd = 0; rnd < (cx->aes_Nrnd >> 1) - 1; ++rnd) -+ { -+ round(inv_rnd, b1, b0, kp); -+ round(inv_rnd, b0, b1, kp + nc); kp += 2 * nc; -+ } -+ -+ round(inv_rnd, b1, b0, kp); -+ round(inv_lrnd, b0, b1, kp + nc); -+ } -+#else -+ { u_int32_t rnd; -+ -+ for(rnd = 0; rnd < cx->aes_Nrnd - 1; ++rnd) -+ { -+ round(inv_rnd, b1, b0, kp); -+ l_copy(b0, b1); kp += nc; -+ } -+ -+ round(inv_lrnd, b0, b1, kp); -+ } -+#endif -+ -+ state_out(out_blk, b0); -+} ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/aes/aes_cbc.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,67 @@ -+/* -+// I retain copyright in this code but I encourage its free use provided -+// that I don't carry any responsibility for the results. I am especially -+// happy to see it used in free and open source software. If you do use -+// it I would appreciate an acknowledgement of its origin in the code or -+// the product that results and I would also appreciate knowing a little -+// about the use to which it is being put. I am grateful to Frank Yellin -+// for some ideas that are used in this implementation. -+// -+// Dr B. R. Gladman 6th April 2001. -+// -+// This is an implementation of the AES encryption algorithm (Rijndael) -+// designed by Joan Daemen and Vincent Rijmen. This version is designed -+// to provide both fixed and dynamic block and key lengths and can also -+// run with either big or little endian internal byte order (see aes.h). -+// It inputs block and key lengths in bytes with the legal values being -+// 16, 24 and 32. -+* -+*/ -+ -+#ifdef __KERNEL__ -+#include -+#else -+#include -+#endif -+#include "klips-crypto/aes_cbc.h" -+#include "klips-crypto/cbc_generic.h" -+#ifdef OCF_ASSIST -+#include "klips-crypto/ocf_assist.h" -+#endif -+ -+/* returns bool success */ -+int AES_set_key(aes_context *aes_ctx, const u_int8_t *key, int keysize) { -+ aes_set_key(aes_ctx, key, keysize, 0); -+ return 1; -+} -+ -+#ifdef OCF_ASSIST -+ -+CBC_IMPL_BLK16(_AES_cbc_encrypt, aes_context, u_int8_t *, aes_encrypt, aes_decrypt); -+ -+int -+AES_cbc_encrypt(aes_context *ctx, const u_int8_t *in, u_int8_t *out, int ilen, -+ const u_int8_t *iv, int encrypt) -+{ -+ if (ocf_aes_assist() & OCF_PROVIDES_AES) { -+ return ocf_aes_cbc_encrypt(ctx, in, out, ilen, iv, encrypt); -+ } else { -+ return _AES_cbc_encrypt(ctx, in, out, ilen, iv, encrypt); -+ } -+} -+ -+#else -+CBC_IMPL_BLK16(AES_cbc_encrypt, aes_context, u_int8_t *, aes_encrypt, aes_decrypt); -+#endif -+ -+ -+/* -+ * $Log: aes_cbc.c,v $ -+ * Revision 1.2 2004/07/10 07:48:40 mcr -+ * Moved from linux/crypto/ciphers/aes/aes_cbc.c,v -+ * -+ * Revision 1.1 2004/04/06 02:48:12 mcr -+ * pullup of AES cipher from alg-branch. -+ * -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/aes/aes_xcbc_mac.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,67 @@ -+#ifdef __KERNEL__ -+#include -+#include -+#define AES_DEBUG(x) -+#else -+#include -+#include -+#define AES_DEBUG(x) x -+#endif -+ -+#include "klips-crypto/aes.h" -+#include "klips-crypto/aes_xcbc_mac.h" -+ -+int AES_xcbc_mac_set_key(aes_context_mac *ctxm, const u_int8_t *key, int keylen) -+{ -+ int ret=1; -+ aes_block kn[3] = { -+ { 0x01010101, 0x01010101, 0x01010101, 0x01010101 }, -+ { 0x02020202, 0x02020202, 0x02020202, 0x02020202 }, -+ { 0x03030303, 0x03030303, 0x03030303, 0x03030303 }, -+ }; -+ aes_set_key(&ctxm->ctx_k1, key, keylen, 0); -+ aes_encrypt(&ctxm->ctx_k1, (u_int8_t *) kn[0], (u_int8_t *) kn[0]); -+ aes_encrypt(&ctxm->ctx_k1, (u_int8_t *) kn[1], (u_int8_t *) ctxm->k2); -+ aes_encrypt(&ctxm->ctx_k1, (u_int8_t *) kn[2], (u_int8_t *) ctxm->k3); -+ aes_set_key(&ctxm->ctx_k1, (u_int8_t *) kn[0], 16, 0); -+ return ret; -+} -+static void do_pad_xor(u_int8_t *out, const u_int8_t *in, int len) { -+ int pos=0; -+ for (pos=1; pos <= 16; pos++, in++, out++) { -+ if (pos <= len) -+ *out ^= *in; -+ if (pos > len) { -+ AES_DEBUG(printf("put 0x80 at pos=%d\n", pos)); -+ *out ^= 0x80; -+ break; -+ } -+ } -+} -+static void xor_block(aes_block res, const aes_block op) { -+ res[0] ^= op[0]; -+ res[1] ^= op[1]; -+ res[2] ^= op[2]; -+ res[3] ^= op[3]; -+} -+int AES_xcbc_mac_hash(const aes_context_mac *ctxm, const u_int8_t * in, int ilen, u_int8_t hash[16]) { -+ int ret=ilen; -+ u_int32_t out[4] = { 0, 0, 0, 0 }; -+ for (; ilen > 16 ; ilen-=16) { -+ xor_block(out, (const u_int32_t*) &in[0]); -+ aes_encrypt(&ctxm->ctx_k1, in, (u_int8_t *)&out[0]); -+ in+=16; -+ } -+ do_pad_xor((u_int8_t *)&out, in, ilen); -+ if (ilen==16) { -+ AES_DEBUG(printf("using k3\n")); -+ xor_block(out, ctxm->k3); -+ } -+ else -+ { -+ AES_DEBUG(printf("using k2\n")); -+ xor_block(out, ctxm->k2); -+ } -+ aes_encrypt(&ctxm->ctx_k1, (u_int8_t *)out, hash); -+ return ret; -+} ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/aes/ipsec_alg_aes.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,300 @@ -+/* -+ * ipsec_alg AES cipher stubs -+ * -+ * Author: JuanJo Ciarlante -+ * -+ * ipsec_alg_aes.c,v 1.1.2.1 2003/11/21 18:12:23 jjo Exp -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * Fixes by: -+ * PK: Pawel Krawczyk -+ * Fixes list: -+ * PK: make XCBC comply with latest draft (keylength) -+ * -+ */ -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif -+#include -+ -+/* -+ * special case: ipsec core modular with this static algo inside: -+ * must avoid MODULE magic for this file -+ */ -+#if defined(CONFIG_KLIPS_MODULE) && defined(CONFIG_KLIPS_ENC_AES) -+#undef MODULE -+#endif -+ -+#include -+#include -+ -+#include /* printk() */ -+#include /* error codes */ -+#include /* size_t */ -+#include -+ -+/* Check if __exit is defined, if not null it */ -+#ifndef __exit -+#define __exit -+#endif -+ -+/* Low freeswan header coupling */ -+#include -+#include "openswan/ipsec_alg.h" -+#include "klips-crypto/aes_cbc.h" -+ -+#define CONFIG_KLIPS_ENC_AES_MAC 1 -+ -+#define AES_CONTEXT_T aes_context -+static int debug_aes=0; -+static int test_aes=0; -+static int excl_aes=0; -+static int keyminbits=0; -+static int keymaxbits=0; -+#if defined(CONFIG_KLIPS_ENC_AES_MODULE) -+MODULE_AUTHOR("JuanJo Ciarlante "); -+#ifdef module_param -+module_param(debug_aes,int,0664); -+module_param(test_aes,int,0664); -+module_param(excl_aes,int,0664); -+module_param(keyminbits,int,0664); -+module_param(keymaxbits,int,0664); -+#else -+MODULE_PARM(debug_aes, "i"); -+MODULE_PARM(test_aes, "i"); -+MODULE_PARM(excl_aes, "i"); -+MODULE_PARM(keyminbits, "i"); -+MODULE_PARM(keymaxbits, "i"); -+#endif -+#endif -+ -+#if CONFIG_KLIPS_ENC_AES_MAC -+#include "klips-crypto/aes_xcbc_mac.h" -+ -+/* -+ * Not IANA number yet (draft-ietf-ipsec-ciph-aes-xcbc-mac-00.txt). -+ * We use 9 for non-modular algorithm and none for modular, thus -+ * forcing user to specify one on module load. -kravietz -+ */ -+#ifdef MODULE -+static int auth_id=0; -+#else -+static int auth_id=9; -+#endif -+#if 0 -+#ifdef MODULE_PARM -+MODULE_PARM(auth_id, "i"); -+#else -+module_param(auth_id,int,0664); -+#endif -+#endif -+#endif -+ -+#define ESP_AES 12 /* truely _constant_ :) */ -+ -+/* 128, 192 or 256 */ -+#define ESP_AES_KEY_SZ_MIN 16 /* 128 bit secret key */ -+#define ESP_AES_KEY_SZ_MAX 32 /* 256 bit secret key */ -+#define ESP_AES_CBC_BLK_LEN 16 /* AES-CBC block size */ -+ -+/* Values according to draft-ietf-ipsec-ciph-aes-xcbc-mac-02.txt -+ * -kravietz -+ */ -+#define ESP_AES_MAC_KEY_SZ 16 /* 128 bit MAC key */ -+#define ESP_AES_MAC_BLK_LEN 16 /* 128 bit block */ -+ -+static int _aes_set_key(struct ipsec_alg_enc *alg, -+ __u8 * key_e, const __u8 * key, -+ size_t keysize) -+{ -+ int ret; -+ AES_CONTEXT_T *ctx=(AES_CONTEXT_T*)key_e; -+ ret=AES_set_key(ctx, key, keysize)!=0? 0: -EINVAL; -+ if (debug_aes > 0) -+ printk(KERN_DEBUG "klips_debug:_aes_set_key:" -+ "ret=%d key_e=%p key=%p keysize=%ld\n", -+ ret, key_e, key, (unsigned long int) keysize); -+ return ret; -+} -+ -+static int _aes_cbc_encrypt(struct ipsec_alg_enc *alg, __u8 * key_e, -+ const __u8 * in, int ilen, const __u8 * iv, -+ int encrypt) -+{ -+ AES_CONTEXT_T *ctx=(AES_CONTEXT_T*)key_e; -+ if (debug_aes > 0) -+ printk(KERN_DEBUG "klips_debug:_aes_cbc_encrypt:" -+ "key_e=%p in=%p ilen=%d iv=%p encrypt=%d\n", -+ key_e, in, ilen, iv, encrypt); -+ return AES_cbc_encrypt(ctx, in, in, ilen, iv, encrypt); -+} -+#if CONFIG_KLIPS_ENC_AES_MAC -+static int _aes_mac_set_key(struct ipsec_alg_auth *alg, __u8 * key_a, const __u8 * key, int keylen) { -+ aes_context_mac *ctxm=(aes_context_mac *)key_a; -+ return AES_xcbc_mac_set_key(ctxm, key, keylen)? 0 : -EINVAL; -+} -+static int _aes_mac_hash(struct ipsec_alg_auth *alg, __u8 * key_a, const __u8 * dat, int len, __u8 * hash, int hashlen) { -+ int ret; -+ char hash_buf[16]; -+ aes_context_mac *ctxm=(aes_context_mac *)key_a; -+ ret=AES_xcbc_mac_hash(ctxm, dat, len, hash_buf); -+ memcpy(hash, hash_buf, hashlen); -+ return ret; -+} -+static struct ipsec_alg_auth ipsec_alg_AES_MAC = { -+ ixt_common: { ixt_version: IPSEC_ALG_VERSION, -+ ixt_refcnt: ATOMIC_INIT(0), -+ ixt_name: "aes_mac", -+ ixt_blocksize: ESP_AES_MAC_BLK_LEN, -+ ixt_support: { -+ ias_exttype: IPSEC_ALG_TYPE_AUTH, -+ ias_id: 0, -+ ias_keyminbits: ESP_AES_MAC_KEY_SZ*8, -+ ias_keymaxbits: ESP_AES_MAC_KEY_SZ*8, -+ }, -+ }, -+#if defined(CONFIG_KLIPS_ENC_AES_MODULE) -+ ixt_module: THIS_MODULE, -+#endif -+ ixt_a_keylen: ESP_AES_MAC_KEY_SZ, -+ ixt_a_ctx_size: sizeof(aes_context_mac), -+ ixt_a_hmac_set_key: _aes_mac_set_key, -+ ixt_a_hmac_hash:_aes_mac_hash, -+}; -+#endif /* CONFIG_KLIPS_ENC_AES_MAC */ -+static struct ipsec_alg_enc ipsec_alg_AES = { -+ ixt_common: { ixt_version: IPSEC_ALG_VERSION, -+ ixt_refcnt: ATOMIC_INIT(0), -+ ixt_name: "aes", -+ ixt_blocksize: ESP_AES_CBC_BLK_LEN, -+ ixt_support: { -+ ias_exttype: IPSEC_ALG_TYPE_ENCRYPT, -+ //ias_ivlen: 128, -+ ias_id: ESP_AES, -+ ias_keyminbits: ESP_AES_KEY_SZ_MIN*8, -+ ias_keymaxbits: ESP_AES_KEY_SZ_MAX*8, -+ }, -+ }, -+#if defined(CONFIG_KLIPS_ENC_AES_MODULE) -+ ixt_module: THIS_MODULE, -+#endif -+ ixt_e_keylen: ESP_AES_KEY_SZ_MAX, -+ ixt_e_ctx_size: sizeof(AES_CONTEXT_T), -+ ixt_e_set_key: _aes_set_key, -+ ixt_e_cbc_encrypt:_aes_cbc_encrypt, -+}; -+ -+#if defined(CONFIG_KLIPS_ENC_AES_MODULE) -+IPSEC_ALG_MODULE_INIT_MOD( ipsec_aes_init ) -+#else -+IPSEC_ALG_MODULE_INIT_STATIC( ipsec_aes_init ) -+#endif -+{ -+ int ret, test_ret; -+ -+ if (keyminbits) -+ ipsec_alg_AES.ixt_common.ixt_support.ias_keyminbits=keyminbits; -+ if (keymaxbits) { -+ ipsec_alg_AES.ixt_common.ixt_support.ias_keymaxbits=keymaxbits; -+ if (keymaxbits*8>ipsec_alg_AES.ixt_common.ixt_support.ias_keymaxbits) -+ ipsec_alg_AES.ixt_e_keylen=keymaxbits*8; -+ } -+ if (excl_aes) ipsec_alg_AES.ixt_common.ixt_state |= IPSEC_ALG_ST_EXCL; -+ ret=register_ipsec_alg_enc(&ipsec_alg_AES); -+ printk("ipsec_aes_init(alg_type=%d alg_id=%d name=%s): ret=%d\n", -+ ipsec_alg_AES.ixt_common.ixt_support.ias_exttype, -+ ipsec_alg_AES.ixt_common.ixt_support.ias_id, -+ ipsec_alg_AES.ixt_common.ixt_name, -+ ret); -+ if (ret==0 && test_aes) { -+ test_ret=ipsec_alg_test( -+ ipsec_alg_AES.ixt_common.ixt_support.ias_exttype , -+ ipsec_alg_AES.ixt_common.ixt_support.ias_id, -+ test_aes); -+ printk("ipsec_aes_init(alg_type=%d alg_id=%d): test_ret=%d\n", -+ ipsec_alg_AES.ixt_common.ixt_support.ias_exttype , -+ ipsec_alg_AES.ixt_common.ixt_support.ias_id, -+ test_ret); -+ } -+#if CONFIG_KLIPS_ENC_AES_MAC -+ if (auth_id!=0){ -+ int ret; -+ ipsec_alg_AES_MAC.ixt_common.ixt_support.ias_id=auth_id; -+ ret=register_ipsec_alg_auth(&ipsec_alg_AES_MAC); -+ printk("ipsec_aes_init(alg_type=%d alg_id=%d name=%s): ret=%d\n", -+ ipsec_alg_AES_MAC.ixt_common.ixt_support.ias_exttype, -+ ipsec_alg_AES_MAC.ixt_common.ixt_support.ias_id, -+ ipsec_alg_AES_MAC.ixt_common.ixt_name, -+ ret); -+ if (ret==0 && test_aes) { -+ test_ret=ipsec_alg_test( -+ ipsec_alg_AES_MAC.ixt_common.ixt_support.ias_exttype, -+ ipsec_alg_AES_MAC.ixt_common.ixt_support.ias_id, -+ test_aes); -+ printk("ipsec_aes_init(alg_type=%d alg_id=%d): test_ret=%d\n", -+ ipsec_alg_AES_MAC.ixt_common.ixt_support.ias_exttype, -+ ipsec_alg_AES_MAC.ixt_common.ixt_support.ias_id, -+ test_ret); -+ } -+ } else { -+ printk(KERN_DEBUG "klips_debug: experimental ipsec_alg_AES_MAC not registered [Ok] (auth_id=%d)\n", auth_id); -+ } -+#endif /* CONFIG_KLIPS_ENC_AES_MAC */ -+ return ret; -+} -+ -+#if defined(CONFIG_KLIPS_ENC_AES_MODULE) -+IPSEC_ALG_MODULE_EXIT_MOD( ipsec_aes_fini ) -+#else -+IPSEC_ALG_MODULE_EXIT_STATIC( ipsec_aes_fini ) -+#endif -+{ -+#if CONFIG_KLIPS_ENC_AES_MAC -+ if (auth_id) unregister_ipsec_alg_auth(&ipsec_alg_AES_MAC); -+#endif /* CONFIG_KLIPS_ENC_AES_MAC */ -+ unregister_ipsec_alg_enc(&ipsec_alg_AES); -+ return; -+} -+#ifdef MODULE_LICENSE -+MODULE_LICENSE("GPL"); -+#endif -+ -+#if 0 /* +NOT_YET */ -+#ifndef MODULE -+/* -+ * This is intended for static module setups, currently -+ * doesn't work for modular ipsec.o with static algos inside -+ */ -+static int setup_keybits(const char *str) -+{ -+ unsigned aux; -+ char *end; -+ -+ aux = simple_strtoul(str,&end,0); -+ if (aux != 128 && aux != 192 && aux != 256) -+ return 0; -+ keyminbits = aux; -+ -+ if (*end == 0 || *end != ',') -+ return 1; -+ str=end+1; -+ aux = simple_strtoul(str, NULL, 0); -+ if (aux != 128 && aux != 192 && aux != 256) -+ return 0; -+ if (aux >= keyminbits) -+ keymaxbits = aux; -+ return 1; -+} -+__setup("ipsec_aes_keybits=", setup_keybits); -+#endif -+#endif -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/alg/Config.alg_aes.in Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,3 @@ -+if [ "$CONFIG_IPSEC_ALG" = "y" ]; then -+ tristate ' AES encryption algorithm' CONFIG_IPSEC_ENC_AES -+fi ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/alg/Config.alg_cryptoapi.in Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,6 @@ -+if [ "$CONFIG_IPSEC_ALG" = "y" ]; then -+ dep_tristate ' CRYPTOAPI ciphers support (needs cryptoapi patch)' CONFIG_IPSEC_ALG_CRYPTOAPI $CONFIG_CRYPTO -+ if [ "$CONFIG_IPSEC_ALG_CRYPTOAPI" != "n" ]; then -+ bool ' CRYPTOAPI proprietary ciphers ' CONFIG_IPSEC_ALG_NON_LIBRE -+ fi -+fi ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/alg/Config.in Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,3 @@ -+#Placeholder -+source net/ipsec/alg/Config.alg_aes.in -+source net/ipsec/alg/Config.alg_cryptoapi.in ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/alg/Makefile.alg_aes Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,18 @@ -+MOD_AES := ipsec_aes.o -+ -+ALG_MODULES += $(MOD_AES) -+ALG_SUBDIRS += libaes -+ -+obj-$(CONFIG_IPSEC_ALG_AES) += $(MOD_AES) -+static_init-func-$(CONFIG_IPSEC_ALG_AES)+= ipsec_aes_init -+alg_obj-$(CONFIG_IPSEC_ALG_AES) += ipsec_alg_aes.o -+ -+AES_OBJS := ipsec_alg_aes.o $(LIBCRYPTO)/libaes/libaes.a -+ -+ -+$(MOD_AES): $(AES_OBJS) -+ $(LD) $(EXTRA_LDFLAGS) -r $(AES_OBJS) -o $@ -+ -+$(LIBCRYPTO)/libaes/libaes.a: -+ $(MAKE) -C $(LIBCRYPTO)/libaes CC='$(CC)' 'ARCH_ASM=$(ARCH_ASM)' CFLAGS='$(CFLAGS) $(EXTRA_CFLAGS)' libaes.a -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/alg/Makefile.alg_cryptoapi Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,14 @@ -+MOD_CRYPTOAPI := ipsec_cryptoapi.o -+ -+ifneq ($(wildcard $(TOPDIR)/include/linux/crypto.h),) -+ALG_MODULES += $(MOD_CRYPTOAPI) -+obj-$(CONFIG_IPSEC_ALG_CRYPTOAPI) += $(MOD_CRYPTOAPI) -+static_init-func-$(CONFIG_IPSEC_ALG_CRYPTOAPI)+= ipsec_cryptoapi_init -+alg_obj-$(CONFIG_IPSEC_ALG_CRYPTOAPI) += ipsec_alg_cryptoapi.o -+else -+$(warning "Linux CryptoAPI (2.4.22+ or 2.6.x) not found, not building ipsec_cryptoapi.o") -+endif -+ -+CRYPTOAPI_OBJS := ipsec_alg_cryptoapi.o -+$(MOD_CRYPTOAPI): $(CRYPTOAPI_OBJS) -+ $(LD) -r $(CRYPTOAPI_OBJS) -o $@ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/alg/ipsec_alg_cryptoapi.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,442 @@ -+/* -+ * ipsec_alg to linux cryptoapi GLUE -+ * -+ * Authors: CODE.ar TEAM -+ * Harpo MAxx -+ * JuanJo Ciarlante -+ * Luciano Ruete -+ * -+ * ipsec_alg_cryptoapi.c,v 1.1.2.1 2003/11/21 18:12:23 jjo Exp -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * Example usage: -+ * modinfo -p ipsec_cryptoapi (quite useful info, including supported algos) -+ * modprobe ipsec_cryptoapi -+ * modprobe ipsec_cryptoapi test=1 -+ * modprobe ipsec_cryptoapi excl=1 (exclusive cipher/algo) -+ * modprobe ipsec_cryptoapi noauto=1 aes=1 twofish=1 (only these ciphers) -+ * modprobe ipsec_cryptoapi aes=128,128 (force these keylens) -+ * modprobe ipsec_cryptoapi des_ede3=0 (everything but 3DES) -+ */ -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif -+#include -+ -+/* -+ * special case: ipsec core modular with this static algo inside: -+ * must avoid MODULE magic for this file -+ */ -+#if CONFIG_IPSEC_MODULE && CONFIG_IPSEC_ALG_CRYPTOAPI -+#undef MODULE -+#endif -+ -+#include -+#include -+ -+#include /* printk() */ -+#include /* error codes */ -+#include /* size_t */ -+#include -+ -+/* Check if __exit is defined, if not null it */ -+#ifndef __exit -+#define __exit -+#endif -+ -+/* warn the innocent */ -+#if !defined (CONFIG_CRYPTO) && !defined (CONFIG_CRYPTO_MODULE) -+#warning "No linux CryptoAPI found, install 2.4.22+ or 2.6.x" -+#define NO_CRYPTOAPI_SUPPORT -+#endif -+/* Low freeswan header coupling */ -+#include "openswan/ipsec_alg.h" -+ -+#include -+#ifdef CRYPTO_API_VERSION_CODE -+#warning "Old CryptoAPI is not supported. Only linux-2.4.22+ or linux-2.6.x are supported" -+#define NO_CRYPTOAPI_SUPPORT -+#endif -+ -+#ifdef NO_CRYPTOAPI_SUPPORT -+#warning "Building an unusable module :P" -+/* Catch old CryptoAPI by not allowing module to load */ -+IPSEC_ALG_MODULE_INIT( ipsec_cryptoapi_init ) -+{ -+ printk(KERN_WARNING "ipsec_cryptoapi.o was not built on stock Linux CryptoAPI (2.4.22+ or 2.6.x), not loading.\n"); -+ return -EINVAL; -+} -+#else -+#include -+#include -+#include -+ -+#define CIPHERNAME_AES "aes" -+#define CIPHERNAME_3DES "des3_ede" -+#define CIPHERNAME_BLOWFISH "blowfish" -+#define CIPHERNAME_CAST "cast5" -+#define CIPHERNAME_SERPENT "serpent" -+#define CIPHERNAME_TWOFISH "twofish" -+ -+#define ESP_3DES 3 -+#define ESP_AES 12 -+#define ESP_BLOWFISH 7 /* truely _constant_ :) */ -+#define ESP_CAST 6 /* quite constant :) */ -+#define ESP_SERPENT 252 /* from ipsec drafts */ -+#define ESP_TWOFISH 253 /* from ipsec drafts */ -+ -+#define AH_MD5 2 -+#define AH_SHA 3 -+#define DIGESTNAME_MD5 "md5" -+#define DIGESTNAME_SHA1 "sha1" -+ -+MODULE_AUTHOR("Juanjo Ciarlante, Harpo MAxx, Luciano Ruete"); -+static int debug=0; -+static int test=0; -+static int excl=0; -+#ifdef module_param -+module_param(debug, int, 0664); -+module_param(test, int, 0664); -+module_param(excl, int, 0664); -+#else -+MODULE_PARM(debug, "i"); -+MODULE_PARM(test, "i"); -+MODULE_PARM(excl, "i"); -+#endif -+ -+static int noauto = 0; -+#ifdef module_param -+module_param(noauto,int, 0664); -+#else -+MODULE_PARM(noauto,"i"); -+#endif -+MODULE_PARM_DESC(noauto, "Dont try all known algos, just setup enabled ones"); -+ -+static int des_ede3[] = {-1, -1}; -+static int aes[] = {-1, -1}; -+static int blowfish[] = {-1, -1}; -+static int cast[] = {-1, -1}; -+static int serpent[] = {-1, -1}; -+static int twofish[] = {-1, -1}; -+ -+#ifdef module_param_array -+module_param_array(des_ede3,int,NULL,0); -+module_param_array(aes,int,NULL,0); -+module_param_array(blowfish,int,NULL,0); -+module_param_array(cast,int,NULL,0); -+module_param_array(serpent,int,NULL,0); -+module_param_array(twofish,int,NULL,0); -+#else -+MODULE_PARM(des_ede3,"1-2i"); -+MODULE_PARM(aes,"1-2i"); -+MODULE_PARM(blowfish,"1-2i"); -+MODULE_PARM(cast,"1-2i"); -+MODULE_PARM(serpent,"1-2i"); -+MODULE_PARM(twofish,"1-2i"); -+#endif -+MODULE_PARM_DESC(des_ede3, "0: disable | 1: force_enable | min,max: dontuse"); -+MODULE_PARM_DESC(aes, "0: disable | 1: force_enable | min,max: keybitlens"); -+MODULE_PARM_DESC(blowfish, "0: disable | 1: force_enable | min,max: keybitlens"); -+MODULE_PARM_DESC(cast, "0: disable | 1: force_enable | min,max: keybitlens"); -+MODULE_PARM_DESC(serpent, "0: disable | 1: force_enable | min,max: keybitlens"); -+MODULE_PARM_DESC(twofish, "0: disable | 1: force_enable | min,max: keybitlens"); -+ -+struct ipsec_alg_capi_cipher { -+ const char *ciphername; /* cryptoapi's ciphername */ -+ unsigned blocksize; -+ unsigned short minbits; -+ unsigned short maxbits; -+ int *parm; /* lkm param for this cipher */ -+ struct ipsec_alg_enc alg; /* note it's not a pointer */ -+}; -+static struct ipsec_alg_capi_cipher alg_capi_carray[] = { -+ { CIPHERNAME_AES , 16, 128, 256, aes , { ixt_alg_id: ESP_AES, }}, -+ { CIPHERNAME_TWOFISH , 16, 128, 256, twofish, { ixt_alg_id: ESP_TWOFISH, }}, -+ { CIPHERNAME_SERPENT , 16, 128, 256, serpent, { ixt_alg_id: ESP_SERPENT, }}, -+ { CIPHERNAME_CAST , 8, 128, 128, cast , { ixt_alg_id: ESP_CAST, }}, -+ { CIPHERNAME_BLOWFISH , 8, 96, 448, blowfish,{ ixt_alg_id: ESP_BLOWFISH, }}, -+ { CIPHERNAME_3DES , 8, 192, 192, des_ede3,{ ixt_alg_id: ESP_3DES, }}, -+ { NULL, 0, 0, 0, NULL, {} } -+}; -+#ifdef NOT_YET -+struct ipsec_alg_capi_digest { -+ const char *digestname; /* cryptoapi's digestname */ -+ struct digest_implementation *di; -+ struct ipsec_alg_auth alg; /* note it's not a pointer */ -+}; -+static struct ipsec_alg_capi_cipher alg_capi_darray[] = { -+ { DIGESTNAME_MD5, NULL, { ixt_alg_id: AH_MD5, }}, -+ { DIGESTNAME_SHA1, NULL, { ixt_alg_id: AH_SHA, }}, -+ { NULL, NULL, {} } -+}; -+#endif -+/* -+ * "generic" linux cryptoapi setup_cipher() function -+ */ -+int setup_cipher(const char *ciphername) -+{ -+ return crypto_alg_available(ciphername, 0); -+} -+ -+/* -+ * setups ipsec_alg_capi_cipher "hyper" struct components, calling -+ * register_ipsec_alg for cointaned ipsec_alg object -+ */ -+static void _capi_destroy_key (struct ipsec_alg_enc *alg, __u8 *key_e); -+static __u8 * _capi_new_key (struct ipsec_alg_enc *alg, const __u8 *key, size_t keylen); -+static int _capi_cbc_encrypt(struct ipsec_alg_enc *alg, __u8 * key_e, __u8 * in, int ilen, const __u8 * iv, int encrypt); -+ -+static int -+setup_ipsec_alg_capi_cipher(struct ipsec_alg_capi_cipher *cptr) -+{ -+ int ret; -+ cptr->alg.ixt_version = IPSEC_ALG_VERSION; -+ cptr->alg.ixt_module = THIS_MODULE; -+ atomic_set (& cptr->alg.ixt_refcnt, 0); -+ strncpy (cptr->alg.ixt_name , cptr->ciphername, sizeof (cptr->alg.ixt_name)); -+ -+ cptr->alg.ixt_blocksize=cptr->blocksize; -+ cptr->alg.ixt_keyminbits=cptr->minbits; -+ cptr->alg.ixt_keymaxbits=cptr->maxbits; -+ cptr->alg.ixt_state = 0; -+ if (excl) cptr->alg.ixt_state |= IPSEC_ALG_ST_EXCL; -+ cptr->alg.ixt_e_keylen=cptr->alg.ixt_keymaxbits/8; -+ cptr->alg.ixt_e_ctx_size = 0; -+ cptr->alg.ixt_alg_type = IPSEC_ALG_TYPE_ENCRYPT; -+ cptr->alg.ixt_e_new_key = _capi_new_key; -+ cptr->alg.ixt_e_destroy_key = _capi_destroy_key; -+ cptr->alg.ixt_e_cbc_encrypt = _capi_cbc_encrypt; -+ cptr->alg.ixt_data = cptr; -+ -+ ret=register_ipsec_alg_enc(&cptr->alg); -+ printk("setup_ipsec_alg_capi_cipher(): " -+ "alg_type=%d alg_id=%d name=%s " -+ "keyminbits=%d keymaxbits=%d, ret=%d\n", -+ cptr->alg.ixt_alg_type, -+ cptr->alg.ixt_alg_id, -+ cptr->alg.ixt_name, -+ cptr->alg.ixt_keyminbits, -+ cptr->alg.ixt_keymaxbits, -+ ret); -+ return ret; -+} -+/* -+ * called in ipsec_sa_wipe() time, will destroy key contexts -+ * and do 1 unbind() -+ */ -+static void -+_capi_destroy_key (struct ipsec_alg_enc *alg, __u8 *key_e) -+{ -+ struct crypto_tfm *tfm=(struct crypto_tfm*)key_e; -+ -+ if (debug > 0) -+ printk(KERN_DEBUG "klips_debug: _capi_destroy_key:" -+ "name=%s key_e=%p \n", -+ alg->ixt_name, key_e); -+ if (!key_e) { -+ printk(KERN_ERR "klips_debug: _capi_destroy_key:" -+ "name=%s NULL key_e!\n", -+ alg->ixt_name); -+ return; -+ } -+ crypto_free_tfm(tfm); -+} -+ -+/* -+ * create new key context, need alg->ixt_data to know which -+ * (of many) cipher inside this module is the target -+ */ -+static __u8 * -+_capi_new_key (struct ipsec_alg_enc *alg, const __u8 *key, size_t keylen) -+{ -+ struct ipsec_alg_capi_cipher *cptr; -+ struct crypto_tfm *tfm=NULL; -+ -+ cptr = alg->ixt_data; -+ if (!cptr) { -+ printk(KERN_ERR "_capi_new_key(): " -+ "NULL ixt_data (?!) for \"%s\" algo\n" -+ , alg->ixt_name); -+ goto err; -+ } -+ if (debug > 0) -+ printk(KERN_DEBUG "klips_debug:_capi_new_key:" -+ "name=%s cptr=%p key=%p keysize=%d\n", -+ alg->ixt_name, cptr, key, keylen); -+ -+ /* -+ * alloc tfm -+ */ -+ tfm = crypto_alloc_tfm(cptr->ciphername, CRYPTO_TFM_MODE_CBC); -+ if (!tfm) { -+ printk(KERN_ERR "_capi_new_key(): " -+ "NULL tfm for \"%s\" cryptoapi (\"%s\") algo\n" -+ , alg->ixt_name, cptr->ciphername); -+ goto err; -+ } -+ if (crypto_cipher_setkey(tfm, key, keylen) < 0) { -+ printk(KERN_ERR "_capi_new_key(): " -+ "failed new_key() for \"%s\" cryptoapi algo (keylen=%d)\n" -+ , alg->ixt_name, keylen); -+ crypto_free_tfm(tfm); -+ tfm=NULL; -+ } -+err: -+ if (debug > 0) -+ printk(KERN_DEBUG "klips_debug:_capi_new_key:" -+ "name=%s key=%p keylen=%d tfm=%p\n", -+ alg->ixt_name, key, keylen, tfm); -+ return (__u8 *) tfm; -+} -+/* -+ * core encryption function: will use cx->ci to call actual cipher's -+ * cbc function -+ */ -+static int -+_capi_cbc_encrypt(struct ipsec_alg_enc *alg, __u8 * key_e, __u8 * in, int ilen, const __u8 * iv, int encrypt) { -+ int error =0; -+ struct crypto_tfm *tfm=(struct crypto_tfm *)key_e; -+ struct scatterlist sg = { -+ .page = virt_to_page(in), -+ .offset = (unsigned long)(in) % PAGE_SIZE, -+ .length=ilen, -+ }; -+ if (debug > 1) -+ printk(KERN_DEBUG "klips_debug:_capi_cbc_encrypt:" -+ "key_e=%p " -+ "in=%p out=%p ilen=%d iv=%p encrypt=%d\n" -+ , key_e -+ , in, in, ilen, iv, encrypt); -+ crypto_cipher_set_iv(tfm, iv, crypto_tfm_alg_ivsize(tfm)); -+ if (encrypt) -+ error = crypto_cipher_encrypt (tfm, &sg, &sg, ilen); -+ else -+ error = crypto_cipher_decrypt (tfm, &sg, &sg, ilen); -+ if (debug > 1) -+ printk(KERN_DEBUG "klips_debug:_capi_cbc_encrypt:" -+ "error=%d\n" -+ , error); -+ return (error<0)? error : ilen; -+} -+/* -+ * main initialization loop: for each cipher in list, do -+ * 1) setup cryptoapi cipher else continue -+ * 2) register ipsec_alg object -+ */ -+static int -+setup_cipher_list (struct ipsec_alg_capi_cipher* clist) -+{ -+ struct ipsec_alg_capi_cipher *cptr; -+ /* foreach cipher in list ... */ -+ for (cptr=clist;cptr->ciphername;cptr++) { -+ /* -+ * see if cipher has been disabled (0) or -+ * if noauto set and not enabled (1) -+ */ -+ if (cptr->parm[0] == 0 || (noauto && cptr->parm[0] < 0)) { -+ if (debug>0) -+ printk(KERN_INFO "setup_cipher_list(): " -+ "ciphername=%s skipped at user request: " -+ "noauto=%d parm[0]=%d parm[1]=%d\n" -+ , cptr->ciphername -+ , noauto -+ , cptr->parm[0] -+ , cptr->parm[1]); -+ continue; -+ } -+ /* -+ * use a local ci to avoid touching cptr->ci, -+ * if register ipsec_alg success then bind cipher -+ */ -+ if( setup_cipher(cptr->ciphername) ) { -+ if (debug > 0) -+ printk(KERN_DEBUG "klips_debug:" -+ "setup_cipher_list():" -+ "ciphername=%s found\n" -+ , cptr->ciphername); -+ if (setup_ipsec_alg_capi_cipher(cptr) == 0) { -+ -+ -+ } else { -+ printk(KERN_ERR "klips_debug:" -+ "setup_cipher_list():" -+ "ciphername=%s failed ipsec_alg_register\n" -+ , cptr->ciphername); -+ } -+ } else { -+ if (debug>0) -+ printk(KERN_INFO "setup_cipher_list(): lookup for ciphername=%s: not found \n", -+ cptr->ciphername); -+ } -+ } -+ return 0; -+} -+/* -+ * deregister ipsec_alg objects and unbind ciphers -+ */ -+static int -+unsetup_cipher_list (struct ipsec_alg_capi_cipher* clist) -+{ -+ struct ipsec_alg_capi_cipher *cptr; -+ /* foreach cipher in list ... */ -+ for (cptr=clist;cptr->ciphername;cptr++) { -+ if (cptr->alg.ixt_state & IPSEC_ALG_ST_REGISTERED) { -+ unregister_ipsec_alg_enc(&cptr->alg); -+ } -+ } -+ return 0; -+} -+/* -+ * test loop for registered algos -+ */ -+static int -+test_cipher_list (struct ipsec_alg_capi_cipher* clist) -+{ -+ int test_ret; -+ struct ipsec_alg_capi_cipher *cptr; -+ /* foreach cipher in list ... */ -+ for (cptr=clist;cptr->ciphername;cptr++) { -+ if (cptr->alg.ixt_state & IPSEC_ALG_ST_REGISTERED) { -+ test_ret=ipsec_alg_test( -+ cptr->alg.ixt_alg_type, -+ cptr->alg.ixt_alg_id, -+ test); -+ printk("test_cipher_list(alg_type=%d alg_id=%d): test_ret=%d\n", -+ cptr->alg.ixt_alg_type, -+ cptr->alg.ixt_alg_id, -+ test_ret); -+ } -+ } -+ return 0; -+} -+ -+IPSEC_ALG_MODULE_INIT( ipsec_cryptoapi_init ) -+{ -+ int ret, test_ret; -+ if ((ret=setup_cipher_list(alg_capi_carray)) < 0) -+ return -EPROTONOSUPPORT; -+ if (ret==0 && test) { -+ test_ret=test_cipher_list(alg_capi_carray); -+ } -+ return ret; -+} -+IPSEC_ALG_MODULE_EXIT( ipsec_cryptoapi_fini ) -+{ -+ unsetup_cipher_list(alg_capi_carray); -+ return; -+} -+#ifdef MODULE_LICENSE -+MODULE_LICENSE("GPL"); -+#endif -+ -+EXPORT_NO_SYMBOLS; -+#endif /* NO_CRYPTOAPI_SUPPORT */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/alg/scripts/mk-static_init.c.sh Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,18 @@ -+#!/bin/sh -+cat << EOF -+#include -+#include -+#include "freeswan/ipsec_alg.h" -+$(for i in $*; do -+ test -z "$i" && continue -+ echo "extern int $i(void);" -+done) -+void ipsec_alg_static_init(void){ -+ int __attribute__ ((unused)) err=0; -+$(for i in $*; do -+ test -z "$i" && continue -+ echo " if ((err=$i()) < 0)" -+ echo " printk(KERN_WARNING \"$i() returned %d\", err);" -+done) -+} -+EOF ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/anyaddr.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,150 @@ -+/* -+ * special addresses -+ * Copyright (C) 2000 Henry Spencer. -+ * -+ * This library is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU Library General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This library is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public -+ * License for more details. -+ * -+ * RCSID $Id: anyaddr.c,v 1.10 2004/07/10 07:43:47 mcr Exp $ -+ */ -+#include "openswan.h" -+ -+/* these are mostly fallbacks for the no-IPv6-support-in-library case */ -+#ifndef IN6ADDR_ANY_INIT -+#define IN6ADDR_ANY_INIT {{{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }}} -+#endif -+#ifndef IN6ADDR_LOOPBACK_INIT -+#define IN6ADDR_LOOPBACK_INIT {{{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 }}} -+#endif -+ -+static struct in6_addr v6any = IN6ADDR_ANY_INIT; -+static struct in6_addr v6loop = IN6ADDR_LOOPBACK_INIT; -+ -+/* -+ - anyaddr - initialize to the any-address value -+ */ -+err_t /* NULL for success, else string literal */ -+anyaddr(af, dst) -+int af; /* address family */ -+ip_address *dst; -+{ -+ uint32_t v4any = htonl(INADDR_ANY); -+ -+ switch (af) { -+ case AF_INET: -+ return initaddr((unsigned char *)&v4any, sizeof(v4any), af, dst); -+ break; -+ case AF_INET6: -+ return initaddr((unsigned char *)&v6any, sizeof(v6any), af, dst); -+ break; -+ default: -+ return "unknown address family in anyaddr/unspecaddr"; -+ break; -+ } -+} -+ -+/* -+ - unspecaddr - initialize to the unspecified-address value -+ */ -+err_t /* NULL for success, else string literal */ -+unspecaddr(af, dst) -+int af; /* address family */ -+ip_address *dst; -+{ -+ return anyaddr(af, dst); -+} -+ -+/* -+ - loopbackaddr - initialize to the loopback-address value -+ */ -+err_t /* NULL for success, else string literal */ -+loopbackaddr(af, dst) -+int af; /* address family */ -+ip_address *dst; -+{ -+ uint32_t v4loop = htonl(INADDR_LOOPBACK); -+ -+ switch (af) { -+ case AF_INET: -+ return initaddr((unsigned char *)&v4loop, sizeof(v4loop), af, dst); -+ break; -+ case AF_INET6: -+ return initaddr((unsigned char *)&v6loop, sizeof(v6loop), af, dst); -+ break; -+ default: -+ return "unknown address family in loopbackaddr"; -+ break; -+ } -+} -+ -+/* -+ - isanyaddr - test for the any-address value -+ */ -+int -+isanyaddr(src) -+const ip_address *src; -+{ -+ uint32_t v4any = htonl(INADDR_ANY); -+ int cmp; -+ -+ switch (src->u.v4.sin_family) { -+ case AF_INET: -+ cmp = memcmp(&src->u.v4.sin_addr.s_addr, &v4any, sizeof(v4any)); -+ break; -+ case AF_INET6: -+ cmp = memcmp(&src->u.v6.sin6_addr, &v6any, sizeof(v6any)); -+ break; -+ -+ case 0: -+ /* a zeroed structure is considered any address */ -+ return 1; -+ -+ default: -+ return 0; -+ break; -+ } -+ -+ return (cmp == 0) ? 1 : 0; -+} -+ -+/* -+ - isunspecaddr - test for the unspecified-address value -+ */ -+int -+isunspecaddr(src) -+const ip_address *src; -+{ -+ return isanyaddr(src); -+} -+ -+/* -+ - isloopbackaddr - test for the loopback-address value -+ */ -+int -+isloopbackaddr(src) -+const ip_address *src; -+{ -+ uint32_t v4loop = htonl(INADDR_LOOPBACK); -+ int cmp; -+ -+ switch (src->u.v4.sin_family) { -+ case AF_INET: -+ cmp = memcmp(&src->u.v4.sin_addr.s_addr, &v4loop, sizeof(v4loop)); -+ break; -+ case AF_INET6: -+ cmp = memcmp(&src->u.v6.sin6_addr, &v6loop, sizeof(v6loop)); -+ break; -+ default: -+ return 0; -+ break; -+ } -+ -+ return (cmp == 0) ? 1 : 0; -+} ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/datatot.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,234 @@ -+/* -+ * convert from binary data (e.g. key) to text form -+ * Copyright (C) 2000 Henry Spencer. -+ * -+ * This library is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU Library General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This library is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public -+ * License for more details. -+ * -+ * RCSID $Id: datatot.c,v 1.9 2005/08/30 21:15:26 mcr Exp $ -+ */ -+#include "openswan.h" -+ -+static void convert(const char *src, size_t nreal, int format, char *out); -+ -+/* -+ - datatot - convert data bytes to text -+ */ -+size_t /* true length (with NUL) for success */ -+datatot(src, srclen, format, dst, dstlen) -+const unsigned char *src; -+size_t srclen; -+int format; /* character indicating what format */ -+char *dst; /* need not be valid if dstlen is 0 */ -+size_t dstlen; -+{ -+ size_t inblocksize; /* process this many bytes at a time */ -+ size_t outblocksize; /* producing this many */ -+ size_t breakevery; /* add a _ every this many (0 means don't) */ -+ size_t sincebreak; /* output bytes since last _ */ -+ char breakchar; /* character used to break between groups */ -+ unsigned char inblock[10]; /* enough for any format */ -+ char outblock[10]; /* enough for any format */ -+ char fake[1]; /* fake output area for dstlen == 0 */ -+ size_t needed; /* return value */ -+ char *stop; /* where the terminating NUL will go */ -+ size_t ntodo; /* remaining input */ -+ size_t nreal; -+ char *out; -+ char *prefix; -+ -+ breakevery = 0; -+ breakchar = '_'; -+ -+ switch (format) { -+ case 0: -+ case 'h': -+ format = 'x'; -+ breakevery = 8; -+ /* FALLTHROUGH */ -+ case 'x': -+ inblocksize = 1; -+ outblocksize = 2; -+ prefix = "0x"; -+ break; -+ case ':': -+ format = 'x'; -+ breakevery = 2; -+ breakchar = ':'; -+ /* FALLTHROUGH */ -+ case 16: -+ inblocksize = 1; -+ outblocksize = 2; -+ prefix = ""; -+ format = 'x'; -+ break; -+ case 's': -+ inblocksize = 3; -+ outblocksize = 4; -+ prefix = "0s"; -+ break; -+ case 64: /* beware, equals ' ' */ -+ inblocksize = 3; -+ outblocksize = 4; -+ prefix = ""; -+ format = 's'; -+ break; -+ default: -+ return 0; -+ break; -+ } -+ -+ user_assert(inblocksize < sizeof(inblock)); -+ user_assert(outblocksize < sizeof(outblock)); -+ user_assert(breakevery % outblocksize == 0); -+ -+ if (srclen == 0) -+ return 0; -+ ntodo = srclen; -+ -+ if (dstlen == 0) { /* dispose of awkward special case */ -+ dst = fake; -+ dstlen = 1; -+ } -+ stop = dst + dstlen - 1; -+ -+ nreal = strlen(prefix); -+ needed = nreal; /* for starters */ -+ if (dstlen <= nreal) { /* prefix won't fit */ -+ strncpy(dst, prefix, dstlen - 1); -+ dst += dstlen - 1; -+ } else { -+ strcpy(dst, prefix); -+ dst += nreal; -+ } -+ -+ user_assert(dst <= stop); -+ sincebreak = 0; -+ -+ while (ntodo > 0) { -+ if (ntodo < inblocksize) { /* incomplete input */ -+ memset(inblock, 0, sizeof(inblock)); -+ memcpy(inblock, src, ntodo); -+ src = inblock; -+ nreal = ntodo; -+ ntodo = inblocksize; -+ } else -+ nreal = inblocksize; -+ out = (outblocksize > stop - dst) ? outblock : dst; -+ -+ convert((const char *)src, nreal, format, out); -+ needed += outblocksize; -+ sincebreak += outblocksize; -+ if (dst < stop) { -+ if (out != dst) { -+ user_assert(outblocksize > stop - dst); -+ memcpy(dst, out, stop - dst); -+ dst = stop; -+ } else -+ dst += outblocksize; -+ } -+ -+ src += inblocksize; -+ ntodo -= inblocksize; -+ if (breakevery != 0 && sincebreak >= breakevery && ntodo > 0) { -+ if (dst < stop) -+ *dst++ = breakchar; -+ needed++; -+ sincebreak = 0; -+ } -+ } -+ -+ user_assert(dst <= stop); -+ *dst++ = '\0'; -+ needed++; -+ -+ return needed; -+} -+ -+/* -+ - convert - convert one input block to one output block -+ */ -+static void -+convert(src, nreal, format, out) -+const char *src; -+size_t nreal; /* how much of the input block is real */ -+int format; -+char *out; -+{ -+ static char hex[] = "0123456789abcdef"; -+ static char base64[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" -+ "abcdefghijklmnopqrstuvwxyz" -+ "0123456789+/"; -+ unsigned char c; -+ unsigned char c1, c2, c3; -+ -+ user_assert(nreal > 0); -+ switch (format) { -+ case 'x': -+ user_assert(nreal == 1); -+ c = (unsigned char)*src; -+ *out++ = hex[c >> 4]; -+ *out++ = hex[c & 0xf]; -+ break; -+ case 's': -+ c1 = (unsigned char)*src++; -+ c2 = (unsigned char)*src++; -+ c3 = (unsigned char)*src++; -+ *out++ = base64[c1 >> 2]; /* top 6 bits of c1 */ -+ c = (c1 & 0x3) << 4; /* bottom 2 of c1... */ -+ c |= c2 >> 4; /* ...top 4 of c2 */ -+ *out++ = base64[c]; -+ if (nreal == 1) -+ *out++ = '='; -+ else { -+ c = (c2 & 0xf) << 2; /* bottom 4 of c2... */ -+ c |= c3 >> 6; /* ...top 2 of c3 */ -+ *out++ = base64[c]; -+ } -+ if (nreal <= 2) -+ *out++ = '='; -+ else -+ *out++ = base64[c3 & 0x3f]; /* bottom 6 of c3 */ -+ break; -+ default: -+ user_assert(nreal == 0); /* unknown format */ -+ break; -+ } -+} -+ -+/* -+ - datatoa - convert data to ASCII -+ * backward-compatibility synonym for datatot -+ */ -+size_t /* true length (with NUL) for success */ -+datatoa(src, srclen, format, dst, dstlen) -+const unsigned char *src; -+size_t srclen; -+int format; /* character indicating what format */ -+char *dst; /* need not be valid if dstlen is 0 */ -+size_t dstlen; -+{ -+ return datatot(src, srclen, format, dst, dstlen); -+} -+ -+/* -+ - bytestoa - convert data bytes to ASCII -+ * backward-compatibility synonym for datatot -+ */ -+size_t /* true length (with NUL) for success */ -+bytestoa(src, srclen, format, dst, dstlen) -+const unsigned char *src; -+size_t srclen; -+int format; /* character indicating what format */ -+char *dst; /* need not be valid if dstlen is 0 */ -+size_t dstlen; -+{ -+ return datatot(src, srclen, format, dst, dstlen); -+} ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/defconfig Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,63 @@ -+ -+# -+# RCSID $Id: defconfig,v 1.30 2005/09/15 02:31:12 paul Exp $ -+# -+ -+# -+# Openswan IPSec implementation, KLIPS kernel config defaults -+# -+ -+# -+# First, lets override stuff already set or not in the kernel config. -+# -+# We can't even think about leaving this off... -+CONFIG_INET=y -+ -+# -+# This must be on for subnet protection. -+CONFIG_IP_FORWARD=y -+ -+# Shut off IPSEC masquerading if it has been enabled, since it will -+# break the compile. IPPROTO_ESP and IPPROTO_AH were included in -+# net/ipv4/ip_masq.c when they should have gone into include/linux/in.h. -+CONFIG_IP_MASQUERADE_IPSEC=n -+ -+# -+# Next, lets set the recommended FreeS/WAN configuration. -+# -+ -+# To config as static (preferred), 'y'. To config as module, 'm'. -+CONFIG_KLIPS=m -+ -+# To do tunnel mode IPSec, this must be enabled. -+CONFIG_KLIPS_IPIP=y -+ -+# To enable authentication, say 'y'. (Highly recommended) -+CONFIG_KLIPS_AH=y -+ -+# Authentication algorithm(s): -+CONFIG_KLIPS_AUTH_HMAC_MD5=y -+CONFIG_KLIPS_AUTH_HMAC_SHA1=y -+ -+# To enable encryption, say 'y'. (Highly recommended) -+CONFIG_KLIPS_ESP=y -+ -+# modular algo extensions (and new ALGOs) -+CONFIG_KLIPS_ALG=y -+ -+# Encryption algorithm(s): -+CONFIG_KLIPS_ENC_3DES=y -+CONFIG_KLIPS_ENC_AES=y -+ -+# Use CryptoAPI for ALG? - by default, no. -+CONFIG_KLIPS_ENC_CRYPTOAPI=n -+ -+# IP Compression: new, probably still has minor bugs. -+CONFIG_KLIPS_IPCOMP=y -+ -+# To enable userspace-switchable KLIPS debugging, say 'y'. -+CONFIG_KLIPS_DEBUG=y -+ -+# OCF HW offloading, requires kernel patch -+# CONFIG_KLIPS_OCF is not set -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/deflate.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,1351 @@ -+/* deflate.c -- compress data using the deflation algorithm -+ * Copyright (C) 1995-2002 Jean-loup Gailly. -+ * For conditions of distribution and use, see copyright notice in zlib.h -+ */ -+ -+/* -+ * ALGORITHM -+ * -+ * The "deflation" process depends on being able to identify portions -+ * of the input text which are identical to earlier input (within a -+ * sliding window trailing behind the input currently being processed). -+ * -+ * The most straightforward technique turns out to be the fastest for -+ * most input files: try all possible matches and select the longest. -+ * The key feature of this algorithm is that insertions into the string -+ * dictionary are very simple and thus fast, and deletions are avoided -+ * completely. Insertions are performed at each input character, whereas -+ * string matches are performed only when the previous match ends. So it -+ * is preferable to spend more time in matches to allow very fast string -+ * insertions and avoid deletions. The matching algorithm for small -+ * strings is inspired from that of Rabin & Karp. A brute force approach -+ * is used to find longer strings when a small match has been found. -+ * A similar algorithm is used in comic (by Jan-Mark Wams) and freeze -+ * (by Leonid Broukhis). -+ * A previous version of this file used a more sophisticated algorithm -+ * (by Fiala and Greene) which is guaranteed to run in linear amortized -+ * time, but has a larger average cost, uses more memory and is patented. -+ * However the F&G algorithm may be faster for some highly redundant -+ * files if the parameter max_chain_length (described below) is too large. -+ * -+ * ACKNOWLEDGEMENTS -+ * -+ * The idea of lazy evaluation of matches is due to Jan-Mark Wams, and -+ * I found it in 'freeze' written by Leonid Broukhis. -+ * Thanks to many people for bug reports and testing. -+ * -+ * REFERENCES -+ * -+ * Deutsch, L.P.,"DEFLATE Compressed Data Format Specification". -+ * Available in ftp://ds.internic.net/rfc/rfc1951.txt -+ * -+ * A description of the Rabin and Karp algorithm is given in the book -+ * "Algorithms" by R. Sedgewick, Addison-Wesley, p252. -+ * -+ * Fiala,E.R., and Greene,D.H. -+ * Data Compression with Finite Windows, Comm.ACM, 32,4 (1989) 490-595 -+ * -+ */ -+ -+/* @(#) $Id: deflate.c,v 1.4 2004/07/10 07:48:37 mcr Exp $ */ -+ -+#include "deflate.h" -+ -+local const char deflate_copyright[] = -+ " deflate 1.1.4 Copyright 1995-2002 Jean-loup Gailly "; -+/* -+ If you use the zlib library in a product, an acknowledgment is welcome -+ in the documentation of your product. If for some reason you cannot -+ include such an acknowledgment, I would appreciate that you keep this -+ copyright string in the executable of your product. -+ */ -+ -+/* =========================================================================== -+ * Function prototypes. -+ */ -+typedef enum { -+ need_more, /* block not completed, need more input or more output */ -+ block_done, /* block flush performed */ -+ finish_started, /* finish started, need only more output at next deflate */ -+ finish_done /* finish done, accept no more input or output */ -+} block_state; -+ -+typedef block_state (*compress_func) OF((deflate_state *s, int flush)); -+/* Compression function. Returns the block state after the call. */ -+ -+local void fill_window OF((deflate_state *s)); -+local block_state deflate_stored OF((deflate_state *s, int flush)); -+local block_state deflate_fast OF((deflate_state *s, int flush)); -+local block_state deflate_slow OF((deflate_state *s, int flush)); -+local void lm_init OF((deflate_state *s)); -+local void putShortMSB OF((deflate_state *s, uInt b)); -+local void flush_pending OF((z_streamp strm)); -+local int read_buf OF((z_streamp strm, Bytef *buf, unsigned size)); -+#ifdef ASMV -+ void match_init OF((void)); /* asm code initialization */ -+ uInt longest_match OF((deflate_state *s, IPos cur_match)); -+#else -+local uInt longest_match OF((deflate_state *s, IPos cur_match)); -+#endif -+ -+#ifdef DEBUG -+local void check_match OF((deflate_state *s, IPos start, IPos match, -+ int length)); -+#endif -+ -+/* =========================================================================== -+ * Local data -+ */ -+ -+#define NIL 0 -+/* Tail of hash chains */ -+ -+#ifndef TOO_FAR -+# define TOO_FAR 4096 -+#endif -+/* Matches of length 3 are discarded if their distance exceeds TOO_FAR */ -+ -+#define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1) -+/* Minimum amount of lookahead, except at the end of the input file. -+ * See deflate.c for comments about the MIN_MATCH+1. -+ */ -+ -+/* Values for max_lazy_match, good_match and max_chain_length, depending on -+ * the desired pack level (0..9). The values given below have been tuned to -+ * exclude worst case performance for pathological files. Better values may be -+ * found for specific files. -+ */ -+typedef struct config_s { -+ ush good_length; /* reduce lazy search above this match length */ -+ ush max_lazy; /* do not perform lazy search above this match length */ -+ ush nice_length; /* quit search above this match length */ -+ ush max_chain; -+ compress_func func; -+} config; -+ -+local const config configuration_table[10] = { -+/* good lazy nice chain */ -+/* 0 */ {0, 0, 0, 0, deflate_stored}, /* store only */ -+/* 1 */ {4, 4, 8, 4, deflate_fast}, /* maximum speed, no lazy matches */ -+/* 2 */ {4, 5, 16, 8, deflate_fast}, -+/* 3 */ {4, 6, 32, 32, deflate_fast}, -+ -+/* 4 */ {4, 4, 16, 16, deflate_slow}, /* lazy matches */ -+/* 5 */ {8, 16, 32, 32, deflate_slow}, -+/* 6 */ {8, 16, 128, 128, deflate_slow}, -+/* 7 */ {8, 32, 128, 256, deflate_slow}, -+/* 8 */ {32, 128, 258, 1024, deflate_slow}, -+/* 9 */ {32, 258, 258, 4096, deflate_slow}}; /* maximum compression */ -+ -+/* Note: the deflate() code requires max_lazy >= MIN_MATCH and max_chain >= 4 -+ * For deflate_fast() (levels <= 3) good is ignored and lazy has a different -+ * meaning. -+ */ -+ -+#define EQUAL 0 -+/* result of memcmp for equal strings */ -+ -+struct static_tree_desc_s {int dummy;}; /* for buggy compilers */ -+ -+/* =========================================================================== -+ * Update a hash value with the given input byte -+ * IN assertion: all calls to to UPDATE_HASH are made with consecutive -+ * input characters, so that a running hash key can be computed from the -+ * previous key instead of complete recalculation each time. -+ */ -+#define UPDATE_HASH(s,h,c) (h = (((h)<hash_shift) ^ (c)) & s->hash_mask) -+ -+ -+/* =========================================================================== -+ * Insert string str in the dictionary and set match_head to the previous head -+ * of the hash chain (the most recent string with same hash key). Return -+ * the previous length of the hash chain. -+ * If this file is compiled with -DFASTEST, the compression level is forced -+ * to 1, and no hash chains are maintained. -+ * IN assertion: all calls to to INSERT_STRING are made with consecutive -+ * input characters and the first MIN_MATCH bytes of str are valid -+ * (except for the last MIN_MATCH-1 bytes of the input file). -+ */ -+#ifdef FASTEST -+#define INSERT_STRING(s, str, match_head) \ -+ (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \ -+ match_head = s->head[s->ins_h], \ -+ s->head[s->ins_h] = (Pos)(str)) -+#else -+#define INSERT_STRING(s, str, match_head) \ -+ (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \ -+ s->prev[(str) & s->w_mask] = match_head = s->head[s->ins_h], \ -+ s->head[s->ins_h] = (Pos)(str)) -+#endif -+ -+/* =========================================================================== -+ * Initialize the hash table (avoiding 64K overflow for 16 bit systems). -+ * prev[] will be initialized on the fly. -+ */ -+#define CLEAR_HASH(s) \ -+ s->head[s->hash_size-1] = NIL; \ -+ zmemzero((Bytef *)s->head, (unsigned)(s->hash_size-1)*sizeof(*s->head)); -+ -+/* ========================================================================= */ -+int ZEXPORT deflateInit_(strm, level, version, stream_size) -+ z_streamp strm; -+ int level; -+ const char *version; -+ int stream_size; -+{ -+ return deflateInit2_(strm, level, Z_DEFLATED, MAX_WBITS, DEF_MEM_LEVEL, -+ Z_DEFAULT_STRATEGY, version, stream_size); -+ /* To do: ignore strm->next_in if we use it as window */ -+} -+ -+/* ========================================================================= */ -+int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy, -+ version, stream_size) -+ z_streamp strm; -+ int level; -+ int method; -+ int windowBits; -+ int memLevel; -+ int strategy; -+ const char *version; -+ int stream_size; -+{ -+ deflate_state *s; -+ int noheader = 0; -+ static const char* my_version = ZLIB_VERSION; -+ -+ ushf *overlay; -+ /* We overlay pending_buf and d_buf+l_buf. This works since the average -+ * output size for (length,distance) codes is <= 24 bits. -+ */ -+ -+ if (version == Z_NULL || version[0] != my_version[0] || -+ stream_size != sizeof(z_stream)) { -+ return Z_VERSION_ERROR; -+ } -+ if (strm == Z_NULL) return Z_STREAM_ERROR; -+ -+ strm->msg = Z_NULL; -+ if (strm->zalloc == Z_NULL) { -+ return Z_STREAM_ERROR; -+/* strm->zalloc = zcalloc; -+ strm->opaque = (voidpf)0;*/ -+ } -+ if (strm->zfree == Z_NULL) return Z_STREAM_ERROR; /* strm->zfree = zcfree; */ -+ -+ if (level == Z_DEFAULT_COMPRESSION) level = 6; -+#ifdef FASTEST -+ level = 1; -+#endif -+ -+ if (windowBits < 0) { /* undocumented feature: suppress zlib header */ -+ noheader = 1; -+ windowBits = -windowBits; -+ } -+ if (memLevel < 1 || memLevel > MAX_MEM_LEVEL || method != Z_DEFLATED || -+ windowBits < 9 || windowBits > 15 || level < 0 || level > 9 || -+ strategy < 0 || strategy > Z_HUFFMAN_ONLY) { -+ return Z_STREAM_ERROR; -+ } -+ s = (deflate_state *) ZALLOC(strm, 1, sizeof(deflate_state)); -+ if (s == Z_NULL) return Z_MEM_ERROR; -+ strm->state = (struct internal_state FAR *)s; -+ s->strm = strm; -+ -+ s->noheader = noheader; -+ s->w_bits = windowBits; -+ s->w_size = 1 << s->w_bits; -+ s->w_mask = s->w_size - 1; -+ -+ s->hash_bits = memLevel + 7; -+ s->hash_size = 1 << s->hash_bits; -+ s->hash_mask = s->hash_size - 1; -+ s->hash_shift = ((s->hash_bits+MIN_MATCH-1)/MIN_MATCH); -+ -+ s->window = (Bytef *) ZALLOC(strm, s->w_size, 2*sizeof(Byte)); -+ s->prev = (Posf *) ZALLOC(strm, s->w_size, sizeof(Pos)); -+ s->head = (Posf *) ZALLOC(strm, s->hash_size, sizeof(Pos)); -+ -+ s->lit_bufsize = 1 << (memLevel + 6); /* 16K elements by default */ -+ -+ overlay = (ushf *) ZALLOC(strm, s->lit_bufsize, sizeof(ush)+2); -+ s->pending_buf = (uchf *) overlay; -+ s->pending_buf_size = (ulg)s->lit_bufsize * (sizeof(ush)+2L); -+ -+ if (s->window == Z_NULL || s->prev == Z_NULL || s->head == Z_NULL || -+ s->pending_buf == Z_NULL) { -+ strm->msg = ERR_MSG(Z_MEM_ERROR); -+ deflateEnd (strm); -+ return Z_MEM_ERROR; -+ } -+ s->d_buf = overlay + s->lit_bufsize/sizeof(ush); -+ s->l_buf = s->pending_buf + (1+sizeof(ush))*s->lit_bufsize; -+ -+ s->level = level; -+ s->strategy = strategy; -+ s->method = (Byte)method; -+ -+ return deflateReset(strm); -+} -+ -+/* ========================================================================= */ -+int ZEXPORT deflateSetDictionary (strm, dictionary, dictLength) -+ z_streamp strm; -+ const Bytef *dictionary; -+ uInt dictLength; -+{ -+ deflate_state *s; -+ uInt length = dictLength; -+ uInt n; -+ IPos hash_head = 0; -+ -+ if (strm == Z_NULL || strm->state == Z_NULL || dictionary == Z_NULL || -+ strm->state->status != INIT_STATE) return Z_STREAM_ERROR; -+ -+ s = strm->state; -+ strm->adler = adler32(strm->adler, dictionary, dictLength); -+ -+ if (length < MIN_MATCH) return Z_OK; -+ if (length > MAX_DIST(s)) { -+ length = MAX_DIST(s); -+#ifndef USE_DICT_HEAD -+ dictionary += dictLength - length; /* use the tail of the dictionary */ -+#endif -+ } -+ zmemcpy(s->window, dictionary, length); -+ s->strstart = length; -+ s->block_start = (long)length; -+ -+ /* Insert all strings in the hash table (except for the last two bytes). -+ * s->lookahead stays null, so s->ins_h will be recomputed at the next -+ * call of fill_window. -+ */ -+ s->ins_h = s->window[0]; -+ UPDATE_HASH(s, s->ins_h, s->window[1]); -+ for (n = 0; n <= length - MIN_MATCH; n++) { -+ INSERT_STRING(s, n, hash_head); -+ } -+ if (hash_head) hash_head = 0; /* to make compiler happy */ -+ return Z_OK; -+} -+ -+/* ========================================================================= */ -+int ZEXPORT deflateReset (strm) -+ z_streamp strm; -+{ -+ deflate_state *s; -+ -+ if (strm == Z_NULL || strm->state == Z_NULL || -+ strm->zalloc == Z_NULL || strm->zfree == Z_NULL) return Z_STREAM_ERROR; -+ -+ strm->total_in = strm->total_out = 0; -+ strm->msg = Z_NULL; /* use zfree if we ever allocate msg dynamically */ -+ strm->data_type = Z_UNKNOWN; -+ -+ s = (deflate_state *)strm->state; -+ s->pending = 0; -+ s->pending_out = s->pending_buf; -+ -+ if (s->noheader < 0) { -+ s->noheader = 0; /* was set to -1 by deflate(..., Z_FINISH); */ -+ } -+ s->status = s->noheader ? BUSY_STATE : INIT_STATE; -+ strm->adler = 1; -+ s->last_flush = Z_NO_FLUSH; -+ -+ _tr_init(s); -+ lm_init(s); -+ -+ return Z_OK; -+} -+ -+/* ========================================================================= */ -+int ZEXPORT deflateParams(strm, level, strategy) -+ z_streamp strm; -+ int level; -+ int strategy; -+{ -+ deflate_state *s; -+ compress_func func; -+ int err = Z_OK; -+ -+ if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; -+ s = strm->state; -+ -+ if (level == Z_DEFAULT_COMPRESSION) { -+ level = 6; -+ } -+ if (level < 0 || level > 9 || strategy < 0 || strategy > Z_HUFFMAN_ONLY) { -+ return Z_STREAM_ERROR; -+ } -+ func = configuration_table[s->level].func; -+ -+ if (func != configuration_table[level].func && strm->total_in != 0) { -+ /* Flush the last buffer: */ -+ err = deflate(strm, Z_PARTIAL_FLUSH); -+ } -+ if (s->level != level) { -+ s->level = level; -+ s->max_lazy_match = configuration_table[level].max_lazy; -+ s->good_match = configuration_table[level].good_length; -+ s->nice_match = configuration_table[level].nice_length; -+ s->max_chain_length = configuration_table[level].max_chain; -+ } -+ s->strategy = strategy; -+ return err; -+} -+ -+/* ========================================================================= -+ * Put a short in the pending buffer. The 16-bit value is put in MSB order. -+ * IN assertion: the stream state is correct and there is enough room in -+ * pending_buf. -+ */ -+local void putShortMSB (s, b) -+ deflate_state *s; -+ uInt b; -+{ -+ put_byte(s, (Byte)(b >> 8)); -+ put_byte(s, (Byte)(b & 0xff)); -+} -+ -+/* ========================================================================= -+ * Flush as much pending output as possible. All deflate() output goes -+ * through this function so some applications may wish to modify it -+ * to avoid allocating a large strm->next_out buffer and copying into it. -+ * (See also read_buf()). -+ */ -+local void flush_pending(strm) -+ z_streamp strm; -+{ -+ unsigned len = strm->state->pending; -+ -+ if (len > strm->avail_out) len = strm->avail_out; -+ if (len == 0) return; -+ -+ zmemcpy(strm->next_out, strm->state->pending_out, len); -+ strm->next_out += len; -+ strm->state->pending_out += len; -+ strm->total_out += len; -+ strm->avail_out -= len; -+ strm->state->pending -= len; -+ if (strm->state->pending == 0) { -+ strm->state->pending_out = strm->state->pending_buf; -+ } -+} -+ -+/* ========================================================================= */ -+int ZEXPORT deflate (strm, flush) -+ z_streamp strm; -+ int flush; -+{ -+ int old_flush; /* value of flush param for previous deflate call */ -+ deflate_state *s; -+ -+ if (strm == Z_NULL || strm->state == Z_NULL || -+ flush > Z_FINISH || flush < 0) { -+ return Z_STREAM_ERROR; -+ } -+ s = strm->state; -+ -+ if (strm->next_out == Z_NULL || -+ (strm->next_in == Z_NULL && strm->avail_in != 0) || -+ (s->status == FINISH_STATE && flush != Z_FINISH)) { -+ ERR_RETURN(strm, Z_STREAM_ERROR); -+ } -+ if (strm->avail_out == 0) ERR_RETURN(strm, Z_BUF_ERROR); -+ -+ s->strm = strm; /* just in case */ -+ old_flush = s->last_flush; -+ s->last_flush = flush; -+ -+ /* Write the zlib header */ -+ if (s->status == INIT_STATE) { -+ -+ uInt header = (Z_DEFLATED + ((s->w_bits-8)<<4)) << 8; -+ uInt level_flags = (s->level-1) >> 1; -+ -+ if (level_flags > 3) level_flags = 3; -+ header |= (level_flags << 6); -+ if (s->strstart != 0) header |= PRESET_DICT; -+ header += 31 - (header % 31); -+ -+ s->status = BUSY_STATE; -+ putShortMSB(s, header); -+ -+ /* Save the adler32 of the preset dictionary: */ -+ if (s->strstart != 0) { -+ putShortMSB(s, (uInt)(strm->adler >> 16)); -+ putShortMSB(s, (uInt)(strm->adler & 0xffff)); -+ } -+ strm->adler = 1L; -+ } -+ -+ /* Flush as much pending output as possible */ -+ if (s->pending != 0) { -+ flush_pending(strm); -+ if (strm->avail_out == 0) { -+ /* Since avail_out is 0, deflate will be called again with -+ * more output space, but possibly with both pending and -+ * avail_in equal to zero. There won't be anything to do, -+ * but this is not an error situation so make sure we -+ * return OK instead of BUF_ERROR at next call of deflate: -+ */ -+ s->last_flush = -1; -+ return Z_OK; -+ } -+ -+ /* Make sure there is something to do and avoid duplicate consecutive -+ * flushes. For repeated and useless calls with Z_FINISH, we keep -+ * returning Z_STREAM_END instead of Z_BUFF_ERROR. -+ */ -+ } else if (strm->avail_in == 0 && flush <= old_flush && -+ flush != Z_FINISH) { -+ ERR_RETURN(strm, Z_BUF_ERROR); -+ } -+ -+ /* User must not provide more input after the first FINISH: */ -+ if (s->status == FINISH_STATE && strm->avail_in != 0) { -+ ERR_RETURN(strm, Z_BUF_ERROR); -+ } -+ -+ /* Start a new block or continue the current one. -+ */ -+ if (strm->avail_in != 0 || s->lookahead != 0 || -+ (flush != Z_NO_FLUSH && s->status != FINISH_STATE)) { -+ block_state bstate; -+ -+ bstate = (*(configuration_table[s->level].func))(s, flush); -+ -+ if (bstate == finish_started || bstate == finish_done) { -+ s->status = FINISH_STATE; -+ } -+ if (bstate == need_more || bstate == finish_started) { -+ if (strm->avail_out == 0) { -+ s->last_flush = -1; /* avoid BUF_ERROR next call, see above */ -+ } -+ return Z_OK; -+ /* If flush != Z_NO_FLUSH && avail_out == 0, the next call -+ * of deflate should use the same flush parameter to make sure -+ * that the flush is complete. So we don't have to output an -+ * empty block here, this will be done at next call. This also -+ * ensures that for a very small output buffer, we emit at most -+ * one empty block. -+ */ -+ } -+ if (bstate == block_done) { -+ if (flush == Z_PARTIAL_FLUSH) { -+ _tr_align(s); -+ } else { /* FULL_FLUSH or SYNC_FLUSH */ -+ _tr_stored_block(s, (char*)0, 0L, 0); -+ /* For a full flush, this empty block will be recognized -+ * as a special marker by inflate_sync(). -+ */ -+ if (flush == Z_FULL_FLUSH) { -+ CLEAR_HASH(s); /* forget history */ -+ } -+ } -+ flush_pending(strm); -+ if (strm->avail_out == 0) { -+ s->last_flush = -1; /* avoid BUF_ERROR at next call, see above */ -+ return Z_OK; -+ } -+ } -+ } -+ Assert(strm->avail_out > 0, "bug2"); -+ -+ if (flush != Z_FINISH) return Z_OK; -+ if (s->noheader) return Z_STREAM_END; -+ -+ /* Write the zlib trailer (adler32) */ -+ putShortMSB(s, (uInt)(strm->adler >> 16)); -+ putShortMSB(s, (uInt)(strm->adler & 0xffff)); -+ flush_pending(strm); -+ /* If avail_out is zero, the application will call deflate again -+ * to flush the rest. -+ */ -+ s->noheader = -1; /* write the trailer only once! */ -+ return s->pending != 0 ? Z_OK : Z_STREAM_END; -+} -+ -+/* ========================================================================= */ -+int ZEXPORT deflateEnd (strm) -+ z_streamp strm; -+{ -+ int status; -+ -+ if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; -+ -+ status = strm->state->status; -+ if (status != INIT_STATE && status != BUSY_STATE && -+ status != FINISH_STATE) { -+ return Z_STREAM_ERROR; -+ } -+ -+ /* Deallocate in reverse order of allocations: */ -+ TRY_FREE(strm, strm->state->pending_buf); -+ TRY_FREE(strm, strm->state->head); -+ TRY_FREE(strm, strm->state->prev); -+ TRY_FREE(strm, strm->state->window); -+ -+ ZFREE(strm, strm->state); -+ strm->state = Z_NULL; -+ -+ return status == BUSY_STATE ? Z_DATA_ERROR : Z_OK; -+} -+ -+/* ========================================================================= -+ * Copy the source state to the destination state. -+ * To simplify the source, this is not supported for 16-bit MSDOS (which -+ * doesn't have enough memory anyway to duplicate compression states). -+ */ -+int ZEXPORT deflateCopy (dest, source) -+ z_streamp dest; -+ z_streamp source; -+{ -+#ifdef MAXSEG_64K -+ return Z_STREAM_ERROR; -+#else -+ deflate_state *ds; -+ deflate_state *ss; -+ ushf *overlay; -+ -+ -+ if (source == Z_NULL || dest == Z_NULL || source->state == Z_NULL) { -+ return Z_STREAM_ERROR; -+ } -+ -+ ss = source->state; -+ -+ *dest = *source; -+ -+ ds = (deflate_state *) ZALLOC(dest, 1, sizeof(deflate_state)); -+ if (ds == Z_NULL) return Z_MEM_ERROR; -+ dest->state = (struct internal_state FAR *) ds; -+ *ds = *ss; -+ ds->strm = dest; -+ -+ ds->window = (Bytef *) ZALLOC(dest, ds->w_size, 2*sizeof(Byte)); -+ ds->prev = (Posf *) ZALLOC(dest, ds->w_size, sizeof(Pos)); -+ ds->head = (Posf *) ZALLOC(dest, ds->hash_size, sizeof(Pos)); -+ overlay = (ushf *) ZALLOC(dest, ds->lit_bufsize, sizeof(ush)+2); -+ ds->pending_buf = (uchf *) overlay; -+ -+ if (ds->window == Z_NULL || ds->prev == Z_NULL || ds->head == Z_NULL || -+ ds->pending_buf == Z_NULL) { -+ deflateEnd (dest); -+ return Z_MEM_ERROR; -+ } -+ /* following zmemcpy do not work for 16-bit MSDOS */ -+ zmemcpy(ds->window, ss->window, ds->w_size * 2 * sizeof(Byte)); -+ zmemcpy(ds->prev, ss->prev, ds->w_size * sizeof(Pos)); -+ zmemcpy(ds->head, ss->head, ds->hash_size * sizeof(Pos)); -+ zmemcpy(ds->pending_buf, ss->pending_buf, (uInt)ds->pending_buf_size); -+ -+ ds->pending_out = ds->pending_buf + (ss->pending_out - ss->pending_buf); -+ ds->d_buf = overlay + ds->lit_bufsize/sizeof(ush); -+ ds->l_buf = ds->pending_buf + (1+sizeof(ush))*ds->lit_bufsize; -+ -+ ds->l_desc.dyn_tree = ds->dyn_ltree; -+ ds->d_desc.dyn_tree = ds->dyn_dtree; -+ ds->bl_desc.dyn_tree = ds->bl_tree; -+ -+ return Z_OK; -+#endif -+} -+ -+/* =========================================================================== -+ * Read a new buffer from the current input stream, update the adler32 -+ * and total number of bytes read. All deflate() input goes through -+ * this function so some applications may wish to modify it to avoid -+ * allocating a large strm->next_in buffer and copying from it. -+ * (See also flush_pending()). -+ */ -+local int read_buf(strm, buf, size) -+ z_streamp strm; -+ Bytef *buf; -+ unsigned size; -+{ -+ unsigned len = strm->avail_in; -+ -+ if (len > size) len = size; -+ if (len == 0) return 0; -+ -+ strm->avail_in -= len; -+ -+ if (!strm->state->noheader) { -+ strm->adler = adler32(strm->adler, strm->next_in, len); -+ } -+ zmemcpy(buf, strm->next_in, len); -+ strm->next_in += len; -+ strm->total_in += len; -+ -+ return (int)len; -+} -+ -+/* =========================================================================== -+ * Initialize the "longest match" routines for a new zlib stream -+ */ -+local void lm_init (s) -+ deflate_state *s; -+{ -+ s->window_size = (ulg)2L*s->w_size; -+ -+ CLEAR_HASH(s); -+ -+ /* Set the default configuration parameters: -+ */ -+ s->max_lazy_match = configuration_table[s->level].max_lazy; -+ s->good_match = configuration_table[s->level].good_length; -+ s->nice_match = configuration_table[s->level].nice_length; -+ s->max_chain_length = configuration_table[s->level].max_chain; -+ -+ s->strstart = 0; -+ s->block_start = 0L; -+ s->lookahead = 0; -+ s->match_length = s->prev_length = MIN_MATCH-1; -+ s->match_available = 0; -+ s->ins_h = 0; -+#ifdef ASMV -+ match_init(); /* initialize the asm code */ -+#endif -+} -+ -+/* =========================================================================== -+ * Set match_start to the longest match starting at the given string and -+ * return its length. Matches shorter or equal to prev_length are discarded, -+ * in which case the result is equal to prev_length and match_start is -+ * garbage. -+ * IN assertions: cur_match is the head of the hash chain for the current -+ * string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1 -+ * OUT assertion: the match length is not greater than s->lookahead. -+ */ -+#ifndef ASMV -+/* For 80x86 and 680x0, an optimized version will be provided in match.asm or -+ * match.S. The code will be functionally equivalent. -+ */ -+#ifndef FASTEST -+local uInt longest_match(s, cur_match) -+ deflate_state *s; -+ IPos cur_match; /* current match */ -+{ -+ unsigned chain_length = s->max_chain_length;/* max hash chain length */ -+ register Bytef *scan = s->window + s->strstart; /* current string */ -+ register Bytef *match; /* matched string */ -+ register int len; /* length of current match */ -+ int best_len = s->prev_length; /* best match length so far */ -+ int nice_match = s->nice_match; /* stop if match long enough */ -+ IPos limit = s->strstart > (IPos)MAX_DIST(s) ? -+ s->strstart - (IPos)MAX_DIST(s) : NIL; -+ /* Stop when cur_match becomes <= limit. To simplify the code, -+ * we prevent matches with the string of window index 0. -+ */ -+ Posf *prev = s->prev; -+ uInt wmask = s->w_mask; -+ -+#ifdef UNALIGNED_OK -+ /* Compare two bytes at a time. Note: this is not always beneficial. -+ * Try with and without -DUNALIGNED_OK to check. -+ */ -+ register Bytef *strend = s->window + s->strstart + MAX_MATCH - 1; -+ register ush scan_start = *(ushf*)scan; -+ register ush scan_end = *(ushf*)(scan+best_len-1); -+#else -+ register Bytef *strend = s->window + s->strstart + MAX_MATCH; -+ register Byte scan_end1 = scan[best_len-1]; -+ register Byte scan_end = scan[best_len]; -+#endif -+ -+ /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16. -+ * It is easy to get rid of this optimization if necessary. -+ */ -+ Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever"); -+ -+ /* Do not waste too much time if we already have a good match: */ -+ if (s->prev_length >= s->good_match) { -+ chain_length >>= 2; -+ } -+ /* Do not look for matches beyond the end of the input. This is necessary -+ * to make deflate deterministic. -+ */ -+ if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; -+ -+ Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead"); -+ -+ do { -+ Assert(cur_match < s->strstart, "no future"); -+ match = s->window + cur_match; -+ -+ /* Skip to next match if the match length cannot increase -+ * or if the match length is less than 2: -+ */ -+#if (defined(UNALIGNED_OK) && MAX_MATCH == 258) -+ /* This code assumes sizeof(unsigned short) == 2. Do not use -+ * UNALIGNED_OK if your compiler uses a different size. -+ */ -+ if (*(ushf*)(match+best_len-1) != scan_end || -+ *(ushf*)match != scan_start) continue; -+ -+ /* It is not necessary to compare scan[2] and match[2] since they are -+ * always equal when the other bytes match, given that the hash keys -+ * are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at -+ * strstart+3, +5, ... up to strstart+257. We check for insufficient -+ * lookahead only every 4th comparison; the 128th check will be made -+ * at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is -+ * necessary to put more guard bytes at the end of the window, or -+ * to check more often for insufficient lookahead. -+ */ -+ Assert(scan[2] == match[2], "scan[2]?"); -+ scan++, match++; -+ do { -+ } while (*(ushf*)(scan+=2) == *(ushf*)(match+=2) && -+ *(ushf*)(scan+=2) == *(ushf*)(match+=2) && -+ *(ushf*)(scan+=2) == *(ushf*)(match+=2) && -+ *(ushf*)(scan+=2) == *(ushf*)(match+=2) && -+ scan < strend); -+ /* The funny "do {}" generates better code on most compilers */ -+ -+ /* Here, scan <= window+strstart+257 */ -+ Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan"); -+ if (*scan == *match) scan++; -+ -+ len = (MAX_MATCH - 1) - (int)(strend-scan); -+ scan = strend - (MAX_MATCH-1); -+ -+#else /* UNALIGNED_OK */ -+ -+ if (match[best_len] != scan_end || -+ match[best_len-1] != scan_end1 || -+ *match != *scan || -+ *++match != scan[1]) continue; -+ -+ /* The check at best_len-1 can be removed because it will be made -+ * again later. (This heuristic is not always a win.) -+ * It is not necessary to compare scan[2] and match[2] since they -+ * are always equal when the other bytes match, given that -+ * the hash keys are equal and that HASH_BITS >= 8. -+ */ -+ scan += 2, match++; -+ Assert(*scan == *match, "match[2]?"); -+ -+ /* We check for insufficient lookahead only every 8th comparison; -+ * the 256th check will be made at strstart+258. -+ */ -+ do { -+ } while (*++scan == *++match && *++scan == *++match && -+ *++scan == *++match && *++scan == *++match && -+ *++scan == *++match && *++scan == *++match && -+ *++scan == *++match && *++scan == *++match && -+ scan < strend); -+ -+ Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan"); -+ -+ len = MAX_MATCH - (int)(strend - scan); -+ scan = strend - MAX_MATCH; -+ -+#endif /* UNALIGNED_OK */ -+ -+ if (len > best_len) { -+ s->match_start = cur_match; -+ best_len = len; -+ if (len >= nice_match) break; -+#ifdef UNALIGNED_OK -+ scan_end = *(ushf*)(scan+best_len-1); -+#else -+ scan_end1 = scan[best_len-1]; -+ scan_end = scan[best_len]; -+#endif -+ } -+ } while ((cur_match = prev[cur_match & wmask]) > limit -+ && --chain_length != 0); -+ -+ if ((uInt)best_len <= s->lookahead) return (uInt)best_len; -+ return s->lookahead; -+} -+ -+#else /* FASTEST */ -+/* --------------------------------------------------------------------------- -+ * Optimized version for level == 1 only -+ */ -+local uInt longest_match(s, cur_match) -+ deflate_state *s; -+ IPos cur_match; /* current match */ -+{ -+ register Bytef *scan = s->window + s->strstart; /* current string */ -+ register Bytef *match; /* matched string */ -+ register int len; /* length of current match */ -+ register Bytef *strend = s->window + s->strstart + MAX_MATCH; -+ -+ /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16. -+ * It is easy to get rid of this optimization if necessary. -+ */ -+ Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever"); -+ -+ Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead"); -+ -+ Assert(cur_match < s->strstart, "no future"); -+ -+ match = s->window + cur_match; -+ -+ /* Return failure if the match length is less than 2: -+ */ -+ if (match[0] != scan[0] || match[1] != scan[1]) return MIN_MATCH-1; -+ -+ /* The check at best_len-1 can be removed because it will be made -+ * again later. (This heuristic is not always a win.) -+ * It is not necessary to compare scan[2] and match[2] since they -+ * are always equal when the other bytes match, given that -+ * the hash keys are equal and that HASH_BITS >= 8. -+ */ -+ scan += 2, match += 2; -+ Assert(*scan == *match, "match[2]?"); -+ -+ /* We check for insufficient lookahead only every 8th comparison; -+ * the 256th check will be made at strstart+258. -+ */ -+ do { -+ } while (*++scan == *++match && *++scan == *++match && -+ *++scan == *++match && *++scan == *++match && -+ *++scan == *++match && *++scan == *++match && -+ *++scan == *++match && *++scan == *++match && -+ scan < strend); -+ -+ Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan"); -+ -+ len = MAX_MATCH - (int)(strend - scan); -+ -+ if (len < MIN_MATCH) return MIN_MATCH - 1; -+ -+ s->match_start = cur_match; -+ return len <= s->lookahead ? len : s->lookahead; -+} -+#endif /* FASTEST */ -+#endif /* ASMV */ -+ -+#ifdef DEBUG -+/* =========================================================================== -+ * Check that the match at match_start is indeed a match. -+ */ -+local void check_match(s, start, match, length) -+ deflate_state *s; -+ IPos start, match; -+ int length; -+{ -+ /* check that the match is indeed a match */ -+ if (zmemcmp(s->window + match, -+ s->window + start, length) != EQUAL) { -+ fprintf(stderr, " start %u, match %u, length %d\n", -+ start, match, length); -+ do { -+ fprintf(stderr, "%c%c", s->window[match++], s->window[start++]); -+ } while (--length != 0); -+ z_error("invalid match"); -+ } -+ if (z_verbose > 1) { -+ fprintf(stderr,"\\[%d,%d]", start-match, length); -+ do { putc(s->window[start++], stderr); } while (--length != 0); -+ } -+} -+#else -+# define check_match(s, start, match, length) -+#endif -+ -+/* =========================================================================== -+ * Fill the window when the lookahead becomes insufficient. -+ * Updates strstart and lookahead. -+ * -+ * IN assertion: lookahead < MIN_LOOKAHEAD -+ * OUT assertions: strstart <= window_size-MIN_LOOKAHEAD -+ * At least one byte has been read, or avail_in == 0; reads are -+ * performed for at least two bytes (required for the zip translate_eol -+ * option -- not supported here). -+ */ -+local void fill_window(s) -+ deflate_state *s; -+{ -+ register unsigned n, m; -+ register Posf *p; -+ unsigned more; /* Amount of free space at the end of the window. */ -+ uInt wsize = s->w_size; -+ -+ do { -+ more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart); -+ -+ /* Deal with !@#$% 64K limit: */ -+ if (more == 0 && s->strstart == 0 && s->lookahead == 0) { -+ more = wsize; -+ -+ } else if (more == (unsigned)(-1)) { -+ /* Very unlikely, but possible on 16 bit machine if strstart == 0 -+ * and lookahead == 1 (input done one byte at time) -+ */ -+ more--; -+ -+ /* If the window is almost full and there is insufficient lookahead, -+ * move the upper half to the lower one to make room in the upper half. -+ */ -+ } else if (s->strstart >= wsize+MAX_DIST(s)) { -+ -+ zmemcpy(s->window, s->window+wsize, (unsigned)wsize); -+ s->match_start -= wsize; -+ s->strstart -= wsize; /* we now have strstart >= MAX_DIST */ -+ s->block_start -= (long) wsize; -+ -+ /* Slide the hash table (could be avoided with 32 bit values -+ at the expense of memory usage). We slide even when level == 0 -+ to keep the hash table consistent if we switch back to level > 0 -+ later. (Using level 0 permanently is not an optimal usage of -+ zlib, so we don't care about this pathological case.) -+ */ -+ n = s->hash_size; -+ p = &s->head[n]; -+ do { -+ m = *--p; -+ *p = (Pos)(m >= wsize ? m-wsize : NIL); -+ } while (--n); -+ -+ n = wsize; -+#ifndef FASTEST -+ p = &s->prev[n]; -+ do { -+ m = *--p; -+ *p = (Pos)(m >= wsize ? m-wsize : NIL); -+ /* If n is not on any hash chain, prev[n] is garbage but -+ * its value will never be used. -+ */ -+ } while (--n); -+#endif -+ more += wsize; -+ } -+ if (s->strm->avail_in == 0) return; -+ -+ /* If there was no sliding: -+ * strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 && -+ * more == window_size - lookahead - strstart -+ * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1) -+ * => more >= window_size - 2*WSIZE + 2 -+ * In the BIG_MEM or MMAP case (not yet supported), -+ * window_size == input_size + MIN_LOOKAHEAD && -+ * strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD. -+ * Otherwise, window_size == 2*WSIZE so more >= 2. -+ * If there was sliding, more >= WSIZE. So in all cases, more >= 2. -+ */ -+ Assert(more >= 2, "more < 2"); -+ -+ n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more); -+ s->lookahead += n; -+ -+ /* Initialize the hash value now that we have some input: */ -+ if (s->lookahead >= MIN_MATCH) { -+ s->ins_h = s->window[s->strstart]; -+ UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]); -+#if MIN_MATCH != 3 -+ Call UPDATE_HASH() MIN_MATCH-3 more times -+#endif -+ } -+ /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage, -+ * but this is not important since only literal bytes will be emitted. -+ */ -+ -+ } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0); -+} -+ -+/* =========================================================================== -+ * Flush the current block, with given end-of-file flag. -+ * IN assertion: strstart is set to the end of the current match. -+ */ -+#define FLUSH_BLOCK_ONLY(s, eof) { \ -+ _tr_flush_block(s, (s->block_start >= 0L ? \ -+ (charf *)&s->window[(unsigned)s->block_start] : \ -+ (charf *)Z_NULL), \ -+ (ulg)((long)s->strstart - s->block_start), \ -+ (eof)); \ -+ s->block_start = s->strstart; \ -+ flush_pending(s->strm); \ -+ Tracev((stderr,"[FLUSH]")); \ -+} -+ -+/* Same but force premature exit if necessary. */ -+#define FLUSH_BLOCK(s, eof) { \ -+ FLUSH_BLOCK_ONLY(s, eof); \ -+ if (s->strm->avail_out == 0) return (eof) ? finish_started : need_more; \ -+} -+ -+/* =========================================================================== -+ * Copy without compression as much as possible from the input stream, return -+ * the current block state. -+ * This function does not insert new strings in the dictionary since -+ * uncompressible data is probably not useful. This function is used -+ * only for the level=0 compression option. -+ * NOTE: this function should be optimized to avoid extra copying from -+ * window to pending_buf. -+ */ -+local block_state deflate_stored(s, flush) -+ deflate_state *s; -+ int flush; -+{ -+ /* Stored blocks are limited to 0xffff bytes, pending_buf is limited -+ * to pending_buf_size, and each stored block has a 5 byte header: -+ */ -+ ulg max_block_size = 0xffff; -+ ulg max_start; -+ -+ if (max_block_size > s->pending_buf_size - 5) { -+ max_block_size = s->pending_buf_size - 5; -+ } -+ -+ /* Copy as much as possible from input to output: */ -+ for (;;) { -+ /* Fill the window as much as possible: */ -+ if (s->lookahead <= 1) { -+ -+ Assert(s->strstart < s->w_size+MAX_DIST(s) || -+ s->block_start >= (long)s->w_size, "slide too late"); -+ -+ fill_window(s); -+ if (s->lookahead == 0 && flush == Z_NO_FLUSH) return need_more; -+ -+ if (s->lookahead == 0) break; /* flush the current block */ -+ } -+ Assert(s->block_start >= 0L, "block gone"); -+ -+ s->strstart += s->lookahead; -+ s->lookahead = 0; -+ -+ /* Emit a stored block if pending_buf will be full: */ -+ max_start = s->block_start + max_block_size; -+ if (s->strstart == 0 || (ulg)s->strstart >= max_start) { -+ /* strstart == 0 is possible when wraparound on 16-bit machine */ -+ s->lookahead = (uInt)(s->strstart - max_start); -+ s->strstart = (uInt)max_start; -+ FLUSH_BLOCK(s, 0); -+ } -+ /* Flush if we may have to slide, otherwise block_start may become -+ * negative and the data will be gone: -+ */ -+ if (s->strstart - (uInt)s->block_start >= MAX_DIST(s)) { -+ FLUSH_BLOCK(s, 0); -+ } -+ } -+ FLUSH_BLOCK(s, flush == Z_FINISH); -+ return flush == Z_FINISH ? finish_done : block_done; -+} -+ -+/* =========================================================================== -+ * Compress as much as possible from the input stream, return the current -+ * block state. -+ * This function does not perform lazy evaluation of matches and inserts -+ * new strings in the dictionary only for unmatched strings or for short -+ * matches. It is used only for the fast compression options. -+ */ -+local block_state deflate_fast(s, flush) -+ deflate_state *s; -+ int flush; -+{ -+ IPos hash_head = NIL; /* head of the hash chain */ -+ int bflush; /* set if current block must be flushed */ -+ -+ for (;;) { -+ /* Make sure that we always have enough lookahead, except -+ * at the end of the input file. We need MAX_MATCH bytes -+ * for the next match, plus MIN_MATCH bytes to insert the -+ * string following the next match. -+ */ -+ if (s->lookahead < MIN_LOOKAHEAD) { -+ fill_window(s); -+ if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) { -+ return need_more; -+ } -+ if (s->lookahead == 0) break; /* flush the current block */ -+ } -+ -+ /* Insert the string window[strstart .. strstart+2] in the -+ * dictionary, and set hash_head to the head of the hash chain: -+ */ -+ if (s->lookahead >= MIN_MATCH) { -+ INSERT_STRING(s, s->strstart, hash_head); -+ } -+ -+ /* Find the longest match, discarding those <= prev_length. -+ * At this point we have always match_length < MIN_MATCH -+ */ -+ if (hash_head != NIL && s->strstart - hash_head <= MAX_DIST(s)) { -+ /* To simplify the code, we prevent matches with the string -+ * of window index 0 (in particular we have to avoid a match -+ * of the string with itself at the start of the input file). -+ */ -+ if (s->strategy != Z_HUFFMAN_ONLY) { -+ s->match_length = longest_match (s, hash_head); -+ } -+ /* longest_match() sets match_start */ -+ } -+ if (s->match_length >= MIN_MATCH) { -+ check_match(s, s->strstart, s->match_start, s->match_length); -+ -+ _tr_tally_dist(s, s->strstart - s->match_start, -+ s->match_length - MIN_MATCH, bflush); -+ -+ s->lookahead -= s->match_length; -+ -+ /* Insert new strings in the hash table only if the match length -+ * is not too large. This saves time but degrades compression. -+ */ -+#ifndef FASTEST -+ if (s->match_length <= s->max_insert_length && -+ s->lookahead >= MIN_MATCH) { -+ s->match_length--; /* string at strstart already in hash table */ -+ do { -+ s->strstart++; -+ INSERT_STRING(s, s->strstart, hash_head); -+ /* strstart never exceeds WSIZE-MAX_MATCH, so there are -+ * always MIN_MATCH bytes ahead. -+ */ -+ } while (--s->match_length != 0); -+ s->strstart++; -+ } else -+#endif -+ { -+ s->strstart += s->match_length; -+ s->match_length = 0; -+ s->ins_h = s->window[s->strstart]; -+ UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]); -+#if MIN_MATCH != 3 -+ Call UPDATE_HASH() MIN_MATCH-3 more times -+#endif -+ /* If lookahead < MIN_MATCH, ins_h is garbage, but it does not -+ * matter since it will be recomputed at next deflate call. -+ */ -+ } -+ } else { -+ /* No match, output a literal byte */ -+ Tracevv((stderr,"%c", s->window[s->strstart])); -+ _tr_tally_lit (s, s->window[s->strstart], bflush); -+ s->lookahead--; -+ s->strstart++; -+ } -+ if (bflush) FLUSH_BLOCK(s, 0); -+ } -+ FLUSH_BLOCK(s, flush == Z_FINISH); -+ return flush == Z_FINISH ? finish_done : block_done; -+} -+ -+/* =========================================================================== -+ * Same as above, but achieves better compression. We use a lazy -+ * evaluation for matches: a match is finally adopted only if there is -+ * no better match at the next window position. -+ */ -+local block_state deflate_slow(s, flush) -+ deflate_state *s; -+ int flush; -+{ -+ IPos hash_head = NIL; /* head of hash chain */ -+ int bflush; /* set if current block must be flushed */ -+ -+ /* Process the input block. */ -+ for (;;) { -+ /* Make sure that we always have enough lookahead, except -+ * at the end of the input file. We need MAX_MATCH bytes -+ * for the next match, plus MIN_MATCH bytes to insert the -+ * string following the next match. -+ */ -+ if (s->lookahead < MIN_LOOKAHEAD) { -+ fill_window(s); -+ if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) { -+ return need_more; -+ } -+ if (s->lookahead == 0) break; /* flush the current block */ -+ } -+ -+ /* Insert the string window[strstart .. strstart+2] in the -+ * dictionary, and set hash_head to the head of the hash chain: -+ */ -+ if (s->lookahead >= MIN_MATCH) { -+ INSERT_STRING(s, s->strstart, hash_head); -+ } -+ -+ /* Find the longest match, discarding those <= prev_length. -+ */ -+ s->prev_length = s->match_length, s->prev_match = s->match_start; -+ s->match_length = MIN_MATCH-1; -+ -+ if (hash_head != NIL && s->prev_length < s->max_lazy_match && -+ s->strstart - hash_head <= MAX_DIST(s)) { -+ /* To simplify the code, we prevent matches with the string -+ * of window index 0 (in particular we have to avoid a match -+ * of the string with itself at the start of the input file). -+ */ -+ if (s->strategy != Z_HUFFMAN_ONLY) { -+ s->match_length = longest_match (s, hash_head); -+ } -+ /* longest_match() sets match_start */ -+ -+ if (s->match_length <= 5 && (s->strategy == Z_FILTERED || -+ (s->match_length == MIN_MATCH && -+ s->strstart - s->match_start > TOO_FAR))) { -+ -+ /* If prev_match is also MIN_MATCH, match_start is garbage -+ * but we will ignore the current match anyway. -+ */ -+ s->match_length = MIN_MATCH-1; -+ } -+ } -+ /* If there was a match at the previous step and the current -+ * match is not better, output the previous match: -+ */ -+ if (s->prev_length >= MIN_MATCH && s->match_length <= s->prev_length) { -+ uInt max_insert = s->strstart + s->lookahead - MIN_MATCH; -+ /* Do not insert strings in hash table beyond this. */ -+ -+ check_match(s, s->strstart-1, s->prev_match, s->prev_length); -+ -+ _tr_tally_dist(s, s->strstart -1 - s->prev_match, -+ s->prev_length - MIN_MATCH, bflush); -+ -+ /* Insert in hash table all strings up to the end of the match. -+ * strstart-1 and strstart are already inserted. If there is not -+ * enough lookahead, the last two strings are not inserted in -+ * the hash table. -+ */ -+ s->lookahead -= s->prev_length-1; -+ s->prev_length -= 2; -+ do { -+ if (++s->strstart <= max_insert) { -+ INSERT_STRING(s, s->strstart, hash_head); -+ } -+ } while (--s->prev_length != 0); -+ s->match_available = 0; -+ s->match_length = MIN_MATCH-1; -+ s->strstart++; -+ -+ if (bflush) FLUSH_BLOCK(s, 0); -+ -+ } else if (s->match_available) { -+ /* If there was no match at the previous position, output a -+ * single literal. If there was a match but the current match -+ * is longer, truncate the previous match to a single literal. -+ */ -+ Tracevv((stderr,"%c", s->window[s->strstart-1])); -+ _tr_tally_lit(s, s->window[s->strstart-1], bflush); -+ if (bflush) { -+ FLUSH_BLOCK_ONLY(s, 0); -+ } -+ s->strstart++; -+ s->lookahead--; -+ if (s->strm->avail_out == 0) return need_more; -+ } else { -+ /* There is no previous match to compare with, wait for -+ * the next step to decide. -+ */ -+ s->match_available = 1; -+ s->strstart++; -+ s->lookahead--; -+ } -+ } -+ Assert (flush != Z_NO_FLUSH, "no flush?"); -+ if (s->match_available) { -+ Tracevv((stderr,"%c", s->window[s->strstart-1])); -+ _tr_tally_lit(s, s->window[s->strstart-1], bflush); -+ s->match_available = 0; -+ } -+ FLUSH_BLOCK(s, flush == Z_FINISH); -+ return flush == Z_FINISH ? finish_done : block_done; -+} ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/deflate.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,318 @@ -+/* deflate.h -- internal compression state -+ * Copyright (C) 1995-2002 Jean-loup Gailly -+ * For conditions of distribution and use, see copyright notice in zlib.h -+ */ -+ -+/* WARNING: this file should *not* be used by applications. It is -+ part of the implementation of the compression library and is -+ subject to change. Applications should only use zlib.h. -+ */ -+ -+/* @(#) $Id: deflate.h,v 1.5 2004/07/10 07:48:38 mcr Exp $ */ -+ -+#ifndef _DEFLATE_H -+#define _DEFLATE_H -+ -+#include "zlib/zutil.h" -+ -+/* =========================================================================== -+ * Internal compression state. -+ */ -+ -+#define LENGTH_CODES 29 -+/* number of length codes, not counting the special END_BLOCK code */ -+ -+#define LITERALS 256 -+/* number of literal bytes 0..255 */ -+ -+#define L_CODES (LITERALS+1+LENGTH_CODES) -+/* number of Literal or Length codes, including the END_BLOCK code */ -+ -+#define D_CODES 30 -+/* number of distance codes */ -+ -+#define BL_CODES 19 -+/* number of codes used to transfer the bit lengths */ -+ -+#define HEAP_SIZE (2*L_CODES+1) -+/* maximum heap size */ -+ -+#define MAX_BITS 15 -+/* All codes must not exceed MAX_BITS bits */ -+ -+#define INIT_STATE 42 -+#define BUSY_STATE 113 -+#define FINISH_STATE 666 -+/* Stream status */ -+ -+ -+/* Data structure describing a single value and its code string. */ -+typedef struct ct_data_s { -+ union { -+ ush freq; /* frequency count */ -+ ush code; /* bit string */ -+ } fc; -+ union { -+ ush dad; /* father node in Huffman tree */ -+ ush len; /* length of bit string */ -+ } dl; -+} FAR ct_data; -+ -+#define Freq fc.freq -+#define Code fc.code -+#define Dad dl.dad -+#define Len dl.len -+ -+typedef struct static_tree_desc_s static_tree_desc; -+ -+typedef struct tree_desc_s { -+ ct_data *dyn_tree; /* the dynamic tree */ -+ int max_code; /* largest code with non zero frequency */ -+ static_tree_desc *stat_desc; /* the corresponding static tree */ -+} FAR tree_desc; -+ -+typedef ush Pos; -+typedef Pos FAR Posf; -+typedef unsigned IPos; -+ -+/* A Pos is an index in the character window. We use short instead of int to -+ * save space in the various tables. IPos is used only for parameter passing. -+ */ -+ -+typedef struct internal_state { -+ z_streamp strm; /* pointer back to this zlib stream */ -+ int status; /* as the name implies */ -+ Bytef *pending_buf; /* output still pending */ -+ ulg pending_buf_size; /* size of pending_buf */ -+ Bytef *pending_out; /* next pending byte to output to the stream */ -+ int pending; /* nb of bytes in the pending buffer */ -+ int noheader; /* suppress zlib header and adler32 */ -+ Byte data_type; /* UNKNOWN, BINARY or ASCII */ -+ Byte method; /* STORED (for zip only) or DEFLATED */ -+ int last_flush; /* value of flush param for previous deflate call */ -+ -+ /* used by deflate.c: */ -+ -+ uInt w_size; /* LZ77 window size (32K by default) */ -+ uInt w_bits; /* log2(w_size) (8..16) */ -+ uInt w_mask; /* w_size - 1 */ -+ -+ Bytef *window; -+ /* Sliding window. Input bytes are read into the second half of the window, -+ * and move to the first half later to keep a dictionary of at least wSize -+ * bytes. With this organization, matches are limited to a distance of -+ * wSize-MAX_MATCH bytes, but this ensures that IO is always -+ * performed with a length multiple of the block size. Also, it limits -+ * the window size to 64K, which is quite useful on MSDOS. -+ * To do: use the user input buffer as sliding window. -+ */ -+ -+ ulg window_size; -+ /* Actual size of window: 2*wSize, except when the user input buffer -+ * is directly used as sliding window. -+ */ -+ -+ Posf *prev; -+ /* Link to older string with same hash index. To limit the size of this -+ * array to 64K, this link is maintained only for the last 32K strings. -+ * An index in this array is thus a window index modulo 32K. -+ */ -+ -+ Posf *head; /* Heads of the hash chains or NIL. */ -+ -+ uInt ins_h; /* hash index of string to be inserted */ -+ uInt hash_size; /* number of elements in hash table */ -+ uInt hash_bits; /* log2(hash_size) */ -+ uInt hash_mask; /* hash_size-1 */ -+ -+ uInt hash_shift; -+ /* Number of bits by which ins_h must be shifted at each input -+ * step. It must be such that after MIN_MATCH steps, the oldest -+ * byte no longer takes part in the hash key, that is: -+ * hash_shift * MIN_MATCH >= hash_bits -+ */ -+ -+ long block_start; -+ /* Window position at the beginning of the current output block. Gets -+ * negative when the window is moved backwards. -+ */ -+ -+ uInt match_length; /* length of best match */ -+ IPos prev_match; /* previous match */ -+ int match_available; /* set if previous match exists */ -+ uInt strstart; /* start of string to insert */ -+ uInt match_start; /* start of matching string */ -+ uInt lookahead; /* number of valid bytes ahead in window */ -+ -+ uInt prev_length; -+ /* Length of the best match at previous step. Matches not greater than this -+ * are discarded. This is used in the lazy match evaluation. -+ */ -+ -+ uInt max_chain_length; -+ /* To speed up deflation, hash chains are never searched beyond this -+ * length. A higher limit improves compression ratio but degrades the -+ * speed. -+ */ -+ -+ uInt max_lazy_match; -+ /* Attempt to find a better match only when the current match is strictly -+ * smaller than this value. This mechanism is used only for compression -+ * levels >= 4. -+ */ -+# define max_insert_length max_lazy_match -+ /* Insert new strings in the hash table only if the match length is not -+ * greater than this length. This saves time but degrades compression. -+ * max_insert_length is used only for compression levels <= 3. -+ */ -+ -+ int level; /* compression level (1..9) */ -+ int strategy; /* favor or force Huffman coding*/ -+ -+ uInt good_match; -+ /* Use a faster search when the previous match is longer than this */ -+ -+ int nice_match; /* Stop searching when current match exceeds this */ -+ -+ /* used by trees.c: */ -+ /* Didn't use ct_data typedef below to supress compiler warning */ -+ struct ct_data_s dyn_ltree[HEAP_SIZE]; /* literal and length tree */ -+ struct ct_data_s dyn_dtree[2*D_CODES+1]; /* distance tree */ -+ struct ct_data_s bl_tree[2*BL_CODES+1]; /* Huffman tree for bit lengths */ -+ -+ struct tree_desc_s l_desc; /* desc. for literal tree */ -+ struct tree_desc_s d_desc; /* desc. for distance tree */ -+ struct tree_desc_s bl_desc; /* desc. for bit length tree */ -+ -+ ush bl_count[MAX_BITS+1]; -+ /* number of codes at each bit length for an optimal tree */ -+ -+ int heap[2*L_CODES+1]; /* heap used to build the Huffman trees */ -+ int heap_len; /* number of elements in the heap */ -+ int heap_max; /* element of largest frequency */ -+ /* The sons of heap[n] are heap[2*n] and heap[2*n+1]. heap[0] is not used. -+ * The same heap array is used to build all trees. -+ */ -+ -+ uch depth[2*L_CODES+1]; -+ /* Depth of each subtree used as tie breaker for trees of equal frequency -+ */ -+ -+ uchf *l_buf; /* buffer for literals or lengths */ -+ -+ uInt lit_bufsize; -+ /* Size of match buffer for literals/lengths. There are 4 reasons for -+ * limiting lit_bufsize to 64K: -+ * - frequencies can be kept in 16 bit counters -+ * - if compression is not successful for the first block, all input -+ * data is still in the window so we can still emit a stored block even -+ * when input comes from standard input. (This can also be done for -+ * all blocks if lit_bufsize is not greater than 32K.) -+ * - if compression is not successful for a file smaller than 64K, we can -+ * even emit a stored file instead of a stored block (saving 5 bytes). -+ * This is applicable only for zip (not gzip or zlib). -+ * - creating new Huffman trees less frequently may not provide fast -+ * adaptation to changes in the input data statistics. (Take for -+ * example a binary file with poorly compressible code followed by -+ * a highly compressible string table.) Smaller buffer sizes give -+ * fast adaptation but have of course the overhead of transmitting -+ * trees more frequently. -+ * - I can't count above 4 -+ */ -+ -+ uInt last_lit; /* running index in l_buf */ -+ -+ ushf *d_buf; -+ /* Buffer for distances. To simplify the code, d_buf and l_buf have -+ * the same number of elements. To use different lengths, an extra flag -+ * array would be necessary. -+ */ -+ -+ ulg opt_len; /* bit length of current block with optimal trees */ -+ ulg static_len; /* bit length of current block with static trees */ -+ uInt matches; /* number of string matches in current block */ -+ int last_eob_len; /* bit length of EOB code for last block */ -+ -+#ifdef DEBUG -+ ulg compressed_len; /* total bit length of compressed file mod 2^32 */ -+ ulg bits_sent; /* bit length of compressed data sent mod 2^32 */ -+#endif -+ -+ ush bi_buf; -+ /* Output buffer. bits are inserted starting at the bottom (least -+ * significant bits). -+ */ -+ int bi_valid; -+ /* Number of valid bits in bi_buf. All bits above the last valid bit -+ * are always zero. -+ */ -+ -+} FAR deflate_state; -+ -+/* Output a byte on the stream. -+ * IN assertion: there is enough room in pending_buf. -+ */ -+#define put_byte(s, c) {s->pending_buf[s->pending++] = (c);} -+ -+ -+#define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1) -+/* Minimum amount of lookahead, except at the end of the input file. -+ * See deflate.c for comments about the MIN_MATCH+1. -+ */ -+ -+#define MAX_DIST(s) ((s)->w_size-MIN_LOOKAHEAD) -+/* In order to simplify the code, particularly on 16 bit machines, match -+ * distances are limited to MAX_DIST instead of WSIZE. -+ */ -+ -+ /* in trees.c */ -+void _tr_init OF((deflate_state *s)); -+int _tr_tally OF((deflate_state *s, unsigned dist, unsigned lc)); -+void _tr_flush_block OF((deflate_state *s, charf *buf, ulg stored_len, -+ int eof)); -+void _tr_align OF((deflate_state *s)); -+void _tr_stored_block OF((deflate_state *s, charf *buf, ulg stored_len, -+ int eof)); -+ -+#define d_code(dist) \ -+ ((dist) < 256 ? _dist_code[dist] : _dist_code[256+((dist)>>7)]) -+/* Mapping from a distance to a distance code. dist is the distance - 1 and -+ * must not have side effects. _dist_code[256] and _dist_code[257] are never -+ * used. -+ */ -+ -+#ifndef DEBUG -+/* Inline versions of _tr_tally for speed: */ -+ -+#if defined(GEN_TREES_H) || !defined(STDC) -+ extern uch _length_code[]; -+ extern uch _dist_code[]; -+#else -+ extern const uch _length_code[]; -+ extern const uch _dist_code[]; -+#endif -+ -+# define _tr_tally_lit(s, c, flush) \ -+ { uch cc = (c); \ -+ s->d_buf[s->last_lit] = 0; \ -+ s->l_buf[s->last_lit++] = cc; \ -+ s->dyn_ltree[cc].Freq++; \ -+ flush = (s->last_lit == s->lit_bufsize-1); \ -+ } -+# define _tr_tally_dist(s, distance, length, flush) \ -+ { uch len = (length); \ -+ ush dist = (distance); \ -+ s->d_buf[s->last_lit] = dist; \ -+ s->l_buf[s->last_lit++] = len; \ -+ dist--; \ -+ s->dyn_ltree[_length_code[len]+LITERALS+1].Freq++; \ -+ s->dyn_dtree[d_code(dist)].Freq++; \ -+ flush = (s->last_lit == s->lit_bufsize-1); \ -+ } -+#else -+# define _tr_tally_lit(s, c, flush) flush = _tr_tally(s, 0, c) -+# define _tr_tally_dist(s, distance, length, flush) \ -+ flush = _tr_tally(s, distance, length) -+#endif -+ -+#endif /* _DEFLATE_H */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/des/COPYRIGHT Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,50 @@ -+Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com) -+All rights reserved. -+ -+This package is an DES implementation written by Eric Young (eay@cryptsoft.com). -+The implementation was written so as to conform with MIT's libdes. -+ -+This library is free for commercial and non-commercial use as long as -+the following conditions are aheared to. The following conditions -+apply to all code found in this distribution. -+ -+Copyright remains Eric Young's, and as such any Copyright notices in -+the code are not to be removed. -+If this package is used in a product, Eric Young should be given attribution -+as the author of that the SSL library. This can be in the form of a textual -+message at program startup or in documentation (online or textual) provided -+with the package. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions -+are met: -+1. Redistributions of source code must retain the copyright -+ notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+3. All advertising materials mentioning features or use of this software -+ must display the following acknowledgement: -+ This product includes software developed by Eric Young (eay@cryptsoft.com) -+ -+THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -+OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -+SUCH DAMAGE. -+ -+The license and distribution terms for any publically available version or -+derivative of this code cannot be changed. i.e. this code cannot simply be -+copied and put under another distrubution license -+[including the GNU Public License.] -+ -+The reason behind this being stated in this direct manner is past -+experience in code simply being copied and the attribution removed -+from it and then being distributed as part of other packages. This -+implementation was a non-trivial and unpaid effort. ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/des/INSTALL Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,69 @@ -+Check the CC and CFLAGS lines in the makefile -+ -+If your C library does not support the times(3) function, change the -+#define TIMES to -+#undef TIMES in speed.c -+If it does, check the HZ value for the times(3) function. -+If your system does not define CLK_TCK it will be assumed to -+be 100.0. -+ -+If possible use gcc v 2.7.? -+Turn on the maximum optimising (normally '-O3 -fomit-frame-pointer' for gcc) -+In recent times, some system compilers give better performace. -+ -+type 'make' -+ -+run './destest' to check things are ok. -+run './rpw' to check the tty code for reading passwords works. -+run './speed' to see how fast those optimisations make the library run :-) -+run './des_opts' to determin the best compile time options. -+ -+The output from des_opts should be put in the makefile options and des_enc.c -+should be rebuilt. For 64 bit computers, do not use the DES_PTR option. -+For the DEC Alpha, edit des.h and change DES_LONG to 'unsigned int' -+and then you can use the 'DES_PTR' option. -+ -+The file options.txt has the options listed for best speed on quite a -+few systems. Look and the options (UNROLL, PTR, RISC2 etc) and then -+turn on the relevent option in the Makefile -+ -+There are some special Makefile targets that make life easier. -+make cc - standard cc build -+make gcc - standard gcc build -+make x86-elf - x86 assembler (elf), linux-elf. -+make x86-out - x86 assembler (a.out), FreeBSD -+make x86-solaris- x86 assembler -+make x86-bsdi - x86 assembler (a.out with primative assembler). -+ -+If at all possible use the assembler (for Windows NT/95, use -+asm/win32.obj to link with). The x86 assembler is very very fast. -+ -+A make install will by default install -+libdes.a in /usr/local/lib/libdes.a -+des in /usr/local/bin/des -+des_crypt.man in /usr/local/man/man3/des_crypt.3 -+des.man in /usr/local/man/man1/des.1 -+des.h in /usr/include/des.h -+ -+des(1) should be compatible with sunOS's but I have been unable to -+test it. -+ -+These routines should compile on MSDOS, most 32bit and 64bit version -+of Unix (BSD and SYSV) and VMS, without modification. -+The only problems should be #include files that are in the wrong places. -+ -+These routines can be compiled under MSDOS. -+I have successfully encrypted files using des(1) under MSDOS and then -+decrypted the files on a SparcStation. -+I have been able to compile and test the routines with -+Microsoft C v 5.1 and Turbo C v 2.0. -+The code in this library is in no way optimised for the 16bit -+operation of MSDOS. -+ -+When building for glibc, ignore all of the above and just unpack into -+glibc-1.??/des and then gmake as per normal. -+ -+As a final note on performace. Certain CPUs like sparcs and Alpha often give -+a %10 speed difference depending on the link order. It is rather anoying -+when one program reports 'x' DES encrypts a second and another reports -+'x*0.9' the speed. ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/des/Makefile Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,60 @@ -+# Makefile for KLIPS kernel code as a module for 2.6 kernels -+# -+# Makefile for KLIPS kernel code as a module -+# Copyright (C) 1998, 1999, 2000,2001 Richard Guy Briggs. -+# Copyright (C) 2002-2004 Michael Richardson -+# -+# This program is free software; you can redistribute it and/or modify it -+# under the terms of the GNU General Public License as published by the -+# Free Software Foundation; either version 2 of the License, or (at your -+# option) any later version. See . -+# -+# This program is distributed in the hope that it will be useful, but -+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+# for more details. -+# -+# RCSID $Id: Makefile.fs2_6,v 1.3 2005/08/12 14:13:59 mcr Exp $ -+# -+# Note! Dependencies are done automagically by 'make dep', which also -+# removes any old dependencies. DON'T put your own dependencies here -+# unless it's something special (ie not a .c file). -+# -+ -+obj-$(CONFIG_KLIPS_ENC_3DES) += ipsec_alg_3des.o -+obj-$(CONFIG_KLIPS_ENC_3DES) += cbc_enc.o -+obj-$(CONFIG_KLIPS_ENC_3DES) += ecb_enc.o -+obj-$(CONFIG_KLIPS_ENC_3DES) += set_key.o -+ -+ifeq ($(strip ${SUBARCH}),) -+SUBARCH:=${ARCH} -+endif -+ -+# the assembly version expects frame pointers, which are -+# optional in many kernel builds. If you want speed, you should -+# probably use cryptoapi code instead. -+USEASSEMBLY=${SUBARCH}${CONFIG_FRAME_POINTER} -+ifeq (${USEASSEMBLY},i386y) -+obj-$(CONFIG_KLIPS_ENC_3DES) += dx86unix.o -+else -+obj-$(CONFIG_KLIPS_ENC_3DES) += des_enc.o -+endif -+ -+# -+# $Log: Makefile.fs2_6,v $ -+# Revision 1.3 2005/08/12 14:13:59 mcr -+# do not use assembly code with there are no frame pointers, -+# as it does not have the right linkages. -+# -+# Revision 1.2 2005/04/29 05:13:07 mcr -+# 3DES algorithm code. -+# -+# Revision 1.1 2004/08/17 03:27:30 mcr -+# klips 2.6 edits. -+# -+# -+# Local Variables: -+# compile-command: "(cd ../../.. && source umlsetup.sh && make -C ${POOLSPACE} module/ipsec.o)" -+# End Variables: -+# -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/des/README Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,54 @@ -+ -+ libdes, Version 4.01 10-Jan-97 -+ -+ Copyright (c) 1997, Eric Young -+ All rights reserved. -+ -+ This program is free software; you can redistribute it and/or modify -+ it under the terms specified in COPYRIGHT. -+ -+-- -+The primary ftp site for this library is -+ftp://ftp.psy.uq.oz.au/pub/Crypto/DES/libdes-x.xx.tar.gz -+libdes is now also shipped with SSLeay. Primary ftp site of -+ftp://ftp.psy.uq.oz.au/pub/Crypto/SSL/SSLeay-x.x.x.tar.gz -+ -+The best way to build this library is to build it as part of SSLeay. -+ -+This kit builds a DES encryption library and a DES encryption program. -+It supports ecb, cbc, ofb, cfb, triple ecb, triple cbc, triple ofb, -+triple cfb, desx, and MIT's pcbc encryption modes and also has a fast -+implementation of crypt(3). -+It contains support routines to read keys from a terminal, -+generate a random key, generate a key from an arbitrary length string, -+read/write encrypted data from/to a file descriptor. -+ -+The implementation was written so as to conform with the manual entry -+for the des_crypt(3) library routines from MIT's project Athena. -+ -+destest should be run after compilation to test the des routines. -+rpw should be run after compilation to test the read password routines. -+The des program is a replacement for the sun des command. I believe it -+conforms to the sun version. -+ -+The Imakefile is setup for use in the kerberos distribution. -+ -+These routines are best compiled with gcc or any other good -+optimising compiler. -+Just turn you optimiser up to the highest settings and run destest -+after the build to make sure everything works. -+ -+I believe these routines are close to the fastest and most portable DES -+routines that use small lookup tables (4.5k) that are publicly available. -+The fcrypt routine is faster than ufc's fcrypt (when compiling with -+gcc2 -O2) on the sparc 2 (1410 vs 1270) but is not so good on other machines -+(on a sun3/260 168 vs 336). It is a function of CPU on chip cache size. -+[ 10-Jan-97 and a function of an incorrect speed testing program in -+ ufc which gave much better test figures that reality ]. -+ -+It is worth noting that on sparc and Alpha CPUs, performance of the DES -+library can vary by upto %10 due to the positioning of files after application -+linkage. -+ -+Eric Young (eay@cryptsoft.com) -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/des/README.freeswan Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,33 @@ -+The only changes the FreeS/WAN project has made to libdes-lite 4.04b are: -+ -+We #ifdef-ed the declaration of DES_LONG in des.h, so it's more efficient -+on the Alpha, instead of just noting the issue in a comment. -+ -+We #ifdef-ed out the des_options() function in ecb_enc.c, because we don't -+use it, and its call to sprintf() can cause subtle difficulties when KLIPS -+is built as a module (depending on details of Linux configuration options). -+ -+We changed some instances of CC=$(CC) in the Makefile to CC='$(CC)' to make -+it cope better with Linux kernel Makefile stupidities, and took out an -+explicit CC=gcc (unwise on systems with strange compilers). -+ -+We deleted some references to and , and a declaration -+of one function found only in the full libdes (not in libdes-lite), to -+avoid dragging in bits of stdio/stdlib unnecessarily. (Our thanks to Hans -+Schultz for spotting this and pointing out the fixes.) -+ -+We deleted a couple of .obj files in the asm subdirectory, which appear to -+have been included in the original library by accident. -+ -+We have added an include of our Makefile.inc file, to permit overriding -+things like choice of compiler (although the libdes Makefile would -+probably need some work to make this effective). -+ -+ -+ -+Note that Eric Young is no longer at the email address listed in these -+files, and is (alas) no longer working on free crypto software. -+ -+ -+ -+This file is RCSID $Id: README.freeswan,v 1.12 2004/07/10 08:06:51 mcr Exp $ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/des/VERSION Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,406 @@ -+Version 4.04 -+ Fixed a few tests in destest. Also added x86 assember for -+ des_ncbc_encrypt() which is the standard cbc mode function. -+ This makes a very very large performace difference. -+ Ariel Glenn ariel@columbia.edu reports that the terminal -+ 'turn echo off' can return (errno == EINVAL) under solaris -+ when redirection is used. So I now catch that as well as ENOTTY. -+ -+ -+Version 4.03 -+ Left a static out of enc_write.c, which caused to buffer to be -+ continiously malloc()ed. Does anyone use these functions? I keep -+ on feeling like removing them since I only had these in there -+ for a version of kerberised login. Anyway, this was pointed out -+ by Theo de Raadt -+ The 'n' bit ofb code was wrong, it was not shifting the shift -+ register. It worked correctly for n == 64. Thanks to -+ Gigi Ankeny for pointing this one out. -+ -+Version 4.02 -+ I was doing 'if (memcmp(weak_keys[i],key,sizeof(key)) == 0)' -+ when checking for weak keys which is wrong :-(, pointed out by -+ Markus F.X.J. Oberhumer . -+ -+Version 4.01 -+ Even faster inner loop in the DES assembler for x86 and a modification -+ for IP/FP which is faster on x86. Both of these changes are -+ from Svend Olaf Mikkelsen . His -+ changes make the assembler run %40 faster on a pentium. This is just -+ a case of getting the instruction sequence 'just right'. -+ All credit to 'Svend' :-) -+ Quite a few special x86 'make' targets. -+ A libdes-l (lite) distribution. -+ -+Version 4.00 -+ After a bit of a pause, I'll up the major version number since this -+ is mostly a performace release. I've added x86 assembler and -+ added more options for performance. A %28 speedup for gcc -+ on a pentium and the assembler is a %50 speedup. -+ MIPS CPU's, sparc and Alpha are the main CPU's with speedups. -+ Run des_opts to work out which options should be used. -+ DES_RISC1/DES_RISC2 use alternative inner loops which use -+ more registers but should give speedups on any CPU that does -+ dual issue (pentium). DES_UNROLL unrolls the inner loop, -+ which costs in code size. -+ -+Version 3.26 -+ I've finally removed one of the shifts in D_ENCRYPT. This -+ meant I've changed the des_SPtrans table (spr.h), the set_key() -+ function and some things in des_enc.c. This has definitly -+ made things faster :-). I've known about this one for some -+ time but I've been too lazy to follow it up :-). -+ Noticed that in the D_ENCRYPT() macro, we can just do L^=(..)^(..)^.. -+ instead of L^=((..)|(..)|(..).. This should save a register at -+ least. -+ Assember for x86. The file to replace is des_enc.c, which is replaced -+ by one of the assembler files found in asm. Look at des/asm/readme -+ for more info. -+ -+ /* Modification to fcrypt so it can be compiled to support -+ HPUX 10.x's long password format, define -DLONGCRYPT to use this. -+ Thanks to Jens Kupferschmidt . */ -+ -+ SIGWINCH case put in des_read_passwd() so the function does not -+ 'exit' if this function is recieved. -+ -+Version 3.25 17/07/96 -+ Modified read_pwd.c so that stdin can be read if not a tty. -+ Thanks to Jeff Barber for the patches. -+ des_init_random_number_generator() shortened due to VMS linker -+ limits. -+ Added RSA's DESX cbc mode. It is a form of cbc encryption, with 2 -+ 8 byte quantites xored before and after encryption. -+ des_xcbc_encryption() - the name is funny to preserve the des_ -+ prefix on all functions. -+ -+Version 3.24 20/04/96 -+ The DES_PTR macro option checked and used by SSLeay configuration -+ -+Version 3.23 11/04/96 -+ Added DES_LONG. If defined to 'unsigned int' on the DEC Alpha, -+ it gives a %20 speedup :-) -+ Fixed the problem with des.pl under perl5. The patches were -+ sent by Ed Kubaitis (ejk@uiuc.edu). -+ if fcrypt.c, changed values to handle illegal salt values the way -+ normal crypt() implementations do. Some programs apparently use -+ them :-(. The patch was sent by Bjorn Gronvall -+ -+Version 3.22 29/11/95 -+ Bug in des(1), an error with the uuencoding stuff when the -+ 'data' is small, thanks to Geoff Keating -+ for the patch. -+ -+Version 3.21 22/11/95 -+ After some emailing back and forth with -+ Colin Plumb , I've tweaked a few things -+ and in a future version I will probably put in some of the -+ optimisation he suggested for use with the DES_USE_PTR option. -+ Extra routines from Mark Murray for use in -+ freeBSD. They mostly involve random number generation for use -+ with kerberos. They involve evil machine specific system calls -+ etc so I would normally suggest pushing this stuff into the -+ application and/or using RAND_seed()/RAND_bytes() if you are -+ using this DES library as part of SSLeay. -+ Redone the read_pw() function so that it is cleaner and -+ supports termios, thanks to Sameer Parekh -+ for the initial patches for this. -+ Renamed 3ecb_encrypt() to ecb3_encrypt(). This has been -+ done just to make things more consistent. -+ I have also now added triple DES versions of cfb and ofb. -+ -+Version 3.20 -+ Damn, Damn, Damn, as pointed out by Mike_Spreitzer.PARC@xerox.com, -+ my des_random_seed() function was only copying 4 bytes of the -+ passed seed into the init structure. It is now fixed to copy 8. -+ My own suggestion is to used something like MD5 :-) -+ -+Version 3.19 -+ While looking at my code one day, I though, why do I keep on -+ calling des_encrypt(in,out,ks,enc) when every function that -+ calls it has in and out the same. So I dropped the 'out' -+ parameter, people should not be using this function. -+ -+Version 3.18 30/08/95 -+ Fixed a few bit with the distribution and the filenames. -+ 3.17 had been munged via a move to DOS and back again. -+ NO CODE CHANGES -+ -+Version 3.17 14/07/95 -+ Fixed ede3 cbc which I had broken in 3.16. I have also -+ removed some unneeded variables in 7-8 of the routines. -+ -+Version 3.16 26/06/95 -+ Added des_encrypt2() which does not use IP/FP, used by triple -+ des routines. Tweaked things a bit elsewhere. %13 speedup on -+ sparc and %6 on a R4400 for ede3 cbc mode. -+ -+Version 3.15 06/06/95 -+ Added des_ncbc_encrypt(), it is des_cbc mode except that it is -+ 'normal' and copies the new iv value back over the top of the -+ passed parameter. -+ CHANGED des_ede3_cbc_encrypt() so that it too now overwrites -+ the iv. THIS WILL BREAK EXISTING CODE, but since this function -+ only new, I feel I can change it, not so with des_cbc_encrypt :-(. -+ I need to update the documentation. -+ -+Version 3.14 31/05/95 -+ New release upon the world, as part of my SSL implementation. -+ New copyright and usage stuff. Basically free for all to use -+ as long as you say it came from me :-) -+ -+Version 3.13 31/05/95 -+ A fix in speed.c, if HZ is not defined, I set it to 100.0 -+ which is reasonable for most unixes except SunOS 4.x. -+ I now have a #ifdef sun but timing for SunOS 4.x looked very -+ good :-(. At my last job where I used SunOS 4.x, it was -+ defined to be 60.0 (look at the old INSTALL documentation), at -+ the last release had it changed to 100.0 since I now work with -+ Solaris2 and SVR4 boxes. -+ Thanks to Rory Chisholm for pointing this -+ one out. -+ -+Version 3.12 08/05/95 -+ As pointed out by The Crypt Keeper , -+ my D_ENCRYPT macro in crypt() had an un-necessary variable. -+ It has been removed. -+ -+Version 3.11 03/05/95 -+ Added des_ede3_cbc_encrypt() which is cbc mode des with 3 keys -+ and one iv. It is a standard and I needed it for my SSL code. -+ It makes more sense to use this for triple DES than -+ 3cbc_encrypt(). I have also added (or should I say tested :-) -+ cfb64_encrypt() which is cfb64 but it will encrypt a partial -+ number of bytes - 3 bytes in 3 bytes out. Again this is for -+ my SSL library, as a form of encryption to use with SSL -+ telnet. -+ -+Version 3.10 22/03/95 -+ Fixed a bug in 3cbc_encrypt() :-(. When making repeated calls -+ to cbc3_encrypt, the 2 iv values that were being returned to -+ be used in the next call were reversed :-(. -+ Many thanks to Bill Wade for pointing out -+ this error. -+ -+Version 3.09 01/02/95 -+ Fixed des_random_key to far more random, it was rather feeble -+ with regards to picking the initial seed. The problem was -+ pointed out by Olaf Kirch . -+ -+Version 3.08 14/12/94 -+ Added Makefile.PL so libdes can be built into perl5. -+ Changed des_locl.h so RAND is always defined. -+ -+Version 3.07 05/12/94 -+ Added GNUmake and stuff so the library can be build with -+ glibc. -+ -+Version 3.06 30/08/94 -+ Added rpc_enc.c which contains _des_crypt. This is for use in -+ secure_rpc v 4.0 -+ Finally fixed the cfb_enc problems. -+ Fixed a few parameter parsing bugs in des (-3 and -b), thanks -+ to Rob McMillan -+ -+Version 3.05 21/04/94 -+ for unsigned long l; gcc does not produce ((l>>34) == 0) -+ This causes bugs in cfb_enc. -+ Thanks to Hadmut Danisch -+ -+Version 3.04 20/04/94 -+ Added a version number to des.c and libdes.a -+ -+Version 3.03 12/01/94 -+ Fixed a bug in non zero iv in 3cbc_enc. -+ -+Version 3.02 29/10/93 -+ I now work in a place where there are 6+ architectures and 14+ -+ OS versions :-). -+ Fixed TERMIO definition so the most sys V boxes will work :-) -+ -+Release upon comp.sources.misc -+Version 3.01 08/10/93 -+ Added des_3cbc_encrypt() -+ -+Version 3.00 07/10/93 -+ Fixed up documentation. -+ quad_cksum definitely compatible with MIT's now. -+ -+Version 2.30 24/08/93 -+ Triple DES now defaults to triple cbc but can do triple ecb -+ with the -b flag. -+ Fixed some MSDOS uuen/uudecoding problems, thanks to -+ Added prototypes. -+ -+Version 2.22 29/06/93 -+ Fixed a bug in des_is_weak_key() which stopped it working :-( -+ thanks to engineering@MorningStar.Com. -+ -+Version 2.21 03/06/93 -+ des(1) with no arguments gives quite a bit of help. -+ Added -c (generate ckecksum) flag to des(1). -+ Added -3 (triple DES) flag to des(1). -+ Added cfb and ofb routines to the library. -+ -+Version 2.20 11/03/93 -+ Added -u (uuencode) flag to des(1). -+ I have been playing with byte order in quad_cksum to make it -+ compatible with MIT's version. All I can say is avid this -+ function if possible since MIT's output is endian dependent. -+ -+Version 2.12 14/10/92 -+ Added MSDOS specific macro in ecb_encrypt which gives a %70 -+ speed up when the code is compiled with turbo C. -+ -+Version 2.11 12/10/92 -+ Speedup in set_key (recoding of PC-1) -+ I now do it in 47 simple operations, down from 60. -+ Thanks to John Fletcher (john_fletcher@lccmail.ocf.llnl.gov) -+ for motivating me to look for a faster system :-) -+ The speedup is probably less that 1% but it is still 13 -+ instructions less :-). -+ -+Version 2.10 06/10/92 -+ The code now works on the 64bit ETA10 and CRAY without modifications or -+ #defines. I believe the code should work on any machine that -+ defines long, int or short to be 8 bytes long. -+ Thanks to Shabbir J. Safdar (shabby@mentor.cc.purdue.edu) -+ for helping me fix the code to run on 64bit machines (he had -+ access to an ETA10). -+ Thanks also to John Fletcher -+ for testing the routines on a CRAY. -+ read_password.c has been renamed to read_passwd.c -+ string_to_key.c has been renamed to string2key.c -+ -+Version 2.00 14/09/92 -+ Made mods so that the library should work on 64bit CPU's. -+ Removed all my uchar and ulong defs. To many different -+ versions of unix define them in their header files in too many -+ different combinations :-) -+ IRIX - Sillicon Graphics mods (mostly in read_password.c). -+ Thanks to Andrew Daviel (advax@erich.triumf.ca) -+ -+Version 1.99 26/08/92 -+ Fixed a bug or 2 in enc_read.c -+ Fixed a bug in enc_write.c -+ Fixed a pseudo bug in fcrypt.c (very obscure). -+ -+Version 1.98 31/07/92 -+ Support for the ETA10. This is a strange machine that defines -+ longs and ints as 8 bytes and shorts as 4 bytes. -+ Since I do evil things with long * that assume that they are 4 -+ bytes. Look in the Makefile for the option to compile for -+ this machine. quad_cksum appears to have problems but I -+ will don't have the time to fix it right now, and this is not -+ a function that uses DES and so will not effect the main uses -+ of the library. -+ -+Version 1.97 20/05/92 eay -+ Fixed the Imakefile and made some changes to des.h to fix some -+ problems when building this package with Kerberos v 4. -+ -+Version 1.96 18/05/92 eay -+ Fixed a small bug in string_to_key() where problems could -+ occur if des_check_key was set to true and the string -+ generated a weak key. -+ -+Patch2 posted to comp.sources.misc -+Version 1.95 13/05/92 eay -+ Added an alternative version of the D_ENCRYPT macro in -+ ecb_encrypt and fcrypt. Depending on the compiler, one version or the -+ other will be faster. This was inspired by -+ Dana How , and her pointers about doing the -+ *(ulong *)((uchar *)ptr+(value&0xfc)) -+ vs -+ ptr[value&0x3f] -+ to stop the C compiler doing a <<2 to convert the long array index. -+ -+Version 1.94 05/05/92 eay -+ Fixed an incompatibility between my string_to_key and the MIT -+ version. When the key is longer than 8 chars, I was wrapping -+ with a different method. To use the old version, define -+ OLD_STR_TO_KEY in the makefile. Thanks to -+ viktor@newsu.shearson.com (Viktor Dukhovni). -+ -+Version 1.93 28/04/92 eay -+ Fixed the VMS mods so that echo is now turned off in -+ read_password. Thanks again to brennan@coco.cchs.su.oz.AU. -+ MSDOS support added. The routines can be compiled with -+ Turbo C (v2.0) and MSC (v5.1). Make sure MSDOS is defined. -+ -+Patch1 posted to comp.sources.misc -+Version 1.92 13/04/92 eay -+ Changed D_ENCRYPT so that the rotation of R occurs outside of -+ the loop. This required rotating all the longs in sp.h (now -+ called spr.h). Thanks to Richard Outerbridge <71755.204@CompuServe.COM> -+ speed.c has been changed so it will work without SIGALRM. If -+ times(3) is not present it will try to use ftime() instead. -+ -+Version 1.91 08/04/92 eay -+ Added -E/-D options to des(1) so it can use string_to_key. -+ Added SVR4 mods suggested by witr@rwwa.COM -+ Added VMS mods suggested by brennan@coco.cchs.su.oz.AU. If -+ anyone knows how to turn of tty echo in VMS please tell me or -+ implement it yourself :-). -+ Changed FILE *IN/*OUT to *DES_IN/*DES_OUT since it appears VMS -+ does not like IN/OUT being used. -+ -+Libdes posted to comp.sources.misc -+Version 1.9 24/03/92 eay -+ Now contains a fast small crypt replacement. -+ Added des(1) command. -+ Added des_rw_mode so people can use cbc encryption with -+ enc_read and enc_write. -+ -+Version 1.8 15/10/91 eay -+ Bug in cbc_cksum. -+ Many thanks to Keith Reynolds (keithr@sco.COM) for pointing this -+ one out. -+ -+Version 1.7 24/09/91 eay -+ Fixed set_key :-) -+ set_key is 4 times faster and takes less space. -+ There are a few minor changes that could be made. -+ -+Version 1.6 19/09/1991 eay -+ Finally go IP and FP finished. -+ Now I need to fix set_key. -+ This version is quite a bit faster that 1.51 -+ -+Version 1.52 15/06/1991 eay -+ 20% speedup in ecb_encrypt by changing the E bit selection -+ to use 2 32bit words. This also required modification of the -+ sp table. There is still a way to speedup the IP and IP-1 -+ (hints from outer@sq.com) still working on this one :-(. -+ -+Version 1.51 07/06/1991 eay -+ Faster des_encrypt by loop unrolling -+ Fixed bug in quad_cksum.c (thanks to hughes@logos.ucs.indiana.edu) -+ -+Version 1.50 28/05/1991 eay -+ Optimised the code a bit more for the sparc. I have improved the -+ speed of the inner des_encrypt by speeding up the initial and -+ final permutations. -+ -+Version 1.40 23/10/1990 eay -+ Fixed des_random_key, it did not produce a random key :-( -+ -+Version 1.30 2/10/1990 eay -+ Have made des_quad_cksum the same as MIT's, the full package -+ should be compatible with MIT's -+ Have tested on a DECstation 3100 -+ Still need to fix des_set_key (make it faster). -+ Does des_cbc_encrypts at 70.5k/sec on a 3100. -+ -+Version 1.20 18/09/1990 eay -+ Fixed byte order dependencies. -+ Fixed (I hope) all the word alignment problems. -+ Speedup in des_ecb_encrypt. -+ -+Version 1.10 11/09/1990 eay -+ Added des_enc_read and des_enc_write. -+ Still need to fix des_quad_cksum. -+ Still need to document des_enc_read and des_enc_write. -+ -+Version 1.00 27/08/1990 eay -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/des/asm/des-586.pl Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,251 @@ -+#!/usr/local/bin/perl -+# -+# The inner loop instruction sequence and the IP/FP modifications are from -+# Svend Olaf Mikkelsen -+# -+ -+push(@INC,"perlasm","../../perlasm"); -+require "x86asm.pl"; -+require "cbc.pl"; -+require "desboth.pl"; -+ -+# base code is in microsft -+# op dest, source -+# format. -+# -+ -+&asm_init($ARGV[0],"des-586.pl"); -+ -+$L="edi"; -+$R="esi"; -+ -+&external_label("des_SPtrans"); -+&des_encrypt("des_encrypt",1); -+&des_encrypt("des_encrypt2",0); -+&des_encrypt3("des_encrypt3",1); -+&des_encrypt3("des_decrypt3",0); -+&cbc("des_ncbc_encrypt","des_encrypt","des_encrypt",0,4,5,3,5,-1); -+&cbc("des_ede3_cbc_encrypt","des_encrypt3","des_decrypt3",0,6,7,3,4,5); -+ -+&asm_finish(); -+ -+sub des_encrypt -+ { -+ local($name,$do_ip)=@_; -+ -+ &function_begin_B($name,"EXTRN _des_SPtrans:DWORD"); -+ -+ &push("esi"); -+ &push("edi"); -+ -+ &comment(""); -+ &comment("Load the 2 words"); -+ $ks="ebp"; -+ -+ if ($do_ip) -+ { -+ &mov($R,&wparam(0)); -+ &xor( "ecx", "ecx" ); -+ -+ &push("ebx"); -+ &push("ebp"); -+ -+ &mov("eax",&DWP(0,$R,"",0)); -+ &mov("ebx",&wparam(2)); # get encrypt flag -+ &mov($L,&DWP(4,$R,"",0)); -+ &comment(""); -+ &comment("IP"); -+ &IP_new("eax",$L,$R,3); -+ } -+ else -+ { -+ &mov("eax",&wparam(0)); -+ &xor( "ecx", "ecx" ); -+ -+ &push("ebx"); -+ &push("ebp"); -+ -+ &mov($R,&DWP(0,"eax","",0)); -+ &mov("ebx",&wparam(2)); # get encrypt flag -+ &rotl($R,3); -+ &mov($L,&DWP(4,"eax","",0)); -+ &rotl($L,3); -+ } -+ -+ &mov( $ks, &wparam(1) ); -+ &cmp("ebx","0"); -+ &je(&label("start_decrypt")); -+ -+ for ($i=0; $i<16; $i+=2) -+ { -+ &comment(""); -+ &comment("Round $i"); -+ &D_ENCRYPT($i,$L,$R,$i*2,$ks,"des_SPtrans","eax","ebx","ecx","edx"); -+ -+ &comment(""); -+ &comment("Round ".sprintf("%d",$i+1)); -+ &D_ENCRYPT($i+1,$R,$L,($i+1)*2,$ks,"des_SPtrans","eax","ebx","ecx","edx"); -+ } -+ &jmp(&label("end")); -+ -+ &set_label("start_decrypt"); -+ -+ for ($i=15; $i>0; $i-=2) -+ { -+ &comment(""); -+ &comment("Round $i"); -+ &D_ENCRYPT(15-$i,$L,$R,$i*2,$ks,"des_SPtrans","eax","ebx","ecx","edx"); -+ &comment(""); -+ &comment("Round ".sprintf("%d",$i-1)); -+ &D_ENCRYPT(15-$i+1,$R,$L,($i-1)*2,$ks,"des_SPtrans","eax","ebx","ecx","edx"); -+ } -+ -+ &set_label("end"); -+ -+ if ($do_ip) -+ { -+ &comment(""); -+ &comment("FP"); -+ &mov("edx",&wparam(0)); -+ &FP_new($L,$R,"eax",3); -+ -+ &mov(&DWP(0,"edx","",0),"eax"); -+ &mov(&DWP(4,"edx","",0),$R); -+ } -+ else -+ { -+ &comment(""); -+ &comment("Fixup"); -+ &rotr($L,3); # r -+ &mov("eax",&wparam(0)); -+ &rotr($R,3); # l -+ &mov(&DWP(0,"eax","",0),$L); -+ &mov(&DWP(4,"eax","",0),$R); -+ } -+ -+ &pop("ebp"); -+ &pop("ebx"); -+ &pop("edi"); -+ &pop("esi"); -+ &ret(); -+ -+ &function_end_B($name); -+ } -+ -+sub D_ENCRYPT -+ { -+ local($r,$L,$R,$S,$ks,$desSP,$u,$tmp1,$tmp2,$t)=@_; -+ -+ &mov( $u, &DWP(&n2a($S*4),$ks,"",0)); -+ &xor( $tmp1, $tmp1); -+ &mov( $t, &DWP(&n2a(($S+1)*4),$ks,"",0)); -+ &xor( $u, $R); -+ &xor( $t, $R); -+ &and( $u, "0xfcfcfcfc" ); -+ &and( $t, "0xcfcfcfcf" ); -+ &movb( &LB($tmp1), &LB($u) ); -+ &movb( &LB($tmp2), &HB($u) ); -+ &rotr( $t, 4 ); -+ &mov( $ks, &DWP(" $desSP",$tmp1,"",0)); -+ &movb( &LB($tmp1), &LB($t) ); -+ &xor( $L, $ks); -+ &mov( $ks, &DWP("0x200+$desSP",$tmp2,"",0)); -+ &xor( $L, $ks); ###### -+ &movb( &LB($tmp2), &HB($t) ); -+ &shr( $u, 16); -+ &mov( $ks, &DWP("0x100+$desSP",$tmp1,"",0)); -+ &xor( $L, $ks); ###### -+ &movb( &LB($tmp1), &HB($u) ); -+ &shr( $t, 16); -+ &mov( $ks, &DWP("0x300+$desSP",$tmp2,"",0)); -+ &xor( $L, $ks); -+ &mov( $ks, &wparam(1) ); -+ &movb( &LB($tmp2), &HB($t) ); -+ &and( $u, "0xff" ); -+ &and( $t, "0xff" ); -+ &mov( $tmp1, &DWP("0x600+$desSP",$tmp1,"",0)); -+ &xor( $L, $tmp1); -+ &mov( $tmp1, &DWP("0x700+$desSP",$tmp2,"",0)); -+ &xor( $L, $tmp1); -+ &mov( $tmp1, &DWP("0x400+$desSP",$u,"",0)); -+ &xor( $L, $tmp1); -+ &mov( $tmp1, &DWP("0x500+$desSP",$t,"",0)); -+ &xor( $L, $tmp1); -+ } -+ -+sub n2a -+ { -+ sprintf("%d",$_[0]); -+ } -+ -+# now has a side affect of rotating $a by $shift -+sub R_PERM_OP -+ { -+ local($a,$b,$tt,$shift,$mask,$last)=@_; -+ -+ &rotl( $a, $shift ) if ($shift != 0); -+ &mov( $tt, $a ); -+ &xor( $a, $b ); -+ &and( $a, $mask ); -+ if (!$last eq $b) -+ { -+ &xor( $b, $a ); -+ &xor( $tt, $a ); -+ } -+ else -+ { -+ &xor( $tt, $a ); -+ &xor( $b, $a ); -+ } -+ &comment(""); -+ } -+ -+sub IP_new -+ { -+ local($l,$r,$tt,$lr)=@_; -+ -+ &R_PERM_OP($l,$r,$tt, 4,"0xf0f0f0f0",$l); -+ &R_PERM_OP($r,$tt,$l,20,"0xfff0000f",$l); -+ &R_PERM_OP($l,$tt,$r,14,"0x33333333",$r); -+ &R_PERM_OP($tt,$r,$l,22,"0x03fc03fc",$r); -+ &R_PERM_OP($l,$r,$tt, 9,"0xaaaaaaaa",$r); -+ -+ if ($lr != 3) -+ { -+ if (($lr-3) < 0) -+ { &rotr($tt, 3-$lr); } -+ else { &rotl($tt, $lr-3); } -+ } -+ if ($lr != 2) -+ { -+ if (($lr-2) < 0) -+ { &rotr($r, 2-$lr); } -+ else { &rotl($r, $lr-2); } -+ } -+ } -+ -+sub FP_new -+ { -+ local($l,$r,$tt,$lr)=@_; -+ -+ if ($lr != 2) -+ { -+ if (($lr-2) < 0) -+ { &rotl($r, 2-$lr); } -+ else { &rotr($r, $lr-2); } -+ } -+ if ($lr != 3) -+ { -+ if (($lr-3) < 0) -+ { &rotl($l, 3-$lr); } -+ else { &rotr($l, $lr-3); } -+ } -+ -+ &R_PERM_OP($l,$r,$tt, 0,"0xaaaaaaaa",$r); -+ &R_PERM_OP($tt,$r,$l,23,"0x03fc03fc",$r); -+ &R_PERM_OP($l,$r,$tt,10,"0x33333333",$l); -+ &R_PERM_OP($r,$tt,$l,18,"0xfff0000f",$l); -+ &R_PERM_OP($l,$tt,$r,12,"0xf0f0f0f0",$r); -+ &rotr($tt , 4); -+ } -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/des/asm/des686.pl Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,230 @@ -+#!/usr/local/bin/perl -+ -+$prog="des686.pl"; -+ -+# base code is in microsft -+# op dest, source -+# format. -+# -+ -+# WILL NOT WORK ANYMORE WITH desboth.pl -+require "desboth.pl"; -+ -+if ( ($ARGV[0] eq "elf")) -+ { require "x86unix.pl"; } -+elsif ( ($ARGV[0] eq "a.out")) -+ { $aout=1; require "x86unix.pl"; } -+elsif ( ($ARGV[0] eq "sol")) -+ { $sol=1; require "x86unix.pl"; } -+elsif ( ($ARGV[0] eq "cpp")) -+ { $cpp=1; require "x86unix.pl"; } -+elsif ( ($ARGV[0] eq "win32")) -+ { require "x86ms.pl"; } -+else -+ { -+ print STDERR <<"EOF"; -+Pick one target type from -+ elf - linux, FreeBSD etc -+ a.out - old linux -+ sol - x86 solaris -+ cpp - format so x86unix.cpp can be used -+ win32 - Windows 95/Windows NT -+EOF -+ exit(1); -+ } -+ -+&comment("Don't even think of reading this code"); -+&comment("It was automatically generated by $prog"); -+&comment("Which is a perl program used to generate the x86 assember for"); -+&comment("any of elf, a.out, Win32, or Solaris"); -+&comment("It can be found in SSLeay 0.6.5+ or in libdes 3.26+"); -+&comment("eric "); -+&comment(""); -+ -+&file("dx86xxxx"); -+ -+$L="edi"; -+$R="esi"; -+ -+&des_encrypt("des_encrypt",1); -+&des_encrypt("des_encrypt2",0); -+ -+&des_encrypt3("des_encrypt3",1); -+&des_encrypt3("des_decrypt3",0); -+ -+&file_end(); -+ -+sub des_encrypt -+ { -+ local($name,$do_ip)=@_; -+ -+ &function_begin($name,"EXTRN _des_SPtrans:DWORD"); -+ -+ &comment(""); -+ &comment("Load the 2 words"); -+ &mov("eax",&wparam(0)); -+ &mov($L,&DWP(0,"eax","",0)); -+ &mov($R,&DWP(4,"eax","",0)); -+ -+ $ksp=&wparam(1); -+ -+ if ($do_ip) -+ { -+ &comment(""); -+ &comment("IP"); -+ &IP_new($L,$R,"eax"); -+ } -+ -+ &comment(""); -+ &comment("fixup rotate"); -+ &rotl($R,3); -+ &rotl($L,3); -+ &exch($L,$R); -+ -+ &comment(""); -+ &comment("load counter, key_schedule and enc flag"); -+ &mov("eax",&wparam(2)); # get encrypt flag -+ &mov("ebp",&wparam(1)); # get ks -+ &cmp("eax","0"); -+ &je(&label("start_decrypt")); -+ -+ # encrypting part -+ -+ for ($i=0; $i<16; $i+=2) -+ { -+ &comment(""); -+ &comment("Round $i"); -+ &D_ENCRYPT($L,$R,$i*2,"ebp","des_SPtrans","ecx","edx","eax","ebx"); -+ -+ &comment(""); -+ &comment("Round ".sprintf("%d",$i+1)); -+ &D_ENCRYPT($R,$L,($i+1)*2,"ebp","des_SPtrans","ecx","edx","eax","ebx"); -+ } -+ &jmp(&label("end")); -+ -+ &set_label("start_decrypt"); -+ -+ for ($i=15; $i>0; $i-=2) -+ { -+ &comment(""); -+ &comment("Round $i"); -+ &D_ENCRYPT($L,$R,$i*2,"ebp","des_SPtrans","ecx","edx","eax","ebx"); -+ &comment(""); -+ &comment("Round ".sprintf("%d",$i-1)); -+ &D_ENCRYPT($R,$L,($i-1)*2,"ebp","des_SPtrans","ecx","edx","eax","ebx"); -+ } -+ -+ &set_label("end"); -+ -+ &comment(""); -+ &comment("Fixup"); -+ &rotr($L,3); # r -+ &rotr($R,3); # l -+ -+ if ($do_ip) -+ { -+ &comment(""); -+ &comment("FP"); -+ &FP_new($R,$L,"eax"); -+ } -+ -+ &mov("eax",&wparam(0)); -+ &mov(&DWP(0,"eax","",0),$L); -+ &mov(&DWP(4,"eax","",0),$R); -+ -+ &function_end($name); -+ } -+ -+ -+# The logic is to load R into 2 registers and operate on both at the same time. -+# We also load the 2 R's into 2 more registers so we can do the 'move word down a byte' -+# while also masking the other copy and doing a lookup. We then also accumulate the -+# L value in 2 registers then combine them at the end. -+sub D_ENCRYPT -+ { -+ local($L,$R,$S,$ks,$desSP,$u,$t,$tmp1,$tmp2,$tmp3)=@_; -+ -+ &mov( $u, &DWP(&n2a($S*4),$ks,"",0)); -+ &mov( $t, &DWP(&n2a(($S+1)*4),$ks,"",0)); -+ &xor( $u, $R ); -+ &xor( $t, $R ); -+ &rotr( $t, 4 ); -+ -+ # the numbers at the end of the line are origional instruction order -+ &mov( $tmp2, $u ); # 1 2 -+ &mov( $tmp1, $t ); # 1 1 -+ &and( $tmp2, "0xfc" ); # 1 4 -+ &and( $tmp1, "0xfc" ); # 1 3 -+ &shr( $t, 8 ); # 1 5 -+ &xor( $L, &DWP("0x100+$desSP",$tmp1,"",0)); # 1 7 -+ &shr( $u, 8 ); # 1 6 -+ &mov( $tmp1, &DWP(" $desSP",$tmp2,"",0)); # 1 8 -+ -+ &mov( $tmp2, $u ); # 2 2 -+ &xor( $L, $tmp1 ); # 1 9 -+ &and( $tmp2, "0xfc" ); # 2 4 -+ &mov( $tmp1, $t ); # 2 1 -+ &and( $tmp1, "0xfc" ); # 2 3 -+ &shr( $t, 8 ); # 2 5 -+ &xor( $L, &DWP("0x300+$desSP",$tmp1,"",0)); # 2 7 -+ &shr( $u, 8 ); # 2 6 -+ &mov( $tmp1, &DWP("0x200+$desSP",$tmp2,"",0)); # 2 8 -+ &mov( $tmp2, $u ); # 3 2 -+ -+ &xor( $L, $tmp1 ); # 2 9 -+ &and( $tmp2, "0xfc" ); # 3 4 -+ -+ &mov( $tmp1, $t ); # 3 1 -+ &shr( $u, 8 ); # 3 6 -+ &and( $tmp1, "0xfc" ); # 3 3 -+ &shr( $t, 8 ); # 3 5 -+ &xor( $L, &DWP("0x500+$desSP",$tmp1,"",0)); # 3 7 -+ &mov( $tmp1, &DWP("0x400+$desSP",$tmp2,"",0)); # 3 8 -+ -+ &and( $t, "0xfc" ); # 4 1 -+ &xor( $L, $tmp1 ); # 3 9 -+ -+ &and( $u, "0xfc" ); # 4 2 -+ &xor( $L, &DWP("0x700+$desSP",$t,"",0)); # 4 3 -+ &xor( $L, &DWP("0x600+$desSP",$u,"",0)); # 4 4 -+ } -+ -+sub PERM_OP -+ { -+ local($a,$b,$tt,$shift,$mask)=@_; -+ -+ &mov( $tt, $a ); -+ &shr( $tt, $shift ); -+ &xor( $tt, $b ); -+ &and( $tt, $mask ); -+ &xor( $b, $tt ); -+ &shl( $tt, $shift ); -+ &xor( $a, $tt ); -+ } -+ -+sub IP_new -+ { -+ local($l,$r,$tt)=@_; -+ -+ &PERM_OP($r,$l,$tt, 4,"0x0f0f0f0f"); -+ &PERM_OP($l,$r,$tt,16,"0x0000ffff"); -+ &PERM_OP($r,$l,$tt, 2,"0x33333333"); -+ &PERM_OP($l,$r,$tt, 8,"0x00ff00ff"); -+ &PERM_OP($r,$l,$tt, 1,"0x55555555"); -+ } -+ -+sub FP_new -+ { -+ local($l,$r,$tt)=@_; -+ -+ &PERM_OP($l,$r,$tt, 1,"0x55555555"); -+ &PERM_OP($r,$l,$tt, 8,"0x00ff00ff"); -+ &PERM_OP($l,$r,$tt, 2,"0x33333333"); -+ &PERM_OP($r,$l,$tt,16,"0x0000ffff"); -+ &PERM_OP($l,$r,$tt, 4,"0x0f0f0f0f"); -+ } -+ -+sub n2a -+ { -+ sprintf("%d",$_[0]); -+ } ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/des/asm/desboth.pl Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,79 @@ -+#!/usr/local/bin/perl -+ -+$L="edi"; -+$R="esi"; -+ -+sub des_encrypt3 -+ { -+ local($name,$enc)=@_; -+ -+ &function_begin_B($name,""); -+ &push("ebx"); -+ &mov("ebx",&wparam(0)); -+ -+ &push("ebp"); -+ &push("esi"); -+ -+ &push("edi"); -+ -+ &comment(""); -+ &comment("Load the data words"); -+ &mov($L,&DWP(0,"ebx","",0)); -+ &mov($R,&DWP(4,"ebx","",0)); -+ &stack_push(3); -+ -+ &comment(""); -+ &comment("IP"); -+ &IP_new($L,$R,"edx",0); -+ -+ # put them back -+ -+ if ($enc) -+ { -+ &mov(&DWP(4,"ebx","",0),$R); -+ &mov("eax",&wparam(1)); -+ &mov(&DWP(0,"ebx","",0),"edx"); -+ &mov("edi",&wparam(2)); -+ &mov("esi",&wparam(3)); -+ } -+ else -+ { -+ &mov(&DWP(4,"ebx","",0),$R); -+ &mov("esi",&wparam(1)); -+ &mov(&DWP(0,"ebx","",0),"edx"); -+ &mov("edi",&wparam(2)); -+ &mov("eax",&wparam(3)); -+ } -+ &mov(&swtmp(2), (($enc)?"1":"0")); -+ &mov(&swtmp(1), "eax"); -+ &mov(&swtmp(0), "ebx"); -+ &call("des_encrypt2"); -+ &mov(&swtmp(2), (($enc)?"0":"1")); -+ &mov(&swtmp(1), "edi"); -+ &mov(&swtmp(0), "ebx"); -+ &call("des_encrypt2"); -+ &mov(&swtmp(2), (($enc)?"1":"0")); -+ &mov(&swtmp(1), "esi"); -+ &mov(&swtmp(0), "ebx"); -+ &call("des_encrypt2"); -+ -+ &stack_pop(3); -+ &mov($L,&DWP(0,"ebx","",0)); -+ &mov($R,&DWP(4,"ebx","",0)); -+ -+ &comment(""); -+ &comment("FP"); -+ &FP_new($L,$R,"eax",0); -+ -+ &mov(&DWP(0,"ebx","",0),"eax"); -+ &mov(&DWP(4,"ebx","",0),$R); -+ -+ &pop("edi"); -+ &pop("esi"); -+ &pop("ebp"); -+ &pop("ebx"); -+ &ret(); -+ &function_end_B($name); -+ } -+ -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/des/asm/readme Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,131 @@ -+First up, let me say I don't like writing in assembler. It is not portable, -+dependant on the particular CPU architecture release and is generally a pig -+to debug and get right. Having said that, the x86 architecture is probably -+the most important for speed due to number of boxes and since -+it appears to be the worst architecture to to get -+good C compilers for. So due to this, I have lowered myself to do -+assembler for the inner DES routines in libdes :-). -+ -+The file to implement in assembler is des_enc.c. Replace the following -+4 functions -+des_encrypt(DES_LONG data[2],des_key_schedule ks, int encrypt); -+des_encrypt2(DES_LONG data[2],des_key_schedule ks, int encrypt); -+des_encrypt3(DES_LONG data[2],des_key_schedule ks1,ks2,ks3); -+des_decrypt3(DES_LONG data[2],des_key_schedule ks1,ks2,ks3); -+ -+They encrypt/decrypt the 64 bits held in 'data' using -+the 'ks' key schedules. The only difference between the 4 functions is that -+des_encrypt2() does not perform IP() or FP() on the data (this is an -+optimization for when doing triple DES and des_encrypt3() and des_decrypt3() -+perform triple des. The triple DES routines are in here because it does -+make a big difference to have them located near the des_encrypt2 function -+at link time.. -+ -+Now as we all know, there are lots of different operating systems running on -+x86 boxes, and unfortunately they normally try to make sure their assembler -+formating is not the same as the other peoples. -+The 4 main formats I know of are -+Microsoft Windows 95/Windows NT -+Elf Includes Linux and FreeBSD(?). -+a.out The older Linux. -+Solaris Same as Elf but different comments :-(. -+ -+Now I was not overly keen to write 4 different copies of the same code, -+so I wrote a few perl routines to output the correct assembler, given -+a target assembler type. This code is ugly and is just a hack. -+The libraries are x86unix.pl and x86ms.pl. -+des586.pl, des686.pl and des-som[23].pl are the programs to actually -+generate the assembler. -+ -+So to generate elf assembler -+perl des-som3.pl elf >dx86-elf.s -+For Windows 95/NT -+perl des-som2.pl win32 >win32.asm -+ -+[ update 4 Jan 1996 ] -+I have added another way to do things. -+perl des-som3.pl cpp >dx86-cpp.s -+generates a file that will be included by dx86unix.cpp when it is compiled. -+To build for elf, a.out, solaris, bsdi etc, -+cc -E -DELF asm/dx86unix.cpp | as -o asm/dx86-elf.o -+cc -E -DSOL asm/dx86unix.cpp | as -o asm/dx86-sol.o -+cc -E -DOUT asm/dx86unix.cpp | as -o asm/dx86-out.o -+cc -E -DBSDI asm/dx86unix.cpp | as -o asm/dx86bsdi.o -+This was done to cut down the number of files in the distribution. -+ -+Now the ugly part. I acquired my copy of Intels -+"Optimization's For Intel's 32-Bit Processors" and found a few interesting -+things. First, the aim of the exersize is to 'extract' one byte at a time -+from a word and do an array lookup. This involves getting the byte from -+the 4 locations in the word and moving it to a new word and doing the lookup. -+The most obvious way to do this is -+xor eax, eax # clear word -+movb al, cl # get low byte -+xor edi DWORD PTR 0x100+des_SP[eax] # xor in word -+movb al, ch # get next byte -+xor edi DWORD PTR 0x300+des_SP[eax] # xor in word -+shr ecx 16 -+which seems ok. For the pentium, this system appears to be the best. -+One has to do instruction interleaving to keep both functional units -+operating, but it is basically very efficient. -+ -+Now the crunch. When a full register is used after a partial write, eg. -+mov al, cl -+xor edi, DWORD PTR 0x100+des_SP[eax] -+386 - 1 cycle stall -+486 - 1 cycle stall -+586 - 0 cycle stall -+686 - at least 7 cycle stall (page 22 of the above mentioned document). -+ -+So the technique that produces the best results on a pentium, according to -+the documentation, will produce hideous results on a pentium pro. -+ -+To get around this, des686.pl will generate code that is not as fast on -+a pentium, should be very good on a pentium pro. -+mov eax, ecx # copy word -+shr ecx, 8 # line up next byte -+and eax, 0fch # mask byte -+xor edi DWORD PTR 0x100+des_SP[eax] # xor in array lookup -+mov eax, ecx # get word -+shr ecx 8 # line up next byte -+and eax, 0fch # mask byte -+xor edi DWORD PTR 0x300+des_SP[eax] # xor in array lookup -+ -+Due to the execution units in the pentium, this actually works quite well. -+For a pentium pro it should be very good. This is the type of output -+Visual C++ generates. -+ -+There is a third option. instead of using -+mov al, ch -+which is bad on the pentium pro, one may be able to use -+movzx eax, ch -+which may not incur the partial write penalty. On the pentium, -+this instruction takes 4 cycles so is not worth using but on the -+pentium pro it appears it may be worth while. I need access to one to -+experiment :-). -+ -+eric (20 Oct 1996) -+ -+22 Nov 1996 - I have asked people to run the 2 different version on pentium -+pros and it appears that the intel documentation is wrong. The -+mov al,bh is still faster on a pentium pro, so just use the des586.pl -+install des686.pl -+ -+3 Dec 1996 - I added des_encrypt3/des_decrypt3 because I have moved these -+functions into des_enc.c because it does make a massive performance -+difference on some boxes to have the functions code located close to -+the des_encrypt2() function. -+ -+9 Jan 1997 - des-som2.pl is now the correct perl script to use for -+pentiums. It contains an inner loop from -+Svend Olaf Mikkelsen which does raw ecb DES calls at -+273,000 per second. He had a previous version at 250,000 and the best -+I was able to get was 203,000. The content has not changed, this is all -+due to instruction sequencing (and actual instructions choice) which is able -+to keep both functional units of the pentium going. -+We may have lost the ugly register usage restrictions when x86 went 32 bit -+but for the pentium it has been replaced by evil instruction ordering tricks. -+ -+13 Jan 1997 - des-som3.pl, more optimizations from Svend Olaf. -+raw DES at 281,000 per second on a pentium 100. -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/des/cbc_enc.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,142 @@ -+/* crypto/des/cbc_enc.c */ -+/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com) -+ * All rights reserved. -+ * -+ * This package is an SSL implementation written -+ * by Eric Young (eay@cryptsoft.com). -+ * The implementation was written so as to conform with Netscapes SSL. -+ * -+ * This library is free for commercial and non-commercial use as long as -+ * the following conditions are aheared to. The following conditions -+ * apply to all code found in this distribution, be it the RC4, RSA, -+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation -+ * included with this distribution is covered by the same copyright terms -+ * except that the holder is Tim Hudson (tjh@cryptsoft.com). -+ * -+ * Copyright remains Eric Young's, and as such any Copyright notices in -+ * the code are not to be removed. -+ * If this package is used in a product, Eric Young should be given attribution -+ * as the author of the parts of the library used. -+ * This can be in the form of a textual message at program startup or -+ * in documentation (online or textual) provided with the package. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * 1. Redistributions of source code must retain the copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. All advertising materials mentioning features or use of this software -+ * must display the following acknowledgement: -+ * "This product includes cryptographic software written by -+ * Eric Young (eay@cryptsoft.com)" -+ * The word 'cryptographic' can be left out if the rouines from the library -+ * being used are not cryptographic related :-). -+ * 4. If you include any Windows specific code (or a derivative thereof) from -+ * the apps directory (application code) you must include an acknowledgement: -+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" -+ * -+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND -+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -+ * SUCH DAMAGE. -+ * -+ * The licence and distribution terms for any publically available version or -+ * derivative of this code cannot be changed. i.e. this code cannot simply be -+ * copied and put under another distribution licence -+ * [including the GNU Public Licence.] -+ */ -+ -+#include "des/des_locl.h" -+ -+void des_cbc_encrypt(input, output, length, schedule, ivec, enc) -+des_cblock (*input); -+des_cblock (*output); -+long length; -+des_key_schedule schedule; -+des_cblock (*ivec); -+int enc; -+ { -+ register DES_LONG tin0,tin1; -+ register DES_LONG tout0,tout1,xor0,xor1; -+ register unsigned char *in,*out; -+ register long l=length; -+ DES_LONG tin[2]; -+ unsigned char *iv; -+ -+#ifdef OCF_ASSIST -+ if (ocf_des_assist() & OCF_PROVIDES_DES_3DES) { -+ ocf_des_cbc_encrypt(input, output, length, schedule, ivec, enc); -+ return; -+ } -+#endif -+ -+ in=(unsigned char *)input; -+ out=(unsigned char *)output; -+ iv=(unsigned char *)ivec; -+ -+ if (enc) -+ { -+ c2l(iv,tout0); -+ c2l(iv,tout1); -+ for (l-=8; l>=0; l-=8) -+ { -+ c2l(in,tin0); -+ c2l(in,tin1); -+ tin0^=tout0; tin[0]=tin0; -+ tin1^=tout1; tin[1]=tin1; -+ des_encrypt((DES_LONG *)tin,schedule,DES_ENCRYPT); -+ tout0=tin[0]; l2c(tout0,out); -+ tout1=tin[1]; l2c(tout1,out); -+ } -+ if (l != -8) -+ { -+ c2ln(in,tin0,tin1,l+8); -+ tin0^=tout0; tin[0]=tin0; -+ tin1^=tout1; tin[1]=tin1; -+ des_encrypt((DES_LONG *)tin,schedule,DES_ENCRYPT); -+ tout0=tin[0]; l2c(tout0,out); -+ tout1=tin[1]; l2c(tout1,out); -+ } -+ } -+ else -+ { -+ c2l(iv,xor0); -+ c2l(iv,xor1); -+ for (l-=8; l>=0; l-=8) -+ { -+ c2l(in,tin0); tin[0]=tin0; -+ c2l(in,tin1); tin[1]=tin1; -+ des_encrypt((DES_LONG *)tin,schedule,DES_DECRYPT); -+ tout0=tin[0]^xor0; -+ tout1=tin[1]^xor1; -+ l2c(tout0,out); -+ l2c(tout1,out); -+ xor0=tin0; -+ xor1=tin1; -+ } -+ if (l != -8) -+ { -+ c2l(in,tin0); tin[0]=tin0; -+ c2l(in,tin1); tin[1]=tin1; -+ des_encrypt((DES_LONG *)tin,schedule,DES_DECRYPT); -+ tout0=tin[0]^xor0; -+ tout1=tin[1]^xor1; -+ l2cn(tout0,tout1,out,l+8); -+ /* xor0=tin0; -+ xor1=tin1; */ -+ } -+ } -+ tin0=tin1=tout0=tout1=xor0=xor1=0; -+ tin[0]=tin[1]=0; -+ } -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/des/des.doc Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,505 @@ -+The DES library. -+ -+Please note that this library was originally written to operate with -+eBones, a version of Kerberos that had had encryption removed when it left -+the USA and then put back in. As such there are some routines that I will -+advise not using but they are still in the library for historical reasons. -+For all calls that have an 'input' and 'output' variables, they can be the -+same. -+ -+This library requires the inclusion of 'des.h'. -+ -+All of the encryption functions take what is called a des_key_schedule as an -+argument. A des_key_schedule is an expanded form of the des key. -+A des_key is 8 bytes of odd parity, the type used to hold the key is a -+des_cblock. A des_cblock is an array of 8 bytes, often in this library -+description I will refer to input bytes when the function specifies -+des_cblock's as input or output, this just means that the variable should -+be a multiple of 8 bytes. -+ -+The define DES_ENCRYPT is passed to specify encryption, DES_DECRYPT to -+specify decryption. The functions and global variable are as follows: -+ -+int des_check_key; -+ DES keys are supposed to be odd parity. If this variable is set to -+ a non-zero value, des_set_key() will check that the key has odd -+ parity and is not one of the known weak DES keys. By default this -+ variable is turned off; -+ -+void des_set_odd_parity( -+des_cblock *key ); -+ This function takes a DES key (8 bytes) and sets the parity to odd. -+ -+int des_is_weak_key( -+des_cblock *key ); -+ This function returns a non-zero value if the DES key passed is a -+ weak, DES key. If it is a weak key, don't use it, try a different -+ one. If you are using 'random' keys, the chances of hitting a weak -+ key are 1/2^52 so it is probably not worth checking for them. -+ -+int des_set_key( -+des_cblock *key, -+des_key_schedule schedule); -+ Des_set_key converts an 8 byte DES key into a des_key_schedule. -+ A des_key_schedule is an expanded form of the key which is used to -+ perform actual encryption. It can be regenerated from the DES key -+ so it only needs to be kept when encryption or decryption is about -+ to occur. Don't save or pass around des_key_schedule's since they -+ are CPU architecture dependent, DES keys are not. If des_check_key -+ is non zero, zero is returned if the key has the wrong parity or -+ the key is a weak key, else 1 is returned. -+ -+int des_key_sched( -+des_cblock *key, -+des_key_schedule schedule); -+ An alternative name for des_set_key(). -+ -+int des_rw_mode; /* defaults to DES_PCBC_MODE */ -+ This flag holds either DES_CBC_MODE or DES_PCBC_MODE (default). -+ This specifies the function to use in the enc_read() and enc_write() -+ functions. -+ -+void des_encrypt( -+unsigned long *data, -+des_key_schedule ks, -+int enc); -+ This is the DES encryption function that gets called by just about -+ every other DES routine in the library. You should not use this -+ function except to implement 'modes' of DES. I say this because the -+ functions that call this routine do the conversion from 'char *' to -+ long, and this needs to be done to make sure 'non-aligned' memory -+ access do not occur. The characters are loaded 'little endian', -+ have a look at my source code for more details on how I use this -+ function. -+ Data is a pointer to 2 unsigned long's and ks is the -+ des_key_schedule to use. enc, is non zero specifies encryption, -+ zero if decryption. -+ -+void des_encrypt2( -+unsigned long *data, -+des_key_schedule ks, -+int enc); -+ This functions is the same as des_encrypt() except that the DES -+ initial permutation (IP) and final permutation (FP) have been left -+ out. As for des_encrypt(), you should not use this function. -+ It is used by the routines in my library that implement triple DES. -+ IP() des_encrypt2() des_encrypt2() des_encrypt2() FP() is the same -+ as des_encrypt() des_encrypt() des_encrypt() except faster :-). -+ -+void des_ecb_encrypt( -+des_cblock *input, -+des_cblock *output, -+des_key_schedule ks, -+int enc); -+ This is the basic Electronic Code Book form of DES, the most basic -+ form. Input is encrypted into output using the key represented by -+ ks. If enc is non zero (DES_ENCRYPT), encryption occurs, otherwise -+ decryption occurs. Input is 8 bytes long and output is 8 bytes. -+ (the des_cblock structure is 8 chars). -+ -+void des_ecb3_encrypt( -+des_cblock *input, -+des_cblock *output, -+des_key_schedule ks1, -+des_key_schedule ks2, -+des_key_schedule ks3, -+int enc); -+ This is the 3 key EDE mode of ECB DES. What this means is that -+ the 8 bytes of input is encrypted with ks1, decrypted with ks2 and -+ then encrypted again with ks3, before being put into output; -+ C=E(ks3,D(ks2,E(ks1,M))). There is a macro, des_ecb2_encrypt() -+ that only takes 2 des_key_schedules that implements, -+ C=E(ks1,D(ks2,E(ks1,M))) in that the final encrypt is done with ks1. -+ -+void des_cbc_encrypt( -+des_cblock *input, -+des_cblock *output, -+long length, -+des_key_schedule ks, -+des_cblock *ivec, -+int enc); -+ This routine implements DES in Cipher Block Chaining mode. -+ Input, which should be a multiple of 8 bytes is encrypted -+ (or decrypted) to output which will also be a multiple of 8 bytes. -+ The number of bytes is in length (and from what I've said above, -+ should be a multiple of 8). If length is not a multiple of 8, I'm -+ not being held responsible :-). ivec is the initialisation vector. -+ This function does not modify this variable. To correctly implement -+ cbc mode, you need to do one of 2 things; copy the last 8 bytes of -+ cipher text for use as the next ivec in your application, -+ or use des_ncbc_encrypt(). -+ Only this routine has this problem with updating the ivec, all -+ other routines that are implementing cbc mode update ivec. -+ -+void des_ncbc_encrypt( -+des_cblock *input, -+des_cblock *output, -+long length, -+des_key_schedule sk, -+des_cblock *ivec, -+int enc); -+ For historical reasons, des_cbc_encrypt() did not update the -+ ivec with the value requires so that subsequent calls to -+ des_cbc_encrypt() would 'chain'. This was needed so that the same -+ 'length' values would not need to be used when decrypting. -+ des_ncbc_encrypt() does the right thing. It is the same as -+ des_cbc_encrypt accept that ivec is updates with the correct value -+ to pass in subsequent calls to des_ncbc_encrypt(). I advise using -+ des_ncbc_encrypt() instead of des_cbc_encrypt(); -+ -+void des_xcbc_encrypt( -+des_cblock *input, -+des_cblock *output, -+long length, -+des_key_schedule sk, -+des_cblock *ivec, -+des_cblock *inw, -+des_cblock *outw, -+int enc); -+ This is RSA's DESX mode of DES. It uses inw and outw to -+ 'whiten' the encryption. inw and outw are secret (unlike the iv) -+ and are as such, part of the key. So the key is sort of 24 bytes. -+ This is much better than cbc des. -+ -+void des_3cbc_encrypt( -+des_cblock *input, -+des_cblock *output, -+long length, -+des_key_schedule sk1, -+des_key_schedule sk2, -+des_cblock *ivec1, -+des_cblock *ivec2, -+int enc); -+ This function is flawed, do not use it. I have left it in the -+ library because it is used in my des(1) program and will function -+ correctly when used by des(1). If I removed the function, people -+ could end up unable to decrypt files. -+ This routine implements outer triple cbc encryption using 2 ks and -+ 2 ivec's. Use des_ede2_cbc_encrypt() instead. -+ -+void des_ede3_cbc_encrypt( -+des_cblock *input, -+des_cblock *output, -+long length, -+des_key_schedule ks1, -+des_key_schedule ks2, -+des_key_schedule ks3, -+des_cblock *ivec, -+int enc); -+ This function implements inner triple CBC DES encryption with 3 -+ keys. What this means is that each 'DES' operation -+ inside the cbc mode is really an C=E(ks3,D(ks2,E(ks1,M))). -+ Again, this is cbc mode so an ivec is requires. -+ This mode is used by SSL. -+ There is also a des_ede2_cbc_encrypt() that only uses 2 -+ des_key_schedule's, the first being reused for the final -+ encryption. C=E(ks1,D(ks2,E(ks1,M))). This form of triple DES -+ is used by the RSAref library. -+ -+void des_pcbc_encrypt( -+des_cblock *input, -+des_cblock *output, -+long length, -+des_key_schedule ks, -+des_cblock *ivec, -+int enc); -+ This is Propagating Cipher Block Chaining mode of DES. It is used -+ by Kerberos v4. It's parameters are the same as des_ncbc_encrypt(). -+ -+void des_cfb_encrypt( -+unsigned char *in, -+unsigned char *out, -+int numbits, -+long length, -+des_key_schedule ks, -+des_cblock *ivec, -+int enc); -+ Cipher Feedback Back mode of DES. This implementation 'feeds back' -+ in numbit blocks. The input (and output) is in multiples of numbits -+ bits. numbits should to be a multiple of 8 bits. Length is the -+ number of bytes input. If numbits is not a multiple of 8 bits, -+ the extra bits in the bytes will be considered padding. So if -+ numbits is 12, for each 2 input bytes, the 4 high bits of the -+ second byte will be ignored. So to encode 72 bits when using -+ a numbits of 12 take 12 bytes. To encode 72 bits when using -+ numbits of 9 will take 16 bytes. To encode 80 bits when using -+ numbits of 16 will take 10 bytes. etc, etc. This padding will -+ apply to both input and output. -+ -+ -+void des_cfb64_encrypt( -+unsigned char *in, -+unsigned char *out, -+long length, -+des_key_schedule ks, -+des_cblock *ivec, -+int *num, -+int enc); -+ This is one of the more useful functions in this DES library, it -+ implements CFB mode of DES with 64bit feedback. Why is this -+ useful you ask? Because this routine will allow you to encrypt an -+ arbitrary number of bytes, no 8 byte padding. Each call to this -+ routine will encrypt the input bytes to output and then update ivec -+ and num. num contains 'how far' we are though ivec. If this does -+ not make much sense, read more about cfb mode of DES :-). -+ -+void des_ede3_cfb64_encrypt( -+unsigned char *in, -+unsigned char *out, -+long length, -+des_key_schedule ks1, -+des_key_schedule ks2, -+des_key_schedule ks3, -+des_cblock *ivec, -+int *num, -+int enc); -+ Same as des_cfb64_encrypt() accept that the DES operation is -+ triple DES. As usual, there is a macro for -+ des_ede2_cfb64_encrypt() which reuses ks1. -+ -+void des_ofb_encrypt( -+unsigned char *in, -+unsigned char *out, -+int numbits, -+long length, -+des_key_schedule ks, -+des_cblock *ivec); -+ This is a implementation of Output Feed Back mode of DES. It is -+ the same as des_cfb_encrypt() in that numbits is the size of the -+ units dealt with during input and output (in bits). -+ -+void des_ofb64_encrypt( -+unsigned char *in, -+unsigned char *out, -+long length, -+des_key_schedule ks, -+des_cblock *ivec, -+int *num); -+ The same as des_cfb64_encrypt() except that it is Output Feed Back -+ mode. -+ -+void des_ede3_ofb64_encrypt( -+unsigned char *in, -+unsigned char *out, -+long length, -+des_key_schedule ks1, -+des_key_schedule ks2, -+des_key_schedule ks3, -+des_cblock *ivec, -+int *num); -+ Same as des_ofb64_encrypt() accept that the DES operation is -+ triple DES. As usual, there is a macro for -+ des_ede2_ofb64_encrypt() which reuses ks1. -+ -+int des_read_pw_string( -+char *buf, -+int length, -+char *prompt, -+int verify); -+ This routine is used to get a password from the terminal with echo -+ turned off. Buf is where the string will end up and length is the -+ size of buf. Prompt is a string presented to the 'user' and if -+ verify is set, the key is asked for twice and unless the 2 copies -+ match, an error is returned. A return code of -1 indicates a -+ system error, 1 failure due to use interaction, and 0 is success. -+ -+unsigned long des_cbc_cksum( -+des_cblock *input, -+des_cblock *output, -+long length, -+des_key_schedule ks, -+des_cblock *ivec); -+ This function produces an 8 byte checksum from input that it puts in -+ output and returns the last 4 bytes as a long. The checksum is -+ generated via cbc mode of DES in which only the last 8 byes are -+ kept. I would recommend not using this function but instead using -+ the EVP_Digest routines, or at least using MD5 or SHA. This -+ function is used by Kerberos v4 so that is why it stays in the -+ library. -+ -+char *des_fcrypt( -+const char *buf, -+const char *salt -+char *ret); -+ This is my fast version of the unix crypt(3) function. This version -+ takes only a small amount of space relative to other fast -+ crypt() implementations. This is different to the normal crypt -+ in that the third parameter is the buffer that the return value -+ is written into. It needs to be at least 14 bytes long. This -+ function is thread safe, unlike the normal crypt. -+ -+char *crypt( -+const char *buf, -+const char *salt); -+ This function calls des_fcrypt() with a static array passed as the -+ third parameter. This emulates the normal non-thread safe semantics -+ of crypt(3). -+ -+void des_string_to_key( -+char *str, -+des_cblock *key); -+ This function takes str and converts it into a DES key. I would -+ recommend using MD5 instead and use the first 8 bytes of output. -+ When I wrote the first version of these routines back in 1990, MD5 -+ did not exist but I feel these routines are still sound. This -+ routines is compatible with the one in MIT's libdes. -+ -+void des_string_to_2keys( -+char *str, -+des_cblock *key1, -+des_cblock *key2); -+ This function takes str and converts it into 2 DES keys. -+ I would recommend using MD5 and using the 16 bytes as the 2 keys. -+ I have nothing against these 2 'string_to_key' routines, it's just -+ that if you say that your encryption key is generated by using the -+ 16 bytes of an MD5 hash, every-one knows how you generated your -+ keys. -+ -+int des_read_password( -+des_cblock *key, -+char *prompt, -+int verify); -+ This routine combines des_read_pw_string() with des_string_to_key(). -+ -+int des_read_2passwords( -+des_cblock *key1, -+des_cblock *key2, -+char *prompt, -+int verify); -+ This routine combines des_read_pw_string() with des_string_to_2key(). -+ -+void des_random_seed( -+des_cblock key); -+ This routine sets a starting point for des_random_key(). -+ -+void des_random_key( -+des_cblock ret); -+ This function return a random key. Make sure to 'seed' the random -+ number generator (with des_random_seed()) before using this function. -+ I personally now use a MD5 based random number system. -+ -+int des_enc_read( -+int fd, -+char *buf, -+int len, -+des_key_schedule ks, -+des_cblock *iv); -+ This function will write to a file descriptor the encrypted data -+ from buf. This data will be preceded by a 4 byte 'byte count' and -+ will be padded out to 8 bytes. The encryption is either CBC of -+ PCBC depending on the value of des_rw_mode. If it is DES_PCBC_MODE, -+ pcbc is used, if DES_CBC_MODE, cbc is used. The default is to use -+ DES_PCBC_MODE. -+ -+int des_enc_write( -+int fd, -+char *buf, -+int len, -+des_key_schedule ks, -+des_cblock *iv); -+ This routines read stuff written by des_enc_read() and decrypts it. -+ I have used these routines quite a lot but I don't believe they are -+ suitable for non-blocking io. If you are after a full -+ authentication/encryption over networks, have a look at SSL instead. -+ -+unsigned long des_quad_cksum( -+des_cblock *input, -+des_cblock *output, -+long length, -+int out_count, -+des_cblock *seed); -+ This is a function from Kerberos v4 that is not anything to do with -+ DES but was needed. It is a cksum that is quicker to generate than -+ des_cbc_cksum(); I personally would use MD5 routines now. -+===== -+Modes of DES -+Quite a bit of the following information has been taken from -+ AS 2805.5.2 -+ Australian Standard -+ Electronic funds transfer - Requirements for interfaces, -+ Part 5.2: Modes of operation for an n-bit block cipher algorithm -+ Appendix A -+ -+There are several different modes in which DES can be used, they are -+as follows. -+ -+Electronic Codebook Mode (ECB) (des_ecb_encrypt()) -+- 64 bits are enciphered at a time. -+- The order of the blocks can be rearranged without detection. -+- The same plaintext block always produces the same ciphertext block -+ (for the same key) making it vulnerable to a 'dictionary attack'. -+- An error will only affect one ciphertext block. -+ -+Cipher Block Chaining Mode (CBC) (des_cbc_encrypt()) -+- a multiple of 64 bits are enciphered at a time. -+- The CBC mode produces the same ciphertext whenever the same -+ plaintext is encrypted using the same key and starting variable. -+- The chaining operation makes the ciphertext blocks dependent on the -+ current and all preceding plaintext blocks and therefore blocks can not -+ be rearranged. -+- The use of different starting variables prevents the same plaintext -+ enciphering to the same ciphertext. -+- An error will affect the current and the following ciphertext blocks. -+ -+Cipher Feedback Mode (CFB) (des_cfb_encrypt()) -+- a number of bits (j) <= 64 are enciphered at a time. -+- The CFB mode produces the same ciphertext whenever the same -+ plaintext is encrypted using the same key and starting variable. -+- The chaining operation makes the ciphertext variables dependent on the -+ current and all preceding variables and therefore j-bit variables are -+ chained together and can not be rearranged. -+- The use of different starting variables prevents the same plaintext -+ enciphering to the same ciphertext. -+- The strength of the CFB mode depends on the size of k (maximal if -+ j == k). In my implementation this is always the case. -+- Selection of a small value for j will require more cycles through -+ the encipherment algorithm per unit of plaintext and thus cause -+ greater processing overheads. -+- Only multiples of j bits can be enciphered. -+- An error will affect the current and the following ciphertext variables. -+ -+Output Feedback Mode (OFB) (des_ofb_encrypt()) -+- a number of bits (j) <= 64 are enciphered at a time. -+- The OFB mode produces the same ciphertext whenever the same -+ plaintext enciphered using the same key and starting variable. More -+ over, in the OFB mode the same key stream is produced when the same -+ key and start variable are used. Consequently, for security reasons -+ a specific start variable should be used only once for a given key. -+- The absence of chaining makes the OFB more vulnerable to specific attacks. -+- The use of different start variables values prevents the same -+ plaintext enciphering to the same ciphertext, by producing different -+ key streams. -+- Selection of a small value for j will require more cycles through -+ the encipherment algorithm per unit of plaintext and thus cause -+ greater processing overheads. -+- Only multiples of j bits can be enciphered. -+- OFB mode of operation does not extend ciphertext errors in the -+ resultant plaintext output. Every bit error in the ciphertext causes -+ only one bit to be in error in the deciphered plaintext. -+- OFB mode is not self-synchronising. If the two operation of -+ encipherment and decipherment get out of synchronism, the system needs -+ to be re-initialised. -+- Each re-initialisation should use a value of the start variable -+ different from the start variable values used before with the same -+ key. The reason for this is that an identical bit stream would be -+ produced each time from the same parameters. This would be -+ susceptible to a ' known plaintext' attack. -+ -+Triple ECB Mode (des_ecb3_encrypt()) -+- Encrypt with key1, decrypt with key2 and encrypt with key3 again. -+- As for ECB encryption but increases the key length to 168 bits. -+ There are theoretic attacks that can be used that make the effective -+ key length 112 bits, but this attack also requires 2^56 blocks of -+ memory, not very likely, even for the NSA. -+- If both keys are the same it is equivalent to encrypting once with -+ just one key. -+- If the first and last key are the same, the key length is 112 bits. -+ There are attacks that could reduce the key space to 55 bit's but it -+ requires 2^56 blocks of memory. -+- If all 3 keys are the same, this is effectively the same as normal -+ ecb mode. -+ -+Triple CBC Mode (des_ede3_cbc_encrypt()) -+- Encrypt with key1, decrypt with key2 and then encrypt with key3. -+- As for CBC encryption but increases the key length to 168 bits with -+ the same restrictions as for triple ecb mode. ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/des/des_enc.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,524 @@ -+/* crypto/des/des_enc.c */ -+/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com) -+ * All rights reserved. -+ * -+ * This package is an SSL implementation written -+ * by Eric Young (eay@cryptsoft.com). -+ * The implementation was written so as to conform with Netscapes SSL. -+ * -+ * This library is free for commercial and non-commercial use as long as -+ * the following conditions are aheared to. The following conditions -+ * apply to all code found in this distribution, be it the RC4, RSA, -+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation -+ * included with this distribution is covered by the same copyright terms -+ * except that the holder is Tim Hudson (tjh@cryptsoft.com). -+ * -+ * Copyright remains Eric Young's, and as such any Copyright notices in -+ * the code are not to be removed. -+ * If this package is used in a product, Eric Young should be given attribution -+ * as the author of the parts of the library used. -+ * This can be in the form of a textual message at program startup or -+ * in documentation (online or textual) provided with the package. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * 1. Redistributions of source code must retain the copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. All advertising materials mentioning features or use of this software -+ * must display the following acknowledgement: -+ * "This product includes cryptographic software written by -+ * Eric Young (eay@cryptsoft.com)" -+ * The word 'cryptographic' can be left out if the rouines from the library -+ * being used are not cryptographic related :-). -+ * 4. If you include any Windows specific code (or a derivative thereof) from -+ * the apps directory (application code) you must include an acknowledgement: -+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" -+ * -+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND -+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -+ * SUCH DAMAGE. -+ * -+ * The licence and distribution terms for any publically available version or -+ * derivative of this code cannot be changed. i.e. this code cannot simply be -+ * copied and put under another distribution licence -+ * [including the GNU Public Licence.] -+ */ -+ -+#include "des/des_locl.h" -+ -+void des_encrypt(data, ks, enc) -+DES_LONG *data; -+des_key_schedule ks; -+int enc; -+ { -+ register DES_LONG l,r,t,u; -+#ifdef DES_PTR -+ register unsigned char *des_SP=(unsigned char *)des_SPtrans; -+#endif -+#ifndef DES_UNROLL -+ register int i; -+#endif -+ register DES_LONG *s; -+ -+#ifdef OCF_ASSIST -+ if (ocf_des_assist() & OCF_PROVIDES_DES_3DES) { -+ ocf_des_encrypt(data, ks, enc); -+ return; -+ } -+#endif -+ -+ r=data[0]; -+ l=data[1]; -+ -+ IP(r,l); -+ /* Things have been modified so that the initial rotate is -+ * done outside the loop. This required the -+ * des_SPtrans values in sp.h to be rotated 1 bit to the right. -+ * One perl script later and things have a 5% speed up on a sparc2. -+ * Thanks to Richard Outerbridge <71755.204@CompuServe.COM> -+ * for pointing this out. */ -+ /* clear the top bits on machines with 8byte longs */ -+ /* shift left by 2 */ -+ r=ROTATE(r,29)&0xffffffffL; -+ l=ROTATE(l,29)&0xffffffffL; -+ -+ s=(DES_LONG *)ks; -+ /* I don't know if it is worth the effort of loop unrolling the -+ * inner loop */ -+ if (enc) -+ { -+#ifdef DES_UNROLL -+ D_ENCRYPT(l,r, 0); /* 1 */ -+ D_ENCRYPT(r,l, 2); /* 2 */ -+ D_ENCRYPT(l,r, 4); /* 3 */ -+ D_ENCRYPT(r,l, 6); /* 4 */ -+ D_ENCRYPT(l,r, 8); /* 5 */ -+ D_ENCRYPT(r,l,10); /* 6 */ -+ D_ENCRYPT(l,r,12); /* 7 */ -+ D_ENCRYPT(r,l,14); /* 8 */ -+ D_ENCRYPT(l,r,16); /* 9 */ -+ D_ENCRYPT(r,l,18); /* 10 */ -+ D_ENCRYPT(l,r,20); /* 11 */ -+ D_ENCRYPT(r,l,22); /* 12 */ -+ D_ENCRYPT(l,r,24); /* 13 */ -+ D_ENCRYPT(r,l,26); /* 14 */ -+ D_ENCRYPT(l,r,28); /* 15 */ -+ D_ENCRYPT(r,l,30); /* 16 */ -+#else -+ for (i=0; i<32; i+=8) -+ { -+ D_ENCRYPT(l,r,i+0); /* 1 */ -+ D_ENCRYPT(r,l,i+2); /* 2 */ -+ D_ENCRYPT(l,r,i+4); /* 3 */ -+ D_ENCRYPT(r,l,i+6); /* 4 */ -+ } -+#endif -+ } -+ else -+ { -+#ifdef DES_UNROLL -+ D_ENCRYPT(l,r,30); /* 16 */ -+ D_ENCRYPT(r,l,28); /* 15 */ -+ D_ENCRYPT(l,r,26); /* 14 */ -+ D_ENCRYPT(r,l,24); /* 13 */ -+ D_ENCRYPT(l,r,22); /* 12 */ -+ D_ENCRYPT(r,l,20); /* 11 */ -+ D_ENCRYPT(l,r,18); /* 10 */ -+ D_ENCRYPT(r,l,16); /* 9 */ -+ D_ENCRYPT(l,r,14); /* 8 */ -+ D_ENCRYPT(r,l,12); /* 7 */ -+ D_ENCRYPT(l,r,10); /* 6 */ -+ D_ENCRYPT(r,l, 8); /* 5 */ -+ D_ENCRYPT(l,r, 6); /* 4 */ -+ D_ENCRYPT(r,l, 4); /* 3 */ -+ D_ENCRYPT(l,r, 2); /* 2 */ -+ D_ENCRYPT(r,l, 0); /* 1 */ -+#else -+ for (i=30; i>0; i-=8) -+ { -+ D_ENCRYPT(l,r,i-0); /* 16 */ -+ D_ENCRYPT(r,l,i-2); /* 15 */ -+ D_ENCRYPT(l,r,i-4); /* 14 */ -+ D_ENCRYPT(r,l,i-6); /* 13 */ -+ } -+#endif -+ } -+ -+ /* rotate and clear the top bits on machines with 8byte longs */ -+ l=ROTATE(l,3)&0xffffffffL; -+ r=ROTATE(r,3)&0xffffffffL; -+ -+ FP(r,l); -+ data[0]=l; -+ data[1]=r; -+ l=r=t=u=0; -+ } -+ -+void des_encrypt2(data, ks, enc) -+DES_LONG *data; -+des_key_schedule ks; -+int enc; -+ { -+ register DES_LONG l,r,t,u; -+#ifdef DES_PTR -+ register unsigned char *des_SP=(unsigned char *)des_SPtrans; -+#endif -+#ifndef DES_UNROLL -+ register int i; -+#endif -+ register DES_LONG *s; -+ -+ r=data[0]; -+ l=data[1]; -+ -+ /* Things have been modified so that the initial rotate is -+ * done outside the loop. This required the -+ * des_SPtrans values in sp.h to be rotated 1 bit to the right. -+ * One perl script later and things have a 5% speed up on a sparc2. -+ * Thanks to Richard Outerbridge <71755.204@CompuServe.COM> -+ * for pointing this out. */ -+ /* clear the top bits on machines with 8byte longs */ -+ r=ROTATE(r,29)&0xffffffffL; -+ l=ROTATE(l,29)&0xffffffffL; -+ -+ s=(DES_LONG *)ks; -+ /* I don't know if it is worth the effort of loop unrolling the -+ * inner loop */ -+ if (enc) -+ { -+#ifdef DES_UNROLL -+ D_ENCRYPT(l,r, 0); /* 1 */ -+ D_ENCRYPT(r,l, 2); /* 2 */ -+ D_ENCRYPT(l,r, 4); /* 3 */ -+ D_ENCRYPT(r,l, 6); /* 4 */ -+ D_ENCRYPT(l,r, 8); /* 5 */ -+ D_ENCRYPT(r,l,10); /* 6 */ -+ D_ENCRYPT(l,r,12); /* 7 */ -+ D_ENCRYPT(r,l,14); /* 8 */ -+ D_ENCRYPT(l,r,16); /* 9 */ -+ D_ENCRYPT(r,l,18); /* 10 */ -+ D_ENCRYPT(l,r,20); /* 11 */ -+ D_ENCRYPT(r,l,22); /* 12 */ -+ D_ENCRYPT(l,r,24); /* 13 */ -+ D_ENCRYPT(r,l,26); /* 14 */ -+ D_ENCRYPT(l,r,28); /* 15 */ -+ D_ENCRYPT(r,l,30); /* 16 */ -+#else -+ for (i=0; i<32; i+=8) -+ { -+ D_ENCRYPT(l,r,i+0); /* 1 */ -+ D_ENCRYPT(r,l,i+2); /* 2 */ -+ D_ENCRYPT(l,r,i+4); /* 3 */ -+ D_ENCRYPT(r,l,i+6); /* 4 */ -+ } -+#endif -+ } -+ else -+ { -+#ifdef DES_UNROLL -+ D_ENCRYPT(l,r,30); /* 16 */ -+ D_ENCRYPT(r,l,28); /* 15 */ -+ D_ENCRYPT(l,r,26); /* 14 */ -+ D_ENCRYPT(r,l,24); /* 13 */ -+ D_ENCRYPT(l,r,22); /* 12 */ -+ D_ENCRYPT(r,l,20); /* 11 */ -+ D_ENCRYPT(l,r,18); /* 10 */ -+ D_ENCRYPT(r,l,16); /* 9 */ -+ D_ENCRYPT(l,r,14); /* 8 */ -+ D_ENCRYPT(r,l,12); /* 7 */ -+ D_ENCRYPT(l,r,10); /* 6 */ -+ D_ENCRYPT(r,l, 8); /* 5 */ -+ D_ENCRYPT(l,r, 6); /* 4 */ -+ D_ENCRYPT(r,l, 4); /* 3 */ -+ D_ENCRYPT(l,r, 2); /* 2 */ -+ D_ENCRYPT(r,l, 0); /* 1 */ -+#else -+ for (i=30; i>0; i-=8) -+ { -+ D_ENCRYPT(l,r,i-0); /* 16 */ -+ D_ENCRYPT(r,l,i-2); /* 15 */ -+ D_ENCRYPT(l,r,i-4); /* 14 */ -+ D_ENCRYPT(r,l,i-6); /* 13 */ -+ } -+#endif -+ } -+ /* rotate and clear the top bits on machines with 8byte longs */ -+ data[0]=ROTATE(l,3)&0xffffffffL; -+ data[1]=ROTATE(r,3)&0xffffffffL; -+ l=r=t=u=0; -+ } -+ -+void des_encrypt3(data,ks1,ks2,ks3) -+DES_LONG *data; -+des_key_schedule ks1; -+des_key_schedule ks2; -+des_key_schedule ks3; -+ { -+ register DES_LONG l,r; -+ -+ l=data[0]; -+ r=data[1]; -+ IP(l,r); -+ data[0]=l; -+ data[1]=r; -+ des_encrypt2((DES_LONG *)data,ks1,DES_ENCRYPT); -+ des_encrypt2((DES_LONG *)data,ks2,DES_DECRYPT); -+ des_encrypt2((DES_LONG *)data,ks3,DES_ENCRYPT); -+ l=data[0]; -+ r=data[1]; -+ FP(r,l); -+ data[0]=l; -+ data[1]=r; -+ } -+ -+void des_decrypt3(data,ks1,ks2,ks3) -+DES_LONG *data; -+des_key_schedule ks1; -+des_key_schedule ks2; -+des_key_schedule ks3; -+ { -+ register DES_LONG l,r; -+ -+ l=data[0]; -+ r=data[1]; -+ IP(l,r); -+ data[0]=l; -+ data[1]=r; -+ des_encrypt2((DES_LONG *)data,ks3,DES_DECRYPT); -+ des_encrypt2((DES_LONG *)data,ks2,DES_ENCRYPT); -+ des_encrypt2((DES_LONG *)data,ks1,DES_DECRYPT); -+ l=data[0]; -+ r=data[1]; -+ FP(r,l); -+ data[0]=l; -+ data[1]=r; -+ } -+ -+#ifndef DES_DEFAULT_OPTIONS -+ -+void des_ncbc_encrypt(input, output, length, schedule, ivec, enc) -+des_cblock (*input); -+des_cblock (*output); -+long length; -+des_key_schedule schedule; -+des_cblock (*ivec); -+int enc; -+ { -+ register DES_LONG tin0,tin1; -+ register DES_LONG tout0,tout1,xor0,xor1; -+ register unsigned char *in,*out; -+ register long l=length; -+ DES_LONG tin[2]; -+ unsigned char *iv; -+ -+#ifdef OCF_ASSIST -+ if (ocf_des_assist() & OCF_PROVIDES_DES_3DES) { -+ ocf_des_ncbc_encrypt(input, output, length, schedule, ivec, enc); -+ return; -+ } -+#endif -+ -+ in=(unsigned char *)input; -+ out=(unsigned char *)output; -+ iv=(unsigned char *)ivec; -+ -+ if (enc) -+ { -+ c2l(iv,tout0); -+ c2l(iv,tout1); -+ for (l-=8; l>=0; l-=8) -+ { -+ c2l(in,tin0); -+ c2l(in,tin1); -+ tin0^=tout0; tin[0]=tin0; -+ tin1^=tout1; tin[1]=tin1; -+ des_encrypt((DES_LONG *)tin,schedule,DES_ENCRYPT); -+ tout0=tin[0]; l2c(tout0,out); -+ tout1=tin[1]; l2c(tout1,out); -+ } -+ if (l != -8) -+ { -+ c2ln(in,tin0,tin1,l+8); -+ tin0^=tout0; tin[0]=tin0; -+ tin1^=tout1; tin[1]=tin1; -+ des_encrypt((DES_LONG *)tin,schedule,DES_ENCRYPT); -+ tout0=tin[0]; l2c(tout0,out); -+ tout1=tin[1]; l2c(tout1,out); -+ } -+ iv=(unsigned char *)ivec; -+ l2c(tout0,iv); -+ l2c(tout1,iv); -+ } -+ else -+ { -+ c2l(iv,xor0); -+ c2l(iv,xor1); -+ for (l-=8; l>=0; l-=8) -+ { -+ c2l(in,tin0); tin[0]=tin0; -+ c2l(in,tin1); tin[1]=tin1; -+ des_encrypt((DES_LONG *)tin,schedule,DES_DECRYPT); -+ tout0=tin[0]^xor0; -+ tout1=tin[1]^xor1; -+ l2c(tout0,out); -+ l2c(tout1,out); -+ xor0=tin0; -+ xor1=tin1; -+ } -+ if (l != -8) -+ { -+ c2l(in,tin0); tin[0]=tin0; -+ c2l(in,tin1); tin[1]=tin1; -+ des_encrypt((DES_LONG *)tin,schedule,DES_DECRYPT); -+ tout0=tin[0]^xor0; -+ tout1=tin[1]^xor1; -+ l2cn(tout0,tout1,out,l+8); -+ xor0=tin0; -+ xor1=tin1; -+ } -+ -+ iv=(unsigned char *)ivec; -+ l2c(xor0,iv); -+ l2c(xor1,iv); -+ } -+ tin0=tin1=tout0=tout1=xor0=xor1=0; -+ tin[0]=tin[1]=0; -+ } -+ -+void des_ede3_cbc_encrypt(input, output, length, ks1, ks2, ks3, ivec, enc) -+des_cblock (*input); -+des_cblock (*output); -+long length; -+des_key_schedule ks1; -+des_key_schedule ks2; -+des_key_schedule ks3; -+des_cblock (*ivec); -+int enc; -+ { -+ register DES_LONG tin0,tin1; -+ register DES_LONG tout0,tout1,xor0,xor1; -+ register unsigned char *in,*out; -+ register long l=length; -+ DES_LONG tin[2]; -+ unsigned char *iv; -+ -+#ifdef OCF_ASSIST -+ if (ocf_des_assist() & OCF_PROVIDES_DES_3DES) { -+ ocf_des_ede3_cbc_encrypt(input,output,length,ks1,ks2,ks3,ivec,enc); -+ return; -+ } -+#endif -+ -+ -+ in=(unsigned char *)input; -+ out=(unsigned char *)output; -+ iv=(unsigned char *)ivec; -+ -+ if (enc) -+ { -+ c2l(iv,tout0); -+ c2l(iv,tout1); -+ for (l-=8; l>=0; l-=8) -+ { -+ c2l(in,tin0); -+ c2l(in,tin1); -+ tin0^=tout0; -+ tin1^=tout1; -+ -+ tin[0]=tin0; -+ tin[1]=tin1; -+ des_encrypt3((DES_LONG *)tin,ks1,ks2,ks3); -+ tout0=tin[0]; -+ tout1=tin[1]; -+ -+ l2c(tout0,out); -+ l2c(tout1,out); -+ } -+ if (l != -8) -+ { -+ c2ln(in,tin0,tin1,l+8); -+ tin0^=tout0; -+ tin1^=tout1; -+ -+ tin[0]=tin0; -+ tin[1]=tin1; -+ des_encrypt3((DES_LONG *)tin,ks1,ks2,ks3); -+ tout0=tin[0]; -+ tout1=tin[1]; -+ -+ l2c(tout0,out); -+ l2c(tout1,out); -+ } -+ iv=(unsigned char *)ivec; -+ l2c(tout0,iv); -+ l2c(tout1,iv); -+ } -+ else -+ { -+ register DES_LONG t0,t1; -+ -+ c2l(iv,xor0); -+ c2l(iv,xor1); -+ for (l-=8; l>=0; l-=8) -+ { -+ c2l(in,tin0); -+ c2l(in,tin1); -+ -+ t0=tin0; -+ t1=tin1; -+ -+ tin[0]=tin0; -+ tin[1]=tin1; -+ des_decrypt3((DES_LONG *)tin,ks1,ks2,ks3); -+ tout0=tin[0]; -+ tout1=tin[1]; -+ -+ tout0^=xor0; -+ tout1^=xor1; -+ l2c(tout0,out); -+ l2c(tout1,out); -+ xor0=t0; -+ xor1=t1; -+ } -+ if (l != -8) -+ { -+ c2l(in,tin0); -+ c2l(in,tin1); -+ -+ t0=tin0; -+ t1=tin1; -+ -+ tin[0]=tin0; -+ tin[1]=tin1; -+ des_decrypt3((DES_LONG *)tin,ks1,ks2,ks3); -+ tout0=tin[0]; -+ tout1=tin[1]; -+ -+ tout0^=xor0; -+ tout1^=xor1; -+ l2cn(tout0,tout1,out,l+8); -+ xor0=t0; -+ xor1=t1; -+ } -+ -+ iv=(unsigned char *)ivec; -+ l2c(xor0,iv); -+ l2c(xor1,iv); -+ } -+ tin0=tin1=tout0=tout1=xor0=xor1=0; -+ tin[0]=tin[1]=0; -+ } -+ -+#endif /* DES_DEFAULT_OPTIONS */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/des/des_opts.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,620 @@ -+/* crypto/des/des_opts.c */ -+/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com) -+ * All rights reserved. -+ * -+ * This package is an SSL implementation written -+ * by Eric Young (eay@cryptsoft.com). -+ * The implementation was written so as to conform with Netscapes SSL. -+ * -+ * This library is free for commercial and non-commercial use as long as -+ * the following conditions are aheared to. The following conditions -+ * apply to all code found in this distribution, be it the RC4, RSA, -+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation -+ * included with this distribution is covered by the same copyright terms -+ * except that the holder is Tim Hudson (tjh@cryptsoft.com). -+ * -+ * Copyright remains Eric Young's, and as such any Copyright notices in -+ * the code are not to be removed. -+ * If this package is used in a product, Eric Young should be given attribution -+ * as the author of the parts of the library used. -+ * This can be in the form of a textual message at program startup or -+ * in documentation (online or textual) provided with the package. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * 1. Redistributions of source code must retain the copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. All advertising materials mentioning features or use of this software -+ * must display the following acknowledgement: -+ * "This product includes cryptographic software written by -+ * Eric Young (eay@cryptsoft.com)" -+ * The word 'cryptographic' can be left out if the rouines from the library -+ * being used are not cryptographic related :-). -+ * 4. If you include any Windows specific code (or a derivative thereof) from -+ * the apps directory (application code) you must include an acknowledgement: -+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" -+ * -+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND -+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -+ * SUCH DAMAGE. -+ * -+ * The licence and distribution terms for any publically available version or -+ * derivative of this code cannot be changed. i.e. this code cannot simply be -+ * copied and put under another distribution licence -+ * [including the GNU Public Licence.] -+ */ -+ -+/* define PART1, PART2, PART3 or PART4 to build only with a few of the options. -+ * This is for machines with 64k code segment size restrictions. */ -+ -+#ifndef MSDOS -+#define TIMES -+#endif -+ -+#include -+#ifndef MSDOS -+#include -+#else -+#include -+extern void exit(); -+#endif -+#include -+#ifndef VMS -+#ifndef _IRIX -+#include -+#endif -+#ifdef TIMES -+#include -+#include -+#endif -+#else /* VMS */ -+#include -+struct tms { -+ time_t tms_utime; -+ time_t tms_stime; -+ time_t tms_uchild; /* I dunno... */ -+ time_t tms_uchildsys; /* so these names are a guess :-) */ -+ } -+#endif -+#ifndef TIMES -+#include -+#endif -+ -+#ifdef sun -+#include -+#include -+#endif -+ -+#include "des/des_locl.h" -+#include "des/spr.h" -+ -+#define DES_DEFAULT_OPTIONS -+ -+#if !defined(PART1) && !defined(PART2) && !defined(PART3) && !defined(PART4) -+#define PART1 -+#define PART2 -+#define PART3 -+#define PART4 -+#endif -+ -+#ifdef PART1 -+ -+#undef DES_UNROLL -+#undef DES_RISC1 -+#undef DES_RISC2 -+#undef DES_PTR -+#undef D_ENCRYPT -+#define des_encrypt des_encrypt_u4_cisc_idx -+#define des_encrypt2 des_encrypt2_u4_cisc_idx -+#define des_encrypt3 des_encrypt3_u4_cisc_idx -+#define des_decrypt3 des_decrypt3_u4_cisc_idx -+#undef HEADER_DES_LOCL_H -+#include "des_enc.c" -+ -+#define DES_UNROLL -+#undef DES_RISC1 -+#undef DES_RISC2 -+#undef DES_PTR -+#undef D_ENCRYPT -+#undef des_encrypt -+#undef des_encrypt2 -+#undef des_encrypt3 -+#undef des_decrypt3 -+#define des_encrypt des_encrypt_u16_cisc_idx -+#define des_encrypt2 des_encrypt2_u16_cisc_idx -+#define des_encrypt3 des_encrypt3_u16_cisc_idx -+#define des_decrypt3 des_decrypt3_u16_cisc_idx -+#undef HEADER_DES_LOCL_H -+#include "des_enc.c" -+ -+#undef DES_UNROLL -+#define DES_RISC1 -+#undef DES_RISC2 -+#undef DES_PTR -+#undef D_ENCRYPT -+#undef des_encrypt -+#undef des_encrypt2 -+#undef des_encrypt3 -+#undef des_decrypt3 -+#define des_encrypt des_encrypt_u4_risc1_idx -+#define des_encrypt2 des_encrypt2_u4_risc1_idx -+#define des_encrypt3 des_encrypt3_u4_risc1_idx -+#define des_decrypt3 des_decrypt3_u4_risc1_idx -+#undef HEADER_DES_LOCL_H -+#include "des_enc.c" -+ -+#endif -+ -+#ifdef PART2 -+ -+#undef DES_UNROLL -+#undef DES_RISC1 -+#define DES_RISC2 -+#undef DES_PTR -+#undef D_ENCRYPT -+#undef des_encrypt -+#undef des_encrypt2 -+#undef des_encrypt3 -+#undef des_decrypt3 -+#define des_encrypt des_encrypt_u4_risc2_idx -+#define des_encrypt2 des_encrypt2_u4_risc2_idx -+#define des_encrypt3 des_encrypt3_u4_risc2_idx -+#define des_decrypt3 des_decrypt3_u4_risc2_idx -+#undef HEADER_DES_LOCL_H -+#include "des_enc.c" -+ -+#define DES_UNROLL -+#define DES_RISC1 -+#undef DES_RISC2 -+#undef DES_PTR -+#undef D_ENCRYPT -+#undef des_encrypt -+#undef des_encrypt2 -+#undef des_encrypt3 -+#undef des_decrypt3 -+#define des_encrypt des_encrypt_u16_risc1_idx -+#define des_encrypt2 des_encrypt2_u16_risc1_idx -+#define des_encrypt3 des_encrypt3_u16_risc1_idx -+#define des_decrypt3 des_decrypt3_u16_risc1_idx -+#undef HEADER_DES_LOCL_H -+#include "des_enc.c" -+ -+#define DES_UNROLL -+#undef DES_RISC1 -+#define DES_RISC2 -+#undef DES_PTR -+#undef D_ENCRYPT -+#undef des_encrypt -+#undef des_encrypt2 -+#undef des_encrypt3 -+#undef des_decrypt3 -+#define des_encrypt des_encrypt_u16_risc2_idx -+#define des_encrypt2 des_encrypt2_u16_risc2_idx -+#define des_encrypt3 des_encrypt3_u16_risc2_idx -+#define des_decrypt3 des_decrypt3_u16_risc2_idx -+#undef HEADER_DES_LOCL_H -+#include "des_enc.c" -+ -+#endif -+ -+#ifdef PART3 -+ -+#undef DES_UNROLL -+#undef DES_RISC1 -+#undef DES_RISC2 -+#define DES_PTR -+#undef D_ENCRYPT -+#undef des_encrypt -+#undef des_encrypt2 -+#undef des_encrypt3 -+#undef des_decrypt3 -+#define des_encrypt des_encrypt_u4_cisc_ptr -+#define des_encrypt2 des_encrypt2_u4_cisc_ptr -+#define des_encrypt3 des_encrypt3_u4_cisc_ptr -+#define des_decrypt3 des_decrypt3_u4_cisc_ptr -+#undef HEADER_DES_LOCL_H -+#include "des_enc.c" -+ -+#define DES_UNROLL -+#undef DES_RISC1 -+#undef DES_RISC2 -+#define DES_PTR -+#undef D_ENCRYPT -+#undef des_encrypt -+#undef des_encrypt2 -+#undef des_encrypt3 -+#undef des_decrypt3 -+#define des_encrypt des_encrypt_u16_cisc_ptr -+#define des_encrypt2 des_encrypt2_u16_cisc_ptr -+#define des_encrypt3 des_encrypt3_u16_cisc_ptr -+#define des_decrypt3 des_decrypt3_u16_cisc_ptr -+#undef HEADER_DES_LOCL_H -+#include "des_enc.c" -+ -+#undef DES_UNROLL -+#define DES_RISC1 -+#undef DES_RISC2 -+#define DES_PTR -+#undef D_ENCRYPT -+#undef des_encrypt -+#undef des_encrypt2 -+#undef des_encrypt3 -+#undef des_decrypt3 -+#define des_encrypt des_encrypt_u4_risc1_ptr -+#define des_encrypt2 des_encrypt2_u4_risc1_ptr -+#define des_encrypt3 des_encrypt3_u4_risc1_ptr -+#define des_decrypt3 des_decrypt3_u4_risc1_ptr -+#undef HEADER_DES_LOCL_H -+#include "des_enc.c" -+ -+#endif -+ -+#ifdef PART4 -+ -+#undef DES_UNROLL -+#undef DES_RISC1 -+#define DES_RISC2 -+#define DES_PTR -+#undef D_ENCRYPT -+#undef des_encrypt -+#undef des_encrypt2 -+#undef des_encrypt3 -+#undef des_decrypt3 -+#define des_encrypt des_encrypt_u4_risc2_ptr -+#define des_encrypt2 des_encrypt2_u4_risc2_ptr -+#define des_encrypt3 des_encrypt3_u4_risc2_ptr -+#define des_decrypt3 des_decrypt3_u4_risc2_ptr -+#undef HEADER_DES_LOCL_H -+#include "des_enc.c" -+ -+#define DES_UNROLL -+#define DES_RISC1 -+#undef DES_RISC2 -+#define DES_PTR -+#undef D_ENCRYPT -+#undef des_encrypt -+#undef des_encrypt2 -+#undef des_encrypt3 -+#undef des_decrypt3 -+#define des_encrypt des_encrypt_u16_risc1_ptr -+#define des_encrypt2 des_encrypt2_u16_risc1_ptr -+#define des_encrypt3 des_encrypt3_u16_risc1_ptr -+#define des_decrypt3 des_decrypt3_u16_risc1_ptr -+#undef HEADER_DES_LOCL_H -+#include "des_enc.c" -+ -+#define DES_UNROLL -+#undef DES_RISC1 -+#define DES_RISC2 -+#define DES_PTR -+#undef D_ENCRYPT -+#undef des_encrypt -+#undef des_encrypt2 -+#undef des_encrypt3 -+#undef des_decrypt3 -+#define des_encrypt des_encrypt_u16_risc2_ptr -+#define des_encrypt2 des_encrypt2_u16_risc2_ptr -+#define des_encrypt3 des_encrypt3_u16_risc2_ptr -+#define des_decrypt3 des_decrypt3_u16_risc2_ptr -+#undef HEADER_DES_LOCL_H -+#include "des_enc.c" -+ -+#endif -+ -+/* The following if from times(3) man page. It may need to be changed */ -+#ifndef HZ -+# ifndef CLK_TCK -+# ifndef _BSD_CLK_TCK_ /* FreeBSD fix */ -+# ifndef VMS -+# define HZ 100.0 -+# else /* VMS */ -+# define HZ 100.0 -+# endif -+# else /* _BSD_CLK_TCK_ */ -+# define HZ ((double)_BSD_CLK_TCK_) -+# endif -+# else /* CLK_TCK */ -+# define HZ ((double)CLK_TCK) -+# endif -+#endif -+ -+#define BUFSIZE ((long)1024) -+long run=0; -+ -+#ifndef NOPROTO -+double Time_F(int s); -+#else -+double Time_F(); -+#endif -+ -+#ifdef SIGALRM -+#if defined(__STDC__) || defined(sgi) -+#define SIGRETTYPE void -+#else -+#define SIGRETTYPE int -+#endif -+ -+#ifndef NOPROTO -+SIGRETTYPE sig_done(int sig); -+#else -+SIGRETTYPE sig_done(); -+#endif -+ -+SIGRETTYPE sig_done(sig) -+int sig; -+ { -+ signal(SIGALRM,sig_done); -+ run=0; -+#ifdef LINT -+ sig=sig; -+#endif -+ } -+#endif -+ -+#define START 0 -+#define STOP 1 -+ -+double Time_F(s) -+int s; -+ { -+ double ret; -+#ifdef TIMES -+ static struct tms tstart,tend; -+ -+ if (s == START) -+ { -+ times(&tstart); -+ return(0); -+ } -+ else -+ { -+ times(&tend); -+ ret=((double)(tend.tms_utime-tstart.tms_utime))/HZ; -+ return((ret == 0.0)?1e-6:ret); -+ } -+#else /* !times() */ -+ static struct timeb tstart,tend; -+ long i; -+ -+ if (s == START) -+ { -+ ftime(&tstart); -+ return(0); -+ } -+ else -+ { -+ ftime(&tend); -+ i=(long)tend.millitm-(long)tstart.millitm; -+ ret=((double)(tend.time-tstart.time))+((double)i)/1000.0; -+ return((ret == 0.0)?1e-6:ret); -+ } -+#endif -+ } -+ -+#ifdef SIGALRM -+#define print_name(name) fprintf(stderr,"Doing %s's for 10 seconds\n",name); alarm(10); -+#else -+#define print_name(name) fprintf(stderr,"Doing %s %ld times\n",name,cb); -+#endif -+ -+#define time_it(func,name,index) \ -+ print_name(name); \ -+ Time_F(START); \ -+ for (count=0,run=1; COND(cb); count++) \ -+ { \ -+ unsigned long d[2]; \ -+ func(d,&(sch[0]),DES_ENCRYPT); \ -+ } \ -+ tm[index]=Time_F(STOP); \ -+ fprintf(stderr,"%ld %s's in %.2f second\n",count,name,tm[index]); \ -+ tm[index]=((double)COUNT(cb))/tm[index]; -+ -+#define print_it(name,index) \ -+ fprintf(stderr,"%s bytes per sec = %12.2f (%5.1fuS)\n",name, \ -+ tm[index]*8,1.0e6/tm[index]); -+ -+int main(argc,argv) -+int argc; -+char **argv; -+ { -+ long count; -+ static unsigned char buf[BUFSIZE]; -+ static des_cblock key ={0x12,0x34,0x56,0x78,0x9a,0xbc,0xde,0xf0}; -+ static des_cblock key2={0x34,0x56,0x78,0x9a,0xbc,0xde,0xf0,0x12}; -+ static des_cblock key3={0x56,0x78,0x9a,0xbc,0xde,0xf0,0x12,0x34}; -+ des_key_schedule sch,sch2,sch3; -+ double d,tm[16],max=0; -+ int rank[16]; -+ char *str[16]; -+ int max_idx=0,i,num=0,j; -+#ifndef SIGALARM -+ long ca,cb,cc,cd,ce; -+#endif -+ -+ for (i=0; i<12; i++) -+ { -+ tm[i]=0.0; -+ rank[i]=0; -+ } -+ -+#ifndef TIMES -+ fprintf(stderr,"To get the most acurate results, try to run this\n"); -+ fprintf(stderr,"program when this computer is idle.\n"); -+#endif -+ -+ des_set_key((C_Block *)key,sch); -+ des_set_key((C_Block *)key2,sch2); -+ des_set_key((C_Block *)key3,sch3); -+ -+#ifndef SIGALRM -+ fprintf(stderr,"First we calculate the approximate speed ...\n"); -+ des_set_key((C_Block *)key,sch); -+ count=10; -+ do { -+ long i; -+ unsigned long data[2]; -+ -+ count*=2; -+ Time_F(START); -+ for (i=count; i; i--) -+ des_encrypt(data,&(sch[0]),DES_ENCRYPT); -+ d=Time_F(STOP); -+ } while (d < 3.0); -+ ca=count; -+ cb=count*3; -+ cc=count*3*8/BUFSIZE+1; -+ cd=count*8/BUFSIZE+1; -+ -+ ce=count/20+1; -+#define COND(d) (count != (d)) -+#define COUNT(d) (d) -+#else -+#define COND(c) (run) -+#define COUNT(d) (count) -+ signal(SIGALRM,sig_done); -+ alarm(10); -+#endif -+ -+#ifdef PART1 -+ time_it(des_encrypt_u4_cisc_idx, "des_encrypt_u4_cisc_idx ", 0); -+ time_it(des_encrypt_u16_cisc_idx, "des_encrypt_u16_cisc_idx ", 1); -+ time_it(des_encrypt_u4_risc1_idx, "des_encrypt_u4_risc1_idx ", 2); -+ num+=3; -+#endif -+#ifdef PART2 -+ time_it(des_encrypt_u16_risc1_idx,"des_encrypt_u16_risc1_idx", 3); -+ time_it(des_encrypt_u4_risc2_idx, "des_encrypt_u4_risc2_idx ", 4); -+ time_it(des_encrypt_u16_risc2_idx,"des_encrypt_u16_risc2_idx", 5); -+ num+=3; -+#endif -+#ifdef PART3 -+ time_it(des_encrypt_u4_cisc_ptr, "des_encrypt_u4_cisc_ptr ", 6); -+ time_it(des_encrypt_u16_cisc_ptr, "des_encrypt_u16_cisc_ptr ", 7); -+ time_it(des_encrypt_u4_risc1_ptr, "des_encrypt_u4_risc1_ptr ", 8); -+ num+=3; -+#endif -+#ifdef PART4 -+ time_it(des_encrypt_u16_risc1_ptr,"des_encrypt_u16_risc1_ptr", 9); -+ time_it(des_encrypt_u4_risc2_ptr, "des_encrypt_u4_risc2_ptr ",10); -+ time_it(des_encrypt_u16_risc2_ptr,"des_encrypt_u16_risc2_ptr",11); -+ num+=3; -+#endif -+ -+#ifdef PART1 -+ str[0]=" 4 c i"; -+ print_it("des_encrypt_u4_cisc_idx ",0); -+ max=tm[0]; -+ max_idx=0; -+ str[1]="16 c i"; -+ print_it("des_encrypt_u16_cisc_idx ",1); -+ if (max < tm[1]) { max=tm[1]; max_idx=1; } -+ str[2]=" 4 r1 i"; -+ print_it("des_encrypt_u4_risc1_idx ",2); -+ if (max < tm[2]) { max=tm[2]; max_idx=2; } -+#endif -+#ifdef PART2 -+ str[3]="16 r1 i"; -+ print_it("des_encrypt_u16_risc1_idx",3); -+ if (max < tm[3]) { max=tm[3]; max_idx=3; } -+ str[4]=" 4 r2 i"; -+ print_it("des_encrypt_u4_risc2_idx ",4); -+ if (max < tm[4]) { max=tm[4]; max_idx=4; } -+ str[5]="16 r2 i"; -+ print_it("des_encrypt_u16_risc2_idx",5); -+ if (max < tm[5]) { max=tm[5]; max_idx=5; } -+#endif -+#ifdef PART3 -+ str[6]=" 4 c p"; -+ print_it("des_encrypt_u4_cisc_ptr ",6); -+ if (max < tm[6]) { max=tm[6]; max_idx=6; } -+ str[7]="16 c p"; -+ print_it("des_encrypt_u16_cisc_ptr ",7); -+ if (max < tm[7]) { max=tm[7]; max_idx=7; } -+ str[8]=" 4 r1 p"; -+ print_it("des_encrypt_u4_risc1_ptr ",8); -+ if (max < tm[8]) { max=tm[8]; max_idx=8; } -+#endif -+#ifdef PART4 -+ str[9]="16 r1 p"; -+ print_it("des_encrypt_u16_risc1_ptr",9); -+ if (max < tm[9]) { max=tm[9]; max_idx=9; } -+ str[10]=" 4 r2 p"; -+ print_it("des_encrypt_u4_risc2_ptr ",10); -+ if (max < tm[10]) { max=tm[10]; max_idx=10; } -+ str[11]="16 r2 p"; -+ print_it("des_encrypt_u16_risc2_ptr",11); -+ if (max < tm[11]) { max=tm[11]; max_idx=11; } -+#endif -+ printf("options des ecb/s\n"); -+ printf("%s %12.2f 100.0%%\n",str[max_idx],tm[max_idx]); -+ d=tm[max_idx]; -+ tm[max_idx]= -2.0; -+ max= -1.0; -+ for (;;) -+ { -+ for (i=0; i<12; i++) -+ { -+ if (max < tm[i]) { max=tm[i]; j=i; } -+ } -+ if (max < 0.0) break; -+ printf("%s %12.2f %4.1f%%\n",str[j],tm[j],tm[j]/d*100.0); -+ tm[j]= -2.0; -+ max= -1.0; -+ } -+ -+ switch (max_idx) -+ { -+ case 0: -+ printf("-DDES_DEFAULT_OPTIONS\n"); -+ break; -+ case 1: -+ printf("-DDES_UNROLL\n"); -+ break; -+ case 2: -+ printf("-DDES_RISC1\n"); -+ break; -+ case 3: -+ printf("-DDES_UNROLL -DDES_RISC1\n"); -+ break; -+ case 4: -+ printf("-DDES_RISC2\n"); -+ break; -+ case 5: -+ printf("-DDES_UNROLL -DDES_RISC2\n"); -+ break; -+ case 6: -+ printf("-DDES_PTR\n"); -+ break; -+ case 7: -+ printf("-DDES_UNROLL -DDES_PTR\n"); -+ break; -+ case 8: -+ printf("-DDES_RISC1 -DDES_PTR\n"); -+ break; -+ case 9: -+ printf("-DDES_UNROLL -DDES_RISC1 -DDES_PTR\n"); -+ break; -+ case 10: -+ printf("-DDES_RISC2 -DDES_PTR\n"); -+ break; -+ case 11: -+ printf("-DDES_UNROLL -DDES_RISC2 -DDES_PTR\n"); -+ break; -+ } -+ exit(0); -+#if defined(LINT) || defined(MSDOS) -+ return(0); -+#endif -+ } ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/des/dx86unix.S Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,3160 @@ -+/* -+ * This file was originally generated by Michael Richardson -+ * via the perl scripts found in the ASM subdir. It remains copyright of -+ * Eric Young, see the file COPYRIGHT. -+ * -+ * This was last done on October 9, 2002. -+ * -+ * While this file does not need to go through cpp, we pass it through -+ * CPP by naming it dx86unix.S instead of dx86unix.s because there is -+ * a bug in Rules.make for .s builds - specifically it references EXTRA_CFLAGS -+ * which may contain stuff that AS doesn't understand instead of -+ * referencing EXTRA_AFLAGS. -+ */ -+ -+ .file "dx86unix.S" -+ .version "01.01" -+.text -+ .align 16 -+.globl des_encrypt -+ .type des_encrypt , @function -+des_encrypt: -+ pushl %esi -+ pushl %edi -+ -+ -+ movl 12(%esp), %esi -+ xorl %ecx, %ecx -+ pushl %ebx -+ pushl %ebp -+ movl (%esi), %eax -+ movl 28(%esp), %ebx -+ movl 4(%esi), %edi -+ -+ -+ roll $4, %eax -+ movl %eax, %esi -+ xorl %edi, %eax -+ andl $0xf0f0f0f0, %eax -+ xorl %eax, %esi -+ xorl %eax, %edi -+ -+ roll $20, %edi -+ movl %edi, %eax -+ xorl %esi, %edi -+ andl $0xfff0000f, %edi -+ xorl %edi, %eax -+ xorl %edi, %esi -+ -+ roll $14, %eax -+ movl %eax, %edi -+ xorl %esi, %eax -+ andl $0x33333333, %eax -+ xorl %eax, %edi -+ xorl %eax, %esi -+ -+ roll $22, %esi -+ movl %esi, %eax -+ xorl %edi, %esi -+ andl $0x03fc03fc, %esi -+ xorl %esi, %eax -+ xorl %esi, %edi -+ -+ roll $9, %eax -+ movl %eax, %esi -+ xorl %edi, %eax -+ andl $0xaaaaaaaa, %eax -+ xorl %eax, %esi -+ xorl %eax, %edi -+ -+.byte 209 -+.byte 199 -+ movl 24(%esp), %ebp -+ cmpl $0, %ebx -+ je .L000start_decrypt -+ -+ -+ movl (%ebp), %eax -+ xorl %ebx, %ebx -+ movl 4(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 8(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 12(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 16(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 20(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 24(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 28(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 32(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 36(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 40(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 44(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 48(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 52(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 56(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 60(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 64(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 68(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 72(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 76(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 80(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 84(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 88(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 92(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 96(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 100(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 104(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 108(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 112(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 116(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 120(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 124(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ jmp .L001end -+.L000start_decrypt: -+ -+ -+ movl 120(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 124(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 112(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 116(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 104(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 108(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 96(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 100(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 88(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 92(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 80(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 84(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 72(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 76(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 64(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 68(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 56(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 60(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 48(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 52(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 40(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 44(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 32(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 36(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 24(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 28(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 16(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 20(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 8(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 12(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl (%ebp), %eax -+ xorl %ebx, %ebx -+ movl 4(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+.L001end: -+ -+ -+ movl 20(%esp), %edx -+.byte 209 -+.byte 206 -+ movl %edi, %eax -+ xorl %esi, %edi -+ andl $0xaaaaaaaa, %edi -+ xorl %edi, %eax -+ xorl %edi, %esi -+ -+ roll $23, %eax -+ movl %eax, %edi -+ xorl %esi, %eax -+ andl $0x03fc03fc, %eax -+ xorl %eax, %edi -+ xorl %eax, %esi -+ -+ roll $10, %edi -+ movl %edi, %eax -+ xorl %esi, %edi -+ andl $0x33333333, %edi -+ xorl %edi, %eax -+ xorl %edi, %esi -+ -+ roll $18, %esi -+ movl %esi, %edi -+ xorl %eax, %esi -+ andl $0xfff0000f, %esi -+ xorl %esi, %edi -+ xorl %esi, %eax -+ -+ roll $12, %edi -+ movl %edi, %esi -+ xorl %eax, %edi -+ andl $0xf0f0f0f0, %edi -+ xorl %edi, %esi -+ xorl %edi, %eax -+ -+ rorl $4, %eax -+ movl %eax, (%edx) -+ movl %esi, 4(%edx) -+ popl %ebp -+ popl %ebx -+ popl %edi -+ popl %esi -+ ret -+.des_encrypt_end: -+ .size des_encrypt , .des_encrypt_end-des_encrypt -+.ident "desasm.pl" -+.text -+ .align 16 -+.globl des_encrypt2 -+ .type des_encrypt2 , @function -+des_encrypt2: -+ pushl %esi -+ pushl %edi -+ -+ -+ movl 12(%esp), %eax -+ xorl %ecx, %ecx -+ pushl %ebx -+ pushl %ebp -+ movl (%eax), %esi -+ movl 28(%esp), %ebx -+ roll $3, %esi -+ movl 4(%eax), %edi -+ roll $3, %edi -+ movl 24(%esp), %ebp -+ cmpl $0, %ebx -+ je .L002start_decrypt -+ -+ -+ movl (%ebp), %eax -+ xorl %ebx, %ebx -+ movl 4(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 8(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 12(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 16(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 20(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 24(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 28(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 32(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 36(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 40(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 44(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 48(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 52(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 56(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 60(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 64(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 68(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 72(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 76(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 80(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 84(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 88(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 92(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 96(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 100(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 104(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 108(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 112(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 116(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 120(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 124(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ jmp .L003end -+.L002start_decrypt: -+ -+ -+ movl 120(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 124(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 112(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 116(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 104(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 108(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 96(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 100(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 88(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 92(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 80(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 84(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 72(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 76(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 64(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 68(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 56(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 60(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 48(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 52(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 40(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 44(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 32(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 36(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 24(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 28(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl 16(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 20(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+ -+ -+ movl 8(%ebp), %eax -+ xorl %ebx, %ebx -+ movl 12(%ebp), %edx -+ xorl %esi, %eax -+ xorl %esi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %edi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %edi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %edi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %edi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %edi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %edi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %edi -+ -+ -+ movl (%ebp), %eax -+ xorl %ebx, %ebx -+ movl 4(%ebp), %edx -+ xorl %edi, %eax -+ xorl %edi, %edx -+ andl $0xfcfcfcfc, %eax -+ andl $0xcfcfcfcf, %edx -+ movb %al, %bl -+ movb %ah, %cl -+ rorl $4, %edx -+ movl des_SPtrans(%ebx),%ebp -+ movb %dl, %bl -+ xorl %ebp, %esi -+ movl 0x200+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movb %dh, %cl -+ shrl $16, %eax -+ movl 0x100+des_SPtrans(%ebx),%ebp -+ xorl %ebp, %esi -+ movb %ah, %bl -+ shrl $16, %edx -+ movl 0x300+des_SPtrans(%ecx),%ebp -+ xorl %ebp, %esi -+ movl 24(%esp), %ebp -+ movb %dh, %cl -+ andl $0xff, %eax -+ andl $0xff, %edx -+ movl 0x600+des_SPtrans(%ebx),%ebx -+ xorl %ebx, %esi -+ movl 0x700+des_SPtrans(%ecx),%ebx -+ xorl %ebx, %esi -+ movl 0x400+des_SPtrans(%eax),%ebx -+ xorl %ebx, %esi -+ movl 0x500+des_SPtrans(%edx),%ebx -+ xorl %ebx, %esi -+.L003end: -+ -+ -+ rorl $3, %edi -+ movl 20(%esp), %eax -+ rorl $3, %esi -+ movl %edi, (%eax) -+ movl %esi, 4(%eax) -+ popl %ebp -+ popl %ebx -+ popl %edi -+ popl %esi -+ ret -+.des_encrypt2_end: -+ .size des_encrypt2 , .des_encrypt2_end-des_encrypt2 -+.ident "desasm.pl" -+.text -+ .align 16 -+.globl des_encrypt3 -+ .type des_encrypt3 , @function -+des_encrypt3: -+ pushl %ebx -+ movl 8(%esp), %ebx -+ pushl %ebp -+ pushl %esi -+ pushl %edi -+ -+ -+ movl (%ebx), %edi -+ movl 4(%ebx), %esi -+ subl $12, %esp -+ -+ -+ roll $4, %edi -+ movl %edi, %edx -+ xorl %esi, %edi -+ andl $0xf0f0f0f0, %edi -+ xorl %edi, %edx -+ xorl %edi, %esi -+ -+ roll $20, %esi -+ movl %esi, %edi -+ xorl %edx, %esi -+ andl $0xfff0000f, %esi -+ xorl %esi, %edi -+ xorl %esi, %edx -+ -+ roll $14, %edi -+ movl %edi, %esi -+ xorl %edx, %edi -+ andl $0x33333333, %edi -+ xorl %edi, %esi -+ xorl %edi, %edx -+ -+ roll $22, %edx -+ movl %edx, %edi -+ xorl %esi, %edx -+ andl $0x03fc03fc, %edx -+ xorl %edx, %edi -+ xorl %edx, %esi -+ -+ roll $9, %edi -+ movl %edi, %edx -+ xorl %esi, %edi -+ andl $0xaaaaaaaa, %edi -+ xorl %edi, %edx -+ xorl %edi, %esi -+ -+ rorl $3, %edx -+ rorl $2, %esi -+ movl %esi, 4(%ebx) -+ movl 36(%esp), %eax -+ movl %edx, (%ebx) -+ movl 40(%esp), %edi -+ movl 44(%esp), %esi -+ movl $1, 8(%esp) -+ movl %eax, 4(%esp) -+ movl %ebx, (%esp) -+ call des_encrypt2 -+ movl $0, 8(%esp) -+ movl %edi, 4(%esp) -+ movl %ebx, (%esp) -+ call des_encrypt2 -+ movl $1, 8(%esp) -+ movl %esi, 4(%esp) -+ movl %ebx, (%esp) -+ call des_encrypt2 -+ addl $12, %esp -+ movl (%ebx), %edi -+ movl 4(%ebx), %esi -+ -+ -+ roll $2, %esi -+ roll $3, %edi -+ movl %edi, %eax -+ xorl %esi, %edi -+ andl $0xaaaaaaaa, %edi -+ xorl %edi, %eax -+ xorl %edi, %esi -+ -+ roll $23, %eax -+ movl %eax, %edi -+ xorl %esi, %eax -+ andl $0x03fc03fc, %eax -+ xorl %eax, %edi -+ xorl %eax, %esi -+ -+ roll $10, %edi -+ movl %edi, %eax -+ xorl %esi, %edi -+ andl $0x33333333, %edi -+ xorl %edi, %eax -+ xorl %edi, %esi -+ -+ roll $18, %esi -+ movl %esi, %edi -+ xorl %eax, %esi -+ andl $0xfff0000f, %esi -+ xorl %esi, %edi -+ xorl %esi, %eax -+ -+ roll $12, %edi -+ movl %edi, %esi -+ xorl %eax, %edi -+ andl $0xf0f0f0f0, %edi -+ xorl %edi, %esi -+ xorl %edi, %eax -+ -+ rorl $4, %eax -+ movl %eax, (%ebx) -+ movl %esi, 4(%ebx) -+ popl %edi -+ popl %esi -+ popl %ebp -+ popl %ebx -+ ret -+.des_encrypt3_end: -+ .size des_encrypt3 , .des_encrypt3_end-des_encrypt3 -+.ident "desasm.pl" -+.text -+ .align 16 -+.globl des_decrypt3 -+ .type des_decrypt3 , @function -+des_decrypt3: -+ pushl %ebx -+ movl 8(%esp), %ebx -+ pushl %ebp -+ pushl %esi -+ pushl %edi -+ -+ -+ movl (%ebx), %edi -+ movl 4(%ebx), %esi -+ subl $12, %esp -+ -+ -+ roll $4, %edi -+ movl %edi, %edx -+ xorl %esi, %edi -+ andl $0xf0f0f0f0, %edi -+ xorl %edi, %edx -+ xorl %edi, %esi -+ -+ roll $20, %esi -+ movl %esi, %edi -+ xorl %edx, %esi -+ andl $0xfff0000f, %esi -+ xorl %esi, %edi -+ xorl %esi, %edx -+ -+ roll $14, %edi -+ movl %edi, %esi -+ xorl %edx, %edi -+ andl $0x33333333, %edi -+ xorl %edi, %esi -+ xorl %edi, %edx -+ -+ roll $22, %edx -+ movl %edx, %edi -+ xorl %esi, %edx -+ andl $0x03fc03fc, %edx -+ xorl %edx, %edi -+ xorl %edx, %esi -+ -+ roll $9, %edi -+ movl %edi, %edx -+ xorl %esi, %edi -+ andl $0xaaaaaaaa, %edi -+ xorl %edi, %edx -+ xorl %edi, %esi -+ -+ rorl $3, %edx -+ rorl $2, %esi -+ movl %esi, 4(%ebx) -+ movl 36(%esp), %esi -+ movl %edx, (%ebx) -+ movl 40(%esp), %edi -+ movl 44(%esp), %eax -+ movl $0, 8(%esp) -+ movl %eax, 4(%esp) -+ movl %ebx, (%esp) -+ call des_encrypt2 -+ movl $1, 8(%esp) -+ movl %edi, 4(%esp) -+ movl %ebx, (%esp) -+ call des_encrypt2 -+ movl $0, 8(%esp) -+ movl %esi, 4(%esp) -+ movl %ebx, (%esp) -+ call des_encrypt2 -+ addl $12, %esp -+ movl (%ebx), %edi -+ movl 4(%ebx), %esi -+ -+ -+ roll $2, %esi -+ roll $3, %edi -+ movl %edi, %eax -+ xorl %esi, %edi -+ andl $0xaaaaaaaa, %edi -+ xorl %edi, %eax -+ xorl %edi, %esi -+ -+ roll $23, %eax -+ movl %eax, %edi -+ xorl %esi, %eax -+ andl $0x03fc03fc, %eax -+ xorl %eax, %edi -+ xorl %eax, %esi -+ -+ roll $10, %edi -+ movl %edi, %eax -+ xorl %esi, %edi -+ andl $0x33333333, %edi -+ xorl %edi, %eax -+ xorl %edi, %esi -+ -+ roll $18, %esi -+ movl %esi, %edi -+ xorl %eax, %esi -+ andl $0xfff0000f, %esi -+ xorl %esi, %edi -+ xorl %esi, %eax -+ -+ roll $12, %edi -+ movl %edi, %esi -+ xorl %eax, %edi -+ andl $0xf0f0f0f0, %edi -+ xorl %edi, %esi -+ xorl %edi, %eax -+ -+ rorl $4, %eax -+ movl %eax, (%ebx) -+ movl %esi, 4(%ebx) -+ popl %edi -+ popl %esi -+ popl %ebp -+ popl %ebx -+ ret -+.des_decrypt3_end: -+ .size des_decrypt3 , .des_decrypt3_end-des_decrypt3 -+.ident "desasm.pl" -+.text -+ .align 16 -+.globl des_ncbc_encrypt -+ .type des_ncbc_encrypt , @function -+des_ncbc_encrypt: -+ -+ pushl %ebp -+ pushl %ebx -+ pushl %esi -+ pushl %edi -+ movl 28(%esp), %ebp -+ -+ movl 36(%esp), %ebx -+ movl (%ebx), %esi -+ movl 4(%ebx), %edi -+ pushl %edi -+ pushl %esi -+ pushl %edi -+ pushl %esi -+ movl %esp, %ebx -+ movl 36(%esp), %esi -+ movl 40(%esp), %edi -+ -+ movl 56(%esp), %ecx -+ -+ pushl %ecx -+ -+ movl 52(%esp), %eax -+ pushl %eax -+ pushl %ebx -+ cmpl $0, %ecx -+ jz .L004decrypt -+ andl $4294967288, %ebp -+ movl 12(%esp), %eax -+ movl 16(%esp), %ebx -+ jz .L005encrypt_finish -+.L006encrypt_loop: -+ movl (%esi), %ecx -+ movl 4(%esi), %edx -+ xorl %ecx, %eax -+ xorl %edx, %ebx -+ movl %eax, 12(%esp) -+ movl %ebx, 16(%esp) -+ call des_encrypt -+ movl 12(%esp), %eax -+ movl 16(%esp), %ebx -+ movl %eax, (%edi) -+ movl %ebx, 4(%edi) -+ addl $8, %esi -+ addl $8, %edi -+ subl $8, %ebp -+ jnz .L006encrypt_loop -+.L005encrypt_finish: -+ movl 56(%esp), %ebp -+ andl $7, %ebp -+ jz .L007finish -+ xorl %ecx, %ecx -+ xorl %edx, %edx -+ movl .L008cbc_enc_jmp_table(,%ebp,4),%ebp -+ jmp *%ebp -+.L009ej7: -+ movb 6(%esi), %dh -+ sall $8, %edx -+.L010ej6: -+ movb 5(%esi), %dh -+.L011ej5: -+ movb 4(%esi), %dl -+.L012ej4: -+ movl (%esi), %ecx -+ jmp .L013ejend -+.L014ej3: -+ movb 2(%esi), %ch -+ sall $8, %ecx -+.L015ej2: -+ movb 1(%esi), %ch -+.L016ej1: -+ movb (%esi), %cl -+.L013ejend: -+ xorl %ecx, %eax -+ xorl %edx, %ebx -+ movl %eax, 12(%esp) -+ movl %ebx, 16(%esp) -+ call des_encrypt -+ movl 12(%esp), %eax -+ movl 16(%esp), %ebx -+ movl %eax, (%edi) -+ movl %ebx, 4(%edi) -+ jmp .L007finish -+.align 16 -+.L004decrypt: -+ andl $4294967288, %ebp -+ movl 20(%esp), %eax -+ movl 24(%esp), %ebx -+ jz .L017decrypt_finish -+.L018decrypt_loop: -+ movl (%esi), %eax -+ movl 4(%esi), %ebx -+ movl %eax, 12(%esp) -+ movl %ebx, 16(%esp) -+ call des_encrypt -+ movl 12(%esp), %eax -+ movl 16(%esp), %ebx -+ movl 20(%esp), %ecx -+ movl 24(%esp), %edx -+ xorl %eax, %ecx -+ xorl %ebx, %edx -+ movl (%esi), %eax -+ movl 4(%esi), %ebx -+ movl %ecx, (%edi) -+ movl %edx, 4(%edi) -+ movl %eax, 20(%esp) -+ movl %ebx, 24(%esp) -+ addl $8, %esi -+ addl $8, %edi -+ subl $8, %ebp -+ jnz .L018decrypt_loop -+.L017decrypt_finish: -+ movl 56(%esp), %ebp -+ andl $7, %ebp -+ jz .L007finish -+ movl (%esi), %eax -+ movl 4(%esi), %ebx -+ movl %eax, 12(%esp) -+ movl %ebx, 16(%esp) -+ call des_encrypt -+ movl 12(%esp), %eax -+ movl 16(%esp), %ebx -+ movl 20(%esp), %ecx -+ movl 24(%esp), %edx -+ xorl %eax, %ecx -+ xorl %ebx, %edx -+ movl (%esi), %eax -+ movl 4(%esi), %ebx -+.L019dj7: -+ rorl $16, %edx -+ movb %dl, 6(%edi) -+ shrl $16, %edx -+.L020dj6: -+ movb %dh, 5(%edi) -+.L021dj5: -+ movb %dl, 4(%edi) -+.L022dj4: -+ movl %ecx, (%edi) -+ jmp .L023djend -+.L024dj3: -+ rorl $16, %ecx -+ movb %cl, 2(%edi) -+ sall $16, %ecx -+.L025dj2: -+ movb %ch, 1(%esi) -+.L026dj1: -+ movb %cl, (%esi) -+.L023djend: -+ jmp .L007finish -+.align 16 -+.L007finish: -+ movl 64(%esp), %ecx -+ addl $28, %esp -+ movl %eax, (%ecx) -+ movl %ebx, 4(%ecx) -+ popl %edi -+ popl %esi -+ popl %ebx -+ popl %ebp -+ ret -+.align 16 -+.L008cbc_enc_jmp_table: -+ .long 0 -+ .long .L016ej1 -+ .long .L015ej2 -+ .long .L014ej3 -+ .long .L012ej4 -+ .long .L011ej5 -+ .long .L010ej6 -+ .long .L009ej7 -+.align 16 -+.L027cbc_dec_jmp_table: -+ .long 0 -+ .long .L026dj1 -+ .long .L025dj2 -+ .long .L024dj3 -+ .long .L022dj4 -+ .long .L021dj5 -+ .long .L020dj6 -+ .long .L019dj7 -+.des_ncbc_encrypt_end: -+ .size des_ncbc_encrypt , .des_ncbc_encrypt_end-des_ncbc_encrypt -+.ident "desasm.pl" -+.text -+ .align 16 -+.globl des_ede3_cbc_encrypt -+ .type des_ede3_cbc_encrypt , @function -+des_ede3_cbc_encrypt: -+ -+ pushl %ebp -+ pushl %ebx -+ pushl %esi -+ pushl %edi -+ movl 28(%esp), %ebp -+ -+ movl 44(%esp), %ebx -+ movl (%ebx), %esi -+ movl 4(%ebx), %edi -+ pushl %edi -+ pushl %esi -+ pushl %edi -+ pushl %esi -+ movl %esp, %ebx -+ movl 36(%esp), %esi -+ movl 40(%esp), %edi -+ -+ movl 64(%esp), %ecx -+ -+ movl 56(%esp), %eax -+ pushl %eax -+ -+ movl 56(%esp), %eax -+ pushl %eax -+ -+ movl 56(%esp), %eax -+ pushl %eax -+ pushl %ebx -+ cmpl $0, %ecx -+ jz .L028decrypt -+ andl $4294967288, %ebp -+ movl 16(%esp), %eax -+ movl 20(%esp), %ebx -+ jz .L029encrypt_finish -+.L030encrypt_loop: -+ movl (%esi), %ecx -+ movl 4(%esi), %edx -+ xorl %ecx, %eax -+ xorl %edx, %ebx -+ movl %eax, 16(%esp) -+ movl %ebx, 20(%esp) -+ call des_encrypt3 -+ movl 16(%esp), %eax -+ movl 20(%esp), %ebx -+ movl %eax, (%edi) -+ movl %ebx, 4(%edi) -+ addl $8, %esi -+ addl $8, %edi -+ subl $8, %ebp -+ jnz .L030encrypt_loop -+.L029encrypt_finish: -+ movl 60(%esp), %ebp -+ andl $7, %ebp -+ jz .L031finish -+ xorl %ecx, %ecx -+ xorl %edx, %edx -+ movl .L032cbc_enc_jmp_table(,%ebp,4),%ebp -+ jmp *%ebp -+.L033ej7: -+ movb 6(%esi), %dh -+ sall $8, %edx -+.L034ej6: -+ movb 5(%esi), %dh -+.L035ej5: -+ movb 4(%esi), %dl -+.L036ej4: -+ movl (%esi), %ecx -+ jmp .L037ejend -+.L038ej3: -+ movb 2(%esi), %ch -+ sall $8, %ecx -+.L039ej2: -+ movb 1(%esi), %ch -+.L040ej1: -+ movb (%esi), %cl -+.L037ejend: -+ xorl %ecx, %eax -+ xorl %edx, %ebx -+ movl %eax, 16(%esp) -+ movl %ebx, 20(%esp) -+ call des_encrypt3 -+ movl 16(%esp), %eax -+ movl 20(%esp), %ebx -+ movl %eax, (%edi) -+ movl %ebx, 4(%edi) -+ jmp .L031finish -+.align 16 -+.L028decrypt: -+ andl $4294967288, %ebp -+ movl 24(%esp), %eax -+ movl 28(%esp), %ebx -+ jz .L041decrypt_finish -+.L042decrypt_loop: -+ movl (%esi), %eax -+ movl 4(%esi), %ebx -+ movl %eax, 16(%esp) -+ movl %ebx, 20(%esp) -+ call des_decrypt3 -+ movl 16(%esp), %eax -+ movl 20(%esp), %ebx -+ movl 24(%esp), %ecx -+ movl 28(%esp), %edx -+ xorl %eax, %ecx -+ xorl %ebx, %edx -+ movl (%esi), %eax -+ movl 4(%esi), %ebx -+ movl %ecx, (%edi) -+ movl %edx, 4(%edi) -+ movl %eax, 24(%esp) -+ movl %ebx, 28(%esp) -+ addl $8, %esi -+ addl $8, %edi -+ subl $8, %ebp -+ jnz .L042decrypt_loop -+.L041decrypt_finish: -+ movl 60(%esp), %ebp -+ andl $7, %ebp -+ jz .L031finish -+ movl (%esi), %eax -+ movl 4(%esi), %ebx -+ movl %eax, 16(%esp) -+ movl %ebx, 20(%esp) -+ call des_decrypt3 -+ movl 16(%esp), %eax -+ movl 20(%esp), %ebx -+ movl 24(%esp), %ecx -+ movl 28(%esp), %edx -+ xorl %eax, %ecx -+ xorl %ebx, %edx -+ movl (%esi), %eax -+ movl 4(%esi), %ebx -+.L043dj7: -+ rorl $16, %edx -+ movb %dl, 6(%edi) -+ shrl $16, %edx -+.L044dj6: -+ movb %dh, 5(%edi) -+.L045dj5: -+ movb %dl, 4(%edi) -+.L046dj4: -+ movl %ecx, (%edi) -+ jmp .L047djend -+.L048dj3: -+ rorl $16, %ecx -+ movb %cl, 2(%edi) -+ sall $16, %ecx -+.L049dj2: -+ movb %ch, 1(%esi) -+.L050dj1: -+ movb %cl, (%esi) -+.L047djend: -+ jmp .L031finish -+.align 16 -+.L031finish: -+ movl 76(%esp), %ecx -+ addl $32, %esp -+ movl %eax, (%ecx) -+ movl %ebx, 4(%ecx) -+ popl %edi -+ popl %esi -+ popl %ebx -+ popl %ebp -+ ret -+.align 16 -+.L032cbc_enc_jmp_table: -+ .long 0 -+ .long .L040ej1 -+ .long .L039ej2 -+ .long .L038ej3 -+ .long .L036ej4 -+ .long .L035ej5 -+ .long .L034ej6 -+ .long .L033ej7 -+.align 16 -+.L051cbc_dec_jmp_table: -+ .long 0 -+ .long .L050dj1 -+ .long .L049dj2 -+ .long .L048dj3 -+ .long .L046dj4 -+ .long .L045dj5 -+ .long .L044dj6 -+ .long .L043dj7 -+.des_ede3_cbc_encrypt_end: -+ .size des_ede3_cbc_encrypt , .des_ede3_cbc_encrypt_end-des_ede3_cbc_encrypt -+.ident "desasm.pl" ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/des/ecb_enc.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,135 @@ -+/* crypto/des/ecb_enc.c */ -+/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com) -+ * All rights reserved. -+ * -+ * This package is an SSL implementation written -+ * by Eric Young (eay@cryptsoft.com). -+ * The implementation was written so as to conform with Netscapes SSL. -+ * -+ * This library is free for commercial and non-commercial use as long as -+ * the following conditions are aheared to. The following conditions -+ * apply to all code found in this distribution, be it the RC4, RSA, -+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation -+ * included with this distribution is covered by the same copyright terms -+ * except that the holder is Tim Hudson (tjh@cryptsoft.com). -+ * -+ * Copyright remains Eric Young's, and as such any Copyright notices in -+ * the code are not to be removed. -+ * If this package is used in a product, Eric Young should be given attribution -+ * as the author of the parts of the library used. -+ * This can be in the form of a textual message at program startup or -+ * in documentation (online or textual) provided with the package. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * 1. Redistributions of source code must retain the copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. All advertising materials mentioning features or use of this software -+ * must display the following acknowledgement: -+ * "This product includes cryptographic software written by -+ * Eric Young (eay@cryptsoft.com)" -+ * The word 'cryptographic' can be left out if the rouines from the library -+ * being used are not cryptographic related :-). -+ * 4. If you include any Windows specific code (or a derivative thereof) from -+ * the apps directory (application code) you must include an acknowledgement: -+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" -+ * -+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND -+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -+ * SUCH DAMAGE. -+ * -+ * The licence and distribution terms for any publically available version or -+ * derivative of this code cannot be changed. i.e. this code cannot simply be -+ * copied and put under another distribution licence -+ * [including the GNU Public Licence.] -+ */ -+ -+#include "des/des_locl.h" -+#include "des/spr.h" -+ -+char *libdes_version="libdes v 3.24 - 20-Apr-1996 - eay"; -+char *DES_version="DES part of SSLeay 0.8.2b 08-Jan-1998"; -+ -+/* RCSID $Id: ecb_enc.c,v 1.8 2004/08/04 15:57:22 mcr Exp $ */ -+/* This function ifdef'ed out for FreeS/WAN project. */ -+#ifdef notdef -+char *des_options() -+ { -+ static int init=1; -+ static char buf[32]; -+ -+ if (init) -+ { -+ char *ptr,*unroll,*risc,*size; -+ -+ init=0; -+#ifdef DES_PTR -+ ptr="ptr"; -+#else -+ ptr="idx"; -+#endif -+#if defined(DES_RISC1) || defined(DES_RISC2) -+#ifdef DES_RISC1 -+ risc="risc1"; -+#endif -+#ifdef DES_RISC2 -+ risc="risc2"; -+#endif -+#else -+ risc="cisc"; -+#endif -+#ifdef DES_UNROLL -+ unroll="16"; -+#else -+ unroll="4"; -+#endif -+ if (sizeof(DES_LONG) != sizeof(long)) -+ size="int"; -+ else -+ size="long"; -+ sprintf(buf,"des(%s,%s,%s,%s)",ptr,risc,unroll,size); -+ } -+ return(buf); -+ } -+#endif -+ -+ -+void des_ecb_encrypt(input, output, ks, enc) -+des_cblock (*input); -+des_cblock (*output); -+des_key_schedule ks; -+int enc; -+ { -+ register DES_LONG l; -+ register unsigned char *in,*out; -+ DES_LONG ll[2]; -+ -+#ifdef OCF_ASSIST -+ if (ocf_des_assist() & OCF_PROVIDES_DES_3DES) { -+ ocf_des_ecb_encrypt(input, output, ks, enc); -+ return; -+ } -+#endif -+ -+ in=(unsigned char *)input; -+ out=(unsigned char *)output; -+ c2l(in,l); ll[0]=l; -+ c2l(in,l); ll[1]=l; -+ des_encrypt(ll,ks,enc); -+ l=ll[0]; l2c(l,out); -+ l=ll[1]; l2c(l,out); -+ l=ll[0]=ll[1]=0; -+ } -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/des/ipsec_alg_3des.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,182 @@ -+/* -+ * ipsec_alg 3DES cipher stubs -+ * -+ * Copyright (C) 2005 Michael Richardson -+ * -+ * Adapted from ipsec_alg_aes.c by JuanJo Ciarlante -+ * -+ * ipsec_alg_aes.c,v 1.1.2.1 2003/11/21 18:12:23 jjo Exp -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ */ -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif -+#include -+ -+/* -+ * special case: ipsec core modular with this static algo inside: -+ * must avoid MODULE magic for this file -+ */ -+#if defined(CONFIG_KLIPS_MODULE) && defined(CONFIG_KLIPS_ENC_3DES) -+#undef MODULE -+#endif -+ -+#include -+#include -+ -+#include /* printk() */ -+#include /* error codes */ -+#include /* size_t */ -+#include -+ -+/* Low freeswan header coupling */ -+#include "openswan/ipsec_xform.h" -+#include "openswan/ipsec_alg.h" -+#include "klips-crypto/des.h" -+#include "openswan/ipsec_alg_3des.h" -+ -+#define AES_CONTEXT_T aes_context -+static int debug_3des=0; -+static int test_3des=0; -+static int excl_3des=0; -+ -+#if defined(CONFIG_KLIPS_ENC_3DES_MODULE) -+MODULE_AUTHOR("Michael Richardson "); -+#ifdef module_param -+module_param(debug_3des, int, 0664); -+module_param(test_des, int, 0664); -+module_param(excl_des, int, 0664); -+#else -+MODULE_PARM(debug_3des, "i"); -+MODULE_PARM(test_des, "i"); -+MODULE_PARM(excl_des, "i"); -+#endif -+#endif -+ -+#define ESP_AES_MAC_KEY_SZ 16 /* 128 bit MAC key */ -+#define ESP_AES_MAC_BLK_LEN 16 /* 128 bit block */ -+ -+static int _3des_set_key(struct ipsec_alg_enc *alg, -+ __u8 * key_e, const __u8 * key, -+ size_t keysize) -+{ -+ int ret = 0; -+ TripleDES_context *ctx = (TripleDES_context*)key_e; -+ -+ if(keysize != 192/8) { -+ return EINVAL; -+ } -+ -+ des_set_key((des_cblock *)(key + DES_KEY_SZ*0), ctx->s1); -+ des_set_key((des_cblock *)(key + DES_KEY_SZ*1), ctx->s2); -+ des_set_key((des_cblock *)(key + DES_KEY_SZ*2), ctx->s3); -+ -+ if (debug_3des > 0) -+ printk(KERN_DEBUG "klips_debug:_3des_set_key:" -+ "ret=%d key_e=%p key=%p keysize=%ld\n", -+ ret, key_e, key, (unsigned long int) keysize); -+ return ret; -+} -+ -+static int _3des_cbc_encrypt(struct ipsec_alg_enc *alg, -+ __u8 * key_e, -+ const __u8 * in, -+ int ilen, const __u8 * iv, -+ int encrypt) -+{ -+ TripleDES_context *ctx=(TripleDES_context*)key_e; -+ des_cblock miv; -+ -+ memcpy(&miv, iv, sizeof(miv)); -+ -+ if (debug_3des > 0) -+ printk(KERN_DEBUG "klips_debug:_3des_cbc_encrypt:" -+ "key_e=%p in=%p ilen=%d iv=%p encrypt=%d\n", -+ key_e, in, ilen, iv, encrypt); -+ -+ des_ede3_cbc_encrypt((des_cblock *)in, -+ (des_cblock *)in, -+ ilen, -+ ctx->s1, -+ ctx->s2, -+ ctx->s3, -+ &miv, encrypt); -+ return 1; -+} -+ -+static struct ipsec_alg_enc ipsec_alg_3DES = { -+ ixt_common: { ixt_version: IPSEC_ALG_VERSION, -+ ixt_refcnt: ATOMIC_INIT(0), -+ ixt_name: "3des", -+ ixt_blocksize: ESP_3DES_CBC_BLK_LEN, -+ ixt_support: { -+ ias_exttype: IPSEC_ALG_TYPE_ENCRYPT, -+ ias_id: ESP_3DES, -+ //ias_ivlen: 64, -+ ias_keyminbits: ESP_3DES_KEY_SZ*8, -+ ias_keymaxbits: ESP_3DES_KEY_SZ*8, -+ }, -+ }, -+#if defined(MODULE_KLIPS_ENC_3DES_MODULE) -+ ixt_module: THIS_MODULE, -+#endif -+ ixt_e_keylen: ESP_3DES_KEY_SZ*8, -+ ixt_e_ctx_size: sizeof(TripleDES_context), -+ ixt_e_set_key: _3des_set_key, -+ ixt_e_cbc_encrypt:_3des_cbc_encrypt, -+}; -+ -+#if defined(CONFIG_KLIPS_ENC_3DES_MODULE) -+IPSEC_ALG_MODULE_INIT_MOD( ipsec_3des_init ) -+#else -+IPSEC_ALG_MODULE_INIT_STATIC( ipsec_3des_init ) -+#endif -+{ -+ int ret, test_ret; -+ -+ if (excl_3des) ipsec_alg_3DES.ixt_common.ixt_state |= IPSEC_ALG_ST_EXCL; -+ ret=register_ipsec_alg_enc(&ipsec_alg_3DES); -+ printk("ipsec_3des_init(alg_type=%d alg_id=%d name=%s): ret=%d\n", -+ ipsec_alg_3DES.ixt_common.ixt_support.ias_exttype, -+ ipsec_alg_3DES.ixt_common.ixt_support.ias_id, -+ ipsec_alg_3DES.ixt_common.ixt_name, -+ ret); -+ if (ret==0 && test_3des) { -+ test_ret=ipsec_alg_test( -+ ipsec_alg_3DES.ixt_common.ixt_support.ias_exttype, -+ ipsec_alg_3DES.ixt_common.ixt_support.ias_id, -+ test_3des); -+ printk("ipsec_3des_init(alg_type=%d alg_id=%d): test_ret=%d\n", -+ ipsec_alg_3DES.ixt_common.ixt_support.ias_exttype, -+ ipsec_alg_3DES.ixt_common.ixt_support.ias_id, -+ test_ret); -+ } -+ return ret; -+} -+ -+#if defined(CONFIG_KLIPS_ENC_3DES_MODULE) -+IPSEC_ALG_MODULE_EXIT_MOD( ipsec_3des_fini ) -+#else -+IPSEC_ALG_MODULE_EXIT_STATIC( ipsec_3des_fini ) -+#endif -+{ -+ unregister_ipsec_alg_enc(&ipsec_alg_3DES); -+ return; -+} -+ -+/* Dual, because 3des code is 4-clause BSD licensed */ -+#ifdef MODULE_LICENSE -+MODULE_LICENSE("Dual BSD/GPL"); -+#endif -+ -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/des/set_key.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,251 @@ -+/* crypto/des/set_key.c */ -+/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com) -+ * All rights reserved. -+ * -+ * This package is an SSL implementation written -+ * by Eric Young (eay@cryptsoft.com). -+ * The implementation was written so as to conform with Netscapes SSL. -+ * -+ * This library is free for commercial and non-commercial use as long as -+ * the following conditions are aheared to. The following conditions -+ * apply to all code found in this distribution, be it the RC4, RSA, -+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation -+ * included with this distribution is covered by the same copyright terms -+ * except that the holder is Tim Hudson (tjh@cryptsoft.com). -+ * -+ * Copyright remains Eric Young's, and as such any Copyright notices in -+ * the code are not to be removed. -+ * If this package is used in a product, Eric Young should be given attribution -+ * as the author of the parts of the library used. -+ * This can be in the form of a textual message at program startup or -+ * in documentation (online or textual) provided with the package. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * 1. Redistributions of source code must retain the copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. All advertising materials mentioning features or use of this software -+ * must display the following acknowledgement: -+ * "This product includes cryptographic software written by -+ * Eric Young (eay@cryptsoft.com)" -+ * The word 'cryptographic' can be left out if the rouines from the library -+ * being used are not cryptographic related :-). -+ * 4. If you include any Windows specific code (or a derivative thereof) from -+ * the apps directory (application code) you must include an acknowledgement: -+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" -+ * -+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND -+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -+ * SUCH DAMAGE. -+ * -+ * The licence and distribution terms for any publically available version or -+ * derivative of this code cannot be changed. i.e. this code cannot simply be -+ * copied and put under another distribution licence -+ * [including the GNU Public Licence.] -+ */ -+ -+/* set_key.c v 1.4 eay 24/9/91 -+ * 1.4 Speed up by 400% :-) -+ * 1.3 added register declarations. -+ * 1.2 unrolled make_key_sched a bit more -+ * 1.1 added norm_expand_bits -+ * 1.0 First working version -+ */ -+#include "des/des_locl.h" -+#include "des/podd.h" -+#include "des/sk.h" -+ -+#ifndef NOPROTO -+static int check_parity(des_cblock (*key)); -+#else -+static int check_parity(); -+#endif -+ -+int des_check_key=0; -+ -+void des_set_odd_parity(key) -+des_cblock (*key); -+ { -+ int i; -+ -+ for (i=0; i>(n))^(b))&(m)),\ -+ * (b)^=(t),\ -+ * (a)=((a)^((t)<<(n)))) -+ */ -+ -+#define HPERM_OP(a,t,n,m) ((t)=((((a)<<(16-(n)))^(a))&(m)),\ -+ (a)=(a)^(t)^(t>>(16-(n)))) -+ -+/* return 0 if key parity is odd (correct), -+ * return -1 if key parity error, -+ * return -2 if illegal weak key. -+ */ -+int des_set_key(key, schedule) -+des_cblock (*key); -+des_key_schedule schedule; -+ { -+ static int shifts2[16]={0,0,1,1,1,1,1,1,0,1,1,1,1,1,1,0}; -+ register DES_LONG c,d,t,s,t2; -+ register unsigned char *in; -+ register DES_LONG *k; -+ register int i; -+ -+ if (des_check_key) -+ { -+ if (!check_parity(key)) -+ return(-1); -+ -+ if (des_is_weak_key(key)) -+ return(-2); -+ } -+ -+#ifdef OCF_ASSIST -+ if (ocf_des_assist() & OCF_PROVIDES_DES_3DES) -+ return(ocf_des_set_key(key, schedule)); -+#endif -+ -+ k=(DES_LONG *)schedule; -+ in=(unsigned char *)key; -+ -+ c2l(in,c); -+ c2l(in,d); -+ -+ /* do PC1 in 60 simple operations */ -+/* PERM_OP(d,c,t,4,0x0f0f0f0fL); -+ HPERM_OP(c,t,-2, 0xcccc0000L); -+ HPERM_OP(c,t,-1, 0xaaaa0000L); -+ HPERM_OP(c,t, 8, 0x00ff0000L); -+ HPERM_OP(c,t,-1, 0xaaaa0000L); -+ HPERM_OP(d,t,-8, 0xff000000L); -+ HPERM_OP(d,t, 8, 0x00ff0000L); -+ HPERM_OP(d,t, 2, 0x33330000L); -+ d=((d&0x00aa00aaL)<<7L)|((d&0x55005500L)>>7L)|(d&0xaa55aa55L); -+ d=(d>>8)|((c&0xf0000000L)>>4); -+ c&=0x0fffffffL; */ -+ -+ /* I now do it in 47 simple operations :-) -+ * Thanks to John Fletcher (john_fletcher@lccmail.ocf.llnl.gov) -+ * for the inspiration. :-) */ -+ PERM_OP (d,c,t,4,0x0f0f0f0fL); -+ HPERM_OP(c,t,-2,0xcccc0000L); -+ HPERM_OP(d,t,-2,0xcccc0000L); -+ PERM_OP (d,c,t,1,0x55555555L); -+ PERM_OP (c,d,t,8,0x00ff00ffL); -+ PERM_OP (d,c,t,1,0x55555555L); -+ d= (((d&0x000000ffL)<<16L)| (d&0x0000ff00L) | -+ ((d&0x00ff0000L)>>16L)|((c&0xf0000000L)>>4L)); -+ c&=0x0fffffffL; -+ -+ for (i=0; i>2L)|(c<<26L)); d=((d>>2L)|(d<<26L)); } -+ else -+ { c=((c>>1L)|(c<<27L)); d=((d>>1L)|(d<<27L)); } -+ c&=0x0fffffffL; -+ d&=0x0fffffffL; -+ /* could be a few less shifts but I am to lazy at this -+ * point in time to investigate */ -+ s= des_skb[0][ (c )&0x3f ]| -+ des_skb[1][((c>> 6)&0x03)|((c>> 7L)&0x3c)]| -+ des_skb[2][((c>>13)&0x0f)|((c>>14L)&0x30)]| -+ des_skb[3][((c>>20)&0x01)|((c>>21L)&0x06) | -+ ((c>>22L)&0x38)]; -+ t= des_skb[4][ (d )&0x3f ]| -+ des_skb[5][((d>> 7L)&0x03)|((d>> 8L)&0x3c)]| -+ des_skb[6][ (d>>15L)&0x3f ]| -+ des_skb[7][((d>>21L)&0x0f)|((d>>22L)&0x30)]; -+ -+ /* table contained 0213 4657 */ -+ t2=((t<<16L)|(s&0x0000ffffL))&0xffffffffL; -+ *(k++)=ROTATE(t2,30)&0xffffffffL; -+ -+ t2=((s>>16L)|(t&0xffff0000L)); -+ *(k++)=ROTATE(t2,26)&0xffffffffL; -+ } -+ return(0); -+ } -+ -+int des_key_sched(key, schedule) -+des_cblock (*key); -+des_key_schedule schedule; -+ { -+ return(des_set_key(key,schedule)); -+ } ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/goodmask.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,150 @@ -+/* -+ * minor utilities for subnet-mask manipulation -+ * Copyright (C) 1998, 1999 Henry Spencer. -+ * -+ * This library is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU Library General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This library is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public -+ * License for more details. -+ * -+ * RCSID $Id: goodmask.c,v 1.12 2004/07/10 07:43:47 mcr Exp $ -+ */ -+#include "openswan.h" -+ -+#ifndef ABITS -+#define ABITS 32 /* bits in an IPv4 address */ -+#endif -+ -+/* This file does not use sysdep.h, otherwise this should go into -+ * ports/darwin/include/sysdep.h -+ */ -+#ifndef s6_addr32 -+#define s6_addr32 __u6_addr.__u6_addr32 -+#endif -+ -+/* -+ - goodmask - is this a good (^1*0*$) subnet mask? -+ * You are not expected to understand this. See Henry S. Warren Jr, -+ * "Functions realizable with word-parallel logical and two's-complement -+ * addition instructions", CACM 20.6 (June 1977), p.439. -+ */ -+int /* predicate */ -+goodmask(mask) -+struct in_addr mask; -+{ -+ unsigned long x = ntohl(mask.s_addr); -+ /* clear rightmost contiguous string of 1-bits */ -+# define CRCS1B(x) (((x|(x-1))+1)&x) -+# define TOPBIT (1UL << 31) -+ -+ /* either zero, or has one string of 1-bits which is left-justified */ -+ if (x == 0 || (CRCS1B(x) == 0 && (x&TOPBIT))) -+ return 1; -+ return 0; -+} -+ -+/* -+ - masktobits - how many bits in this mask? -+ * The algorithm is essentially a binary search, but highly optimized -+ * for this particular task. -+ */ -+int /* -1 means !goodmask() */ -+masktobits(mask) -+struct in_addr mask; -+{ -+ unsigned long m = ntohl(mask.s_addr); -+ int masklen; -+ -+ if (!goodmask(mask)) -+ return -1; -+ -+ if (m&0x00000001UL) -+ return 32; -+ masklen = 0; -+ if (m&(0x0000ffffUL<<1)) { /* <<1 for 1-origin numbering */ -+ masklen |= 0x10; -+ m <<= 16; -+ } -+ if (m&(0x00ff0000UL<<1)) { -+ masklen |= 0x08; -+ m <<= 8; -+ } -+ if (m&(0x0f000000UL<<1)) { -+ masklen |= 0x04; -+ m <<= 4; -+ } -+ if (m&(0x30000000UL<<1)) { -+ masklen |= 0x02; -+ m <<= 2; -+ } -+ if (m&(0x40000000UL<<1)) -+ masklen |= 0x01; -+ -+ return masklen; -+} -+ -+/* -+ - bitstomask - return a mask with this many high bits on -+ */ -+struct in_addr -+bitstomask(n) -+int n; -+{ -+ struct in_addr result; -+ -+ if (n > 0 && n <= ABITS) -+ result.s_addr = htonl(~((1UL << (ABITS - n)) - 1)); -+ else if (n == 0) -+ result.s_addr = 0; -+ else -+ result.s_addr = 0; /* best error report we can do */ -+ return result; -+} -+ -+/* -+ - bitstomask6 - return a mask with this many high bits on -+ */ -+struct in6_addr -+bitstomask6(n) -+int n; -+{ -+ struct in6_addr result; -+ -+ if (n > 0 && n <= 32) { -+ result.s6_addr32[0] = htonl(~((1UL << (32 - n)) - 1)); -+ result.s6_addr32[1]=0; -+ result.s6_addr32[2]=0; -+ result.s6_addr32[3]=0; -+ } -+ else if (n > 32 && n <= 64) { -+ result.s6_addr32[0]=0xffffffffUL; -+ result.s6_addr32[1] = htonl(~((1UL << (64 - n)) - 1)); -+ result.s6_addr32[2]=0; -+ result.s6_addr32[3]=0; -+ } -+ else if (n > 64 && n <= 96) { -+ result.s6_addr32[0]=0xffffffffUL; -+ result.s6_addr32[1]=0xffffffffUL; -+ result.s6_addr32[2] = htonl(~((1UL << (96 - n)) - 1)); -+ result.s6_addr32[3]=0; -+ } -+ else if (n > 96 && n <= 128) { -+ result.s6_addr32[0]=0xffffffff; -+ result.s6_addr32[1]=0xffffffff; -+ result.s6_addr32[2]=0xffffffff; -+ result.s6_addr32[3] = htonl(~((1UL << (128 - n)) - 1)); -+ } -+ else { -+ result.s6_addr32[0] = 0; -+ result.s6_addr32[0] = 0; -+ result.s6_addr32[0] = 0; -+ result.s6_addr32[0] = 0; -+ } -+ -+ return result; -+} ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/infblock.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,403 @@ -+/* infblock.c -- interpret and process block types to last block -+ * Copyright (C) 1995-2002 Mark Adler -+ * For conditions of distribution and use, see copyright notice in zlib.h -+ */ -+ -+#include -+#include "infblock.h" -+#include "inftrees.h" -+#include "infcodes.h" -+#include "infutil.h" -+ -+struct inflate_codes_state {int dummy;}; /* for buggy compilers */ -+ -+/* simplify the use of the inflate_huft type with some defines */ -+#define exop word.what.Exop -+#define bits word.what.Bits -+ -+/* Table for deflate from PKZIP's appnote.txt. */ -+local const uInt border[] = { /* Order of the bit length code lengths */ -+ 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; -+ -+/* -+ Notes beyond the 1.93a appnote.txt: -+ -+ 1. Distance pointers never point before the beginning of the output -+ stream. -+ 2. Distance pointers can point back across blocks, up to 32k away. -+ 3. There is an implied maximum of 7 bits for the bit length table and -+ 15 bits for the actual data. -+ 4. If only one code exists, then it is encoded using one bit. (Zero -+ would be more efficient, but perhaps a little confusing.) If two -+ codes exist, they are coded using one bit each (0 and 1). -+ 5. There is no way of sending zero distance codes--a dummy must be -+ sent if there are none. (History: a pre 2.0 version of PKZIP would -+ store blocks with no distance codes, but this was discovered to be -+ too harsh a criterion.) Valid only for 1.93a. 2.04c does allow -+ zero distance codes, which is sent as one code of zero bits in -+ length. -+ 6. There are up to 286 literal/length codes. Code 256 represents the -+ end-of-block. Note however that the static length tree defines -+ 288 codes just to fill out the Huffman codes. Codes 286 and 287 -+ cannot be used though, since there is no length base or extra bits -+ defined for them. Similarily, there are up to 30 distance codes. -+ However, static trees define 32 codes (all 5 bits) to fill out the -+ Huffman codes, but the last two had better not show up in the data. -+ 7. Unzip can check dynamic Huffman blocks for complete code sets. -+ The exception is that a single code would not be complete (see #4). -+ 8. The five bits following the block type is really the number of -+ literal codes sent minus 257. -+ 9. Length codes 8,16,16 are interpreted as 13 length codes of 8 bits -+ (1+6+6). Therefore, to output three times the length, you output -+ three codes (1+1+1), whereas to output four times the same length, -+ you only need two codes (1+3). Hmm. -+ 10. In the tree reconstruction algorithm, Code = Code + Increment -+ only if BitLength(i) is not zero. (Pretty obvious.) -+ 11. Correction: 4 Bits: # of Bit Length codes - 4 (4 - 19) -+ 12. Note: length code 284 can represent 227-258, but length code 285 -+ really is 258. The last length deserves its own, short code -+ since it gets used a lot in very redundant files. The length -+ 258 is special since 258 - 3 (the min match length) is 255. -+ 13. The literal/length and distance code bit lengths are read as a -+ single stream of lengths. It is possible (and advantageous) for -+ a repeat code (16, 17, or 18) to go across the boundary between -+ the two sets of lengths. -+ */ -+ -+ -+void inflate_blocks_reset(s, z, c) -+inflate_blocks_statef *s; -+z_streamp z; -+uLongf *c; -+{ -+ if (c != Z_NULL) -+ *c = s->check; -+ if (s->mode == BTREE || s->mode == DTREE) -+ ZFREE(z, s->sub.trees.blens); -+ if (s->mode == CODES) -+ inflate_codes_free(s->sub.decode.codes, z); -+ s->mode = TYPE; -+ s->bitk = 0; -+ s->bitb = 0; -+ s->read = s->write = s->window; -+ if (s->checkfn != Z_NULL) -+ z->adler = s->check = (*s->checkfn)(0L, (const Bytef *)Z_NULL, 0); -+ Tracev((stderr, "inflate: blocks reset\n")); -+} -+ -+ -+inflate_blocks_statef *inflate_blocks_new(z, c, w) -+z_streamp z; -+check_func c; -+uInt w; -+{ -+ inflate_blocks_statef *s; -+ -+ if ((s = (inflate_blocks_statef *)ZALLOC -+ (z,1,sizeof(struct inflate_blocks_state))) == Z_NULL) -+ return s; -+ if ((s->hufts = -+ (inflate_huft *)ZALLOC(z, sizeof(inflate_huft), MANY)) == Z_NULL) -+ { -+ ZFREE(z, s); -+ return Z_NULL; -+ } -+ if ((s->window = (Bytef *)ZALLOC(z, 1, w)) == Z_NULL) -+ { -+ ZFREE(z, s->hufts); -+ ZFREE(z, s); -+ return Z_NULL; -+ } -+ s->end = s->window + w; -+ s->checkfn = c; -+ s->mode = TYPE; -+ Tracev((stderr, "inflate: blocks allocated\n")); -+ inflate_blocks_reset(s, z, Z_NULL); -+ return s; -+} -+ -+ -+int inflate_blocks(s, z, r) -+inflate_blocks_statef *s; -+z_streamp z; -+int r; -+{ -+ uInt t; /* temporary storage */ -+ uLong b; /* bit buffer */ -+ uInt k; /* bits in bit buffer */ -+ Bytef *p; /* input data pointer */ -+ uInt n; /* bytes available there */ -+ Bytef *q; /* output window write pointer */ -+ uInt m; /* bytes to end of window or read pointer */ -+ -+ /* copy input/output information to locals (UPDATE macro restores) */ -+ LOAD -+ -+ /* process input based on current state */ -+ while (1) switch (s->mode) -+ { -+ case TYPE: -+ NEEDBITS(3) -+ t = (uInt)b & 7; -+ s->last = t & 1; -+ switch (t >> 1) -+ { -+ case 0: /* stored */ -+ Tracev((stderr, "inflate: stored block%s\n", -+ s->last ? " (last)" : "")); -+ DUMPBITS(3) -+ t = k & 7; /* go to byte boundary */ -+ DUMPBITS(t) -+ s->mode = LENS; /* get length of stored block */ -+ break; -+ case 1: /* fixed */ -+ Tracev((stderr, "inflate: fixed codes block%s\n", -+ s->last ? " (last)" : "")); -+ { -+ uInt bl, bd; -+ inflate_huft *tl, *td; -+ -+ inflate_trees_fixed(&bl, &bd, &tl, &td, z); -+ s->sub.decode.codes = inflate_codes_new(bl, bd, tl, td, z); -+ if (s->sub.decode.codes == Z_NULL) -+ { -+ r = Z_MEM_ERROR; -+ LEAVE -+ } -+ } -+ DUMPBITS(3) -+ s->mode = CODES; -+ break; -+ case 2: /* dynamic */ -+ Tracev((stderr, "inflate: dynamic codes block%s\n", -+ s->last ? " (last)" : "")); -+ DUMPBITS(3) -+ s->mode = TABLE; -+ break; -+ case 3: /* illegal */ -+ DUMPBITS(3) -+ s->mode = BAD; -+ z->msg = (char*)"invalid block type"; -+ r = Z_DATA_ERROR; -+ LEAVE -+ } -+ break; -+ case LENS: -+ NEEDBITS(32) -+ if ((((~b) >> 16) & 0xffff) != (b & 0xffff)) -+ { -+ s->mode = BAD; -+ z->msg = (char*)"invalid stored block lengths"; -+ r = Z_DATA_ERROR; -+ LEAVE -+ } -+ s->sub.left = (uInt)b & 0xffff; -+ b = k = 0; /* dump bits */ -+ Tracev((stderr, "inflate: stored length %u\n", s->sub.left)); -+ s->mode = s->sub.left ? STORED : (s->last ? DRY : TYPE); -+ break; -+ case STORED: -+ if (n == 0) -+ LEAVE -+ NEEDOUT -+ t = s->sub.left; -+ if (t > n) t = n; -+ if (t > m) t = m; -+ zmemcpy(q, p, t); -+ p += t; n -= t; -+ q += t; m -= t; -+ if ((s->sub.left -= t) != 0) -+ break; -+ Tracev((stderr, "inflate: stored end, %lu total out\n", -+ z->total_out + (q >= s->read ? q - s->read : -+ (s->end - s->read) + (q - s->window)))); -+ s->mode = s->last ? DRY : TYPE; -+ break; -+ case TABLE: -+ NEEDBITS(14) -+ s->sub.trees.table = t = (uInt)b & 0x3fff; -+#ifndef PKZIP_BUG_WORKAROUND -+ if ((t & 0x1f) > 29 || ((t >> 5) & 0x1f) > 29) -+ { -+ s->mode = BAD; -+ z->msg = (char*)"too many length or distance symbols"; -+ r = Z_DATA_ERROR; -+ LEAVE -+ } -+#endif -+ t = 258 + (t & 0x1f) + ((t >> 5) & 0x1f); -+ if ((s->sub.trees.blens = (uIntf*)ZALLOC(z, t, sizeof(uInt))) == Z_NULL) -+ { -+ r = Z_MEM_ERROR; -+ LEAVE -+ } -+ DUMPBITS(14) -+ s->sub.trees.index = 0; -+ Tracev((stderr, "inflate: table sizes ok\n")); -+ s->mode = BTREE; -+ case BTREE: -+ while (s->sub.trees.index < 4 + (s->sub.trees.table >> 10)) -+ { -+ NEEDBITS(3) -+ s->sub.trees.blens[border[s->sub.trees.index++]] = (uInt)b & 7; -+ DUMPBITS(3) -+ } -+ while (s->sub.trees.index < 19) -+ s->sub.trees.blens[border[s->sub.trees.index++]] = 0; -+ s->sub.trees.bb = 7; -+ t = inflate_trees_bits(s->sub.trees.blens, &s->sub.trees.bb, -+ &s->sub.trees.tb, s->hufts, z); -+ if (t != Z_OK) -+ { -+ r = t; -+ if (r == Z_DATA_ERROR) -+ { -+ ZFREE(z, s->sub.trees.blens); -+ s->mode = BAD; -+ } -+ LEAVE -+ } -+ s->sub.trees.index = 0; -+ Tracev((stderr, "inflate: bits tree ok\n")); -+ s->mode = DTREE; -+ case DTREE: -+ while (t = s->sub.trees.table, -+ s->sub.trees.index < 258 + (t & 0x1f) + ((t >> 5) & 0x1f)) -+ { -+ inflate_huft *h; -+ uInt i, j, c; -+ -+ t = s->sub.trees.bb; -+ NEEDBITS(t) -+ h = s->sub.trees.tb + ((uInt)b & inflate_mask[t]); -+ t = h->bits; -+ c = h->base; -+ if (c < 16) -+ { -+ DUMPBITS(t) -+ s->sub.trees.blens[s->sub.trees.index++] = c; -+ } -+ else /* c == 16..18 */ -+ { -+ i = c == 18 ? 7 : c - 14; -+ j = c == 18 ? 11 : 3; -+ NEEDBITS(t + i) -+ DUMPBITS(t) -+ j += (uInt)b & inflate_mask[i]; -+ DUMPBITS(i) -+ i = s->sub.trees.index; -+ t = s->sub.trees.table; -+ if (i + j > 258 + (t & 0x1f) + ((t >> 5) & 0x1f) || -+ (c == 16 && i < 1)) -+ { -+ ZFREE(z, s->sub.trees.blens); -+ s->mode = BAD; -+ z->msg = (char*)"invalid bit length repeat"; -+ r = Z_DATA_ERROR; -+ LEAVE -+ } -+ c = c == 16 ? s->sub.trees.blens[i - 1] : 0; -+ do { -+ s->sub.trees.blens[i++] = c; -+ } while (--j); -+ s->sub.trees.index = i; -+ } -+ } -+ s->sub.trees.tb = Z_NULL; -+ { -+ uInt bl, bd; -+ inflate_huft *tl, *td; -+ inflate_codes_statef *c; -+ -+ bl = 9; /* must be <= 9 for lookahead assumptions */ -+ bd = 6; /* must be <= 9 for lookahead assumptions */ -+ t = s->sub.trees.table; -+ t = inflate_trees_dynamic(257 + (t & 0x1f), 1 + ((t >> 5) & 0x1f), -+ s->sub.trees.blens, &bl, &bd, &tl, &td, -+ s->hufts, z); -+ if (t != Z_OK) -+ { -+ if (t == (uInt)Z_DATA_ERROR) -+ { -+ ZFREE(z, s->sub.trees.blens); -+ s->mode = BAD; -+ } -+ r = t; -+ LEAVE -+ } -+ Tracev((stderr, "inflate: trees ok\n")); -+ if ((c = inflate_codes_new(bl, bd, tl, td, z)) == Z_NULL) -+ { -+ r = Z_MEM_ERROR; -+ LEAVE -+ } -+ s->sub.decode.codes = c; -+ } -+ ZFREE(z, s->sub.trees.blens); -+ s->mode = CODES; -+ case CODES: -+ UPDATE -+ if ((r = inflate_codes(s, z, r)) != Z_STREAM_END) -+ return inflate_flush(s, z, r); -+ r = Z_OK; -+ inflate_codes_free(s->sub.decode.codes, z); -+ LOAD -+ Tracev((stderr, "inflate: codes end, %lu total out\n", -+ z->total_out + (q >= s->read ? q - s->read : -+ (s->end - s->read) + (q - s->window)))); -+ if (!s->last) -+ { -+ s->mode = TYPE; -+ break; -+ } -+ s->mode = DRY; -+ case DRY: -+ FLUSH -+ if (s->read != s->write) -+ LEAVE -+ s->mode = DONE; -+ case DONE: -+ r = Z_STREAM_END; -+ LEAVE -+ case BAD: -+ r = Z_DATA_ERROR; -+ LEAVE -+ default: -+ r = Z_STREAM_ERROR; -+ LEAVE -+ } -+} -+ -+ -+int inflate_blocks_free(s, z) -+inflate_blocks_statef *s; -+z_streamp z; -+{ -+ inflate_blocks_reset(s, z, Z_NULL); -+ ZFREE(z, s->window); -+ ZFREE(z, s->hufts); -+ ZFREE(z, s); -+ Tracev((stderr, "inflate: blocks freed\n")); -+ return Z_OK; -+} -+ -+ -+void inflate_set_dictionary(s, d, n) -+inflate_blocks_statef *s; -+const Bytef *d; -+uInt n; -+{ -+ zmemcpy(s->window, d, n); -+ s->read = s->write = s->window + n; -+} -+ -+ -+/* Returns true if inflate is currently at the end of a block generated -+ * by Z_SYNC_FLUSH or Z_FULL_FLUSH. -+ * IN assertion: s != Z_NULL -+ */ -+int inflate_blocks_sync_point(s) -+inflate_blocks_statef *s; -+{ -+ return s->mode == LENS; -+} ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/infblock.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,39 @@ -+/* infblock.h -- header to use infblock.c -+ * Copyright (C) 1995-2002 Mark Adler -+ * For conditions of distribution and use, see copyright notice in zlib.h -+ */ -+ -+/* WARNING: this file should *not* be used by applications. It is -+ part of the implementation of the compression library and is -+ subject to change. Applications should only use zlib.h. -+ */ -+ -+struct inflate_blocks_state; -+typedef struct inflate_blocks_state FAR inflate_blocks_statef; -+ -+extern inflate_blocks_statef * inflate_blocks_new OF(( -+ z_streamp z, -+ check_func c, /* check function */ -+ uInt w)); /* window size */ -+ -+extern int inflate_blocks OF(( -+ inflate_blocks_statef *, -+ z_streamp , -+ int)); /* initial return code */ -+ -+extern void inflate_blocks_reset OF(( -+ inflate_blocks_statef *, -+ z_streamp , -+ uLongf *)); /* check value on output */ -+ -+extern int inflate_blocks_free OF(( -+ inflate_blocks_statef *, -+ z_streamp)); -+ -+extern void inflate_set_dictionary OF(( -+ inflate_blocks_statef *s, -+ const Bytef *d, /* dictionary */ -+ uInt n)); /* dictionary length */ -+ -+extern int inflate_blocks_sync_point OF(( -+ inflate_blocks_statef *s)); ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/infcodes.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,251 @@ -+/* infcodes.c -- process literals and length/distance pairs -+ * Copyright (C) 1995-2002 Mark Adler -+ * For conditions of distribution and use, see copyright notice in zlib.h -+ */ -+ -+#include -+#include "inftrees.h" -+#include "infblock.h" -+#include "infcodes.h" -+#include "infutil.h" -+#include "inffast.h" -+ -+/* simplify the use of the inflate_huft type with some defines */ -+#define exop word.what.Exop -+#define bits word.what.Bits -+ -+typedef enum { /* waiting for "i:"=input, "o:"=output, "x:"=nothing */ -+ START, /* x: set up for LEN */ -+ LEN, /* i: get length/literal/eob next */ -+ LENEXT, /* i: getting length extra (have base) */ -+ DIST, /* i: get distance next */ -+ DISTEXT, /* i: getting distance extra */ -+ COPY, /* o: copying bytes in window, waiting for space */ -+ LIT, /* o: got literal, waiting for output space */ -+ WASH, /* o: got eob, possibly still output waiting */ -+ END, /* x: got eob and all data flushed */ -+ BADCODE} /* x: got error */ -+inflate_codes_mode; -+ -+/* inflate codes private state */ -+struct inflate_codes_state { -+ -+ /* mode */ -+ inflate_codes_mode mode; /* current inflate_codes mode */ -+ -+ /* mode dependent information */ -+ uInt len; -+ union { -+ struct { -+ inflate_huft *tree; /* pointer into tree */ -+ uInt need; /* bits needed */ -+ } code; /* if LEN or DIST, where in tree */ -+ uInt lit; /* if LIT, literal */ -+ struct { -+ uInt get; /* bits to get for extra */ -+ uInt dist; /* distance back to copy from */ -+ } copy; /* if EXT or COPY, where and how much */ -+ } sub; /* submode */ -+ -+ /* mode independent information */ -+ Byte lbits; /* ltree bits decoded per branch */ -+ Byte dbits; /* dtree bits decoder per branch */ -+ inflate_huft *ltree; /* literal/length/eob tree */ -+ inflate_huft *dtree; /* distance tree */ -+ -+}; -+ -+ -+inflate_codes_statef *inflate_codes_new(bl, bd, tl, td, z) -+uInt bl, bd; -+inflate_huft *tl; -+inflate_huft *td; /* need separate declaration for Borland C++ */ -+z_streamp z; -+{ -+ inflate_codes_statef *c; -+ -+ if ((c = (inflate_codes_statef *) -+ ZALLOC(z,1,sizeof(struct inflate_codes_state))) != Z_NULL) -+ { -+ c->mode = START; -+ c->lbits = (Byte)bl; -+ c->dbits = (Byte)bd; -+ c->ltree = tl; -+ c->dtree = td; -+ Tracev((stderr, "inflate: codes new\n")); -+ } -+ return c; -+} -+ -+ -+int inflate_codes(s, z, r) -+inflate_blocks_statef *s; -+z_streamp z; -+int r; -+{ -+ uInt j; /* temporary storage */ -+ inflate_huft *t; /* temporary pointer */ -+ uInt e; /* extra bits or operation */ -+ uLong b; /* bit buffer */ -+ uInt k; /* bits in bit buffer */ -+ Bytef *p; /* input data pointer */ -+ uInt n; /* bytes available there */ -+ Bytef *q; /* output window write pointer */ -+ uInt m; /* bytes to end of window or read pointer */ -+ Bytef *f; /* pointer to copy strings from */ -+ inflate_codes_statef *c = s->sub.decode.codes; /* codes state */ -+ -+ /* copy input/output information to locals (UPDATE macro restores) */ -+ LOAD -+ -+ /* process input and output based on current state */ -+ while (1) switch (c->mode) -+ { /* waiting for "i:"=input, "o:"=output, "x:"=nothing */ -+ case START: /* x: set up for LEN */ -+#ifndef SLOW -+ if (m >= 258 && n >= 10) -+ { -+ UPDATE -+ r = inflate_fast(c->lbits, c->dbits, c->ltree, c->dtree, s, z); -+ LOAD -+ if (r != Z_OK) -+ { -+ c->mode = r == Z_STREAM_END ? WASH : BADCODE; -+ break; -+ } -+ } -+#endif /* !SLOW */ -+ c->sub.code.need = c->lbits; -+ c->sub.code.tree = c->ltree; -+ c->mode = LEN; -+ case LEN: /* i: get length/literal/eob next */ -+ j = c->sub.code.need; -+ NEEDBITS(j) -+ t = c->sub.code.tree + ((uInt)b & inflate_mask[j]); -+ DUMPBITS(t->bits) -+ e = (uInt)(t->exop); -+ if (e == 0) /* literal */ -+ { -+ c->sub.lit = t->base; -+ Tracevv((stderr, t->base >= 0x20 && t->base < 0x7f ? -+ "inflate: literal '%c'\n" : -+ "inflate: literal 0x%02x\n", t->base)); -+ c->mode = LIT; -+ break; -+ } -+ if (e & 16) /* length */ -+ { -+ c->sub.copy.get = e & 15; -+ c->len = t->base; -+ c->mode = LENEXT; -+ break; -+ } -+ if ((e & 64) == 0) /* next table */ -+ { -+ c->sub.code.need = e; -+ c->sub.code.tree = t + t->base; -+ break; -+ } -+ if (e & 32) /* end of block */ -+ { -+ Tracevv((stderr, "inflate: end of block\n")); -+ c->mode = WASH; -+ break; -+ } -+ c->mode = BADCODE; /* invalid code */ -+ z->msg = (char*)"invalid literal/length code"; -+ r = Z_DATA_ERROR; -+ LEAVE -+ case LENEXT: /* i: getting length extra (have base) */ -+ j = c->sub.copy.get; -+ NEEDBITS(j) -+ c->len += (uInt)b & inflate_mask[j]; -+ DUMPBITS(j) -+ c->sub.code.need = c->dbits; -+ c->sub.code.tree = c->dtree; -+ Tracevv((stderr, "inflate: length %u\n", c->len)); -+ c->mode = DIST; -+ case DIST: /* i: get distance next */ -+ j = c->sub.code.need; -+ NEEDBITS(j) -+ t = c->sub.code.tree + ((uInt)b & inflate_mask[j]); -+ DUMPBITS(t->bits) -+ e = (uInt)(t->exop); -+ if (e & 16) /* distance */ -+ { -+ c->sub.copy.get = e & 15; -+ c->sub.copy.dist = t->base; -+ c->mode = DISTEXT; -+ break; -+ } -+ if ((e & 64) == 0) /* next table */ -+ { -+ c->sub.code.need = e; -+ c->sub.code.tree = t + t->base; -+ break; -+ } -+ c->mode = BADCODE; /* invalid code */ -+ z->msg = (char*)"invalid distance code"; -+ r = Z_DATA_ERROR; -+ LEAVE -+ case DISTEXT: /* i: getting distance extra */ -+ j = c->sub.copy.get; -+ NEEDBITS(j) -+ c->sub.copy.dist += (uInt)b & inflate_mask[j]; -+ DUMPBITS(j) -+ Tracevv((stderr, "inflate: distance %u\n", c->sub.copy.dist)); -+ c->mode = COPY; -+ case COPY: /* o: copying bytes in window, waiting for space */ -+ f = q - c->sub.copy.dist; -+ while (f < s->window) /* modulo window size-"while" instead */ -+ f += s->end - s->window; /* of "if" handles invalid distances */ -+ while (c->len) -+ { -+ NEEDOUT -+ OUTBYTE(*f++) -+ if (f == s->end) -+ f = s->window; -+ c->len--; -+ } -+ c->mode = START; -+ break; -+ case LIT: /* o: got literal, waiting for output space */ -+ NEEDOUT -+ OUTBYTE(c->sub.lit) -+ c->mode = START; -+ break; -+ case WASH: /* o: got eob, possibly more output */ -+ if (k > 7) /* return unused byte, if any */ -+ { -+ Assert(k < 16, "inflate_codes grabbed too many bytes") -+ k -= 8; -+ n++; -+ p--; /* can always return one */ -+ } -+ FLUSH -+ if (s->read != s->write) -+ LEAVE -+ c->mode = END; -+ case END: -+ r = Z_STREAM_END; -+ LEAVE -+ case BADCODE: /* x: got error */ -+ r = Z_DATA_ERROR; -+ LEAVE -+ default: -+ r = Z_STREAM_ERROR; -+ LEAVE -+ } -+#ifdef NEED_DUMMY_RETURN -+ return Z_STREAM_ERROR; /* Some dumb compilers complain without this */ -+#endif -+} -+ -+ -+void inflate_codes_free(c, z) -+inflate_codes_statef *c; -+z_streamp z; -+{ -+ ZFREE(z, c); -+ Tracev((stderr, "inflate: codes free\n")); -+} ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/infcodes.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,31 @@ -+/* infcodes.h -- header to use infcodes.c -+ * Copyright (C) 1995-2002 Mark Adler -+ * For conditions of distribution and use, see copyright notice in zlib.h -+ */ -+ -+/* WARNING: this file should *not* be used by applications. It is -+ part of the implementation of the compression library and is -+ subject to change. Applications should only use zlib.h. -+ */ -+ -+#ifndef _INFCODES_H -+#define _INFCODES_H -+ -+struct inflate_codes_state; -+typedef struct inflate_codes_state FAR inflate_codes_statef; -+ -+extern inflate_codes_statef *inflate_codes_new OF(( -+ uInt, uInt, -+ inflate_huft *, inflate_huft *, -+ z_streamp )); -+ -+extern int inflate_codes OF(( -+ inflate_blocks_statef *, -+ z_streamp , -+ int)); -+ -+extern void inflate_codes_free OF(( -+ inflate_codes_statef *, -+ z_streamp )); -+ -+#endif /* _INFCODES_H */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/inffast.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,183 @@ -+/* inffast.c -- process literals and length/distance pairs fast -+ * Copyright (C) 1995-2002 Mark Adler -+ * For conditions of distribution and use, see copyright notice in zlib.h -+ */ -+ -+#include -+#include "inftrees.h" -+#include "infblock.h" -+#include "infcodes.h" -+#include "infutil.h" -+#include "inffast.h" -+ -+struct inflate_codes_state {int dummy;}; /* for buggy compilers */ -+ -+/* simplify the use of the inflate_huft type with some defines */ -+#define exop word.what.Exop -+#define bits word.what.Bits -+ -+/* macros for bit input with no checking and for returning unused bytes */ -+#define GRABBITS(j) {while(k<(j)){b|=((uLong)NEXTBYTE)<avail_in-n;c=(k>>3)>3:c;n+=c;p-=c;k-=c<<3;} -+ -+/* Called with number of bytes left to write in window at least 258 -+ (the maximum string length) and number of input bytes available -+ at least ten. The ten bytes are six bytes for the longest length/ -+ distance pair plus four bytes for overloading the bit buffer. */ -+ -+int inflate_fast(bl, bd, tl, td, s, z) -+uInt bl, bd; -+inflate_huft *tl; -+inflate_huft *td; /* need separate declaration for Borland C++ */ -+inflate_blocks_statef *s; -+z_streamp z; -+{ -+ inflate_huft *t; /* temporary pointer */ -+ uInt e; /* extra bits or operation */ -+ uLong b; /* bit buffer */ -+ uInt k; /* bits in bit buffer */ -+ Bytef *p; /* input data pointer */ -+ uInt n; /* bytes available there */ -+ Bytef *q; /* output window write pointer */ -+ uInt m; /* bytes to end of window or read pointer */ -+ uInt ml; /* mask for literal/length tree */ -+ uInt md; /* mask for distance tree */ -+ uInt c; /* bytes to copy */ -+ uInt d; /* distance back to copy from */ -+ Bytef *r; /* copy source pointer */ -+ -+ /* load input, output, bit values */ -+ LOAD -+ -+ /* initialize masks */ -+ ml = inflate_mask[bl]; -+ md = inflate_mask[bd]; -+ -+ /* do until not enough input or output space for fast loop */ -+ do { /* assume called with m >= 258 && n >= 10 */ -+ /* get literal/length code */ -+ GRABBITS(20) /* max bits for literal/length code */ -+ if ((e = (t = tl + ((uInt)b & ml))->exop) == 0) -+ { -+ DUMPBITS(t->bits) -+ Tracevv((stderr, t->base >= 0x20 && t->base < 0x7f ? -+ "inflate: * literal '%c'\n" : -+ "inflate: * literal 0x%02x\n", t->base)); -+ *q++ = (Byte)t->base; -+ m--; -+ continue; -+ } -+ do { -+ DUMPBITS(t->bits) -+ if (e & 16) -+ { -+ /* get extra bits for length */ -+ e &= 15; -+ c = t->base + ((uInt)b & inflate_mask[e]); -+ DUMPBITS(e) -+ Tracevv((stderr, "inflate: * length %u\n", c)); -+ -+ /* decode distance base of block to copy */ -+ GRABBITS(15); /* max bits for distance code */ -+ e = (t = td + ((uInt)b & md))->exop; -+ do { -+ DUMPBITS(t->bits) -+ if (e & 16) -+ { -+ /* get extra bits to add to distance base */ -+ e &= 15; -+ GRABBITS(e) /* get extra bits (up to 13) */ -+ d = t->base + ((uInt)b & inflate_mask[e]); -+ DUMPBITS(e) -+ Tracevv((stderr, "inflate: * distance %u\n", d)); -+ -+ /* do the copy */ -+ m -= c; -+ r = q - d; -+ if (r < s->window) /* wrap if needed */ -+ { -+ do { -+ r += s->end - s->window; /* force pointer in window */ -+ } while (r < s->window); /* covers invalid distances */ -+ e = s->end - r; -+ if (c > e) -+ { -+ c -= e; /* wrapped copy */ -+ do { -+ *q++ = *r++; -+ } while (--e); -+ r = s->window; -+ do { -+ *q++ = *r++; -+ } while (--c); -+ } -+ else /* normal copy */ -+ { -+ *q++ = *r++; c--; -+ *q++ = *r++; c--; -+ do { -+ *q++ = *r++; -+ } while (--c); -+ } -+ } -+ else /* normal copy */ -+ { -+ *q++ = *r++; c--; -+ *q++ = *r++; c--; -+ do { -+ *q++ = *r++; -+ } while (--c); -+ } -+ break; -+ } -+ else if ((e & 64) == 0) -+ { -+ t += t->base; -+ e = (t += ((uInt)b & inflate_mask[e]))->exop; -+ } -+ else -+ { -+ z->msg = (char*)"invalid distance code"; -+ UNGRAB -+ UPDATE -+ return Z_DATA_ERROR; -+ } -+ } while (1); -+ break; -+ } -+ if ((e & 64) == 0) -+ { -+ t += t->base; -+ if ((e = (t += ((uInt)b & inflate_mask[e]))->exop) == 0) -+ { -+ DUMPBITS(t->bits) -+ Tracevv((stderr, t->base >= 0x20 && t->base < 0x7f ? -+ "inflate: * literal '%c'\n" : -+ "inflate: * literal 0x%02x\n", t->base)); -+ *q++ = (Byte)t->base; -+ m--; -+ break; -+ } -+ } -+ else if (e & 32) -+ { -+ Tracevv((stderr, "inflate: * end of block\n")); -+ UNGRAB -+ UPDATE -+ return Z_STREAM_END; -+ } -+ else -+ { -+ z->msg = (char*)"invalid literal/length code"; -+ UNGRAB -+ UPDATE -+ return Z_DATA_ERROR; -+ } -+ } while (1); -+ } while (m >= 258 && n >= 10); -+ -+ /* not enough input or output--restore pointers and return */ -+ UNGRAB -+ UPDATE -+ return Z_OK; -+} ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/inffast.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,22 @@ -+/* inffast.h -- header to use inffast.c -+ * Copyright (C) 1995-2002 Mark Adler -+ * For conditions of distribution and use, see copyright notice in zlib.h -+ */ -+ -+/* WARNING: this file should *not* be used by applications. It is -+ part of the implementation of the compression library and is -+ subject to change. Applications should only use zlib.h. -+ */ -+ -+#ifndef _INFFAST_H -+#define _INFFAST_H -+ -+extern int inflate_fast OF(( -+ uInt, -+ uInt, -+ inflate_huft *, -+ inflate_huft *, -+ inflate_blocks_statef *, -+ z_streamp )); -+ -+#endif /* _INFFAST_H */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/inffixed.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,151 @@ -+/* inffixed.h -- table for decoding fixed codes -+ * Generated automatically by the maketree.c program -+ */ -+ -+/* WARNING: this file should *not* be used by applications. It is -+ part of the implementation of the compression library and is -+ subject to change. Applications should only use zlib.h. -+ */ -+ -+local uInt fixed_bl = 9; -+local uInt fixed_bd = 5; -+local inflate_huft fixed_tl[] = { -+ {{{96,7}},256}, {{{0,8}},80}, {{{0,8}},16}, {{{84,8}},115}, -+ {{{82,7}},31}, {{{0,8}},112}, {{{0,8}},48}, {{{0,9}},192}, -+ {{{80,7}},10}, {{{0,8}},96}, {{{0,8}},32}, {{{0,9}},160}, -+ {{{0,8}},0}, {{{0,8}},128}, {{{0,8}},64}, {{{0,9}},224}, -+ {{{80,7}},6}, {{{0,8}},88}, {{{0,8}},24}, {{{0,9}},144}, -+ {{{83,7}},59}, {{{0,8}},120}, {{{0,8}},56}, {{{0,9}},208}, -+ {{{81,7}},17}, {{{0,8}},104}, {{{0,8}},40}, {{{0,9}},176}, -+ {{{0,8}},8}, {{{0,8}},136}, {{{0,8}},72}, {{{0,9}},240}, -+ {{{80,7}},4}, {{{0,8}},84}, {{{0,8}},20}, {{{85,8}},227}, -+ {{{83,7}},43}, {{{0,8}},116}, {{{0,8}},52}, {{{0,9}},200}, -+ {{{81,7}},13}, {{{0,8}},100}, {{{0,8}},36}, {{{0,9}},168}, -+ {{{0,8}},4}, {{{0,8}},132}, {{{0,8}},68}, {{{0,9}},232}, -+ {{{80,7}},8}, {{{0,8}},92}, {{{0,8}},28}, {{{0,9}},152}, -+ {{{84,7}},83}, {{{0,8}},124}, {{{0,8}},60}, {{{0,9}},216}, -+ {{{82,7}},23}, {{{0,8}},108}, {{{0,8}},44}, {{{0,9}},184}, -+ {{{0,8}},12}, {{{0,8}},140}, {{{0,8}},76}, {{{0,9}},248}, -+ {{{80,7}},3}, {{{0,8}},82}, {{{0,8}},18}, {{{85,8}},163}, -+ {{{83,7}},35}, {{{0,8}},114}, {{{0,8}},50}, {{{0,9}},196}, -+ {{{81,7}},11}, {{{0,8}},98}, {{{0,8}},34}, {{{0,9}},164}, -+ {{{0,8}},2}, {{{0,8}},130}, {{{0,8}},66}, {{{0,9}},228}, -+ {{{80,7}},7}, {{{0,8}},90}, {{{0,8}},26}, {{{0,9}},148}, -+ {{{84,7}},67}, {{{0,8}},122}, {{{0,8}},58}, {{{0,9}},212}, -+ {{{82,7}},19}, {{{0,8}},106}, {{{0,8}},42}, {{{0,9}},180}, -+ {{{0,8}},10}, {{{0,8}},138}, {{{0,8}},74}, {{{0,9}},244}, -+ {{{80,7}},5}, {{{0,8}},86}, {{{0,8}},22}, {{{192,8}},0}, -+ {{{83,7}},51}, {{{0,8}},118}, {{{0,8}},54}, {{{0,9}},204}, -+ {{{81,7}},15}, {{{0,8}},102}, {{{0,8}},38}, {{{0,9}},172}, -+ {{{0,8}},6}, {{{0,8}},134}, {{{0,8}},70}, {{{0,9}},236}, -+ {{{80,7}},9}, {{{0,8}},94}, {{{0,8}},30}, {{{0,9}},156}, -+ {{{84,7}},99}, {{{0,8}},126}, {{{0,8}},62}, {{{0,9}},220}, -+ {{{82,7}},27}, {{{0,8}},110}, {{{0,8}},46}, {{{0,9}},188}, -+ {{{0,8}},14}, {{{0,8}},142}, {{{0,8}},78}, {{{0,9}},252}, -+ {{{96,7}},256}, {{{0,8}},81}, {{{0,8}},17}, {{{85,8}},131}, -+ {{{82,7}},31}, {{{0,8}},113}, {{{0,8}},49}, {{{0,9}},194}, -+ {{{80,7}},10}, {{{0,8}},97}, {{{0,8}},33}, {{{0,9}},162}, -+ {{{0,8}},1}, {{{0,8}},129}, {{{0,8}},65}, {{{0,9}},226}, -+ {{{80,7}},6}, {{{0,8}},89}, {{{0,8}},25}, {{{0,9}},146}, -+ {{{83,7}},59}, {{{0,8}},121}, {{{0,8}},57}, {{{0,9}},210}, -+ {{{81,7}},17}, {{{0,8}},105}, {{{0,8}},41}, {{{0,9}},178}, -+ {{{0,8}},9}, {{{0,8}},137}, {{{0,8}},73}, {{{0,9}},242}, -+ {{{80,7}},4}, {{{0,8}},85}, {{{0,8}},21}, {{{80,8}},258}, -+ {{{83,7}},43}, {{{0,8}},117}, {{{0,8}},53}, {{{0,9}},202}, -+ {{{81,7}},13}, {{{0,8}},101}, {{{0,8}},37}, {{{0,9}},170}, -+ {{{0,8}},5}, {{{0,8}},133}, {{{0,8}},69}, {{{0,9}},234}, -+ {{{80,7}},8}, {{{0,8}},93}, {{{0,8}},29}, {{{0,9}},154}, -+ {{{84,7}},83}, {{{0,8}},125}, {{{0,8}},61}, {{{0,9}},218}, -+ {{{82,7}},23}, {{{0,8}},109}, {{{0,8}},45}, {{{0,9}},186}, -+ {{{0,8}},13}, {{{0,8}},141}, {{{0,8}},77}, {{{0,9}},250}, -+ {{{80,7}},3}, {{{0,8}},83}, {{{0,8}},19}, {{{85,8}},195}, -+ {{{83,7}},35}, {{{0,8}},115}, {{{0,8}},51}, {{{0,9}},198}, -+ {{{81,7}},11}, {{{0,8}},99}, {{{0,8}},35}, {{{0,9}},166}, -+ {{{0,8}},3}, {{{0,8}},131}, {{{0,8}},67}, {{{0,9}},230}, -+ {{{80,7}},7}, {{{0,8}},91}, {{{0,8}},27}, {{{0,9}},150}, -+ {{{84,7}},67}, {{{0,8}},123}, {{{0,8}},59}, {{{0,9}},214}, -+ {{{82,7}},19}, {{{0,8}},107}, {{{0,8}},43}, {{{0,9}},182}, -+ {{{0,8}},11}, {{{0,8}},139}, {{{0,8}},75}, {{{0,9}},246}, -+ {{{80,7}},5}, {{{0,8}},87}, {{{0,8}},23}, {{{192,8}},0}, -+ {{{83,7}},51}, {{{0,8}},119}, {{{0,8}},55}, {{{0,9}},206}, -+ {{{81,7}},15}, {{{0,8}},103}, {{{0,8}},39}, {{{0,9}},174}, -+ {{{0,8}},7}, {{{0,8}},135}, {{{0,8}},71}, {{{0,9}},238}, -+ {{{80,7}},9}, {{{0,8}},95}, {{{0,8}},31}, {{{0,9}},158}, -+ {{{84,7}},99}, {{{0,8}},127}, {{{0,8}},63}, {{{0,9}},222}, -+ {{{82,7}},27}, {{{0,8}},111}, {{{0,8}},47}, {{{0,9}},190}, -+ {{{0,8}},15}, {{{0,8}},143}, {{{0,8}},79}, {{{0,9}},254}, -+ {{{96,7}},256}, {{{0,8}},80}, {{{0,8}},16}, {{{84,8}},115}, -+ {{{82,7}},31}, {{{0,8}},112}, {{{0,8}},48}, {{{0,9}},193}, -+ {{{80,7}},10}, {{{0,8}},96}, {{{0,8}},32}, {{{0,9}},161}, -+ {{{0,8}},0}, {{{0,8}},128}, {{{0,8}},64}, {{{0,9}},225}, -+ {{{80,7}},6}, {{{0,8}},88}, {{{0,8}},24}, {{{0,9}},145}, -+ {{{83,7}},59}, {{{0,8}},120}, {{{0,8}},56}, {{{0,9}},209}, -+ {{{81,7}},17}, {{{0,8}},104}, {{{0,8}},40}, {{{0,9}},177}, -+ {{{0,8}},8}, {{{0,8}},136}, {{{0,8}},72}, {{{0,9}},241}, -+ {{{80,7}},4}, {{{0,8}},84}, {{{0,8}},20}, {{{85,8}},227}, -+ {{{83,7}},43}, {{{0,8}},116}, {{{0,8}},52}, {{{0,9}},201}, -+ {{{81,7}},13}, {{{0,8}},100}, {{{0,8}},36}, {{{0,9}},169}, -+ {{{0,8}},4}, {{{0,8}},132}, {{{0,8}},68}, {{{0,9}},233}, -+ {{{80,7}},8}, {{{0,8}},92}, {{{0,8}},28}, {{{0,9}},153}, -+ {{{84,7}},83}, {{{0,8}},124}, {{{0,8}},60}, {{{0,9}},217}, -+ {{{82,7}},23}, {{{0,8}},108}, {{{0,8}},44}, {{{0,9}},185}, -+ {{{0,8}},12}, {{{0,8}},140}, {{{0,8}},76}, {{{0,9}},249}, -+ {{{80,7}},3}, {{{0,8}},82}, {{{0,8}},18}, {{{85,8}},163}, -+ {{{83,7}},35}, {{{0,8}},114}, {{{0,8}},50}, {{{0,9}},197}, -+ {{{81,7}},11}, {{{0,8}},98}, {{{0,8}},34}, {{{0,9}},165}, -+ {{{0,8}},2}, {{{0,8}},130}, {{{0,8}},66}, {{{0,9}},229}, -+ {{{80,7}},7}, {{{0,8}},90}, {{{0,8}},26}, {{{0,9}},149}, -+ {{{84,7}},67}, {{{0,8}},122}, {{{0,8}},58}, {{{0,9}},213}, -+ {{{82,7}},19}, {{{0,8}},106}, {{{0,8}},42}, {{{0,9}},181}, -+ {{{0,8}},10}, {{{0,8}},138}, {{{0,8}},74}, {{{0,9}},245}, -+ {{{80,7}},5}, {{{0,8}},86}, {{{0,8}},22}, {{{192,8}},0}, -+ {{{83,7}},51}, {{{0,8}},118}, {{{0,8}},54}, {{{0,9}},205}, -+ {{{81,7}},15}, {{{0,8}},102}, {{{0,8}},38}, {{{0,9}},173}, -+ {{{0,8}},6}, {{{0,8}},134}, {{{0,8}},70}, {{{0,9}},237}, -+ {{{80,7}},9}, {{{0,8}},94}, {{{0,8}},30}, {{{0,9}},157}, -+ {{{84,7}},99}, {{{0,8}},126}, {{{0,8}},62}, {{{0,9}},221}, -+ {{{82,7}},27}, {{{0,8}},110}, {{{0,8}},46}, {{{0,9}},189}, -+ {{{0,8}},14}, {{{0,8}},142}, {{{0,8}},78}, {{{0,9}},253}, -+ {{{96,7}},256}, {{{0,8}},81}, {{{0,8}},17}, {{{85,8}},131}, -+ {{{82,7}},31}, {{{0,8}},113}, {{{0,8}},49}, {{{0,9}},195}, -+ {{{80,7}},10}, {{{0,8}},97}, {{{0,8}},33}, {{{0,9}},163}, -+ {{{0,8}},1}, {{{0,8}},129}, {{{0,8}},65}, {{{0,9}},227}, -+ {{{80,7}},6}, {{{0,8}},89}, {{{0,8}},25}, {{{0,9}},147}, -+ {{{83,7}},59}, {{{0,8}},121}, {{{0,8}},57}, {{{0,9}},211}, -+ {{{81,7}},17}, {{{0,8}},105}, {{{0,8}},41}, {{{0,9}},179}, -+ {{{0,8}},9}, {{{0,8}},137}, {{{0,8}},73}, {{{0,9}},243}, -+ {{{80,7}},4}, {{{0,8}},85}, {{{0,8}},21}, {{{80,8}},258}, -+ {{{83,7}},43}, {{{0,8}},117}, {{{0,8}},53}, {{{0,9}},203}, -+ {{{81,7}},13}, {{{0,8}},101}, {{{0,8}},37}, {{{0,9}},171}, -+ {{{0,8}},5}, {{{0,8}},133}, {{{0,8}},69}, {{{0,9}},235}, -+ {{{80,7}},8}, {{{0,8}},93}, {{{0,8}},29}, {{{0,9}},155}, -+ {{{84,7}},83}, {{{0,8}},125}, {{{0,8}},61}, {{{0,9}},219}, -+ {{{82,7}},23}, {{{0,8}},109}, {{{0,8}},45}, {{{0,9}},187}, -+ {{{0,8}},13}, {{{0,8}},141}, {{{0,8}},77}, {{{0,9}},251}, -+ {{{80,7}},3}, {{{0,8}},83}, {{{0,8}},19}, {{{85,8}},195}, -+ {{{83,7}},35}, {{{0,8}},115}, {{{0,8}},51}, {{{0,9}},199}, -+ {{{81,7}},11}, {{{0,8}},99}, {{{0,8}},35}, {{{0,9}},167}, -+ {{{0,8}},3}, {{{0,8}},131}, {{{0,8}},67}, {{{0,9}},231}, -+ {{{80,7}},7}, {{{0,8}},91}, {{{0,8}},27}, {{{0,9}},151}, -+ {{{84,7}},67}, {{{0,8}},123}, {{{0,8}},59}, {{{0,9}},215}, -+ {{{82,7}},19}, {{{0,8}},107}, {{{0,8}},43}, {{{0,9}},183}, -+ {{{0,8}},11}, {{{0,8}},139}, {{{0,8}},75}, {{{0,9}},247}, -+ {{{80,7}},5}, {{{0,8}},87}, {{{0,8}},23}, {{{192,8}},0}, -+ {{{83,7}},51}, {{{0,8}},119}, {{{0,8}},55}, {{{0,9}},207}, -+ {{{81,7}},15}, {{{0,8}},103}, {{{0,8}},39}, {{{0,9}},175}, -+ {{{0,8}},7}, {{{0,8}},135}, {{{0,8}},71}, {{{0,9}},239}, -+ {{{80,7}},9}, {{{0,8}},95}, {{{0,8}},31}, {{{0,9}},159}, -+ {{{84,7}},99}, {{{0,8}},127}, {{{0,8}},63}, {{{0,9}},223}, -+ {{{82,7}},27}, {{{0,8}},111}, {{{0,8}},47}, {{{0,9}},191}, -+ {{{0,8}},15}, {{{0,8}},143}, {{{0,8}},79}, {{{0,9}},255} -+ }; -+local inflate_huft fixed_td[] = { -+ {{{80,5}},1}, {{{87,5}},257}, {{{83,5}},17}, {{{91,5}},4097}, -+ {{{81,5}},5}, {{{89,5}},1025}, {{{85,5}},65}, {{{93,5}},16385}, -+ {{{80,5}},3}, {{{88,5}},513}, {{{84,5}},33}, {{{92,5}},8193}, -+ {{{82,5}},9}, {{{90,5}},2049}, {{{86,5}},129}, {{{192,5}},24577}, -+ {{{80,5}},2}, {{{87,5}},385}, {{{83,5}},25}, {{{91,5}},6145}, -+ {{{81,5}},7}, {{{89,5}},1537}, {{{85,5}},97}, {{{93,5}},24577}, -+ {{{80,5}},4}, {{{88,5}},769}, {{{84,5}},49}, {{{92,5}},12289}, -+ {{{82,5}},13}, {{{90,5}},3073}, {{{86,5}},193}, {{{192,5}},24577} -+ }; ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/inflate.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,368 @@ -+/* inflate.c -- zlib interface to inflate modules -+ * Copyright (C) 1995-2002 Mark Adler -+ * For conditions of distribution and use, see copyright notice in zlib.h -+ */ -+ -+#include -+#include "infblock.h" -+ -+struct inflate_blocks_state {int dummy;}; /* for buggy compilers */ -+ -+typedef enum { -+ METHOD, /* waiting for method byte */ -+ FLAG, /* waiting for flag byte */ -+ DICT4, /* four dictionary check bytes to go */ -+ DICT3, /* three dictionary check bytes to go */ -+ DICT2, /* two dictionary check bytes to go */ -+ DICT1, /* one dictionary check byte to go */ -+ DICT0, /* waiting for inflateSetDictionary */ -+ BLOCKS, /* decompressing blocks */ -+ CHECK4, /* four check bytes to go */ -+ CHECK3, /* three check bytes to go */ -+ CHECK2, /* two check bytes to go */ -+ CHECK1, /* one check byte to go */ -+ DONE, /* finished check, done */ -+ BAD} /* got an error--stay here */ -+inflate_mode; -+ -+/* inflate private state */ -+struct internal_state { -+ -+ /* mode */ -+ inflate_mode mode; /* current inflate mode */ -+ -+ /* mode dependent information */ -+ union { -+ uInt method; /* if FLAGS, method byte */ -+ struct { -+ uLong was; /* computed check value */ -+ uLong need; /* stream check value */ -+ } check; /* if CHECK, check values to compare */ -+ uInt marker; /* if BAD, inflateSync's marker bytes count */ -+ } sub; /* submode */ -+ -+ /* mode independent information */ -+ int nowrap; /* flag for no wrapper */ -+ uInt wbits; /* log2(window size) (8..15, defaults to 15) */ -+ inflate_blocks_statef -+ *blocks; /* current inflate_blocks state */ -+ -+}; -+ -+ -+int ZEXPORT inflateReset(z) -+z_streamp z; -+{ -+ if (z == Z_NULL || z->state == Z_NULL) -+ return Z_STREAM_ERROR; -+ z->total_in = z->total_out = 0; -+ z->msg = Z_NULL; -+ z->state->mode = z->state->nowrap ? BLOCKS : METHOD; -+ inflate_blocks_reset(z->state->blocks, z, Z_NULL); -+ Tracev((stderr, "inflate: reset\n")); -+ return Z_OK; -+} -+ -+ -+int ZEXPORT inflateEnd(z) -+z_streamp z; -+{ -+ if (z == Z_NULL || z->state == Z_NULL || z->zfree == Z_NULL) -+ return Z_STREAM_ERROR; -+ if (z->state->blocks != Z_NULL) -+ inflate_blocks_free(z->state->blocks, z); -+ ZFREE(z, z->state); -+ z->state = Z_NULL; -+ Tracev((stderr, "inflate: end\n")); -+ return Z_OK; -+} -+ -+ -+int ZEXPORT inflateInit2_(z, w, version, stream_size) -+z_streamp z; -+int w; -+const char *version; -+int stream_size; -+{ -+ if (version == Z_NULL || version[0] != ZLIB_VERSION[0] || -+ stream_size != sizeof(z_stream)) -+ return Z_VERSION_ERROR; -+ -+ /* initialize state */ -+ if (z == Z_NULL) -+ return Z_STREAM_ERROR; -+ z->msg = Z_NULL; -+ if (z->zalloc == Z_NULL) -+ { -+ return Z_STREAM_ERROR; -+/* z->zalloc = zcalloc; -+ z->opaque = (voidpf)0; -+*/ -+ } -+ if (z->zfree == Z_NULL) return Z_STREAM_ERROR; /* z->zfree = zcfree; */ -+ if ((z->state = (struct internal_state FAR *) -+ ZALLOC(z,1,sizeof(struct internal_state))) == Z_NULL) -+ return Z_MEM_ERROR; -+ z->state->blocks = Z_NULL; -+ -+ /* handle undocumented nowrap option (no zlib header or check) */ -+ z->state->nowrap = 0; -+ if (w < 0) -+ { -+ w = - w; -+ z->state->nowrap = 1; -+ } -+ -+ /* set window size */ -+ if (w < 8 || w > 15) -+ { -+ inflateEnd(z); -+ return Z_STREAM_ERROR; -+ } -+ z->state->wbits = (uInt)w; -+ -+ /* create inflate_blocks state */ -+ if ((z->state->blocks = -+ inflate_blocks_new(z, z->state->nowrap ? Z_NULL : adler32, (uInt)1 << w)) -+ == Z_NULL) -+ { -+ inflateEnd(z); -+ return Z_MEM_ERROR; -+ } -+ Tracev((stderr, "inflate: allocated\n")); -+ -+ /* reset state */ -+ inflateReset(z); -+ return Z_OK; -+} -+ -+ -+int ZEXPORT inflateInit_(z, version, stream_size) -+z_streamp z; -+const char *version; -+int stream_size; -+{ -+ return inflateInit2_(z, DEF_WBITS, version, stream_size); -+} -+ -+ -+#define NEEDBYTE {if(z->avail_in==0)return r;r=f;} -+#define NEXTBYTE (z->avail_in--,z->total_in++,*z->next_in++) -+ -+int ZEXPORT inflate(z, f) -+z_streamp z; -+int f; -+{ -+ int r; -+ uInt b; -+ -+ if (z == Z_NULL || z->state == Z_NULL || z->next_in == Z_NULL) -+ return Z_STREAM_ERROR; -+ f = f == Z_FINISH ? Z_BUF_ERROR : Z_OK; -+ r = Z_BUF_ERROR; -+ while (1) switch (z->state->mode) -+ { -+ case METHOD: -+ NEEDBYTE -+ if (((z->state->sub.method = NEXTBYTE) & 0xf) != Z_DEFLATED) -+ { -+ z->state->mode = BAD; -+ z->msg = (char*)"unknown compression method"; -+ z->state->sub.marker = 5; /* can't try inflateSync */ -+ break; -+ } -+ if ((z->state->sub.method >> 4) + 8 > z->state->wbits) -+ { -+ z->state->mode = BAD; -+ z->msg = (char*)"invalid window size"; -+ z->state->sub.marker = 5; /* can't try inflateSync */ -+ break; -+ } -+ z->state->mode = FLAG; -+ case FLAG: -+ NEEDBYTE -+ b = NEXTBYTE; -+ if (((z->state->sub.method << 8) + b) % 31) -+ { -+ z->state->mode = BAD; -+ z->msg = (char*)"incorrect header check"; -+ z->state->sub.marker = 5; /* can't try inflateSync */ -+ break; -+ } -+ Tracev((stderr, "inflate: zlib header ok\n")); -+ if (!(b & PRESET_DICT)) -+ { -+ z->state->mode = BLOCKS; -+ break; -+ } -+ z->state->mode = DICT4; -+ case DICT4: -+ NEEDBYTE -+ z->state->sub.check.need = (uLong)NEXTBYTE << 24; -+ z->state->mode = DICT3; -+ case DICT3: -+ NEEDBYTE -+ z->state->sub.check.need += (uLong)NEXTBYTE << 16; -+ z->state->mode = DICT2; -+ case DICT2: -+ NEEDBYTE -+ z->state->sub.check.need += (uLong)NEXTBYTE << 8; -+ z->state->mode = DICT1; -+ case DICT1: -+ NEEDBYTE -+ z->state->sub.check.need += (uLong)NEXTBYTE; -+ z->adler = z->state->sub.check.need; -+ z->state->mode = DICT0; -+ return Z_NEED_DICT; -+ case DICT0: -+ z->state->mode = BAD; -+ z->msg = (char*)"need dictionary"; -+ z->state->sub.marker = 0; /* can try inflateSync */ -+ return Z_STREAM_ERROR; -+ case BLOCKS: -+ r = inflate_blocks(z->state->blocks, z, r); -+ if (r == Z_DATA_ERROR) -+ { -+ z->state->mode = BAD; -+ z->state->sub.marker = 0; /* can try inflateSync */ -+ break; -+ } -+ if (r == Z_OK) -+ r = f; -+ if (r != Z_STREAM_END) -+ return r; -+ r = f; -+ inflate_blocks_reset(z->state->blocks, z, &z->state->sub.check.was); -+ if (z->state->nowrap) -+ { -+ z->state->mode = DONE; -+ break; -+ } -+ z->state->mode = CHECK4; -+ case CHECK4: -+ NEEDBYTE -+ z->state->sub.check.need = (uLong)NEXTBYTE << 24; -+ z->state->mode = CHECK3; -+ case CHECK3: -+ NEEDBYTE -+ z->state->sub.check.need += (uLong)NEXTBYTE << 16; -+ z->state->mode = CHECK2; -+ case CHECK2: -+ NEEDBYTE -+ z->state->sub.check.need += (uLong)NEXTBYTE << 8; -+ z->state->mode = CHECK1; -+ case CHECK1: -+ NEEDBYTE -+ z->state->sub.check.need += (uLong)NEXTBYTE; -+ -+ if (z->state->sub.check.was != z->state->sub.check.need) -+ { -+ z->state->mode = BAD; -+ z->msg = (char*)"incorrect data check"; -+ z->state->sub.marker = 5; /* can't try inflateSync */ -+ break; -+ } -+ Tracev((stderr, "inflate: zlib check ok\n")); -+ z->state->mode = DONE; -+ case DONE: -+ return Z_STREAM_END; -+ case BAD: -+ return Z_DATA_ERROR; -+ default: -+ return Z_STREAM_ERROR; -+ } -+#ifdef NEED_DUMMY_RETURN -+ return Z_STREAM_ERROR; /* Some dumb compilers complain without this */ -+#endif -+} -+ -+ -+int ZEXPORT inflateSetDictionary(z, dictionary, dictLength) -+z_streamp z; -+const Bytef *dictionary; -+uInt dictLength; -+{ -+ uInt length = dictLength; -+ -+ if (z == Z_NULL || z->state == Z_NULL || z->state->mode != DICT0) -+ return Z_STREAM_ERROR; -+ -+ if (adler32(1L, dictionary, dictLength) != z->adler) return Z_DATA_ERROR; -+ z->adler = 1L; -+ -+ if (length >= ((uInt)1<state->wbits)) -+ { -+ length = (1<state->wbits)-1; -+ dictionary += dictLength - length; -+ } -+ inflate_set_dictionary(z->state->blocks, dictionary, length); -+ z->state->mode = BLOCKS; -+ return Z_OK; -+} -+ -+ -+int ZEXPORT inflateSync(z) -+z_streamp z; -+{ -+ uInt n; /* number of bytes to look at */ -+ Bytef *p; /* pointer to bytes */ -+ uInt m; /* number of marker bytes found in a row */ -+ uLong r, w; /* temporaries to save total_in and total_out */ -+ -+ /* set up */ -+ if (z == Z_NULL || z->state == Z_NULL) -+ return Z_STREAM_ERROR; -+ if (z->state->mode != BAD) -+ { -+ z->state->mode = BAD; -+ z->state->sub.marker = 0; -+ } -+ if ((n = z->avail_in) == 0) -+ return Z_BUF_ERROR; -+ p = z->next_in; -+ m = z->state->sub.marker; -+ -+ /* search */ -+ while (n && m < 4) -+ { -+ static const Byte mark[4] = {0, 0, 0xff, 0xff}; -+ if (*p == mark[m]) -+ m++; -+ else if (*p) -+ m = 0; -+ else -+ m = 4 - m; -+ p++, n--; -+ } -+ -+ /* restore */ -+ z->total_in += p - z->next_in; -+ z->next_in = p; -+ z->avail_in = n; -+ z->state->sub.marker = m; -+ -+ /* return no joy or set up to restart on a new block */ -+ if (m != 4) -+ return Z_DATA_ERROR; -+ r = z->total_in; w = z->total_out; -+ inflateReset(z); -+ z->total_in = r; z->total_out = w; -+ z->state->mode = BLOCKS; -+ return Z_OK; -+} -+ -+ -+/* Returns true if inflate is currently at the end of a block generated -+ * by Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP -+ * implementation to provide an additional safety check. PPP uses Z_SYNC_FLUSH -+ * but removes the length bytes of the resulting empty stored block. When -+ * decompressing, PPP checks that at the end of input packet, inflate is -+ * waiting for these length bytes. -+ */ -+int ZEXPORT inflateSyncPoint(z) -+z_streamp z; -+{ -+ if (z == Z_NULL || z->state == Z_NULL || z->state->blocks == Z_NULL) -+ return Z_STREAM_ERROR; -+ return inflate_blocks_sync_point(z->state->blocks); -+} ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/inftrees.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,454 @@ -+/* inftrees.c -- generate Huffman trees for efficient decoding -+ * Copyright (C) 1995-2002 Mark Adler -+ * For conditions of distribution and use, see copyright notice in zlib.h -+ */ -+ -+#include -+#include "inftrees.h" -+ -+#if !defined(BUILDFIXED) && !defined(STDC) -+# define BUILDFIXED /* non ANSI compilers may not accept inffixed.h */ -+#endif -+ -+local const char inflate_copyright[] = -+ " inflate 1.1.4 Copyright 1995-2002 Mark Adler "; -+/* -+ If you use the zlib library in a product, an acknowledgment is welcome -+ in the documentation of your product. If for some reason you cannot -+ include such an acknowledgment, I would appreciate that you keep this -+ copyright string in the executable of your product. -+ */ -+struct internal_state {int dummy;}; /* for buggy compilers */ -+ -+/* simplify the use of the inflate_huft type with some defines */ -+#define exop word.what.Exop -+#define bits word.what.Bits -+ -+ -+local int huft_build OF(( -+ uIntf *, /* code lengths in bits */ -+ uInt, /* number of codes */ -+ uInt, /* number of "simple" codes */ -+ const uIntf *, /* list of base values for non-simple codes */ -+ const uIntf *, /* list of extra bits for non-simple codes */ -+ inflate_huft * FAR*,/* result: starting table */ -+ uIntf *, /* maximum lookup bits (returns actual) */ -+ inflate_huft *, /* space for trees */ -+ uInt *, /* hufts used in space */ -+ uIntf * )); /* space for values */ -+ -+/* Tables for deflate from PKZIP's appnote.txt. */ -+local const uInt cplens[31] = { /* Copy lengths for literal codes 257..285 */ -+ 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, -+ 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0}; -+ /* see note #13 above about 258 */ -+local const uInt cplext[31] = { /* Extra bits for literal codes 257..285 */ -+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, -+ 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 112, 112}; /* 112==invalid */ -+local const uInt cpdist[30] = { /* Copy offsets for distance codes 0..29 */ -+ 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, -+ 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, -+ 8193, 12289, 16385, 24577}; -+local const uInt cpdext[30] = { /* Extra bits for distance codes */ -+ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, -+ 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, -+ 12, 12, 13, 13}; -+ -+/* -+ Huffman code decoding is performed using a multi-level table lookup. -+ The fastest way to decode is to simply build a lookup table whose -+ size is determined by the longest code. However, the time it takes -+ to build this table can also be a factor if the data being decoded -+ is not very long. The most common codes are necessarily the -+ shortest codes, so those codes dominate the decoding time, and hence -+ the speed. The idea is you can have a shorter table that decodes the -+ shorter, more probable codes, and then point to subsidiary tables for -+ the longer codes. The time it costs to decode the longer codes is -+ then traded against the time it takes to make longer tables. -+ -+ This results of this trade are in the variables lbits and dbits -+ below. lbits is the number of bits the first level table for literal/ -+ length codes can decode in one step, and dbits is the same thing for -+ the distance codes. Subsequent tables are also less than or equal to -+ those sizes. These values may be adjusted either when all of the -+ codes are shorter than that, in which case the longest code length in -+ bits is used, or when the shortest code is *longer* than the requested -+ table size, in which case the length of the shortest code in bits is -+ used. -+ -+ There are two different values for the two tables, since they code a -+ different number of possibilities each. The literal/length table -+ codes 286 possible values, or in a flat code, a little over eight -+ bits. The distance table codes 30 possible values, or a little less -+ than five bits, flat. The optimum values for speed end up being -+ about one bit more than those, so lbits is 8+1 and dbits is 5+1. -+ The optimum values may differ though from machine to machine, and -+ possibly even between compilers. Your mileage may vary. -+ */ -+ -+ -+/* If BMAX needs to be larger than 16, then h and x[] should be uLong. */ -+#define BMAX 15 /* maximum bit length of any code */ -+ -+local int huft_build(b, n, s, d, e, t, m, hp, hn, v) -+uIntf *b; /* code lengths in bits (all assumed <= BMAX) */ -+uInt n; /* number of codes (assumed <= 288) */ -+uInt s; /* number of simple-valued codes (0..s-1) */ -+const uIntf *d; /* list of base values for non-simple codes */ -+const uIntf *e; /* list of extra bits for non-simple codes */ -+inflate_huft * FAR *t; /* result: starting table */ -+uIntf *m; /* maximum lookup bits, returns actual */ -+inflate_huft *hp; /* space for trees */ -+uInt *hn; /* hufts used in space */ -+uIntf *v; /* working area: values in order of bit length */ -+/* Given a list of code lengths and a maximum table size, make a set of -+ tables to decode that set of codes. Return Z_OK on success, Z_BUF_ERROR -+ if the given code set is incomplete (the tables are still built in this -+ case), or Z_DATA_ERROR if the input is invalid. */ -+{ -+ -+ uInt a; /* counter for codes of length k */ -+ uInt c[BMAX+1]; /* bit length count table */ -+ uInt f; /* i repeats in table every f entries */ -+ int g; /* maximum code length */ -+ int h; /* table level */ -+ register uInt i; /* counter, current code */ -+ register uInt j; /* counter */ -+ register int k; /* number of bits in current code */ -+ int l; /* bits per table (returned in m) */ -+ uInt mask; /* (1 << w) - 1, to avoid cc -O bug on HP */ -+ register uIntf *p; /* pointer into c[], b[], or v[] */ -+ inflate_huft *q; /* points to current table */ -+ struct inflate_huft_s r; /* table entry for structure assignment */ -+ inflate_huft *u[BMAX]; /* table stack */ -+ register int w; /* bits before this table == (l * h) */ -+ uInt x[BMAX+1]; /* bit offsets, then code stack */ -+ uIntf *xp; /* pointer into x */ -+ int y; /* number of dummy codes added */ -+ uInt z; /* number of entries in current table */ -+ -+ -+ /* Generate counts for each bit length */ -+ p = c; -+#define C0 *p++ = 0; -+#define C2 C0 C0 C0 C0 -+#define C4 C2 C2 C2 C2 -+ C4 /* clear c[]--assume BMAX+1 is 16 */ -+ p = b; i = n; -+ do { -+ c[*p++]++; /* assume all entries <= BMAX */ -+ } while (--i); -+ if (c[0] == n) /* null input--all zero length codes */ -+ { -+ *t = (inflate_huft *)Z_NULL; -+ *m = 0; -+ return Z_OK; -+ } -+ -+ -+ /* Find minimum and maximum length, bound *m by those */ -+ l = *m; -+ for (j = 1; j <= BMAX; j++) -+ if (c[j]) -+ break; -+ k = j; /* minimum code length */ -+ if ((uInt)l < j) -+ l = j; -+ for (i = BMAX; i; i--) -+ if (c[i]) -+ break; -+ g = i; /* maximum code length */ -+ if ((uInt)l > i) -+ l = i; -+ *m = l; -+ -+ -+ /* Adjust last length count to fill out codes, if needed */ -+ for (y = 1 << j; j < i; j++, y <<= 1) -+ if ((y -= c[j]) < 0) -+ return Z_DATA_ERROR; -+ if ((y -= c[i]) < 0) -+ return Z_DATA_ERROR; -+ c[i] += y; -+ -+ -+ /* Generate starting offsets into the value table for each length */ -+ x[1] = j = 0; -+ p = c + 1; xp = x + 2; -+ while (--i) { /* note that i == g from above */ -+ *xp++ = (j += *p++); -+ } -+ -+ -+ /* Make a table of values in order of bit lengths */ -+ p = b; i = 0; -+ do { -+ if ((j = *p++) != 0) -+ v[x[j]++] = i; -+ } while (++i < n); -+ n = x[g]; /* set n to length of v */ -+ -+ -+ /* Generate the Huffman codes and for each, make the table entries */ -+ x[0] = i = 0; /* first Huffman code is zero */ -+ p = v; /* grab values in bit order */ -+ h = -1; /* no tables yet--level -1 */ -+ w = -l; /* bits decoded == (l * h) */ -+ u[0] = (inflate_huft *)Z_NULL; /* just to keep compilers happy */ -+ q = (inflate_huft *)Z_NULL; /* ditto */ -+ z = 0; /* ditto */ -+ -+ /* go through the bit lengths (k already is bits in shortest code) */ -+ for (; k <= g; k++) -+ { -+ a = c[k]; -+ while (a--) -+ { -+ /* here i is the Huffman code of length k bits for value *p */ -+ /* make tables up to required level */ -+ while (k > w + l) -+ { -+ h++; -+ w += l; /* previous table always l bits */ -+ -+ /* compute minimum size table less than or equal to l bits */ -+ z = g - w; -+ z = z > (uInt)l ? l : z; /* table size upper limit */ -+ if ((f = 1 << (j = k - w)) > a + 1) /* try a k-w bit table */ -+ { /* too few codes for k-w bit table */ -+ f -= a + 1; /* deduct codes from patterns left */ -+ xp = c + k; -+ if (j < z) -+ while (++j < z) /* try smaller tables up to z bits */ -+ { -+ if ((f <<= 1) <= *++xp) -+ break; /* enough codes to use up j bits */ -+ f -= *xp; /* else deduct codes from patterns */ -+ } -+ } -+ z = 1 << j; /* table entries for j-bit table */ -+ -+ /* allocate new table */ -+ if (*hn + z > MANY) /* (note: doesn't matter for fixed) */ -+ return Z_DATA_ERROR; /* overflow of MANY */ -+ u[h] = q = hp + *hn; -+ *hn += z; -+ -+ /* connect to last table, if there is one */ -+ if (h) -+ { -+ x[h] = i; /* save pattern for backing up */ -+ r.bits = (Byte)l; /* bits to dump before this table */ -+ r.exop = (Byte)j; /* bits in this table */ -+ j = i >> (w - l); -+ r.base = (uInt)(q - u[h-1] - j); /* offset to this table */ -+ u[h-1][j] = r; /* connect to last table */ -+ } -+ else -+ *t = q; /* first table is returned result */ -+ } -+ -+ /* set up table entry in r */ -+ r.bits = (Byte)(k - w); -+ if (p >= v + n) -+ r.exop = 128 + 64; /* out of values--invalid code */ -+ else if (*p < s) -+ { -+ r.exop = (Byte)(*p < 256 ? 0 : 32 + 64); /* 256 is end-of-block */ -+ r.base = *p++; /* simple code is just the value */ -+ } -+ else -+ { -+ r.exop = (Byte)(e[*p - s] + 16 + 64);/* non-simple--look up in lists */ -+ r.base = d[*p++ - s]; -+ } -+ -+ /* fill code-like entries with r */ -+ f = 1 << (k - w); -+ for (j = i >> w; j < z; j += f) -+ q[j] = r; -+ -+ /* backwards increment the k-bit code i */ -+ for (j = 1 << (k - 1); i & j; j >>= 1) -+ i ^= j; -+ i ^= j; -+ -+ /* backup over finished tables */ -+ mask = (1 << w) - 1; /* needed on HP, cc -O bug */ -+ while ((i & mask) != x[h]) -+ { -+ h--; /* don't need to update q */ -+ w -= l; -+ mask = (1 << w) - 1; -+ } -+ } -+ } -+ -+ -+ /* Return Z_BUF_ERROR if we were given an incomplete table */ -+ return y != 0 && g != 1 ? Z_BUF_ERROR : Z_OK; -+} -+ -+ -+int inflate_trees_bits(c, bb, tb, hp, z) -+uIntf *c; /* 19 code lengths */ -+uIntf *bb; /* bits tree desired/actual depth */ -+inflate_huft * FAR *tb; /* bits tree result */ -+inflate_huft *hp; /* space for trees */ -+z_streamp z; /* for messages */ -+{ -+ int r; -+ uInt hn = 0; /* hufts used in space */ -+ uIntf *v; /* work area for huft_build */ -+ -+ if ((v = (uIntf*)ZALLOC(z, 19, sizeof(uInt))) == Z_NULL) -+ return Z_MEM_ERROR; -+ r = huft_build(c, 19, 19, (uIntf*)Z_NULL, (uIntf*)Z_NULL, -+ tb, bb, hp, &hn, v); -+ if (r == Z_DATA_ERROR) -+ z->msg = (char*)"oversubscribed dynamic bit lengths tree"; -+ else if (r == Z_BUF_ERROR || *bb == 0) -+ { -+ z->msg = (char*)"incomplete dynamic bit lengths tree"; -+ r = Z_DATA_ERROR; -+ } -+ ZFREE(z, v); -+ return r; -+} -+ -+ -+int inflate_trees_dynamic(nl, nd, c, bl, bd, tl, td, hp, z) -+uInt nl; /* number of literal/length codes */ -+uInt nd; /* number of distance codes */ -+uIntf *c; /* that many (total) code lengths */ -+uIntf *bl; /* literal desired/actual bit depth */ -+uIntf *bd; /* distance desired/actual bit depth */ -+inflate_huft * FAR *tl; /* literal/length tree result */ -+inflate_huft * FAR *td; /* distance tree result */ -+inflate_huft *hp; /* space for trees */ -+z_streamp z; /* for messages */ -+{ -+ int r; -+ uInt hn = 0; /* hufts used in space */ -+ uIntf *v; /* work area for huft_build */ -+ -+ /* allocate work area */ -+ if ((v = (uIntf*)ZALLOC(z, 288, sizeof(uInt))) == Z_NULL) -+ return Z_MEM_ERROR; -+ -+ /* build literal/length tree */ -+ r = huft_build(c, nl, 257, cplens, cplext, tl, bl, hp, &hn, v); -+ if (r != Z_OK || *bl == 0) -+ { -+ if (r == Z_DATA_ERROR) -+ z->msg = (char*)"oversubscribed literal/length tree"; -+ else if (r != Z_MEM_ERROR) -+ { -+ z->msg = (char*)"incomplete literal/length tree"; -+ r = Z_DATA_ERROR; -+ } -+ ZFREE(z, v); -+ return r; -+ } -+ -+ /* build distance tree */ -+ r = huft_build(c + nl, nd, 0, cpdist, cpdext, td, bd, hp, &hn, v); -+ if (r != Z_OK || (*bd == 0 && nl > 257)) -+ { -+ if (r == Z_DATA_ERROR) -+ z->msg = (char*)"oversubscribed distance tree"; -+ else if (r == Z_BUF_ERROR) { -+#ifdef PKZIP_BUG_WORKAROUND -+ r = Z_OK; -+ } -+#else -+ z->msg = (char*)"incomplete distance tree"; -+ r = Z_DATA_ERROR; -+ } -+ else if (r != Z_MEM_ERROR) -+ { -+ z->msg = (char*)"empty distance tree with lengths"; -+ r = Z_DATA_ERROR; -+ } -+ ZFREE(z, v); -+ return r; -+#endif -+ } -+ -+ /* done */ -+ ZFREE(z, v); -+ return Z_OK; -+} -+ -+ -+/* build fixed tables only once--keep them here */ -+#ifdef BUILDFIXED -+local int fixed_built = 0; -+#define FIXEDH 544 /* number of hufts used by fixed tables */ -+local inflate_huft fixed_mem[FIXEDH]; -+local uInt fixed_bl; -+local uInt fixed_bd; -+local inflate_huft *fixed_tl; -+local inflate_huft *fixed_td; -+#else -+#include "inffixed.h" -+#endif -+ -+ -+int inflate_trees_fixed(bl, bd, tl, td, z) -+uIntf *bl; /* literal desired/actual bit depth */ -+uIntf *bd; /* distance desired/actual bit depth */ -+inflate_huft * FAR *tl; /* literal/length tree result */ -+inflate_huft * FAR *td; /* distance tree result */ -+z_streamp z; /* for memory allocation */ -+{ -+#ifdef BUILDFIXED -+ /* build fixed tables if not already */ -+ if (!fixed_built) -+ { -+ int k; /* temporary variable */ -+ uInt f = 0; /* number of hufts used in fixed_mem */ -+ uIntf *c; /* length list for huft_build */ -+ uIntf *v; /* work area for huft_build */ -+ -+ /* allocate memory */ -+ if ((c = (uIntf*)ZALLOC(z, 288, sizeof(uInt))) == Z_NULL) -+ return Z_MEM_ERROR; -+ if ((v = (uIntf*)ZALLOC(z, 288, sizeof(uInt))) == Z_NULL) -+ { -+ ZFREE(z, c); -+ return Z_MEM_ERROR; -+ } -+ -+ /* literal table */ -+ for (k = 0; k < 144; k++) -+ c[k] = 8; -+ for (; k < 256; k++) -+ c[k] = 9; -+ for (; k < 280; k++) -+ c[k] = 7; -+ for (; k < 288; k++) -+ c[k] = 8; -+ fixed_bl = 9; -+ huft_build(c, 288, 257, cplens, cplext, &fixed_tl, &fixed_bl, -+ fixed_mem, &f, v); -+ -+ /* distance table */ -+ for (k = 0; k < 30; k++) -+ c[k] = 5; -+ fixed_bd = 5; -+ huft_build(c, 30, 0, cpdist, cpdext, &fixed_td, &fixed_bd, -+ fixed_mem, &f, v); -+ -+ /* done */ -+ ZFREE(z, v); -+ ZFREE(z, c); -+ fixed_built = 1; -+ } -+#endif -+ *bl = fixed_bl; -+ *bd = fixed_bd; -+ *tl = fixed_tl; -+ *td = fixed_td; -+ return Z_OK; -+} ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/inftrees.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,63 @@ -+/* inftrees.h -- header to use inftrees.c -+ * Copyright (C) 1995-2002 Mark Adler -+ * For conditions of distribution and use, see copyright notice in zlib.h -+ */ -+ -+/* WARNING: this file should *not* be used by applications. It is -+ part of the implementation of the compression library and is -+ subject to change. Applications should only use zlib.h. -+ */ -+ -+/* Huffman code lookup table entry--this entry is four bytes for machines -+ that have 16-bit pointers (e.g. PC's in the small or medium model). */ -+ -+#ifndef _INFTREES_H -+#define _INFTREES_H -+ -+typedef struct inflate_huft_s FAR inflate_huft; -+ -+struct inflate_huft_s { -+ union { -+ struct { -+ Byte Exop; /* number of extra bits or operation */ -+ Byte Bits; /* number of bits in this code or subcode */ -+ } what; -+ uInt pad; /* pad structure to a power of 2 (4 bytes for */ -+ } word; /* 16-bit, 8 bytes for 32-bit int's) */ -+ uInt base; /* literal, length base, distance base, -+ or table offset */ -+}; -+ -+/* Maximum size of dynamic tree. The maximum found in a long but non- -+ exhaustive search was 1004 huft structures (850 for length/literals -+ and 154 for distances, the latter actually the result of an -+ exhaustive search). The actual maximum is not known, but the -+ value below is more than safe. */ -+#define MANY 1440 -+ -+extern int inflate_trees_bits OF(( -+ uIntf *, /* 19 code lengths */ -+ uIntf *, /* bits tree desired/actual depth */ -+ inflate_huft * FAR *, /* bits tree result */ -+ inflate_huft *, /* space for trees */ -+ z_streamp)); /* for messages */ -+ -+extern int inflate_trees_dynamic OF(( -+ uInt, /* number of literal/length codes */ -+ uInt, /* number of distance codes */ -+ uIntf *, /* that many (total) code lengths */ -+ uIntf *, /* literal desired/actual bit depth */ -+ uIntf *, /* distance desired/actual bit depth */ -+ inflate_huft * FAR *, /* literal/length tree result */ -+ inflate_huft * FAR *, /* distance tree result */ -+ inflate_huft *, /* space for trees */ -+ z_streamp)); /* for messages */ -+ -+extern int inflate_trees_fixed OF(( -+ uIntf *, /* literal desired/actual bit depth */ -+ uIntf *, /* distance desired/actual bit depth */ -+ inflate_huft * FAR *, /* literal/length tree result */ -+ inflate_huft * FAR *, /* distance tree result */ -+ z_streamp)); /* for memory allocation */ -+ -+#endif /* _INFTREES_H */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/infutil.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,87 @@ -+/* inflate_util.c -- data and routines common to blocks and codes -+ * Copyright (C) 1995-2002 Mark Adler -+ * For conditions of distribution and use, see copyright notice in zlib.h -+ */ -+ -+#include -+#include "infblock.h" -+#include "inftrees.h" -+#include "infcodes.h" -+#include "infutil.h" -+ -+struct inflate_codes_state {int dummy;}; /* for buggy compilers */ -+ -+/* And'ing with mask[n] masks the lower n bits */ -+uInt inflate_mask[17] = { -+ 0x0000, -+ 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff, -+ 0x01ff, 0x03ff, 0x07ff, 0x0fff, 0x1fff, 0x3fff, 0x7fff, 0xffff -+}; -+ -+ -+/* copy as much as possible from the sliding window to the output area */ -+int inflate_flush(s, z, r) -+inflate_blocks_statef *s; -+z_streamp z; -+int r; -+{ -+ uInt n; -+ Bytef *p; -+ Bytef *q; -+ -+ /* local copies of source and destination pointers */ -+ p = z->next_out; -+ q = s->read; -+ -+ /* compute number of bytes to copy as far as end of window */ -+ n = (uInt)((q <= s->write ? s->write : s->end) - q); -+ if (n > z->avail_out) n = z->avail_out; -+ if (n && r == Z_BUF_ERROR) r = Z_OK; -+ -+ /* update counters */ -+ z->avail_out -= n; -+ z->total_out += n; -+ -+ /* update check information */ -+ if (s->checkfn != Z_NULL) -+ z->adler = s->check = (*s->checkfn)(s->check, q, n); -+ -+ /* copy as far as end of window */ -+ zmemcpy(p, q, n); -+ p += n; -+ q += n; -+ -+ /* see if more to copy at beginning of window */ -+ if (q == s->end) -+ { -+ /* wrap pointers */ -+ q = s->window; -+ if (s->write == s->end) -+ s->write = s->window; -+ -+ /* compute bytes to copy */ -+ n = (uInt)(s->write - q); -+ if (n > z->avail_out) n = z->avail_out; -+ if (n && r == Z_BUF_ERROR) r = Z_OK; -+ -+ /* update counters */ -+ z->avail_out -= n; -+ z->total_out += n; -+ -+ /* update check information */ -+ if (s->checkfn != Z_NULL) -+ z->adler = s->check = (*s->checkfn)(s->check, q, n); -+ -+ /* copy */ -+ zmemcpy(p, q, n); -+ p += n; -+ q += n; -+ } -+ -+ /* update pointers */ -+ z->next_out = p; -+ s->read = q; -+ -+ /* done */ -+ return r; -+} ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/infutil.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,98 @@ -+/* infutil.h -- types and macros common to blocks and codes -+ * Copyright (C) 1995-2002 Mark Adler -+ * For conditions of distribution and use, see copyright notice in zlib.h -+ */ -+ -+/* WARNING: this file should *not* be used by applications. It is -+ part of the implementation of the compression library and is -+ subject to change. Applications should only use zlib.h. -+ */ -+ -+#ifndef _INFUTIL_H -+#define _INFUTIL_H -+ -+typedef enum { -+ TYPE, /* get type bits (3, including end bit) */ -+ LENS, /* get lengths for stored */ -+ STORED, /* processing stored block */ -+ TABLE, /* get table lengths */ -+ BTREE, /* get bit lengths tree for a dynamic block */ -+ DTREE, /* get length, distance trees for a dynamic block */ -+ CODES, /* processing fixed or dynamic block */ -+ DRY, /* output remaining window bytes */ -+ DONE, /* finished last block, done */ -+ BAD} /* got a data error--stuck here */ -+inflate_block_mode; -+ -+/* inflate blocks semi-private state */ -+struct inflate_blocks_state { -+ -+ /* mode */ -+ inflate_block_mode mode; /* current inflate_block mode */ -+ -+ /* mode dependent information */ -+ union { -+ uInt left; /* if STORED, bytes left to copy */ -+ struct { -+ uInt table; /* table lengths (14 bits) */ -+ uInt index; /* index into blens (or border) */ -+ uIntf *blens; /* bit lengths of codes */ -+ uInt bb; /* bit length tree depth */ -+ inflate_huft *tb; /* bit length decoding tree */ -+ } trees; /* if DTREE, decoding info for trees */ -+ struct { -+ inflate_codes_statef -+ *codes; -+ } decode; /* if CODES, current state */ -+ } sub; /* submode */ -+ uInt last; /* true if this block is the last block */ -+ -+ /* mode independent information */ -+ uInt bitk; /* bits in bit buffer */ -+ uLong bitb; /* bit buffer */ -+ inflate_huft *hufts; /* single malloc for tree space */ -+ Bytef *window; /* sliding window */ -+ Bytef *end; /* one byte after sliding window */ -+ Bytef *read; /* window read pointer */ -+ Bytef *write; /* window write pointer */ -+ check_func checkfn; /* check function */ -+ uLong check; /* check on output */ -+ -+}; -+ -+ -+/* defines for inflate input/output */ -+/* update pointers and return */ -+#define UPDBITS {s->bitb=b;s->bitk=k;} -+#define UPDIN {z->avail_in=n;z->total_in+=p-z->next_in;z->next_in=p;} -+#define UPDOUT {s->write=q;} -+#define UPDATE {UPDBITS UPDIN UPDOUT} -+#define LEAVE {UPDATE return inflate_flush(s,z,r);} -+/* get bytes and bits */ -+#define LOADIN {p=z->next_in;n=z->avail_in;b=s->bitb;k=s->bitk;} -+#define NEEDBYTE {if(n)r=Z_OK;else LEAVE} -+#define NEXTBYTE (n--,*p++) -+#define NEEDBITS(j) {while(k<(j)){NEEDBYTE;b|=((uLong)NEXTBYTE)<>=(j);k-=(j);} -+/* output bytes */ -+#define WAVAIL (uInt)(qread?s->read-q-1:s->end-q) -+#define LOADOUT {q=s->write;m=(uInt)WAVAIL;} -+#define WRAP {if(q==s->end&&s->read!=s->window){q=s->window;m=(uInt)WAVAIL;}} -+#define FLUSH {UPDOUT r=inflate_flush(s,z,r); LOADOUT} -+#define NEEDOUT {if(m==0){WRAP if(m==0){FLUSH WRAP if(m==0) LEAVE}}r=Z_OK;} -+#define OUTBYTE(a) {*q++=(Byte)(a);m--;} -+/* load local pointers */ -+#define LOAD {LOADIN LOADOUT} -+ -+/* masks for lower bits (size given to avoid silly warnings with Visual C++) */ -+extern uInt inflate_mask[17]; -+ -+/* copy as much as possible from the sliding window to the output area */ -+extern int inflate_flush OF(( -+ inflate_blocks_statef *, -+ z_streamp , -+ int)); -+ -+struct internal_state {int dummy;}; /* for buggy compilers */ -+ -+#endif /* _INFUTIL_H */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/initaddr.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,85 @@ -+/* -+ * initialize address structure -+ * Copyright (C) 2000 Henry Spencer. -+ * -+ * This library is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU Library General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This library is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public -+ * License for more details. -+ * -+ * RCSID $Id: initaddr.c,v 1.6 2004/07/10 07:43:47 mcr Exp $ -+ */ -+#include "openswan.h" -+ -+err_t -+add_port(af, addr, port) -+int af; -+ip_address *addr; -+unsigned short port; -+{ -+ switch (af) { -+ case AF_INET: -+ addr->u.v4.sin_port = port; -+ break; -+ case AF_INET6: -+ addr->u.v6.sin6_port = port; -+ break; -+ default: -+ return "unknown address family in add_port"; -+ break; -+ } -+ return NULL; -+} -+ -+/* -+ - initaddr - initialize ip_address from bytes -+ */ -+err_t /* NULL for success, else string literal */ -+initaddr(src, srclen, af, dst) -+const unsigned char *src; -+size_t srclen; -+int af; /* address family */ -+ip_address *dst; -+{ -+ switch (af) { -+ case AF_INET: -+ if (srclen != 4) -+ return "IPv4 address must be exactly 4 bytes"; -+#if !defined(__KERNEL__) -+ /* On BSD, the kernel compares the entire struct sockaddr when -+ * using bind(). However, this is as large as the largest -+ * address family, so the 'remainder' has to be 0. Linux -+ * compares interface addresses with the length of sa_len, -+ * instead of sizeof(struct sockaddr), so in that case padding -+ * is not needed. -+ * -+ * Patch by Stefan Arentz -+ */ -+ bzero(&dst->u.v4, sizeof(dst->u.v4)); -+#endif -+ dst->u.v4.sin_family = af; -+ dst->u.v4.sin_port = 0; -+ memcpy((char *)&dst->u.v4.sin_addr.s_addr, src, srclen); -+ break; -+ case AF_INET6: -+ if (srclen != 16) -+ return "IPv6 address must be exactly 16 bytes"; -+#if !defined(__KERNEL__) -+ bzero(&dst->u.v6, sizeof(dst->u.v6)); -+#endif -+ dst->u.v6.sin6_family = af; -+ dst->u.v6.sin6_flowinfo = 0; /* unused */ -+ dst->u.v6.sin6_port = 0; -+ memcpy((char *)&dst->u.v6.sin6_addr, src, srclen); -+ break; -+ default: -+ return "unknown address family in initaddr"; -+ break; -+ } -+ return NULL; -+} ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/ipcomp.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,694 @@ -+/* -+ * IPCOMP zlib interface code. -+ * implementation of RFC 3173. -+ * -+ * Copyright (C) 2000 Svenning Soerensen -+ * Copyright (C) 2000, 2001 Richard Guy Briggs -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ */ -+ -+/* SSS */ -+ -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif -+#include -+ -+#define __NO_VERSION__ -+#include -+#include /* printk() */ -+ -+#include "openswan/ipsec_param.h" -+ -+#ifdef MALLOC_SLAB -+# include /* kmalloc() */ -+#else /* MALLOC_SLAB */ -+# include /* kmalloc() */ -+#endif /* MALLOC_SLAB */ -+#include /* error codes */ -+#include -+#include -+#include -+#include -+ -+#include /* struct device, and other headers */ -+#include /* eth_type_trans */ -+#include /* struct iphdr */ -+#include -+#include -+#include -+ -+#include -+ -+#include -+ -+#include "openswan/ipsec_kern24.h" -+#include "openswan/radij.h" -+#include "openswan/ipsec_encap.h" -+#include "openswan/ipsec_sa.h" -+ -+#include "openswan/ipsec_xform.h" -+#include "openswan/ipsec_tunnel.h" -+#include "openswan/ipsec_rcv.h" /* sysctl_ipsec_inbound_policy_check */ -+#include "openswan/ipsec_proto.h" -+#include "openswan/ipcomp.h" -+#include "zlib/zlib.h" -+#include "zlib/zutil.h" -+ -+#include /* SADB_X_CALG_DEFLATE */ -+ -+static -+struct sk_buff *skb_copy_ipcomp(struct sk_buff *skb, int data_growth, int gfp_mask); -+ -+static -+voidpf my_zcalloc(voidpf opaque, uInt items, uInt size) -+{ -+ return (voidpf) kmalloc(items*size, GFP_ATOMIC); -+} -+ -+static -+void my_zfree(voidpf opaque, voidpf address) -+{ -+ kfree(address); -+} -+ -+/* -+ * We use this function because sometimes we want to pass a negative offset -+ * into skb_put(), this does not work on 64bit platforms because long to -+ * unsigned int casting. -+ */ -+static inline unsigned char * -+safe_skb_put(struct sk_buff *skb, int extend) -+{ -+ unsigned char *ptr; -+ -+ if (extend>0) { -+ // increase the size of the packet -+ ptr = skb_put(skb, extend); -+ } else { -+ // shrink the size of the packet -+ ptr = skb_tail_pointer(skb); -+ skb_trim (skb, skb->len + extend); -+ } -+ -+ return ptr; -+} -+ -+struct sk_buff *skb_compress(struct sk_buff *skb, struct ipsec_sa *ips, unsigned int *flags) -+{ -+ struct iphdr *iph; -+ unsigned int iphlen, pyldsz, cpyldsz; -+ unsigned char *buffer; -+ z_stream zs; -+ int zresult; -+ -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_debug:skb_compress: .\n"); -+ -+ if(skb == NULL) { -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_debug:skb_compress: " -+ "passed in NULL skb, returning ERROR.\n"); -+ if(flags != NULL) { -+ *flags |= IPCOMP_PARMERROR; -+ } -+ return skb; -+ } -+ -+ if(ips == NULL) { -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_debug:skb_compress: " -+ "passed in NULL ipsec_sa needed for cpi, returning ERROR.\n"); -+ if(flags) { -+ *flags |= IPCOMP_PARMERROR; -+ } -+ return skb; -+ } -+ -+ if (flags == NULL) { -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_debug:skb_compress: " -+ "passed in NULL flags, returning ERROR.\n"); -+ ipsec_kfree_skb(skb); -+ return NULL; -+ } -+ -+#ifdef NET_21 -+ iph = ip_hdr(skb); -+#else /* NET_21 */ -+ iph = skb->ip_hdr; -+#endif /* NET_21 */ -+ -+ switch (iph->protocol) { -+ case IPPROTO_COMP: -+ case IPPROTO_AH: -+ case IPPROTO_ESP: -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_debug:skb_compress: " -+ "skipping compression of packet with ip protocol %d.\n", -+ iph->protocol); -+ *flags |= IPCOMP_UNCOMPRESSABLE; -+ return skb; -+ } -+ -+ /* Don't compress packets already fragmented */ -+ if (iph->frag_off & __constant_htons(IP_MF | IP_OFFSET)) { -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_debug:skb_compress: " -+ "skipping compression of fragmented packet.\n"); -+ *flags |= IPCOMP_UNCOMPRESSABLE; -+ return skb; -+ } -+ -+ iphlen = iph->ihl << 2; -+ pyldsz = ntohs(iph->tot_len) - iphlen; -+ -+ /* Don't compress less than 90 bytes (rfc 2394) */ -+ if (pyldsz < 90) { -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_debug:skb_compress: " -+ "skipping compression of tiny packet, len=%d.\n", -+ pyldsz); -+ *flags |= IPCOMP_UNCOMPRESSABLE; -+ return skb; -+ } -+ -+ /* Adaptive decision */ -+ if (ips->ips_comp_adapt_skip) { -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_debug:skb_compress: " -+ "skipping compression: ips_comp_adapt_skip=%d.\n", -+ ips->ips_comp_adapt_skip); -+ ips->ips_comp_adapt_skip--; -+ *flags |= IPCOMP_UNCOMPRESSABLE; -+ return skb; -+ } -+ -+ zs.zalloc = my_zcalloc; -+ zs.zfree = my_zfree; -+ zs.opaque = 0; -+ -+ /* We want to use deflateInit2 because we don't want the adler -+ header. */ -+ zresult = deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, -11, -+ DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY); -+ if (zresult != Z_OK) { -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_error:skb_compress: " -+ "deflateInit2() returned error %d (%s), " -+ "skipping compression.\n", -+ zresult, -+ zs.msg ? zs.msg : zError(zresult)); -+ *flags |= IPCOMP_COMPRESSIONERROR; -+ return skb; -+ } -+ -+ -+ /* Max output size. Result should be max this size. -+ * Implementation specific tweak: -+ * If it's not at least 32 bytes and 6.25% smaller than -+ * the original packet, it's probably not worth wasting -+ * the receiver's CPU cycles decompressing it. -+ * Your mileage may vary. -+ */ -+ cpyldsz = pyldsz - sizeof(struct ipcomphdr) - (pyldsz <= 512 ? 32 : pyldsz >> 4); -+ -+ buffer = kmalloc(cpyldsz, GFP_ATOMIC); -+ if (!buffer) { -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_error:skb_compress: " -+ "unable to kmalloc(%d, GFP_ATOMIC), " -+ "skipping compression.\n", -+ cpyldsz); -+ *flags |= IPCOMP_COMPRESSIONERROR; -+ deflateEnd(&zs); -+ return skb; -+ } -+ -+#ifdef CONFIG_KLIPS_DEBUG -+ if(sysctl_ipsec_debug_ipcomp && sysctl_ipsec_debug_verbose) { -+ __u8 *c; -+ -+ c = (__u8*)iph + iphlen; -+ ipsec_dmp_block("compress before", c, pyldsz); -+ } -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+ zs.next_in = (char *) iph + iphlen; /* start of payload */ -+ zs.avail_in = pyldsz; -+ zs.next_out = buffer; /* start of compressed payload */ -+ zs.avail_out = cpyldsz; -+ -+ /* Finish compression in one step */ -+ zresult = deflate(&zs, Z_FINISH); -+ -+ /* Free all dynamically allocated buffers */ -+ deflateEnd(&zs); -+ if (zresult != Z_STREAM_END) { -+ *flags |= IPCOMP_UNCOMPRESSABLE; -+ kfree(buffer); -+ -+ /* Adjust adaptive counters */ -+ if (++(ips->ips_comp_adapt_tries) == IPCOMP_ADAPT_INITIAL_TRIES) { -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_debug:skb_compress: " -+ "first %d packets didn't compress, " -+ "skipping next %d\n", -+ IPCOMP_ADAPT_INITIAL_TRIES, -+ IPCOMP_ADAPT_INITIAL_SKIP); -+ ips->ips_comp_adapt_skip = IPCOMP_ADAPT_INITIAL_SKIP; -+ } -+ else if (ips->ips_comp_adapt_tries == IPCOMP_ADAPT_INITIAL_TRIES + IPCOMP_ADAPT_SUBSEQ_TRIES) { -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_debug:skb_compress: " -+ "next %d packets didn't compress, " -+ "skipping next %d\n", -+ IPCOMP_ADAPT_SUBSEQ_TRIES, -+ IPCOMP_ADAPT_SUBSEQ_SKIP); -+ ips->ips_comp_adapt_skip = IPCOMP_ADAPT_SUBSEQ_SKIP; -+ ips->ips_comp_adapt_tries = IPCOMP_ADAPT_INITIAL_TRIES; -+ } -+ -+ return skb; -+ } -+ -+ /* resulting compressed size */ -+ cpyldsz -= zs.avail_out; -+ -+ /* Insert IPCOMP header */ -+ ((struct ipcomphdr*) ((char*) iph + iphlen))->ipcomp_nh = iph->protocol; -+ ((struct ipcomphdr*) ((char*) iph + iphlen))->ipcomp_flags = 0; -+ /* use the bottom 16 bits of the spi for the cpi. The top 16 bits are -+ for internal reference only. */ -+ ((struct ipcomphdr*) (((char*)iph) + iphlen))->ipcomp_cpi = htons((__u16)(ntohl(ips->ips_said.spi) & 0x0000ffff)); -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_debug:skb_compress: " -+ "spi=%08x, spi&0xffff=%04x, cpi=%04x, payload size: raw=%d, comp=%d.\n", -+ ntohl(ips->ips_said.spi), -+ ntohl(ips->ips_said.spi) & 0x0000ffff, -+ ntohs(((struct ipcomphdr*)(((char*)iph)+iphlen))->ipcomp_cpi), -+ pyldsz, -+ cpyldsz); -+ -+ /* Update IP header */ -+ iph->protocol = IPPROTO_COMP; -+ iph->tot_len = htons(iphlen + sizeof(struct ipcomphdr) + cpyldsz); -+#if 1 /* XXX checksum is done by ipsec_tunnel ? */ -+ iph->check = 0; -+ iph->check = ip_fast_csum((char *) iph, iph->ihl); -+#endif -+ -+ /* Copy compressed payload */ -+ memcpy((char *) iph + iphlen + sizeof(struct ipcomphdr), -+ buffer, -+ cpyldsz); -+ kfree(buffer); -+ -+ /* Update skb length/tail by "unputting" the shrinkage */ -+ safe_skb_put (skb, cpyldsz + sizeof(struct ipcomphdr) - pyldsz); -+ -+#ifdef CONFIG_KLIPS_DEBUG -+ if(sysctl_ipsec_debug_ipcomp && sysctl_ipsec_debug_verbose) { -+ __u8 *c; -+ -+ c = (__u8*)iph + iphlen + sizeof(struct ipcomphdr); -+ ipsec_dmp_block("compress result", c, cpyldsz); -+ } -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+ ips->ips_comp_adapt_skip = 0; -+ ips->ips_comp_adapt_tries = 0; -+ -+ return skb; -+} -+ -+struct sk_buff *skb_decompress(struct sk_buff *skb, struct ipsec_sa *ips, unsigned int *flags) -+{ -+ struct sk_buff *nskb = NULL; -+ -+ /* original ip header */ -+ struct iphdr *oiph, *iph; -+ unsigned int iphlen, pyldsz, cpyldsz; -+ z_stream zs; -+ int zresult; -+ -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_debug:skb_decompress: .\n"); -+ -+ if(!skb) { -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_error:skb_decompress: " -+ "passed in NULL skb, returning ERROR.\n"); -+ if (flags) *flags |= IPCOMP_PARMERROR; -+ return skb; -+ } -+ -+ if(!ips && sysctl_ipsec_inbound_policy_check) { -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_error:skb_decompress: " -+ "passed in NULL ipsec_sa needed for comp alg, returning ERROR.\n"); -+ if (flags) *flags |= IPCOMP_PARMERROR; -+ return skb; -+ } -+ -+ if (!flags) { -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_error:skb_decompress: " -+ "passed in NULL flags, returning ERROR.\n"); -+ ipsec_kfree_skb(skb); -+ return NULL; -+ } -+ -+#ifdef NET_21 -+ oiph = ip_hdr(skb); -+#else /* NET_21 */ -+ oiph = skb->ip_hdr; -+#endif /* NET_21 */ -+ -+ iphlen = oiph->ihl << 2; -+ -+ if (oiph->protocol != IPPROTO_COMP) { -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_error:skb_decompress: " -+ "called with non-IPCOMP packet (protocol=%d)," -+ "skipping decompression.\n", -+ oiph->protocol); -+ *flags |= IPCOMP_PARMERROR; -+ return skb; -+ } -+ -+ if ( (((struct ipcomphdr*)((char*) oiph + iphlen))->ipcomp_flags != 0) -+ || ((((struct ipcomphdr*) ((char*) oiph + iphlen))->ipcomp_cpi -+ != htons(SADB_X_CALG_DEFLATE)) -+ && sysctl_ipsec_inbound_policy_check -+ && (!ips || (ips && (ips->ips_encalg != SADB_X_CALG_DEFLATE)))) ) { -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_error:skb_decompress: " -+ "called with incompatible IPCOMP packet (flags=%d, " -+ "cpi=%d), ips-compalg=%d, skipping decompression.\n", -+ ntohs(((struct ipcomphdr*) ((char*) oiph + iphlen))->ipcomp_flags), -+ ntohs(((struct ipcomphdr*) ((char*) oiph + iphlen))->ipcomp_cpi), -+ ips ? ips->ips_encalg : 0); -+ *flags |= IPCOMP_PARMERROR; -+ -+ return skb; -+ } -+ -+ if (ntohs(oiph->frag_off) & ~0x4000) { -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_error:skb_decompress: " -+ "called with fragmented IPCOMP packet, " -+ "skipping decompression.\n"); -+ *flags |= IPCOMP_PARMERROR; -+ return skb; -+ } -+ -+ /* original compressed payload size */ -+ cpyldsz = ntohs(oiph->tot_len) - iphlen - sizeof(struct ipcomphdr); -+ -+ zs.zalloc = my_zcalloc; -+ zs.zfree = my_zfree; -+ zs.opaque = 0; -+ -+ zs.next_in = (char *) oiph + iphlen + sizeof(struct ipcomphdr); -+ zs.avail_in = cpyldsz; -+ -+ /* Maybe we should be a bit conservative about memory -+ requirements and use inflateInit2 */ -+ /* Beware, that this might make us unable to decompress packets -+ from other implementations - HINT: check PGPnet source code */ -+ /* We want to use inflateInit2 because we don't want the adler -+ header. */ -+ zresult = inflateInit2(&zs, -15); -+ if (zresult != Z_OK) { -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_error:skb_decompress: " -+ "inflateInit2() returned error %d (%s), " -+ "skipping decompression.\n", -+ zresult, -+ zs.msg ? zs.msg : zError(zresult)); -+ *flags |= IPCOMP_DECOMPRESSIONERROR; -+ -+ return skb; -+ } -+ -+ /* We have no way of knowing the exact length of the resulting -+ decompressed output before we have actually done the decompression. -+ For now, we guess that the packet will not be bigger than the -+ attached ipsec device's mtu or 16260, whichever is biggest. -+ This may be wrong, since the sender's mtu may be bigger yet. -+ XXX This must be dealt with later XXX -+ */ -+ -+ /* max payload size */ -+ pyldsz = skb->dev ? (skb->dev->mtu < 16260 ? 16260 : skb->dev->mtu) -+ : (65520 - iphlen); -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_debug:skb_decompress: " -+ "max payload size: %d\n", pyldsz); -+ -+ while (pyldsz > (cpyldsz + sizeof(struct ipcomphdr)) && -+ (nskb = skb_copy_ipcomp(skb, -+ pyldsz - cpyldsz - sizeof(struct ipcomphdr), -+ GFP_ATOMIC)) == NULL) { -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_error:skb_decompress: " -+ "unable to skb_copy_ipcomp(skb, %d, GFP_ATOMIC), " -+ "trying with less payload size.\n", -+ (int)(pyldsz - cpyldsz - sizeof(struct ipcomphdr))); -+ pyldsz >>=1; -+ } -+ -+ if (!nskb) { -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_error:skb_decompress: " -+ "unable to allocate memory, dropping packet.\n"); -+ *flags |= IPCOMP_DECOMPRESSIONERROR; -+ inflateEnd(&zs); -+ -+ return skb; -+ } -+ -+#ifdef CONFIG_KLIPS_DEBUG -+ if(sysctl_ipsec_debug_ipcomp && sysctl_ipsec_debug_verbose) { -+ __u8 *c; -+ -+ c = (__u8*)oiph + iphlen + sizeof(struct ipcomphdr); -+ ipsec_dmp_block("decompress before", c, cpyldsz); -+ } -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+#ifdef NET_21 -+ iph = ip_hdr(nskb); -+#else /* NET_21 */ -+ iph = nskb->ip_hdr; -+#endif /* NET_21 */ -+ zs.next_out = (char *)iph + iphlen; -+ zs.avail_out = pyldsz; -+ -+ zresult = inflate(&zs, Z_SYNC_FLUSH); -+ -+ /* work around a bug in zlib, which sometimes wants to taste an extra -+ * byte when being used in the (undocumented) raw deflate mode. -+ */ -+ if (zresult == Z_OK && !zs.avail_in && zs.avail_out) { -+ __u8 zerostuff = 0; -+ -+ zs.next_in = &zerostuff; -+ zs.avail_in = 1; -+ zresult = inflate(&zs, Z_FINISH); -+ } -+ -+ inflateEnd(&zs); -+ if (zresult != Z_STREAM_END) { -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_error:skb_decompress: " -+ "inflate() returned error %d (%s), " -+ "skipping decompression.\n", -+ zresult, -+ zs.msg ? zs.msg : zError(zresult)); -+ *flags |= IPCOMP_DECOMPRESSIONERROR; -+ ipsec_kfree_skb(nskb); -+ -+ return skb; -+ } -+ -+ /* Update IP header */ -+ /* resulting decompressed size */ -+ pyldsz -= zs.avail_out; -+ iph->tot_len = htons(iphlen + pyldsz); -+ iph->protocol = ((struct ipcomphdr*) ((char*) oiph + iphlen))->ipcomp_nh; -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_debug:skb_decompress: " -+ "spi=%08x, spi&0xffff=%04x, cpi=%04x, payload size: comp=%d, raw=%d, nh=%d.\n", -+ ips ? ntohl(ips->ips_said.spi) : 0, -+ ips ? ntohl(ips->ips_said.spi) & 0x0000ffff : 0, -+ ntohs(((struct ipcomphdr*)(((char*)oiph)+iphlen))->ipcomp_cpi), -+ cpyldsz, -+ pyldsz, -+ iph->protocol); -+ -+#if 1 /* XXX checksum is done by ipsec_rcv ? */ -+ iph->check = 0; -+ iph->check = ip_fast_csum((char*) iph, iph->ihl); -+#endif -+ -+ /* Update skb length/tail by "unputting" the unused data area */ -+ safe_skb_put(nskb, -zs.avail_out); -+ -+ ipsec_kfree_skb(skb); -+ -+ if (iph->protocol == IPPROTO_COMP) -+ { -+#ifdef CONFIG_KLIPS_DEBUG -+ if(sysctl_ipsec_debug_ipcomp) -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_debug:skb_decompress: " -+ "Eh? inner packet is also compressed, dropping.\n"); -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+ ipsec_kfree_skb(nskb); -+ return NULL; -+ } -+ -+#ifdef CONFIG_KLIPS_DEBUG -+ if(sysctl_ipsec_debug_ipcomp && sysctl_ipsec_debug_verbose) { -+ __u8 *c; -+ -+ c = (__u8*)iph + iphlen; -+ ipsec_dmp_block("decompress result", c, pyldsz); -+ } -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+ return nskb; -+} -+ -+ -+/* this is derived from skb_copy() in linux 2.2.14 */ -+/* May be incompatible with other kernel versions!! */ -+static -+struct sk_buff *skb_copy_ipcomp(struct sk_buff *skb, int data_growth, int gfp_mask) -+{ -+ struct sk_buff *n; -+ struct iphdr *iph; -+ unsigned long offset; -+ unsigned int iphlen; -+ -+ if(!skb) { -+ KLIPS_PRINT(sysctl_ipsec_debug_ipcomp, -+ "klips_debug:skb_copy_ipcomp: " -+ "passed in NULL skb, returning NULL.\n"); -+ return NULL; -+ } -+ -+ /* -+ * Allocate the copy buffer -+ */ -+ -+#ifdef NET_21 -+ iph = ip_hdr(skb); -+#else /* NET_21 */ -+ iph = skb->ip_hdr; -+#endif /* NET_21 */ -+ if (!iph) return NULL; -+ iphlen = iph->ihl << 2; -+ -+ n=alloc_skb(skb_end_pointer(skb) - skb->head + data_growth, gfp_mask); -+ if(n==NULL) -+ return NULL; -+ -+ /* -+ * Shift between the two data areas in bytes -+ */ -+ -+ offset=n->head-skb->head; -+ -+ /* Set the data pointer */ -+ skb_reserve(n,skb->data-skb->head); -+ /* Set the tail pointer and length */ -+ safe_skb_put(n,skb->len+data_growth); -+ /* Copy the bytes up to and including the ip header */ -+ memcpy(n->head, -+ skb->head, -+ ((char *)iph - (char *)skb->head) + iphlen); -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) -+ n->list=NULL; -+#endif -+ n->next=NULL; -+ n->prev=NULL; -+ n->sk=NULL; -+ n->dev=skb->dev; -+ if (skb_transport_header(skb)) -+ skb_set_transport_header(n, offset); -+ n->protocol=skb->protocol; -+#ifdef NET_21 -+ n->csum = 0; -+ n->priority=skb->priority; -+ n->dst=dst_clone(skb->dst); -+ skb_set_network_header(n, offset); -+#ifndef NETDEV_23 -+ n->is_clone=0; -+#endif /* NETDEV_23 */ -+ atomic_set(&n->users, 1); -+ n->destructor = NULL; -+#ifdef HAVE_SOCK_SECURITY -+ n->security=skb->security; -+#endif -+ memcpy(n->cb, skb->cb, sizeof(skb->cb)); -+#ifdef CONFIG_IP_FIREWALL -+ n->fwmark = skb->fwmark; -+#endif -+#else /* NET_21 */ -+ n->link3=NULL; -+ n->when=skb->when; -+ n->ip_hdr=(struct iphdr *)(((char *)skb->ip_hdr)+offset); -+ n->saddr=skb->saddr; -+ n->daddr=skb->daddr; -+ n->raddr=skb->raddr; -+ n->seq=skb->seq; -+ n->end_seq=skb->end_seq; -+ n->ack_seq=skb->ack_seq; -+ n->acked=skb->acked; -+ n->free=1; -+ n->arp=skb->arp; -+ n->tries=0; -+ n->lock=0; -+ n->users=0; -+ memcpy(n->proto_priv, skb->proto_priv, sizeof(skb->proto_priv)); -+#endif /* NET_21 */ -+ if (skb_mac_header(skb)) -+ skb_set_mac_header(n, offset); -+#ifndef NETDEV_23 -+ n->used=skb->used; -+#endif /* !NETDEV_23 */ -+ n->pkt_type=skb->pkt_type; -+#ifndef NETDEV_23 -+ n->pkt_bridged=skb->pkt_bridged; -+#endif /* NETDEV_23 */ -+ n->ip_summed=0; -+#ifdef HAVE_TSTAMP -+ n->tstamp = skb->tstamp; -+#else -+ n->stamp=skb->stamp; -+#endif -+#ifndef NETDEV_23 /* this seems to have been removed in 2.4 */ -+#if defined(CONFIG_SHAPER) || defined(CONFIG_SHAPER_MODULE) -+ n->shapelatency=skb->shapelatency; /* Latency on frame */ -+ n->shapeclock=skb->shapeclock; /* Time it should go out */ -+ n->shapelen=skb->shapelen; /* Frame length in clocks */ -+ n->shapestamp=skb->shapestamp; /* Stamp for shaper */ -+ n->shapepend=skb->shapepend; /* Pending */ -+#endif /* defined(CONFIG_SHAPER) || defined(CONFIG_SHAPER_MODULE) */ -+#endif /* NETDEV_23 */ -+ -+ return n; -+} ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/ipsec_ah.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,369 @@ -+/* -+ * processing code for AH -+ * Copyright (C) 2003-2004 Michael Richardson -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ */ -+ -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif -+#include -+ -+#define __NO_VERSION__ -+#include -+#include /* printk() */ -+ -+#include "openswan/ipsec_param.h" -+ -+#ifdef MALLOC_SLAB -+# include /* kmalloc() */ -+#else /* MALLOC_SLAB */ -+# include /* kmalloc() */ -+#endif /* MALLOC_SLAB */ -+#include /* error codes */ -+#include /* size_t */ -+#include /* mark_bh */ -+ -+#include /* struct device, and other headers */ -+#include /* eth_type_trans */ -+#include /* struct iphdr */ -+#include -+#include -+#ifdef SPINLOCK -+# ifdef SPINLOCK_23 -+# include /* *lock* */ -+# else /* SPINLOCK_23 */ -+# include /* *lock* */ -+# endif /* SPINLOCK_23 */ -+#endif /* SPINLOCK */ -+ -+#include -+#include -+ -+#include "openswan/radij.h" -+#include "openswan/ipsec_encap.h" -+#include "openswan/ipsec_sa.h" -+ -+#include "openswan/ipsec_radij.h" -+#include "openswan/ipsec_xform.h" -+#include "openswan/ipsec_tunnel.h" -+#include "openswan/ipsec_rcv.h" -+#include "openswan/ipsec_xmit.h" -+ -+#include "openswan/ipsec_auth.h" -+#include "openswan/ipsec_ah.h" -+#include "openswan/ipsec_proto.h" -+ -+#include "ipsec_ocf.h" -+ -+__u32 zeroes[AH_AMAX]; -+ -+enum ipsec_rcv_value -+ipsec_rcv_ah_checks(struct ipsec_rcv_state *irs, -+ struct sk_buff *skb) -+{ -+ int ahminlen; -+ -+ ahminlen = irs->hard_header_len + sizeof(struct iphdr); -+ -+ /* take care not to deref this pointer until we check the minlen though */ -+ irs->protostuff.ahstuff.ahp = (struct ahhdr *)skb_transport_header(skb); -+ -+ if((skb->len < ahminlen+sizeof(struct ahhdr)) || -+ (skb->len < ahminlen+(irs->protostuff.ahstuff.ahp->ah_hl << 2))) { -+ KLIPS_PRINT(debug_rcv & DB_RX_INAU, -+ "klips_debug:ipsec_rcv: " -+ "runt ah packet of skb->len=%d received from %s, dropped.\n", -+ skb->len, -+ irs->ipsaddr_txt); -+ if(irs->stats) { -+ irs->stats->rx_errors++; -+ } -+ return IPSEC_RCV_BADLEN; -+ } -+ -+ irs->said.spi = irs->protostuff.ahstuff.ahp->ah_spi; -+ -+ /* XXX we only support the one 12-byte authenticator for now */ -+ if(irs->protostuff.ahstuff.ahp->ah_hl != ((AHHMAC_HASHLEN+AHHMAC_RPLLEN) >> 2)) { -+ KLIPS_PRINT(debug_rcv & DB_RX_INAU, -+ "klips_debug:ipsec_rcv: " -+ "bad authenticator length %ld, expected %lu from %s.\n", -+ (long)(irs->protostuff.ahstuff.ahp->ah_hl << 2), -+ (unsigned long) sizeof(struct ahhdr), -+ irs->ipsaddr_txt); -+ if(irs->stats) { -+ irs->stats->rx_errors++; -+ } -+ return IPSEC_RCV_BADLEN; -+ } -+ -+ return IPSEC_RCV_OK; -+} -+ -+ -+enum ipsec_rcv_value -+ipsec_rcv_ah_setup_auth(struct ipsec_rcv_state *irs, -+ struct sk_buff *skb, -+ __u32 *replay, -+ unsigned char **authenticator) -+{ -+ struct ahhdr *ahp = irs->protostuff.ahstuff.ahp; -+ -+ *replay = ntohl(ahp->ah_rpl); -+ *authenticator = ahp->ah_data; -+ -+ return IPSEC_RCV_OK; -+} -+ -+enum ipsec_rcv_value -+ipsec_rcv_ah_authcalc(struct ipsec_rcv_state *irs, -+ struct sk_buff *skb) -+{ -+ struct auth_alg *aa; -+ struct ahhdr *ahp = irs->protostuff.ahstuff.ahp; -+ union { -+ MD5_CTX md5; -+ SHA1_CTX sha1; -+ } tctx; -+ struct iphdr ipo; -+ int ahhlen; -+ -+#ifdef CONFIG_KLIPS_OCF -+ if (irs->ipsp->ocf_in_use) -+ return(ipsec_ocf_rcv(irs)); -+#endif -+ -+ aa = irs->authfuncs; -+ -+ /* copy the initialized keying material */ -+ memcpy(&tctx, irs->ictx, irs->ictx_len); -+ -+ ipo = *irs->ipp; -+ ipo.tos = 0; /* mutable RFC 2402 3.3.3.1.1.1 */ -+ ipo.frag_off = 0; -+ ipo.ttl = 0; -+ ipo.check = 0; -+ -+ -+ /* do the sanitized header */ -+ (*aa->update)((void*)&tctx, (caddr_t)&ipo, sizeof(struct iphdr)); -+ -+ /* XXX we didn't do the options here! */ -+ -+ /* now do the AH header itself */ -+ ahhlen = AH_BASIC_LEN + (ahp->ah_hl << 2); -+ (*aa->update)((void*)&tctx, (caddr_t)ahp, ahhlen - AHHMAC_HASHLEN); -+ -+ /* now, do some zeroes */ -+ (*aa->update)((void*)&tctx, (caddr_t)zeroes, AHHMAC_HASHLEN); -+ -+ /* finally, do the packet contents themselves */ -+ (*aa->update)((void*)&tctx, -+ (caddr_t)skb_transport_header(skb) + ahhlen, -+ skb->len - ahhlen); -+ -+ (*aa->final)(irs->hash, (void *)&tctx); -+ -+ memcpy(&tctx, irs->octx, irs->octx_len); -+ -+ (*aa->update)((void *)&tctx, irs->hash, aa->hashlen); -+ (*aa->final)(irs->hash, (void *)&tctx); -+ -+ return IPSEC_RCV_OK; -+} -+ -+enum ipsec_rcv_value -+ipsec_rcv_ah_decap(struct ipsec_rcv_state *irs) -+{ -+ struct ahhdr *ahp = irs->protostuff.ahstuff.ahp; -+ struct sk_buff *skb; -+ int ahhlen; -+ -+ skb=irs->skb; -+ -+ ahhlen = AH_BASIC_LEN + (ahp->ah_hl << 2); -+ -+ irs->ipp->tot_len = htons(ntohs(irs->ipp->tot_len) - ahhlen); -+ irs->next_header = ahp->ah_nh; -+ -+ /* -+ * move the IP header forward by the size of the AH header, which -+ * will remove the the AH header from the packet. -+ */ -+ memmove((void *)(skb_network_header(skb) + ahhlen), -+ (void *)(skb_network_header(skb)), irs->iphlen); -+ -+ ipsec_rcv_dmp("ah postmove", skb->data, skb->len); -+ -+ /* skb_pull below, will move up by ahhlen */ -+ -+ /* XXX not clear how this can happen, as the message indicates */ -+ if(skb->len < ahhlen) { -+ printk(KERN_WARNING -+ "klips_error:ipsec_rcv: " -+ "tried to skb_pull ahhlen=%d, %d available. This should never happen, please report.\n", -+ ahhlen, -+ (int)(skb->len)); -+ return IPSEC_RCV_DECAPFAIL; -+ } -+ skb_pull(skb, ahhlen); -+ -+ skb_set_network_header(skb, ahhlen); -+ irs->ipp = ip_hdr(skb); -+ -+ ipsec_rcv_dmp("ah postpull", (void *)ip_hdr(skb), skb->len); -+ -+ return IPSEC_RCV_OK; -+} -+ -+enum ipsec_xmit_value -+ipsec_xmit_ah_setup(struct ipsec_xmit_state *ixs) -+{ -+ struct iphdr ipo; -+ struct ahhdr *ahp; -+#if defined(CONFIG_KLIPS_AUTH_HMAC_MD5) || defined(CONFIG_KLIPS_AUTH_HMAC_SHA1) -+ __u8 hash[AH_AMAX]; -+ union { -+#ifdef CONFIG_KLIPS_AUTH_HMAC_MD5 -+ MD5_CTX md5; -+#endif /* CONFIG_KLIPS_AUTH_HMAC_MD5 */ -+#ifdef CONFIG_KLIPS_AUTH_HMAC_SHA1 -+ SHA1_CTX sha1; -+#endif /* CONFIG_KLIPS_AUTH_HMAC_SHA1 */ -+ } tctx; -+#endif -+ unsigned char *dat = (unsigned char *)ixs->iph; -+ -+ ahp = (struct ahhdr *)(dat + ixs->iphlen); -+ ahp->ah_spi = ixs->ipsp->ips_said.spi; -+ ahp->ah_rpl = htonl(++(ixs->ipsp->ips_replaywin_lastseq)); -+ ahp->ah_rv = 0; -+ ahp->ah_nh = ixs->iph->protocol; -+ ahp->ah_hl = (sizeof(struct ahhdr) >> 2) - sizeof(__u64)/sizeof(__u32); -+ ixs->iph->protocol = IPPROTO_AH; -+ ipsec_xmit_dmp("ahp", (char*)ahp, sizeof(*ahp)); -+ -+ ipo = *ixs->iph; -+ ipo.tos = 0; -+ ipo.frag_off = 0; -+ ipo.ttl = 0; -+ ipo.check = 0; -+ ipsec_xmit_dmp("ipo", (char*)&ipo, sizeof(ipo)); -+ -+ switch(ixs->ipsp->ips_authalg) { -+#ifdef CONFIG_KLIPS_AUTH_HMAC_MD5 -+ case AH_MD5: -+ tctx.md5 = ((struct md5_ctx*)(ixs->ipsp->ips_key_a))->ictx; -+ ipsec_xmit_dmp("ictx", (char*)&tctx.md5, sizeof(tctx.md5)); -+ osMD5Update(&tctx.md5, (unsigned char *)&ipo, sizeof (struct iphdr)); -+ ipsec_xmit_dmp("ictx+ipo", (char*)&tctx.md5, sizeof(tctx.md5)); -+ osMD5Update(&tctx.md5, (unsigned char *)ahp, -+ sizeof(struct ahhdr) - sizeof(ahp->ah_data)); -+ ipsec_xmit_dmp("ictx+ahp", (char*)&tctx.md5, sizeof(tctx.md5)); -+ osMD5Update(&tctx.md5, (unsigned char *)zeroes, AHHMAC_HASHLEN); -+ ipsec_xmit_dmp("ictx+zeroes", (char*)&tctx.md5, sizeof(tctx.md5)); -+ osMD5Update(&tctx.md5, dat + ixs->iphlen + sizeof(struct ahhdr), -+ ixs->skb->len - ixs->iphlen - sizeof(struct ahhdr)); -+ ipsec_xmit_dmp("ictx+dat", (char*)&tctx.md5, sizeof(tctx.md5)); -+ osMD5Final(hash, &tctx.md5); -+ ipsec_xmit_dmp("ictx hash", (char*)&hash, sizeof(hash)); -+ tctx.md5 = ((struct md5_ctx*)(ixs->ipsp->ips_key_a))->octx; -+ ipsec_xmit_dmp("octx", (char*)&tctx.md5, sizeof(tctx.md5)); -+ osMD5Update(&tctx.md5, hash, AHMD596_ALEN); -+ ipsec_xmit_dmp("octx+hash", (char*)&tctx.md5, sizeof(tctx.md5)); -+ osMD5Final(hash, &tctx.md5); -+ ipsec_xmit_dmp("octx hash", (char*)&hash, sizeof(hash)); -+ -+ memcpy(ahp->ah_data, hash, AHHMAC_HASHLEN); -+ -+ /* paranoid */ -+ memset((caddr_t)&tctx.md5, 0, sizeof(tctx.md5)); -+ memset((caddr_t)hash, 0, sizeof(*hash)); -+ break; -+#endif /* CONFIG_KLIPS_AUTH_HMAC_MD5 */ -+#ifdef CONFIG_KLIPS_AUTH_HMAC_SHA1 -+ case AH_SHA: -+ tctx.sha1 = ((struct sha1_ctx*)(ixs->ipsp->ips_key_a))->ictx; -+ SHA1Update(&tctx.sha1, (unsigned char *)&ipo, sizeof (struct iphdr)); -+ SHA1Update(&tctx.sha1, (unsigned char *)ahp, sizeof(struct ahhdr) - sizeof(ahp->ah_data)); -+ SHA1Update(&tctx.sha1, (unsigned char *)zeroes, AHHMAC_HASHLEN); -+ SHA1Update(&tctx.sha1, dat + ixs->iphlen + sizeof(struct ahhdr), -+ ixs->skb->len - ixs->iphlen - sizeof(struct ahhdr)); -+ SHA1Final(hash, &tctx.sha1); -+ tctx.sha1 = ((struct sha1_ctx*)(ixs->ipsp->ips_key_a))->octx; -+ SHA1Update(&tctx.sha1, hash, AHSHA196_ALEN); -+ SHA1Final(hash, &tctx.sha1); -+ -+ memcpy(ahp->ah_data, hash, AHHMAC_HASHLEN); -+ -+ /* paranoid */ -+ memset((caddr_t)&tctx.sha1, 0, sizeof(tctx.sha1)); -+ memset((caddr_t)hash, 0, sizeof(*hash)); -+ break; -+#endif /* CONFIG_KLIPS_AUTH_HMAC_SHA1 */ -+ default: -+ ixs->stats->tx_errors++; -+ return IPSEC_XMIT_AH_BADALG; -+ } -+#ifdef NET_21 -+ skb_set_transport_header(ixs->skb, ipsec_skb_offset(ixs->skb, ahp)); -+#endif /* NET_21 */ -+ -+ return IPSEC_XMIT_OK; -+} -+ -+struct xform_functions ah_xform_funcs[]={ -+ { -+ protocol: IPPROTO_AH, -+ rcv_checks: ipsec_rcv_ah_checks, -+ rcv_setup_auth: ipsec_rcv_ah_setup_auth, -+ rcv_calc_auth: ipsec_rcv_ah_authcalc, -+ rcv_decrypt: ipsec_rcv_ah_decap, -+ -+ xmit_setup: ipsec_xmit_ah_setup, -+ xmit_headroom: sizeof(struct ahhdr), -+ xmit_needtailroom: 0, -+ }, -+}; -+ -+ -+#ifndef CONFIG_XFRM_ALTERNATE_STACK -+#ifdef NET_26 -+struct inet_protocol ah_protocol = { -+ .handler = ipsec_rcv, -+ .no_policy = 1, -+}; -+#else -+struct inet_protocol ah_protocol = -+{ -+ ipsec_rcv, /* AH handler */ -+ NULL, /* TUNNEL error control */ -+#ifdef NETDEV_25 -+ 1, /* no policy */ -+#else -+ 0, /* next */ -+ IPPROTO_AH, /* protocol ID */ -+ 0, /* copy */ -+ NULL, /* data */ -+ "AH" /* name */ -+#endif -+}; -+#endif /* NET_26 */ -+#endif /* CONFIG_XFRM_ALTERNATE_STACK */ -+ -+/* -+ * Local variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/ipsec_alg.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,1045 @@ -+/* -+ * Modular extensions service and registration functions -+ * -+ * Author: JuanJo Ciarlante -+ * -+ * Version: 0.8.1 -+ * -+ * ipsec_alg.c,v 1.1.2.1 2003/11/21 18:12:23 jjo Exp -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ */ -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif -+#include -+ -+#define __NO_VERSION__ -+#include -+#include /* printk() */ -+ -+#include /* struct device, and other headers */ -+#include /* eth_type_trans */ -+#include /* struct iphdr */ -+#include -+#include -+#include -+#include -+#include /* memcmp() */ -+#include /* get_random_bytes() */ -+#include /* error codes */ -+#ifdef SPINLOCK -+# ifdef SPINLOCK_23 -+# include /* *lock* */ -+# else /* SPINLOCK_23 */ -+# include /* *lock* */ -+# endif /* SPINLOCK_23 */ -+#endif /* SPINLOCK */ -+ -+#include "openswan/ipsec_param.h" -+#include -+#include "openswan/ipsec_sa.h" -+#include "openswan/radij.h" -+#include "openswan/ipsec_encap.h" -+#include "openswan/ipsec_radij.h" -+#include "openswan/ipsec_xform.h" -+#include "openswan/ipsec_tunnel.h" -+#include "openswan/ipsec_rcv.h" -+#if defined(CONFIG_KLIPS_ESP) || defined(CONFIG_KLIPS_AH) -+# include "openswan/ipsec_ah.h" -+#endif /* defined(CONFIG_KLIPS_ESP) || defined(CONFIG_KLIPS_AH) */ -+#ifdef CONFIG_KLIPS_ESP -+# include "openswan/ipsec_esp.h" -+#endif /* !CONFIG_KLIPS_ESP */ -+#ifdef CONFIG_KLIPS_IPCOMP -+# include "openswan/ipcomp.h" -+#endif /* CONFIG_KLIPS_COMP */ -+ -+#include -+#include -+ -+#include "openswan/ipsec_alg.h" -+#include "openswan/ipsec_proto.h" -+ -+#if K_SADB_EALG_MAX < 255 -+#warning Compiling with limited ESP support ( K_SADB_EALG_MAX < 256 ) -+#endif -+ -+static rwlock_t ipsec_alg_lock = RW_LOCK_UNLOCKED; -+#define IPSEC_ALG_HASHSZ 16 /* must be power of 2, even 2^0=1 */ -+static struct list_head ipsec_alg_hash_table[IPSEC_ALG_HASHSZ]; -+ -+/* Old gcc's will fail here */ -+#define barf_out(fmt, args...) do { struct ipsec_alg *ixtc = (struct ipsec_alg *)ixt; printk(KERN_ERR "%s: (%s) " fmt, __FUNCTION__, ixtc->ixt_name , ## args) \ -+ ; goto out; } while(0) -+ -+#ifdef NET_26 -+/* -+ * Must be already protected by lock -+ */ -+static void __ipsec_alg_usage_inc(struct ipsec_alg *ixt) -+{ -+#ifdef MODULE -+ if (ixt->ixt_module) -+ try_module_get(ixt->ixt_module); -+#endif -+ atomic_inc(&ixt->ixt_refcnt); -+} -+static void __ipsec_alg_usage_dec(struct ipsec_alg *ixt) { -+ atomic_dec(&ixt->ixt_refcnt); -+#ifdef MODULE -+ if (ixt->ixt_module) -+ module_put(ixt->ixt_module); -+#endif -+} -+ -+#else -+ -+/* -+ * Must be already protected by lock -+ */ -+static void __ipsec_alg_usage_inc(struct ipsec_alg *ixt) { -+#ifdef MODULE -+ if (ixt->ixt_module) { -+ __MOD_INC_USE_COUNT(ixt->ixt_module); -+ } -+#endif -+ atomic_inc(&ixt->ixt_refcnt); -+} -+static void __ipsec_alg_usage_dec(struct ipsec_alg *ixt) { -+ atomic_dec(&ixt->ixt_refcnt); -+#ifdef MODULE -+ if (ixt->ixt_module) -+ __MOD_DEC_USE_COUNT(ixt->ixt_module); -+#endif -+} -+#endif -+ -+/* -+ * simple hash function, optimized for 0-hash (1 list) special -+ * case -+ */ -+#if IPSEC_ALG_HASHSZ > 1 -+static inline unsigned ipsec_alg_hashfn(int alg_type, int alg_id) { -+ return ((alg_type^alg_id)&(IPSEC_ALG_HASHSZ-1)); -+} -+#else -+#define ipsec_alg_hashfn(x,y) (0) -+#endif -+ -+/***************************************************************** -+ * -+ * INTERNAL table handling: insert, delete, find -+ * -+ *****************************************************************/ -+ -+/* -+ * hash table initialization, called from ipsec_alg_init() -+ */ -+static void ipsec_alg_hash_init(void) { -+ struct list_head *head = ipsec_alg_hash_table; -+ int i = IPSEC_ALG_HASHSZ; -+ do { -+ INIT_LIST_HEAD(head); -+ head++; -+ i--; -+ } while (i); -+} -+/* -+ * hash list lookup by {alg_type, alg_id} and table head, -+ * must be already protected by lock -+ */ -+static struct ipsec_alg *__ipsec_alg_find(unsigned alg_type, unsigned alg_id, struct list_head * head) { -+ struct list_head *p; -+ struct ipsec_alg *ixt=NULL; -+ for (p=head->next; p!=head; p=p->next) { -+ ixt = list_entry(p, struct ipsec_alg, ixt_list); -+ if (ixt->ixt_alg_type == alg_type && ixt->ixt_alg_id==alg_id) { -+ goto out; -+ } -+ } -+ ixt=NULL; -+out: -+ return ixt; -+} -+/* -+ * inserts (in front) a new entry in hash table, -+ * called from ipsec_alg_register() when new algorithm is registered. -+ */ -+static int ipsec_alg_insert(struct ipsec_alg *ixt) { -+ int ret=-EINVAL; -+ unsigned hashval=ipsec_alg_hashfn(ixt->ixt_alg_type, ixt->ixt_alg_id); -+ struct list_head *head= ipsec_alg_hash_table + hashval; -+ struct ipsec_alg *ixt_cur; -+ -+ /* new element must be virgin ... */ -+ if (ixt->ixt_list.next != &ixt->ixt_list || -+ ixt->ixt_list.prev != &ixt->ixt_list) { -+ printk(KERN_ERR "ipsec_alg_insert: ixt object \"%s\" " -+ "list head not initialized\n", -+ ixt->ixt_name); -+ return ret; -+ } -+ write_lock_bh(&ipsec_alg_lock); -+ -+ ixt_cur = __ipsec_alg_find(ixt->ixt_alg_type, ixt->ixt_alg_id, head); -+ -+ /* if previous (current) ipsec_alg found check excl flag of _anyone_ */ -+ if (ixt_cur -+ && ((ixt->ixt_state|ixt_cur->ixt_state) & IPSEC_ALG_ST_EXCL)) { -+ barf_out("ipsec_alg for alg_type=%d, alg_id=%d already exist. " -+ "Not loaded (ret=%d).\n", -+ ixt->ixt_alg_type, -+ ixt->ixt_alg_id, ret=-EEXIST); -+ } -+ list_add(&ixt->ixt_list, head); -+ ixt->ixt_state |= IPSEC_ALG_ST_REGISTERED; -+ ret=0; -+out: -+ write_unlock_bh(&ipsec_alg_lock); -+ return ret; -+} -+ -+/* -+ * deletes an existing entry in hash table, -+ * called from ipsec_alg_unregister() when algorithm is unregistered. -+ */ -+static int ipsec_alg_delete(struct ipsec_alg *ixt) { -+ write_lock_bh(&ipsec_alg_lock); -+ list_del(&ixt->ixt_list); -+ write_unlock_bh(&ipsec_alg_lock); -+ return 0; -+} -+ -+/* -+ * here @user context (read-only when @kernel bh context) -+ * -> no bh disabling -+ * -+ * called from ipsec_sa_init() -> ipsec_alg_sa_init() -+ */ -+static struct ipsec_alg *ipsec_alg_get(int alg_type, int alg_id) -+{ -+ unsigned hashval=ipsec_alg_hashfn(alg_type, alg_id); -+ struct list_head *head= ipsec_alg_hash_table + hashval; -+ struct ipsec_alg *ixt; -+ -+ read_lock(&ipsec_alg_lock); -+ ixt=__ipsec_alg_find(alg_type, alg_id, head); -+ if (ixt) __ipsec_alg_usage_inc(ixt); -+ read_unlock(&ipsec_alg_lock); -+ -+ return ixt; -+} -+ -+static void ipsec_alg_put(struct ipsec_alg *ixt) { -+ __ipsec_alg_usage_dec((struct ipsec_alg *)ixt); -+} -+ -+/***************************************************************** -+ * -+ * INTERFACE for ENC services: key creation, encrypt function -+ * -+ *****************************************************************/ -+ -+/* -+ * main encrypt service entry point -+ * called from ipsec_rcv() with encrypt=IPSEC_ALG_DECRYPT and -+ * ipsec_tunnel_start_xmit with encrypt=IPSEC_ALG_ENCRYPT -+ */ -+int ipsec_alg_esp_encrypt(struct ipsec_sa *sa_p, __u8 * idat, -+ int ilen, __u8 * iv, int encrypt) -+{ -+ int ret; -+ struct ipsec_alg_enc *ixt_e=sa_p->ips_alg_enc; -+#ifdef CONFIG_KLIPS_DEBUG -+ int debug_flag = (encrypt==IPSEC_ALG_ENCRYPT ? -+ debug_tunnel : debug_rcv); -+#endif -+ -+ KLIPS_PRINT(debug_flag, -+ "klips_debug:ipsec_alg_esp_encrypt: " -+ "entering with encalg=%d, ixt_e=%p\n", -+ sa_p->ips_encalg, ixt_e); -+ if (ixt_e == NULL) { -+ KLIPS_ERROR(debug_flag, -+ "klips_debug:ipsec_alg_esp_encrypt: " -+ "NULL ipsec_alg_enc object\n"); -+ return -1; -+ } -+ KLIPS_PRINT(debug_flag, -+ "klips_debug:ipsec_alg_esp_encrypt: " -+ "calling cbc_encrypt encalg=%d " -+ "ips_key_e=%p idat=%p ilen=%d iv=%p, encrypt=%d\n", -+ sa_p->ips_encalg, -+ sa_p->ips_key_e, idat, ilen, iv, encrypt); -+ ret=ixt_e->ixt_e_cbc_encrypt(ixt_e, sa_p->ips_key_e, idat, -+ ilen, iv, encrypt); -+ KLIPS_PRINT(debug_flag, -+ "klips_debug:ipsec_alg_esp_encrypt: " -+ "returned ret=%d\n", -+ ret); -+ return ret; -+} -+ -+/* -+ * encryption key context creation function -+ * called from pfkey_v2_parser.c:pfkey_ips_init() -+ */ -+int ipsec_alg_enc_key_create(struct ipsec_sa *sa_p) { -+ int ret=-EINVAL; -+ int keyminbits, keymaxbits; -+ caddr_t ekp; -+ struct ipsec_alg_enc *ixt_e=sa_p->ips_alg_enc; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:ipsec_alg_enc_key_create: " -+ "entering with encalg=%d ixt_e=%p\n", -+ sa_p->ips_encalg, ixt_e); -+ if (!ixt_e) { -+ KLIPS_ERROR(debug_pfkey, -+ "klips_debug:ipsec_alg_enc_key_create: " -+ "NULL ipsec_alg_enc object\n"); -+ return -EPROTO; -+ } -+ -+ /* -+ * grRRR... DES 7bits jurassic stuff ... f*ckk --jjo -+ */ -+ switch(ixt_e->ixt_common.ixt_support.ias_id) { -+ case ESP_3DES: -+ keyminbits=keymaxbits=192;break; -+ case ESP_DES: -+ keyminbits=keymaxbits=64;break; -+ default: -+ keyminbits=ixt_e->ixt_common.ixt_support.ias_keyminbits; -+ keymaxbits=ixt_e->ixt_common.ixt_support.ias_keymaxbits; -+ } -+ if(sa_p->ips_key_bits_eips_key_bits_e>keymaxbits) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:ipsec_alg_enc_key_create: " -+ "incorrect encryption key size for id=%d: %d bits -- " -+ "must be between %d,%d bits\n" /*octets (bytes)\n"*/, -+ ixt_e->ixt_common.ixt_support.ias_id, -+ sa_p->ips_key_bits_e, keyminbits, keymaxbits); -+ ret=-EINVAL; -+ goto ixt_out; -+ } -+ /* save encryption key pointer */ -+ ekp = sa_p->ips_key_e; -+ -+ -+ if (ixt_e->ixt_e_new_key) { -+ sa_p->ips_key_e = ixt_e->ixt_e_new_key(ixt_e, -+ ekp, sa_p->ips_key_bits_e/8); -+ ret = (sa_p->ips_key_e)? 0 : -EINVAL; -+ } else { -+ if((sa_p->ips_key_e = (caddr_t) -+ kmalloc((sa_p->ips_key_e_size = ixt_e->ixt_e_ctx_size), -+ GFP_ATOMIC)) == NULL) { -+ ret=-ENOMEM; -+ goto ixt_out; -+ } -+ /* zero-out key_e */ -+ memset(sa_p->ips_key_e, 0, sa_p->ips_key_e_size); -+ -+ /* I cast here to allow more decoupling in alg module */ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:ipsec_alg_enc_key_create: about to call:" -+ "set_key(key_e=%p, ekp=%p, key_size=%d)\n", -+ (caddr_t)sa_p->ips_key_e, ekp, sa_p->ips_key_bits_e/8); -+ ret = ixt_e->ixt_e_set_key(ixt_e, (caddr_t)sa_p->ips_key_e, ekp, sa_p->ips_key_bits_e/8); -+ } -+ /* paranoid */ -+ memset(ekp, 0, sa_p->ips_key_bits_e/8); -+ kfree(ekp); -+ixt_out: -+ return ret; -+} -+ -+/*************************************************************** -+ * -+ * INTERFACE for AUTH services: key creation, hash functions -+ * -+ ***************************************************************/ -+ -+/* -+ * auth key context creation function -+ * called from pfkey_v2_parser.c:pfkey_ips_init() -+ */ -+int ipsec_alg_auth_key_create(struct ipsec_sa *sa_p) { -+ int ret=-EINVAL; -+ struct ipsec_alg_auth *ixt_a=sa_p->ips_alg_auth; -+ int keyminbits, keymaxbits; -+ unsigned char *akp; -+ unsigned int aks; -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:ipsec_alg_auth_key_create: " -+ "entering with authalg=%d ixt_a=%p\n", -+ sa_p->ips_authalg, ixt_a); -+ if (!ixt_a) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:ipsec_alg_auth_key_create: " -+ "NULL ipsec_alg_auth object\n"); -+ return -EPROTO; -+ } -+ keyminbits=ixt_a->ixt_common.ixt_support.ias_keyminbits; -+ keymaxbits=ixt_a->ixt_common.ixt_support.ias_keymaxbits; -+ if(sa_p->ips_key_bits_aips_key_bits_a>keymaxbits) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:ipsec_alg_auth_key_create: incorrect auth" -+ "key size: %d bits -- must be between %d,%d bits\n"/*octets (bytes)\n"*/, -+ sa_p->ips_key_bits_a, keyminbits, keymaxbits); -+ ret=-EINVAL; -+ goto ixt_out; -+ } -+ /* save auth key pointer */ -+ sa_p->ips_auth_bits = ixt_a->ixt_a_keylen * 8; /* XXX XXX */ -+ akp = sa_p->ips_key_a; -+ aks = sa_p->ips_key_a_size; -+ -+ /* will hold: 2 ctx and a blocksize buffer: kb */ -+ sa_p->ips_key_a_size = ixt_a->ixt_a_ctx_size; -+ if((sa_p->ips_key_a = -+ (caddr_t) kmalloc(sa_p->ips_key_a_size, GFP_ATOMIC)) == NULL) { -+ ret=-ENOMEM; -+ goto ixt_out; -+ } -+ ixt_a->ixt_a_hmac_set_key(ixt_a, sa_p->ips_key_a, akp, sa_p->ips_key_bits_a/8); /* XXX XXX */ -+ ret=0; -+ memset(akp, 0, aks); -+ kfree(akp); -+ -+ixt_out: -+ return ret; -+} -+ -+ -+int ipsec_alg_sa_esp_hash(const struct ipsec_sa *sa_p, const __u8 *espp, -+ int len, __u8 *hash, int hashlen) -+{ -+ struct ipsec_alg_auth *ixt_a=sa_p->ips_alg_auth; -+ if (!ixt_a) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:ipsec_sa_esp_hash: " -+ "NULL ipsec_alg_auth object\n"); -+ return -EPROTO; -+ } -+ KLIPS_PRINT(debug_tunnel|debug_rcv, -+ "klips_debug:ipsec_sa_esp_hash: " -+ "hashing %p (%d bytes) to %p (%d bytes)\n", -+ espp, len, -+ hash, hashlen); -+ ixt_a->ixt_a_hmac_hash(ixt_a, -+ sa_p->ips_key_a, -+ espp, len, -+ hash, hashlen); -+ return 0; -+} -+ -+/*************************************************************** -+ * -+ * INTERFACE for module loading,testing, and unloading -+ * -+ ***************************************************************/ -+ -+/* validation for registering (enc) module */ -+static int check_enc(struct ipsec_alg_enc *ixt) -+{ -+ int ret=-EINVAL; -+ if (ixt->ixt_common.ixt_blocksize==0) /* || ixt->ixt_common.ixt_blocksize%2) need for ESP_NULL */ -+ barf_out(KERN_ERR "invalid blocksize=%d\n", ixt->ixt_common.ixt_blocksize); -+ if (ixt->ixt_common.ixt_support.ias_keyminbits==0 -+ && ixt->ixt_common.ixt_support.ias_keymaxbits==0 -+ && ixt->ixt_e_keylen==0) -+ goto zero_key_ok; -+ -+ if (ixt->ixt_common.ixt_support.ias_keyminbits==0) -+ barf_out(KERN_ERR "invalid keyminbits=%d\n", ixt->ixt_common.ixt_support.ias_keyminbits); -+ -+ if (ixt->ixt_common.ixt_support.ias_keymaxbits==0) -+ barf_out(KERN_ERR "invalid keymaxbits=%d\n", ixt->ixt_common.ixt_support.ias_keymaxbits); -+ -+ if (ixt->ixt_e_keylen==0) -+ barf_out(KERN_ERR "invalid keysize=%d\n", ixt->ixt_e_keylen); -+ -+zero_key_ok: -+ if (ixt->ixt_e_ctx_size==0 && ixt->ixt_e_new_key == NULL) -+ barf_out(KERN_ERR "invalid key_e_size=%d and ixt_e_new_key=NULL\n", ixt->ixt_e_ctx_size); -+ if (ixt->ixt_e_cbc_encrypt==NULL) -+ barf_out(KERN_ERR "e_cbc_encrypt() must be not NULL\n"); -+ ret=0; -+out: -+ return ret; -+} -+ -+/* validation for registering (auth) module */ -+static int check_auth(struct ipsec_alg_auth *ixt) -+{ -+ int ret=-EINVAL; -+ if (ixt->ixt_common.ixt_support.ias_id==0 || ixt->ixt_common.ixt_support.ias_id > K_SADB_AALG_MAX) -+ barf_out("invalid alg_id=%d > %d (K_SADB_AALG_MAX)\n", -+ ixt->ixt_common.ixt_support.ias_id, K_SADB_AALG_MAX); -+ -+ if (ixt->ixt_common.ixt_blocksize==0 -+ || ixt->ixt_common.ixt_blocksize%2) -+ barf_out(KERN_ERR "invalid blocksize=%d\n", -+ ixt->ixt_common.ixt_blocksize); -+ -+ if (ixt->ixt_common.ixt_blocksize>AH_BLKLEN_MAX) -+ barf_out(KERN_ERR "sorry blocksize=%d > %d. " -+ "Please increase AH_BLKLEN_MAX and recompile\n", -+ ixt->ixt_common.ixt_blocksize, -+ AH_BLKLEN_MAX); -+ if (ixt->ixt_common.ixt_support.ias_keyminbits==0 && ixt->ixt_common.ixt_support.ias_keymaxbits==0 && ixt->ixt_a_keylen==0) -+ goto zero_key_ok; -+ if (ixt->ixt_common.ixt_support.ias_keyminbits==0) -+ barf_out(KERN_ERR "invalid keyminbits=%d\n", ixt->ixt_common.ixt_support.ias_keyminbits); -+ if (ixt->ixt_common.ixt_support.ias_keymaxbits==0) -+ barf_out(KERN_ERR "invalid keymaxbits=%d\n", ixt->ixt_common.ixt_support.ias_keymaxbits); -+ if (ixt->ixt_common.ixt_support.ias_keymaxbits!=ixt->ixt_common.ixt_support.ias_keyminbits) -+ barf_out(KERN_ERR "keymaxbits must equal keyminbits (not sure).\n"); -+ if (ixt->ixt_a_keylen==0) -+ barf_out(KERN_ERR "invalid keysize=%d\n", ixt->ixt_a_keylen); -+zero_key_ok: -+ if (ixt->ixt_a_ctx_size==0) -+ barf_out(KERN_ERR "invalid a_ctx_size=%d\n", ixt->ixt_a_ctx_size); -+ if (ixt->ixt_a_hmac_set_key==NULL) -+ barf_out(KERN_ERR "a_hmac_set_key() must be not NULL\n"); -+ if (ixt->ixt_a_hmac_hash==NULL) -+ barf_out(KERN_ERR "a_hmac_hash() must be not NULL\n"); -+ ret=0; -+out: -+ return ret; -+} -+ -+/* -+ * Generic (enc, auth) registration entry point -+ */ -+int register_ipsec_alg(struct ipsec_alg *ixt) -+{ -+ int ret=-EINVAL; -+ /* Validation */ -+ if (ixt==NULL) -+ barf_out("NULL ipsec_alg object passed\n"); -+ if ((ixt->ixt_version&0xffffff00) != (IPSEC_ALG_VERSION&0xffffff00)) -+ barf_out("incorrect version: %d.%d.%d-%d, " -+ "must be %d.%d.%d[-%d]\n", -+ IPSEC_ALG_VERSION_QUAD(ixt->ixt_version), -+ IPSEC_ALG_VERSION_QUAD(IPSEC_ALG_VERSION)); -+ -+ switch(ixt->ixt_alg_type) { -+ case IPSEC_ALG_TYPE_AUTH: -+ if ((ret=check_auth((struct ipsec_alg_auth *)ixt)<0)) -+ goto out; -+ break; -+ case IPSEC_ALG_TYPE_ENCRYPT: -+ if ((ret=check_enc((struct ipsec_alg_enc *)ixt)<0)) -+ goto out; -+ /* -+ * Adapted two lines below: -+ * ivlen == 0 is possible (NULL enc has blocksize==1) -+ * -+ * fixed NULL support by David De Reu -+ */ -+ if (ixt->ixt_support.ias_ivlen == 0 -+ && ixt->ixt_blocksize > 1) { -+ ixt->ixt_support.ias_ivlen = ixt->ixt_blocksize*8; -+ } -+ break; -+ default: -+ barf_out("alg_type=%d not supported\n", ixt->ixt_alg_type); -+ } -+ INIT_LIST_HEAD(&ixt->ixt_list); -+ ret = ipsec_alg_insert(ixt); -+ if (ret<0) -+ barf_out(KERN_WARNING "ipsec_alg for alg_id=%d failed." -+ "Not loaded (ret=%d).\n", -+ ixt->ixt_support.ias_id, ret); -+ -+ -+ ret = pfkey_list_insert_supported((struct ipsec_alg_supported *)&ixt->ixt_support -+ , &(pfkey_supported_list[K_SADB_SATYPE_ESP])); -+ -+ if (ret==0) { -+ ixt->ixt_state |= IPSEC_ALG_ST_SUPP; -+ /* send register event to userspace */ -+ pfkey_register_reply(K_SADB_SATYPE_ESP, NULL); -+ } else -+ printk(KERN_ERR "pfkey_list_insert_supported returned %d. " -+ "Loading anyway.\n", ret); -+ ret=0; -+out: -+ return ret; -+} -+ -+/* -+ * unregister ipsec_alg object from own tables, if -+ * success => calls pfkey_list_remove_supported() -+ */ -+int unregister_ipsec_alg(struct ipsec_alg *ixt) { -+ int ret= -EINVAL; -+ switch(ixt->ixt_alg_type) { -+ case IPSEC_ALG_TYPE_AUTH: -+ case IPSEC_ALG_TYPE_ENCRYPT: -+ break; -+ default: -+ /* this is not a typo :) */ -+ barf_out("frog found in list (\"%s\"): ixt_p=NULL\n", -+ ixt->ixt_name); -+ } -+ -+ ret=ipsec_alg_delete(ixt); -+ if (ixt->ixt_state&IPSEC_ALG_ST_SUPP) { -+ ixt->ixt_state &= ~IPSEC_ALG_ST_SUPP; -+ pfkey_list_remove_supported((struct ipsec_alg_supported *)&ixt->ixt_support -+ , &(pfkey_supported_list[K_SADB_SATYPE_ESP])); -+ -+ /* send register event to userspace */ -+ pfkey_register_reply(K_SADB_SATYPE_ESP, NULL); -+ } -+ -+out: -+ return ret; -+} -+ -+/* -+ * Must be called from user context -+ * used at module load type for testing algo implementation -+ */ -+static int ipsec_alg_test_encrypt(int enc_alg, int test) { -+ int ret; -+ caddr_t buf = NULL; -+ int iv_size, keysize, key_e_size; -+ struct ipsec_alg_enc *ixt_e; -+ void *tmp_key_e = NULL; -+ #define BUFSZ 1024 -+ #define MARGIN 0 -+ #define test_enc (buf+MARGIN) -+ #define test_dec (test_enc+BUFSZ+MARGIN) -+ #define test_tmp (test_dec+BUFSZ+MARGIN) -+ #define test_key_e (test_tmp+BUFSZ+MARGIN) -+ #define test_iv (test_key_e+key_e_size+MARGIN) -+ #define test_key (test_iv+iv_size+MARGIN) -+ #define test_size (BUFSZ*3+key_e_size+iv_size+keysize+MARGIN*7) -+ ixt_e=(struct ipsec_alg_enc *)ipsec_alg_get(IPSEC_ALG_TYPE_ENCRYPT, enc_alg); -+ if (ixt_e==NULL) { -+ KLIPS_PRINT(1, -+ "klips_debug: ipsec_alg_test_encrypt: " -+ "encalg=%d object not found\n", -+ enc_alg); -+ ret=-EINVAL; -+ goto out; -+ } -+ iv_size=ixt_e->ixt_common.ixt_support.ias_ivlen / 8; -+ key_e_size=ixt_e->ixt_e_ctx_size; -+ keysize=ixt_e->ixt_e_keylen; -+ KLIPS_PRINT(1, -+ "klips_debug: ipsec_alg_test_encrypt: " -+ "enc_alg=%d blocksize=%d key_e_size=%d keysize=%d\n", -+ enc_alg, iv_size, key_e_size, keysize); -+ if ((buf=kmalloc (test_size, GFP_KERNEL)) == NULL) { -+ ret= -ENOMEM; -+ goto out; -+ } -+ get_random_bytes(test_key, keysize); -+ get_random_bytes(test_iv, iv_size); -+ if (ixt_e->ixt_e_new_key) { -+ tmp_key_e = ixt_e->ixt_e_new_key(ixt_e, test_key, keysize); -+ ret = tmp_key_e ? 0 : -EINVAL; -+ } else { -+ tmp_key_e = test_key_e; -+ ret = ixt_e->ixt_e_set_key(ixt_e, test_key_e, test_key, keysize); -+ } -+ if (ret < 0) -+ goto out; -+ get_random_bytes(test_enc, BUFSZ); -+ memcpy(test_tmp, test_enc, BUFSZ); -+ ret=ixt_e->ixt_e_cbc_encrypt(ixt_e, tmp_key_e, test_enc, BUFSZ, test_iv, 1); -+ printk(KERN_INFO -+ "klips_info: ipsec_alg_test_encrypt: " -+ "cbc_encrypt=1 ret=%d\n", -+ ret); -+ ret=memcmp(test_enc, test_tmp, BUFSZ); -+ printk(KERN_INFO -+ "klips_info: ipsec_alg_test_encrypt: " -+ "memcmp(enc, tmp) ret=%d: %s\n", ret, -+ ret!=0? "OK. (encr->DIFFers)" : "FAIL! (encr->SAME)" ); -+ memcpy(test_dec, test_enc, BUFSZ); -+ ret=ixt_e->ixt_e_cbc_encrypt(ixt_e, tmp_key_e, test_dec, BUFSZ, test_iv, 0); -+ printk(KERN_INFO -+ "klips_info: ipsec_alg_test_encrypt: " -+ "cbc_encrypt=0 ret=%d\n", ret); -+ ret=memcmp(test_dec, test_tmp, BUFSZ); -+ printk(KERN_INFO -+ "klips_info: ipsec_alg_test_encrypt: " -+ "memcmp(dec,tmp) ret=%d: %s\n", ret, -+ ret==0? "OK. (encr->decr->SAME)" : "FAIL! (encr->decr->DIFFers)" ); -+ { -+ /* Shamelessly taken from drivers/md sources O:) */ -+ unsigned long now; -+ int i, count, max=0; -+ int encrypt, speed; -+ for (encrypt=0; encrypt <2;encrypt ++) { -+ for (i = 0; i < 5; i++) { -+ now = jiffies; -+ count = 0; -+ while (jiffies == now) { -+ mb(); -+ ixt_e->ixt_e_cbc_encrypt(ixt_e, -+ tmp_key_e, test_tmp, -+ BUFSZ, test_iv, encrypt); -+ mb(); -+ count++; -+ mb(); -+ } -+ if (count > max) -+ max = count; -+ } -+ speed = max * (HZ * BUFSZ / 1024); -+ printk(KERN_INFO -+ "klips_info: ipsec_alg_test_encrypt: " -+ "%s %s speed=%d KB/s\n", -+ ixt_e->ixt_common.ixt_name, -+ encrypt? "encrypt": "decrypt", speed); -+ } -+ } -+out: -+ if (tmp_key_e && ixt_e->ixt_e_destroy_key) ixt_e->ixt_e_destroy_key(ixt_e, tmp_key_e); -+ if (buf) kfree(buf); -+ if (ixt_e) ipsec_alg_put((struct ipsec_alg *)ixt_e); -+ return ret; -+ #undef test_enc -+ #undef test_dec -+ #undef test_tmp -+ #undef test_key_e -+ #undef test_iv -+ #undef test_key -+ #undef test_size -+} -+ -+/* -+ * Must be called from user context -+ * used at module load type for testing algo implementation -+ */ -+static int ipsec_alg_test_auth(int auth_alg, int test) { -+ int ret; -+ caddr_t buf = NULL; -+ int blocksize, keysize, key_a_size; -+ struct ipsec_alg_auth *ixt_a; -+ #define BUFSZ 1024 -+ #define MARGIN 0 -+ #define test_auth (buf+MARGIN) -+ #define test_key_a (test_auth+BUFSZ+MARGIN) -+ #define test_key (test_key_a+key_a_size+MARGIN) -+ #define test_hash (test_key+keysize+MARGIN) -+ #define test_size (BUFSZ+key_a_size+keysize+AHHMAC_HASHLEN+MARGIN*4) -+ ixt_a=(struct ipsec_alg_auth *)ipsec_alg_get(IPSEC_ALG_TYPE_AUTH, auth_alg); -+ if (ixt_a==NULL) { -+ KLIPS_PRINT(1, -+ "klips_debug: ipsec_alg_test_auth: " -+ "encalg=%d object not found\n", -+ auth_alg); -+ ret=-EINVAL; -+ goto out; -+ } -+ blocksize=ixt_a->ixt_common.ixt_blocksize; -+ key_a_size=ixt_a->ixt_a_ctx_size; -+ keysize=ixt_a->ixt_a_keylen; -+ KLIPS_PRINT(1, -+ "klips_debug: ipsec_alg_test_auth: " -+ "auth_alg=%d blocksize=%d key_a_size=%d keysize=%d\n", -+ auth_alg, blocksize, key_a_size, keysize); -+ if ((buf=kmalloc (test_size, GFP_KERNEL)) == NULL) { -+ ret= -ENOMEM; -+ goto out; -+ } -+ get_random_bytes(test_key, keysize); -+ ret = ixt_a->ixt_a_hmac_set_key(ixt_a, test_key_a, test_key, keysize); -+ if (ret < 0 ) -+ goto out; -+ get_random_bytes(test_auth, BUFSZ); -+ ret=ixt_a->ixt_a_hmac_hash(ixt_a, test_key_a, test_auth, BUFSZ, test_hash, AHHMAC_HASHLEN); -+ printk(KERN_INFO -+ "klips_info: ipsec_alg_test_auth: " -+ "ret=%d\n", ret); -+ { -+ /* Shamelessly taken from drivers/md sources O:) */ -+ unsigned long now; -+ int i, count, max=0; -+ int speed; -+ for (i = 0; i < 5; i++) { -+ now = jiffies; -+ count = 0; -+ while (jiffies == now) { -+ mb(); -+ ixt_a->ixt_a_hmac_hash(ixt_a, test_key_a, test_auth, BUFSZ, test_hash, AHHMAC_HASHLEN); -+ mb(); -+ count++; -+ mb(); -+ } -+ if (count > max) -+ max = count; -+ } -+ speed = max * (HZ * BUFSZ / 1024); -+ printk(KERN_INFO -+ "klips_info: ipsec_alg_test_auth: " -+ "%s hash speed=%d KB/s\n", -+ ixt_a->ixt_common.ixt_name, -+ speed); -+ } -+out: -+ if (buf) kfree(buf); -+ if (ixt_a) ipsec_alg_put((struct ipsec_alg *)ixt_a); -+ return ret; -+ #undef test_auth -+ #undef test_key_a -+ #undef test_key -+ #undef test_hash -+ #undef test_size -+} -+ -+int ipsec_alg_test(unsigned alg_type, unsigned alg_id, int test) { -+ switch(alg_type) { -+ case IPSEC_ALG_TYPE_ENCRYPT: -+ return ipsec_alg_test_encrypt(alg_id, test); -+ break; -+ case IPSEC_ALG_TYPE_AUTH: -+ return ipsec_alg_test_auth(alg_id, test); -+ break; -+ } -+ printk(KERN_ERR "klips_info: ipsec_alg_test() called incorrectly: " -+ "alg_type=%d alg_id=%d\n", -+ alg_type, alg_id); -+ return -EINVAL; -+} -+ -+int ipsec_alg_init(void) { -+ KLIPS_PRINT(1, "klips_info:ipsec_alg_init: " -+ "KLIPS alg v=%d.%d.%d-%d (EALG_MAX=%d, AALG_MAX=%d)\n", -+ IPSEC_ALG_VERSION_QUAD(IPSEC_ALG_VERSION), -+ K_SADB_EALG_MAX, K_SADB_AALG_MAX); -+ /* Initialize tables */ -+ write_lock_bh(&ipsec_alg_lock); -+ ipsec_alg_hash_init(); -+ write_unlock_bh(&ipsec_alg_lock); -+ -+ /* Initialize static algos */ -+ KLIPS_PRINT(1, "klips_info:ipsec_alg_init: " -+ "calling ipsec_alg_static_init()\n"); -+ -+ /* If we are suppose to use our AES, and don't have -+ * CryptoAPI enabled... -+ */ -+#if defined(CONFIG_KLIPS_ENC_AES) && CONFIG_KLIPS_ENC_AES && !defined(CONFIG_KLIPS_ENC_AES_MODULE) -+#if defined(CONFIG_KLIPS_ENC_CRYPTOAPI) && CONFIG_KLIPS_ENC_CRYPTOAPI -+#warning "Using built-in AES rather than CryptoAPI AES" -+#endif -+ { -+ extern int ipsec_aes_init(void); -+ ipsec_aes_init(); -+ } -+#endif -+ -+#if defined(CONFIG_KLIPS_ENC_3DES) && !defined(CONFIG_KLIPS_ENC_3DES_MODULE) -+#if defined(CONFIG_KLIPS_ENC_CRYPTOAPI) && CONFIG_KLIPS_ENC_CRYPTOAPI -+#warning "Using built-in 3des rather than CryptoAPI 3des" -+#endif -+ { -+ extern int ipsec_3des_init(void); -+ ipsec_3des_init(); -+ } -+#endif -+ -+ /* If we are doing CryptoAPI, then init */ -+#if defined(CONFIG_KLIPS_ENC_CRYPTOAPI) && CONFIG_KLIPS_ENC_CRYPTOAPI && !defined(CONFIG_KLIPS_ENC_CRYPTOAPI_MODULE) -+ { -+ extern int ipsec_cryptoapi_init(void); -+ ipsec_cryptoapi_init(); -+ } -+#endif -+ -+ -+ return 0; -+} -+ -+/********************************************** -+ * -+ * INTERFACE for ipsec_sa init and wipe -+ * -+ **********************************************/ -+ -+/* -+ * Called from pluto -> pfkey_v2_parser.c:pfkey_ipsec_sa_init() -+ */ -+int ipsec_alg_sa_init(struct ipsec_sa *sa_p) { -+ struct ipsec_alg_enc *ixt_e; -+ struct ipsec_alg_auth *ixt_a; -+ -+ /* Only ESP for now ... */ -+ if (sa_p->ips_said.proto != IPPROTO_ESP) -+ return -EPROTONOSUPPORT; -+ -+ KLIPS_PRINT(debug_pfkey, "klips_debug: ipsec_alg_sa_init() :" -+ "entering for encalg=%d, authalg=%d\n", -+ sa_p->ips_encalg, sa_p->ips_authalg); -+ -+ if ((ixt_e=(struct ipsec_alg_enc *) -+ ipsec_alg_get(IPSEC_ALG_TYPE_ENCRYPT, sa_p->ips_encalg))) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug: ipsec_alg_sa_init() :" -+ "found ipsec_alg (ixt_e=%p) for encalg=%d\n", -+ ixt_e, sa_p->ips_encalg); -+ sa_p->ips_alg_enc=ixt_e; -+ } -+ -+ if ((ixt_a=(struct ipsec_alg_auth *) -+ ipsec_alg_get(IPSEC_ALG_TYPE_AUTH, sa_p->ips_authalg))) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug: ipsec_alg_sa_init() :" -+ "found ipsec_alg (ixt_a=%p) for auth=%d\n", -+ ixt_a, sa_p->ips_authalg); -+ sa_p->ips_alg_auth=ixt_a; -+ } -+ return 0; -+} -+ -+/* -+ * Called from pluto -> ipsec_sa.c:ipsec_sa_delchain() -+ */ -+int ipsec_alg_sa_wipe(struct ipsec_sa *sa_p) { -+ struct ipsec_alg *ixt; -+ if ((ixt=(struct ipsec_alg *)sa_p->ips_alg_enc)) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug: ipsec_alg_sa_wipe() :" -+ "unlinking for encalg=%d\n", -+ ixt->ixt_support.ias_id); -+ ipsec_alg_put(ixt); -+ } -+ if ((ixt=(struct ipsec_alg *)sa_p->ips_alg_auth)) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug: ipsec_alg_sa_wipe() :" -+ "unlinking for authalg=%d\n", -+ ixt->ixt_support.ias_id); -+ ipsec_alg_put(ixt); -+ } -+ return 0; -+} -+ -+IPSEC_PROCFS_DEBUG_NO_STATIC -+int -+ipsec_xform_get_info(char *buffer, -+ char **start, -+ off_t offset, -+ int length IPSEC_PROC_LAST_ARG) -+{ -+ int len = 0; -+ off_t begin = 0; -+ int i; -+ struct list_head *head; -+ struct ipsec_alg *ixt; -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_PROCFS, -+ "klips_debug:ipsec_tncfg_get_info: " -+ "buffer=0p%p, *start=0p%p, offset=%d, length=%d\n", -+ buffer, -+ *start, -+ (int)offset, -+ length); -+ -+ for(i = 0, head = ipsec_alg_hash_table; -+ inext; p!=head; p=p->next) -+ { -+ ixt = list_entry(p, struct ipsec_alg, ixt_list); -+ len += ipsec_snprintf(buffer+len, length-len, -+ "VERSION=%d TYPE=%d ID=%d NAME=%s REFCNT=%d ", -+ ixt->ixt_version, ixt->ixt_alg_type, ixt->ixt_support.ias_id, -+ ixt->ixt_name, ixt->ixt_refcnt); -+ -+ len += ipsec_snprintf(buffer+len, length-len, -+ "STATE=%08x BLOCKSIZE=%d IVLEN=%d KEYMINBITS=%d KEYMAXBITS=%d ", -+ ixt->ixt_state, ixt->ixt_blocksize, -+ ixt->ixt_support.ias_ivlen, ixt->ixt_support.ias_keyminbits, ixt->ixt_support.ias_keymaxbits); -+ -+ len += ipsec_snprintf(buffer+len, length-len, -+ "IVLEN=%d KEYMINBITS=%d KEYMAXBITS=%d ", -+ ixt->ixt_support.ias_ivlen, ixt->ixt_support.ias_keyminbits, ixt->ixt_support.ias_keymaxbits); -+ -+ switch(ixt->ixt_alg_type) -+ { -+ case IPSEC_ALG_TYPE_AUTH: -+ { -+ struct ipsec_alg_auth *auth = (struct ipsec_alg_auth *)ixt; -+ -+ len += ipsec_snprintf(buffer+len, length-len, -+ "KEYLEN=%d CTXSIZE=%d AUTHLEN=%d ", -+ auth->ixt_a_keylen, auth->ixt_a_ctx_size, -+ auth->ixt_a_authlen); -+ break; -+ } -+ case IPSEC_ALG_TYPE_ENCRYPT: -+ { -+ struct ipsec_alg_enc *enc = (struct ipsec_alg_enc *)ixt; -+ len += ipsec_snprintf(buffer+len, length-len, -+ "KEYLEN=%d CTXSIZE=%d ", -+ enc->ixt_e_keylen, enc->ixt_e_ctx_size); -+ -+ break; -+ } -+ } -+ -+ len += ipsec_snprintf(buffer+len, length-len, "\n"); -+ } -+ } -+ -+ *start = buffer + (offset - begin); /* Start of wanted data */ -+ len -= (offset - begin); /* Start slop */ -+ if (len > length) -+ len = length; -+ return len; -+} -+ -+ -+/* -+ * As the author of this module, I ONLY ALLOW using it from -+ * GPL (or same LICENSE TERMS as kernel source) modules. -+ * -+ * In respect to hardware crypto engines this means: -+ * * Closed-source device drivers ARE NOT ALLOWED to use -+ * this interface. -+ * * Closed-source VHDL/Verilog firmware running on -+ * the crypto hardware device IS ALLOWED to use this interface -+ * via a GPL (or same LICENSE TERMS as kernel source) device driver. -+ * --Juan Jose Ciarlante 20/03/2002 (thanks RGB for the correct wording) -+ */ -+ -+/* -+ * These symbols can only be used from GPL modules -+ * for now, I'm disabling this because it creates false -+ * symbol problems for old modutils. -+ */ -+ -+#ifdef CONFIG_MODULES -+#ifndef NET_26 -+#if 0 -+#ifndef EXPORT_SYMBOL_GPL -+#undef EXPORT_SYMBOL_GPL -+#define EXPORT_SYMBOL_GPL EXPORT_SYMBOL -+#endif -+#endif -+EXPORT_SYMBOL(register_ipsec_alg); -+EXPORT_SYMBOL(unregister_ipsec_alg); -+EXPORT_SYMBOL(ipsec_alg_test); -+#endif -+#endif ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/ipsec_alg_cryptoapi.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,515 @@ -+/* -+ * ipsec_alg to linux cryptoapi GLUE -+ * -+ * Authors: CODE.ar TEAM -+ * Harpo MAxx -+ * JuanJo Ciarlante -+ * Luciano Ruete -+ * -+ * ipsec_alg_cryptoapi.c,v 1.1.2.1 2003/11/21 18:12:23 jjo Exp -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * Example usage: -+ * modinfo -p ipsec_cryptoapi (quite useful info, including supported algos) -+ * modprobe ipsec_cryptoapi -+ * modprobe ipsec_cryptoapi test=1 -+ * modprobe ipsec_cryptoapi excl=1 (exclusive cipher/algo) -+ * modprobe ipsec_cryptoapi noauto=1 aes=1 twofish=1 (only these ciphers) -+ * modprobe ipsec_cryptoapi aes=128,128 (force these keylens) -+ * modprobe ipsec_cryptoapi des_ede3=0 (everything but 3DES) -+ */ -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif -+#include -+ -+/* -+ * special case: ipsec core modular with this static algo inside: -+ * must avoid MODULE magic for this file -+ */ -+#if CONFIG_KLIPS_MODULE && CONFIG_KLIPS_ENC_CRYPTOAPI -+#undef MODULE -+#endif -+ -+#include -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) -+# include -+#endif -+#include -+ -+#include /* printk() */ -+#include /* error codes */ -+#include /* size_t */ -+#include -+ -+/* Check if __exit is defined, if not null it */ -+#ifndef __exit -+#define __exit -+#endif -+ -+/* warn the innocent */ -+#if !defined (CONFIG_CRYPTO) && !defined (CONFIG_CRYPTO_MODULE) -+#warning "No linux CryptoAPI configured, install 2.4.22+ or 2.6.x or enable CryptoAPI" -+#define NO_CRYPTOAPI_SUPPORT -+#endif -+ -+#include "openswan.h" -+#include "openswan/ipsec_alg.h" -+#include "openswan/ipsec_policy.h" -+ -+#include -+#ifdef CRYPTO_API_VERSION_CODE -+#warning "Old CryptoAPI is not supported. Only linux-2.4.22+ or linux-2.6.x are supported" -+#define NO_CRYPTOAPI_SUPPORT -+#endif -+ -+#ifdef NO_CRYPTOAPI_SUPPORT -+#warning "Building an unusable module :P" -+/* Catch old CryptoAPI by not allowing module to load */ -+IPSEC_ALG_MODULE_INIT_STATIC( ipsec_cryptoapi_init ) -+{ -+ printk(KERN_WARNING "ipsec_cryptoapi.o was not built on stock Linux CryptoAPI (2.4.22+ or 2.6.x), not loading.\n"); -+ return -EINVAL; -+} -+#else -+#include -+#include -+#include -+ -+/* -+ * CryptoAPI compat code - we use the current API and macro back to -+ * the older ones. -+ */ -+ -+#ifndef CRYPTO_TFM_MODE_CBC -+/* -+ * As of linux-2.6.21 this is no longer defined, and presumably no longer -+ * needed to be passed into the crypto core code. -+ */ -+#define CRYPTO_TFM_MODE_CBC 0 -+#define CRYPTO_TFM_MODE_ECB 0 -+#endif -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) -+ /* -+ * Linux 2.6.19 introduced a new Crypto API, setup macro's to convert new -+ * API into old API. -+ */ -+ -+ /* Symmetric/Block Cipher */ -+ struct blkcipher_desc -+ { -+ struct crypto_tfm *tfm; -+ void *info; -+ }; -+ #define ecb(X) #X -+ #define cbc(X) #X -+ #define crypto_has_blkcipher(X, Y, Z) crypto_alg_available(X, 0) -+ #define crypto_blkcipher_cast(X) X -+ #define crypto_blkcipher_tfm(X) X -+ #define crypto_alloc_blkcipher(X, Y, Z) crypto_alloc_tfm(X, CRYPTO_TFM_MODE_CBC) -+ #define crypto_blkcipher_ivsize(X) crypto_tfm_alg_ivsize(X) -+ #define crypto_blkcipher_blocksize(X) crypto_tfm_alg_blocksize(X) -+ #define crypto_blkcipher_setkey(X, Y, Z) crypto_cipher_setkey(X, Y, Z) -+ #define crypto_blkcipher_encrypt_iv(W, X, Y, Z) \ -+ crypto_cipher_encrypt_iv((W)->tfm, X, Y, Z, (u8 *)((W)->info)) -+ #define crypto_blkcipher_decrypt_iv(W, X, Y, Z) \ -+ crypto_cipher_decrypt_iv((W)->tfm, X, Y, Z, (u8 *)((W)->info)) -+ -+ /* Hash/HMAC/Digest */ -+ struct hash_desc -+ { -+ struct crypto_tfm *tfm; -+ }; -+ #define hmac(X) #X -+ #define crypto_has_hash(X, Y, Z) crypto_alg_available(X, 0) -+ #define crypto_hash_cast(X) X -+ #define crypto_hash_tfm(X) X -+ #define crypto_alloc_hash(X, Y, Z) crypto_alloc_tfm(X, 0) -+ #define crypto_hash_digestsize(X) crypto_tfm_alg_digestsize(X) -+ #define crypto_hash_digest(W, X, Y, Z) \ -+ crypto_digest_digest((W)->tfm, X, sg_num, Z) -+ -+ /* Asymmetric Cipher */ -+ #define crypto_has_cipher(X, Y, Z) crypto_alg_available(X, 0) -+ -+ /* Compression */ -+ #define crypto_has_comp(X, Y, Z) crypto_alg_available(X, 0) -+ #define crypto_comp_tfm(X) X -+ #define crypto_comp_cast(X) X -+ #define crypto_alloc_comp(X, Y, Z) crypto_alloc_tfm(X, 0) -+#else -+ #define ecb(X) "ecb(" #X ")" -+ #define cbc(X) "cbc(" #X ")" -+ #define hmac(X) "hmac(" #X ")" -+#endif /* if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) */ -+ -+#define CIPHERNAME_AES cbc(aes) -+#define CIPHERNAME_1DES cbc(des) -+#define CIPHERNAME_3DES cbc(des3_ede) -+#define CIPHERNAME_BLOWFISH cbc(blowfish) -+#define CIPHERNAME_CAST cbc(cast5) -+#define CIPHERNAME_SERPENT cbc(serpent) -+#define CIPHERNAME_TWOFISH cbc(twofish) -+ -+#define DIGESTNAME_MD5 "md5" -+#define DIGESTNAME_SHA1 "sha1" -+ -+#define ESP_SERPENT 252 /* from ipsec drafts */ -+#define ESP_TWOFISH 253 /* from ipsec drafts */ -+ -+MODULE_AUTHOR("Juanjo Ciarlante, Harpo MAxx, Luciano Ruete"); -+static int debug_crypto=0; -+static int test_crypto=0; -+static int excl_crypto=0; -+static int noauto = 0; -+module_param(debug_crypto,int,0644); -+module_param(test_crypto,int,0644); -+module_param(excl_crypto,int,0644); -+module_param(noauto,int,0644); -+ -+MODULE_PARM_DESC(noauto, "Dont try all known algos, just setup enabled ones"); -+ -+#ifdef CONFIG_KLIPS_ENC_1DES -+static int des_ede1[] = {-1, -1}; -+#endif -+static int des_ede3[] = {-1, -1}; -+static int aes[] = {-1, -1}; -+static int blowfish[] = {-1, -1}; -+static int cast[] = {-1, -1}; -+static int serpent[] = {-1, -1}; -+static int twofish[] = {-1, -1}; -+ -+#ifdef CONFIG_KLIPS_ENC_1DES -+module_param_array(des_ede1,int,NULL,0444); -+#endif -+module_param_array(des_ede3,int,NULL,0444); -+module_param_array(aes,int,NULL,0444); -+module_param_array(blowfish,int,NULL,0444); -+module_param_array(cast,int,NULL,0444); -+module_param_array(serpent,int,NULL,0444); -+module_param_array(twofish,int,NULL,0444); -+ -+MODULE_PARM_DESC(des_ede1, "0: disable | 1: force_enable | min,max: dontuse"); -+MODULE_PARM_DESC(des_ede3, "0: disable | 1: force_enable | min,max: dontuse"); -+MODULE_PARM_DESC(aes, "0: disable | 1: force_enable | min,max: keybitlens"); -+MODULE_PARM_DESC(blowfish, "0: disable | 1: force_enable | min,max: keybitlens"); -+MODULE_PARM_DESC(cast, "0: disable | 1: force_enable | min,max: keybitlens"); -+MODULE_PARM_DESC(serpent, "0: disable | 1: force_enable | min,max: keybitlens"); -+MODULE_PARM_DESC(twofish, "0: disable | 1: force_enable | min,max: keybitlens"); -+ -+struct ipsec_alg_capi_cipher { -+ const char *ciphername; /* cryptoapi's ciphername */ -+ unsigned blocksize; -+ unsigned short minbits; -+ unsigned short maxbits; -+ int *parm; /* lkm param for this cipher */ -+ struct ipsec_alg_enc alg; /* note it's not a pointer */ -+}; -+ -+static struct ipsec_alg_capi_cipher alg_capi_carray[] = { -+ { CIPHERNAME_AES, 16, 128, 256, aes, { ixt_common:{ ixt_support:{ ias_id: ESP_AES}}}}, -+ { CIPHERNAME_TWOFISH, 16, 128, 256, twofish, { ixt_common:{ ixt_support:{ ias_id: ESP_TWOFISH,}}}}, -+ { CIPHERNAME_SERPENT, 16, 128, 256, serpent, { ixt_common:{ ixt_support:{ ias_id: ESP_SERPENT,}}}}, -+ { CIPHERNAME_CAST, 8, 128, 128, cast , { ixt_common:{ ixt_support:{ ias_id: ESP_CAST,}}}}, -+ { CIPHERNAME_BLOWFISH, 8, 96, 448, blowfish, { ixt_common:{ ixt_support:{ ias_id: ESP_BLOWFISH,}}}}, -+ { CIPHERNAME_3DES, 8, 192, 192, des_ede3, { ixt_common:{ ixt_support:{ ias_id: ESP_3DES,}}}}, -+#ifdef CONFIG_KLIPS_ENC_1DES -+ { CIPHERNAME_1DES, 8, 64, 64, des_ede1, { ixt_common:{ ixt_support:{ ias_id: ESP_DES,}}}}, -+#endif -+ { NULL, 0, 0, 0, NULL, {} } -+}; -+ -+#ifdef NOT_YET -+struct ipsec_alg_capi_digest { -+ const char *digestname; /* cryptoapi's digestname */ -+ struct digest_implementation *di; -+ struct ipsec_alg_auth alg; /* note it's not a pointer */ -+}; -+static struct ipsec_alg_capi_cipher alg_capi_darray[] = { -+ { DIGESTNAME_MD5, NULL, { ixt_alg_id: AH_MD5, }}, -+ { DIGESTNAME_SHA1, NULL, { ixt_alg_id: AH_SHA, }}, -+ { NULL, NULL, {} } -+}; -+#endif -+/* -+ * "generic" linux cryptoapi setup_cipher() function -+ */ -+int setup_cipher(const char *ciphername) -+{ -+ return crypto_has_blkcipher(ciphername, 0, CRYPTO_ALG_ASYNC); -+} -+ -+/* -+ * setups ipsec_alg_capi_cipher "hyper" struct components, calling -+ * register_ipsec_alg for cointaned ipsec_alg object -+ */ -+static void _capi_destroy_key (struct ipsec_alg_enc *alg, __u8 *key_e); -+static __u8 * _capi_new_key (struct ipsec_alg_enc *alg, const __u8 *key, size_t keylen); -+static int _capi_cbc_encrypt(struct ipsec_alg_enc *alg, __u8 * key_e, __u8 * in, int ilen, __u8 * iv, int encrypt); -+ -+static int -+setup_ipsec_alg_capi_cipher(struct ipsec_alg_capi_cipher *cptr) -+{ -+ int ret; -+ cptr->alg.ixt_common.ixt_version = IPSEC_ALG_VERSION; -+ cptr->alg.ixt_common.ixt_module = THIS_MODULE; -+ atomic_set (& cptr->alg.ixt_common.ixt_refcnt, 0); -+ strncpy (cptr->alg.ixt_common.ixt_name , cptr->ciphername, sizeof (cptr->alg.ixt_common.ixt_name)); -+ -+ cptr->alg.ixt_common.ixt_blocksize=cptr->blocksize; -+ cptr->alg.ixt_common.ixt_support.ias_keyminbits=cptr->minbits; -+ cptr->alg.ixt_common.ixt_support.ias_keymaxbits=cptr->maxbits; -+ cptr->alg.ixt_common.ixt_state = 0; -+ if (excl_crypto) cptr->alg.ixt_common.ixt_state |= IPSEC_ALG_ST_EXCL; -+ cptr->alg.ixt_e_keylen=cptr->alg.ixt_common.ixt_support.ias_keymaxbits/8; -+ cptr->alg.ixt_e_ctx_size = 0; -+ cptr->alg.ixt_common.ixt_support.ias_exttype = IPSEC_ALG_TYPE_ENCRYPT; -+ cptr->alg.ixt_e_new_key = _capi_new_key; -+ cptr->alg.ixt_e_destroy_key = _capi_destroy_key; -+ cptr->alg.ixt_e_cbc_encrypt = _capi_cbc_encrypt; -+ cptr->alg.ixt_common.ixt_data = cptr; -+ -+ ret=register_ipsec_alg_enc(&cptr->alg); -+ printk(KERN_INFO "KLIPS cryptoapi interface: " -+ "alg_type=%d alg_id=%d name=%s " -+ "keyminbits=%d keymaxbits=%d, %s(%d)\n", -+ cptr->alg.ixt_common.ixt_support.ias_exttype, -+ cptr->alg.ixt_common.ixt_support.ias_id, -+ cptr->alg.ixt_common.ixt_name, -+ cptr->alg.ixt_common.ixt_support.ias_keyminbits, -+ cptr->alg.ixt_common.ixt_support.ias_keymaxbits, -+ ret ? "not found" : "found", ret); -+ return ret; -+} -+/* -+ * called in ipsec_sa_wipe() time, will destroy key contexts -+ * and do 1 unbind() -+ */ -+static void -+_capi_destroy_key (struct ipsec_alg_enc *alg, __u8 *key_e) -+{ -+ struct crypto_tfm *tfm=(struct crypto_tfm*)key_e; -+ -+ if (debug_crypto > 0) -+ printk(KERN_DEBUG "klips_debug: _capi_destroy_key:" -+ "name=%s key_e=%p \n", -+ alg->ixt_common.ixt_name, key_e); -+ if (!key_e) { -+ printk(KERN_ERR "klips_debug: _capi_destroy_key:" -+ "name=%s NULL key_e!\n", -+ alg->ixt_common.ixt_name); -+ return; -+ } -+ crypto_free_tfm(tfm); -+} -+ -+/* -+ * create new key context, need alg->ixt_data to know which -+ * (of many) cipher inside this module is the target -+ */ -+static __u8 * -+_capi_new_key (struct ipsec_alg_enc *alg, const __u8 *key, size_t keylen) -+{ -+ struct ipsec_alg_capi_cipher *cptr; -+ struct crypto_tfm *tfm=NULL; -+ -+ cptr = alg->ixt_common.ixt_data; -+ if (!cptr) { -+ printk(KERN_ERR "_capi_new_key(): " -+ "NULL ixt_data (?!) for \"%s\" algo\n" -+ , alg->ixt_common.ixt_name); -+ goto err; -+ } -+ if (debug_crypto > 0) -+ printk(KERN_DEBUG "klips_debug:_capi_new_key:" -+ "name=%s cptr=%p key=%p keysize=%d\n", -+ alg->ixt_common.ixt_name, cptr, key, keylen); -+ -+ /* -+ * alloc tfm -+ */ -+ tfm = crypto_blkcipher_tfm(crypto_alloc_blkcipher(cptr->ciphername, 0, CRYPTO_ALG_ASYNC)); -+ if (!tfm) { -+ printk(KERN_ERR "_capi_new_key(): " -+ "NULL tfm for \"%s\" cryptoapi (\"%s\") algo\n" -+ , alg->ixt_common.ixt_name, cptr->ciphername); -+ goto err; -+ } -+ if (crypto_blkcipher_setkey(crypto_blkcipher_cast(tfm), key, keylen) < 0) { -+ printk(KERN_ERR "_capi_new_key(): " -+ "failed new_key() for \"%s\" cryptoapi algo (keylen=%d)\n" -+ , alg->ixt_common.ixt_name, keylen); -+ crypto_free_tfm(tfm); -+ tfm=NULL; -+ } -+err: -+ if (debug_crypto > 0) -+ printk(KERN_DEBUG "klips_debug:_capi_new_key:" -+ "name=%s key=%p keylen=%d tfm=%p\n", -+ alg->ixt_common.ixt_name, key, keylen, tfm); -+ return (__u8 *) tfm; -+} -+/* -+ * core encryption function: will use cx->ci to call actual cipher's -+ * cbc function -+ */ -+static int -+_capi_cbc_encrypt(struct ipsec_alg_enc *alg, __u8 * key_e, __u8 * in, int ilen, __u8 * iv, int encrypt) { -+ int error =0; -+ struct crypto_tfm *tfm=(struct crypto_tfm *)key_e; -+ struct scatterlist sg; -+ struct blkcipher_desc desc; -+ if (debug_crypto > 1) -+ printk(KERN_DEBUG "klips_debug:_capi_cbc_encrypt:" -+ "key_e=%p " -+ "in=%p out=%p ilen=%d iv=%p encrypt=%d\n" -+ , key_e -+ , in, in, ilen, iv, encrypt); -+ -+ memset(&sg, 0, sizeof(sg)); -+ sg_set_page(&sg, virt_to_page(in), ilen, offset_in_page(in)); -+ -+ memset(&desc, 0, sizeof(desc)); -+ desc.tfm = crypto_blkcipher_cast(tfm); -+ desc.info = (void *) iv; -+ -+ if (encrypt) -+ error = crypto_blkcipher_encrypt_iv (&desc, &sg, &sg, ilen); -+ else -+ error = crypto_blkcipher_decrypt_iv (&desc, &sg, &sg, ilen); -+ if (debug_crypto > 1) -+ printk(KERN_DEBUG "klips_debug:_capi_cbc_encrypt:" -+ "error=%d\n" -+ , error); -+ return (error<0)? error : ilen; -+} -+/* -+ * main initialization loop: for each cipher in list, do -+ * 1) setup cryptoapi cipher else continue -+ * 2) register ipsec_alg object -+ */ -+static int -+setup_cipher_list (struct ipsec_alg_capi_cipher* clist) -+{ -+ struct ipsec_alg_capi_cipher *cptr; -+ /* foreach cipher in list ... */ -+ for (cptr=clist;cptr->ciphername;cptr++) { -+ /* -+ * see if cipher has been disabled (0) or -+ * if noauto set and not enabled (1) -+ */ -+ if (cptr->parm[0] == 0 || (noauto && cptr->parm[0] < 0)) { -+ if (debug_crypto>0) -+ printk(KERN_INFO "setup_cipher_list(): " -+ "ciphername=%s skipped at user request: " -+ "noauto=%d parm[0]=%d parm[1]=%d\n" -+ , cptr->ciphername -+ , noauto -+ , cptr->parm[0] -+ , cptr->parm[1]); -+ continue; -+ } else { -+ if (debug_crypto>0) -+ printk(KERN_INFO "setup_cipher_list(): going to init ciphername=%s: noauto=%d parm[0]=%d parm[1]=%d\n", -+ , cptr->ciphername -+ , noauto -+ , cptr->parm[0] -+ , cptr->parm[1]); -+ } -+ /* -+ * use a local ci to avoid touching cptr->ci, -+ * if register ipsec_alg success then bind cipher -+ */ -+ if(cptr->alg.ixt_common.ixt_support.ias_name == NULL) { -+ cptr->alg.ixt_common.ixt_support.ias_name = cptr->ciphername; -+ } -+ -+ if( setup_cipher(cptr->ciphername) ) { -+ if (debug_crypto > 0) -+ printk(KERN_DEBUG "klips_debug:" -+ "setup_cipher_list():" -+ "ciphername=%s found\n" -+ , cptr->ciphername); -+ -+ if (setup_ipsec_alg_capi_cipher(cptr) != 0) { -+ printk(KERN_ERR "klips_debug:" -+ "setup_cipher_list():" -+ "ciphername=%s failed ipsec_alg_register\n" -+ , cptr->ciphername); -+ } -+ } else { -+ printk(KERN_INFO "KLIPS: lookup for ciphername=%s: not found \n", -+ cptr->ciphername); -+ } -+ } -+ return 0; -+} -+/* -+ * deregister ipsec_alg objects and unbind ciphers -+ */ -+static int -+unsetup_cipher_list (struct ipsec_alg_capi_cipher* clist) -+{ -+ struct ipsec_alg_capi_cipher *cptr; -+ /* foreach cipher in list ... */ -+ for (cptr=clist;cptr->ciphername;cptr++) { -+ if (cptr->alg.ixt_common.ixt_state & IPSEC_ALG_ST_REGISTERED) { -+ unregister_ipsec_alg_enc(&cptr->alg); -+ } -+ } -+ return 0; -+} -+/* -+ * test loop for registered algos -+ */ -+static int -+test_cipher_list (struct ipsec_alg_capi_cipher* clist) -+{ -+ int test_ret; -+ struct ipsec_alg_capi_cipher *cptr; -+ /* foreach cipher in list ... */ -+ for (cptr=clist;cptr->ciphername;cptr++) { -+ if (cptr->alg.ixt_common.ixt_state & IPSEC_ALG_ST_REGISTERED) { -+ test_ret=ipsec_alg_test( -+ cptr->alg.ixt_common.ixt_support.ias_exttype, -+ cptr->alg.ixt_common.ixt_support.ias_id, -+ test_crypto); -+ printk("test_cipher_list(alg_type=%d alg_id=%d): test_ret=%d\n", -+ cptr->alg.ixt_common.ixt_support.ias_exttype, -+ cptr->alg.ixt_common.ixt_support.ias_id, -+ test_ret); -+ } -+ } -+ return 0; -+} -+ -+IPSEC_ALG_MODULE_INIT_STATIC( ipsec_cryptoapi_init ) -+{ -+ int ret, test_ret; -+ if ((ret=setup_cipher_list(alg_capi_carray)) < 0) -+ return -EPROTONOSUPPORT; -+ if (ret==0 && test_crypto) { -+ test_ret=test_cipher_list(alg_capi_carray); -+ } -+ return ret; -+} -+IPSEC_ALG_MODULE_EXIT_STATIC( ipsec_cryptoapi_fini ) -+{ -+ unsetup_cipher_list(alg_capi_carray); -+ return; -+} -+#ifdef MODULE_LICENSE -+MODULE_LICENSE("GPL"); -+#endif -+ -+#endif /* NO_CRYPTOAPI_SUPPORT */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/ipsec_esp.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,584 @@ -+/* -+ * processing code for ESP -+ * Copyright (C) 2003 Michael Richardson -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ */ -+ -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif -+#include -+ -+#define __NO_VERSION__ -+#include -+#include /* printk() */ -+ -+#include "openswan/ipsec_param.h" -+ -+#ifdef MALLOC_SLAB -+# include /* kmalloc() */ -+#else /* MALLOC_SLAB */ -+# include /* kmalloc() */ -+#endif /* MALLOC_SLAB */ -+#include /* error codes */ -+#include /* size_t */ -+#include /* mark_bh */ -+ -+#include /* struct device, and other headers */ -+#include /* eth_type_trans */ -+#include /* struct iphdr */ -+#include -+#include -+#ifdef SPINLOCK -+# ifdef SPINLOCK_23 -+# include /* *lock* */ -+# else /* SPINLOCK_23 */ -+# include /* *lock* */ -+# endif /* SPINLOCK_23 */ -+#endif /* SPINLOCK */ -+ -+#include -+#include -+ -+#include "openswan/radij.h" -+#include "openswan/ipsec_encap.h" -+#include "openswan/ipsec_sa.h" -+ -+#include "openswan/ipsec_radij.h" -+#include "openswan/ipsec_xform.h" -+#include "openswan/ipsec_tunnel.h" -+#include "openswan/ipsec_rcv.h" -+#include "openswan/ipsec_xmit.h" -+ -+#include "openswan/ipsec_auth.h" -+ -+#ifdef CONFIG_KLIPS_ESP -+#include "openswan/ipsec_esp.h" -+#endif /* CONFIG_KLIPS_ESP */ -+ -+#include "openswan/ipsec_proto.h" -+#include "openswan/ipsec_alg.h" -+#include "ipsec_ocf.h" -+ -+#ifdef CONFIG_KLIPS_DEBUG -+#define ESP_DMP(_x,_y,_z) if(debug_rcv && sysctl_ipsec_debug_verbose) ipsec_dmp_block(_x,_y,_z) -+#else -+#define ESP_DMP(_x,_y,_z) -+#endif -+ -+#ifdef CONFIG_KLIPS_ESP -+enum ipsec_rcv_value -+ipsec_rcv_esp_checks(struct ipsec_rcv_state *irs, -+ struct sk_buff *skb) -+{ -+ __u8 proto; -+ int len; /* packet length */ -+ -+ len = skb->len; -+ proto = irs->ipp->protocol; -+ -+ /* XXX this will need to be 8 for IPv6 */ -+ if ((proto == IPPROTO_ESP) && ((len - irs->iphlen) % 4)) { -+ printk("klips_error:ipsec_rcv: " -+ "got packet with content length = %d from %s -- should be on 4 octet boundary, packet dropped\n", -+ len - irs->iphlen, -+ irs->ipsaddr_txt); -+ if(irs->stats) { -+ irs->stats->rx_errors++; -+ } -+ return IPSEC_RCV_BADLEN; -+ } -+ -+ if(skb->len < (irs->hard_header_len + sizeof(struct iphdr) + sizeof(struct esphdr))) { -+ KLIPS_PRINT(debug_rcv & DB_RX_INAU, -+ "klips_debug:ipsec_rcv: " -+ "runt esp packet of skb->len=%d received from %s, dropped.\n", -+ skb->len, -+ irs->ipsaddr_txt); -+ if(irs->stats) { -+ irs->stats->rx_errors++; -+ } -+ return IPSEC_RCV_BADLEN; -+ } -+ -+ irs->protostuff.espstuff.espp = (struct esphdr *)skb_transport_header(skb); -+ irs->said.spi = irs->protostuff.espstuff.espp->esp_spi; -+ -+ return IPSEC_RCV_OK; -+} -+ -+enum ipsec_rcv_value -+ipsec_rcv_esp_decrypt_setup(struct ipsec_rcv_state *irs, -+ struct sk_buff *skb, -+ __u32 *replay, -+ unsigned char **authenticator) -+{ -+ struct esphdr *espp = irs->protostuff.espstuff.espp; -+ //unsigned char *idat = (unsigned char *)espp; -+ -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "packet from %s received with seq=%d (iv)=0x%08x%08x iplen=%d esplen=%d sa=%s\n", -+ irs->ipsaddr_txt, -+ (__u32)ntohl(espp->esp_rpl), -+ (__u32)ntohl(*((__u32 *)(espp->esp_iv) )), -+ (__u32)ntohl(*((__u32 *)(espp->esp_iv) + 1)), -+ irs->len, -+ irs->ilen, -+ irs->sa_len ? irs->sa : " (error)"); -+ -+ *replay = ntohl(espp->esp_rpl); -+ *authenticator = &(skb_transport_header(skb)[irs->ilen]); -+ -+ return IPSEC_RCV_OK; -+} -+ -+enum ipsec_rcv_value -+ipsec_rcv_esp_authcalc(struct ipsec_rcv_state *irs, -+ struct sk_buff *skb) -+{ -+ struct auth_alg *aa; -+ struct esphdr *espp = irs->protostuff.espstuff.espp; -+ union { -+ MD5_CTX md5; -+ SHA1_CTX sha1; -+ } tctx; -+ -+#ifdef CONFIG_KLIPS_OCF -+ if (irs->ipsp->ocf_in_use) -+ return(ipsec_ocf_rcv(irs)); -+#endif -+ -+#ifdef CONFIG_KLIPS_ALG -+ if (irs->ipsp->ips_alg_auth) { -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "ipsec_alg hashing proto=%d... ", -+ irs->said.proto); -+ if(irs->said.proto == IPPROTO_ESP) { -+ ipsec_alg_sa_esp_hash(irs->ipsp, -+ (caddr_t)espp, irs->ilen, -+ irs->hash, AHHMAC_HASHLEN); -+ return IPSEC_RCV_OK; -+ } -+ return IPSEC_RCV_BADPROTO; -+ } -+#endif -+ aa = irs->authfuncs; -+ -+ /* copy the initialized keying material */ -+ memcpy(&tctx, irs->ictx, irs->ictx_len); -+ -+#ifdef HASH_DEBUG -+ ESP_DMP("ictx", irs->ictx, irs->ictx_len); -+ -+ ESP_DMP("mac_esp", (caddr_t)espp, irs->ilen); -+#endif -+ (*aa->update)((void *)&tctx, (caddr_t)espp, irs->ilen); -+ -+ (*aa->final)(irs->hash, (void *)&tctx); -+ -+#ifdef HASH_DEBUG -+ ESP_DMP("hash1", irs->hash, aa->hashlen); -+#endif -+ -+ memcpy(&tctx, irs->octx, irs->octx_len); -+ -+#ifdef HASH_DEBUG -+ ESP_DMP("octx", irs->octx, irs->octx_len); -+#endif -+ -+ (*aa->update)((void *)&tctx, irs->hash, aa->hashlen); -+ (*aa->final)(irs->hash, (void *)&tctx); -+ -+ return IPSEC_RCV_OK; -+} -+ -+ -+enum ipsec_rcv_value -+ipsec_rcv_esp_decrypt(struct ipsec_rcv_state *irs) -+{ -+ struct ipsec_sa *ipsp = irs->ipsp; -+#ifdef CONFIG_KLIPS_ALG -+ struct esphdr *espp = irs->protostuff.espstuff.espp; -+ __u8 *idat; /* pointer to content to be decrypted/authenticated */ -+ int encaplen = 0; -+ struct sk_buff *skb; -+ struct ipsec_alg_enc *ixt_e=NULL; -+#endif -+ -+#ifdef CONFIG_KLIPS_OCF -+ if (ipsp->ocf_in_use) -+ return(ipsec_ocf_rcv(irs)); -+#endif -+ -+#ifdef CONFIG_KLIPS_ALG -+ skb=irs->skb; -+ -+ idat = skb_transport_header(skb); -+ -+ /* encaplen is the distance between the end of the IP -+ * header and the beginning of the ESP header. -+ * on ESP headers it is zero, but on UDP-encap ESP -+ * it includes the space for the UDP header. -+ * -+ * Note: UDP-encap code has already moved the -+ * skb->data forward to accomodate this. -+ */ -+ encaplen = skb_transport_header(skb) - (skb_network_header(skb) + irs->iphlen); -+ -+ ixt_e=ipsp->ips_alg_enc; -+ irs->esphlen = ESP_HEADER_LEN + ixt_e->ixt_common.ixt_support.ias_ivlen/8; -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "encalg=%d esphlen=%d\n", -+ ipsp->ips_encalg, irs->esphlen); -+ -+ idat += irs->esphlen; -+ irs->ilen -= irs->esphlen; -+ -+ if (ipsec_alg_esp_encrypt(ipsp, -+ idat, irs->ilen, espp->esp_iv, -+ IPSEC_ALG_DECRYPT) <= 0) { -+#ifdef CONFIG_KLIPS_DEBUG -+ KLIPS_ERROR(debug_rcv, "klips_error:ipsec_rcv: " -+ "got packet with esplen = %d " -+ "from %s -- should be on " -+ "ENC(%d) octet boundary, " -+ "packet dropped\n", -+ irs->ilen, -+ irs->ipsaddr_txt, -+ ipsp->ips_encalg); -+#endif -+ if(irs->stats) { -+ irs->stats->rx_errors++; -+ } -+ return IPSEC_RCV_BAD_DECRYPT; -+ } -+ -+ return ipsec_rcv_esp_post_decrypt(irs); -+#else -+ return IPSEC_RCV_BAD_DECRYPT; -+#endif /* CONFIG_KLIPS_ALG */ -+} -+ -+ -+enum ipsec_rcv_value -+ipsec_rcv_esp_post_decrypt(struct ipsec_rcv_state *irs) -+{ -+ struct sk_buff *skb; -+ __u8 *idat; /* pointer to content to be decrypted/authenticated */ -+ struct ipsec_sa *ipsp = irs->ipsp; -+ int pad = 0, padlen; -+ int badpad = 0; -+ int i; -+ -+ skb = irs->skb; -+ -+ idat = skb_transport_header(skb) + irs->esphlen; -+ -+ ESP_DMP("postdecrypt", idat, irs->ilen); -+ -+ irs->next_header = idat[irs->ilen - 1]; -+ padlen = idat[irs->ilen - 2]; -+ pad = padlen + 2 + irs->authlen; -+ -+ KLIPS_PRINT(debug_rcv & DB_RX_IPAD, -+ "klips_debug:ipsec_rcv_esp_post_decrypt: " -+ "padlen=%d, contents: 0x: 0x 0x ...\n", -+ padlen); -+ -+ for (i = 1; i <= padlen; i++) { -+ if((i % 16) == 1) { -+ KLIPS_PRINT(debug_rcv & DB_RX_IPAD, -+ "klips_debug: %02x:", -+ i - 1); -+ } -+ KLIPS_PRINTMORE(debug_rcv & DB_RX_IPAD, -+ " %02x", -+ idat[irs->ilen - 2 - padlen + i - 1]); -+ if(i != idat[irs->ilen - 2 - padlen + i - 1]) { -+ badpad = 1; -+ } -+ if((i % 16) == 0) { -+ KLIPS_PRINTMORE(debug_rcv & DB_RX_IPAD, -+ "\n"); -+ } -+ } -+ if((i % 16) != 1) { -+ KLIPS_PRINTMORE(debug_rcv & DB_RX_IPAD, -+ "\n"); -+ } -+ if(badpad) { -+ KLIPS_PRINT(debug_rcv & DB_RX_IPAD, -+ "klips_debug:ipsec_rcv_esp_post_decrypt: " -+ "warning, decrypted packet from %s has bad padding\n", -+ irs->ipsaddr_txt); -+ KLIPS_PRINT(debug_rcv & DB_RX_IPAD, -+ "klips_debug:ipsec_rcv_esp_post_decrypt: " -+ "...may be bad decryption -- not dropped\n"); -+ ipsp->ips_errs.ips_encpad_errs += 1; -+ } -+ -+ KLIPS_PRINT(debug_rcv & DB_RX_IPAD, -+ "klips_debug:ipsec_rcv_esp_post_decrypt: " -+ "packet decrypted from %s: next_header = %d, padding = %d\n", -+ irs->ipsaddr_txt, -+ irs->next_header, -+ pad - 2 - irs->authlen); -+ -+ irs->ipp->tot_len = htons(ntohs(irs->ipp->tot_len) - (irs->esphlen + pad)); -+ -+ /* -+ * move the IP header forward by the size of the ESP header, which -+ * will remove the the ESP header from the packet. -+ * -+ * XXX this is really unnecessary, since odds we are in tunnel -+ * mode, and we will be *removing* this IP header. -+ * -+ */ -+ memmove((void *)(idat - irs->iphlen), -+ (void *)(skb_network_header(skb)), irs->iphlen); -+ -+ ESP_DMP("esp postmove", (idat - irs->iphlen), -+ irs->iphlen + irs->ilen); -+ -+ /* skb_pull below, will move up by esphlen */ -+ -+ /* XXX not clear how this can happen, as the message indicates */ -+ if(skb->len < irs->esphlen) { -+ printk(KERN_WARNING -+ "klips_error:ipsec_rcv_esp_post_decrypt: " -+ "tried to skb_pull esphlen=%d, %d available. This should never happen, please report.\n", -+ irs->esphlen, (int)(skb->len)); -+ return IPSEC_RCV_ESP_DECAPFAIL; -+ } -+ skb_pull(skb, irs->esphlen); -+ skb_set_network_header(skb, ipsec_skb_offset(skb, idat - irs->iphlen)); -+ irs->ipp = ip_hdr(skb); -+ -+ ESP_DMP("esp postpull", skb->data, skb->len); -+ -+ /* now, trip off the padding from the end */ -+ KLIPS_PRINT(debug_rcv & DB_RX_PKTRX, -+ "klips_debug:ipsec_rcv: " -+ "trimming to %d.\n", -+ irs->len - irs->esphlen - pad); -+ if(pad + irs->esphlen <= irs->len) { -+ skb_trim(skb, irs->len - irs->esphlen - pad); -+ } else { -+ KLIPS_PRINT(debug_rcv & DB_RX_PKTRX, -+ "klips_debug:ipsec_rcv: " -+ "bogus packet, size is zero or negative, dropping.\n"); -+ return IPSEC_RCV_DECAPFAIL; -+ } -+ -+ return IPSEC_RCV_OK; -+} -+ -+/* -+ * -+ */ -+enum ipsec_xmit_value -+ipsec_xmit_esp_setup(struct ipsec_xmit_state *ixs) -+{ -+#ifdef CONFIG_KLIPS_ENC_3DES -+ __u32 iv[2]; -+#endif -+ struct esphdr *espp; -+ int ilen = 0; -+ int padlen = 0, i; -+ unsigned char *dat; -+ unsigned char *idat, *pad; -+#if defined(CONFIG_KLIPS_AUTH_HMAC_MD5) || defined(CONFIG_KLIPS_AUTH_HMAC_SHA1) -+ __u8 hash[AH_AMAX]; -+ union { -+#ifdef CONFIG_KLIPS_AUTH_HMAC_MD5 -+ MD5_CTX md5; -+#endif /* CONFIG_KLIPS_AUTH_HMAC_MD5 */ -+#ifdef CONFIG_KLIPS_AUTH_HMAC_SHA1 -+ SHA1_CTX sha1; -+#endif /* CONFIG_KLIPS_AUTH_HMAC_SHA1 */ -+ } tctx; -+#endif -+ -+ dat = (unsigned char *)ixs->iph; -+ -+ espp = (struct esphdr *)(dat + ixs->iphlen); -+ espp->esp_spi = ixs->ipsp->ips_said.spi; -+ espp->esp_rpl = htonl(++(ixs->ipsp->ips_replaywin_lastseq)); -+ -+ switch(ixs->ipsp->ips_encalg) { -+#if defined(CONFIG_KLIPS_ENC_3DES) -+#ifdef CONFIG_KLIPS_ENC_3DES -+ case ESP_3DES: -+#endif /* CONFIG_KLIPS_ENC_3DES */ -+ iv[0] = *((__u32*)&(espp->esp_iv) ) = -+ ((__u32*)(ixs->ipsp->ips_iv))[0]; -+ iv[1] = *((__u32*)&(espp->esp_iv) + 1) = -+ ((__u32*)(ixs->ipsp->ips_iv))[1]; -+ break; -+#endif /* defined(CONFIG_KLIPS_ENC_3DES) */ -+ default: -+ ixs->stats->tx_errors++; -+ return IPSEC_XMIT_ESP_BADALG; -+ } -+ -+ idat = dat + ixs->iphlen + sizeof(struct esphdr); -+ ilen = ixs->skb->len - (ixs->iphlen + sizeof(struct esphdr) + ixs->authlen); -+ -+ /* Self-describing padding */ -+ pad = &dat[ixs->skb->len - ixs->tailroom]; -+ padlen = ixs->tailroom - 2 - ixs->authlen; -+ for (i = 0; i < padlen; i++) { -+ pad[i] = i + 1; -+ } -+ dat[ixs->skb->len - ixs->authlen - 2] = padlen; -+ -+ dat[ixs->skb->len - ixs->authlen - 1] = ixs->iph->protocol; -+ ixs->iph->protocol = IPPROTO_ESP; -+ -+ switch(ixs->ipsp->ips_encalg) { -+#ifdef CONFIG_KLIPS_ENC_3DES -+ case ESP_3DES: -+ des_ede3_cbc_encrypt((des_cblock *)idat, -+ (des_cblock *)idat, -+ ilen, -+ ((struct des_eks *)(ixs->ipsp->ips_key_e))[0].ks, -+ ((struct des_eks *)(ixs->ipsp->ips_key_e))[1].ks, -+ ((struct des_eks *)(ixs->ipsp->ips_key_e))[2].ks, -+ (des_cblock *)iv, 1); -+ break; -+#endif /* CONFIG_KLIPS_ENC_3DES */ -+ default: -+ ixs->stats->tx_errors++; -+ return IPSEC_XMIT_ESP_BADALG; -+ } -+ -+ switch(ixs->ipsp->ips_encalg) { -+#if defined(CONFIG_KLIPS_ENC_3DES) -+#ifdef CONFIG_KLIPS_ENC_3DES -+ case ESP_3DES: -+#endif /* CONFIG_KLIPS_ENC_3DES */ -+ /* XXX update IV with the last 8 octets of the encryption */ -+#if KLIPS_IMPAIRMENT_ESPIV_CBC_ATTACK -+ ((__u32*)(ixs->ipsp->ips_iv))[0] = -+ ((__u32 *)(idat))[(ilen >> 2) - 2]; -+ ((__u32*)(ixs->ipsp->ips_iv))[1] = -+ ((__u32 *)(idat))[(ilen >> 2) - 1]; -+#else /* KLIPS_IMPAIRMENT_ESPIV_CBC_ATTACK */ -+ prng_bytes(&ipsec_prng, (char *)ixs->ipsp->ips_iv, EMT_ESPDES_IV_SZ); -+#endif /* KLIPS_IMPAIRMENT_ESPIV_CBC_ATTACK */ -+ break; -+#endif /* defined(CONFIG_KLIPS_ENC_3DES) */ -+ default: -+ ixs->stats->tx_errors++; -+ return IPSEC_XMIT_ESP_BADALG; -+ } -+ -+ switch(ixs->ipsp->ips_authalg) { -+#ifdef CONFIG_KLIPS_AUTH_HMAC_MD5 -+ case AH_MD5: -+ ipsec_xmit_dmp("espp", (char*)espp, ixs->skb->len - ixs->iphlen - ixs->authlen); -+ tctx.md5 = ((struct md5_ctx*)(ixs->ipsp->ips_key_a))->ictx; -+ ipsec_xmit_dmp("ictx", (char*)&tctx.md5, sizeof(tctx.md5)); -+ osMD5Update(&tctx.md5, (caddr_t)espp, ixs->skb->len - ixs->iphlen - ixs->authlen); -+ ipsec_xmit_dmp("ictx+dat", (char*)&tctx.md5, sizeof(tctx.md5)); -+ osMD5Final(hash, &tctx.md5); -+ ipsec_xmit_dmp("ictx hash", (char*)&hash, sizeof(hash)); -+ tctx.md5 = ((struct md5_ctx*)(ixs->ipsp->ips_key_a))->octx; -+ ipsec_xmit_dmp("octx", (char*)&tctx.md5, sizeof(tctx.md5)); -+ osMD5Update(&tctx.md5, hash, AHMD596_ALEN); -+ ipsec_xmit_dmp("octx+hash", (char*)&tctx.md5, sizeof(tctx.md5)); -+ osMD5Final(hash, &tctx.md5); -+ ipsec_xmit_dmp("octx hash", (char*)&hash, sizeof(hash)); -+ memcpy(&(dat[ixs->skb->len - ixs->authlen]), hash, ixs->authlen); -+ -+ /* paranoid */ -+ memset((caddr_t)&tctx.md5, 0, sizeof(tctx.md5)); -+ memset((caddr_t)hash, 0, sizeof(*hash)); -+ break; -+#endif /* CONFIG_KLIPS_AUTH_HMAC_MD5 */ -+#ifdef CONFIG_KLIPS_AUTH_HMAC_SHA1 -+ case AH_SHA: -+ tctx.sha1 = ((struct sha1_ctx*)(ixs->ipsp->ips_key_a))->ictx; -+ SHA1Update(&tctx.sha1, (caddr_t)espp, ixs->skb->len - ixs->iphlen - ixs->authlen); -+ SHA1Final(hash, &tctx.sha1); -+ tctx.sha1 = ((struct sha1_ctx*)(ixs->ipsp->ips_key_a))->octx; -+ SHA1Update(&tctx.sha1, hash, AHSHA196_ALEN); -+ SHA1Final(hash, &tctx.sha1); -+ memcpy(&(dat[ixs->skb->len - ixs->authlen]), hash, ixs->authlen); -+ -+ /* paranoid */ -+ memset((caddr_t)&tctx.sha1, 0, sizeof(tctx.sha1)); -+ memset((caddr_t)hash, 0, sizeof(*hash)); -+ break; -+#endif /* CONFIG_KLIPS_AUTH_HMAC_SHA1 */ -+ case AH_NONE: -+ break; -+ default: -+ ixs->stats->tx_errors++; -+ return IPSEC_XMIT_AH_BADALG; -+ } -+ -+ skb_set_transport_header(ixs->skb, ipsec_skb_offset(ixs->skb, espp)); -+ -+ return IPSEC_XMIT_OK; -+} -+ -+ -+struct xform_functions esp_xform_funcs[]={ -+ { -+ protocol: IPPROTO_ESP, -+ rcv_checks: ipsec_rcv_esp_checks, -+ rcv_setup_auth: ipsec_rcv_esp_decrypt_setup, -+ rcv_calc_auth: ipsec_rcv_esp_authcalc, -+ rcv_decrypt: ipsec_rcv_esp_decrypt, -+ -+ xmit_setup: ipsec_xmit_esp_setup, -+ xmit_headroom: sizeof(struct esphdr), -+ xmit_needtailroom: 1, -+ }, -+}; -+ -+#ifndef CONFIG_XFRM_ALTERNATE_STACK -+#ifdef NET_26 -+struct inet_protocol esp_protocol = { -+ .handler = ipsec_rcv, -+ .no_policy = 1, -+}; -+#else -+struct inet_protocol esp_protocol = -+{ -+ ipsec_rcv, /* ESP handler */ -+ NULL, /* TUNNEL error control */ -+#ifdef NETDEV_25 -+ 1, /* no policy */ -+#else -+ 0, /* next */ -+ IPPROTO_ESP, /* protocol ID */ -+ 0, /* copy */ -+ NULL, /* data */ -+ "ESP" /* name */ -+#endif -+}; -+#endif /* NET_26 */ -+#endif /* CONFIG_XFRM_ALTERNATE_STACK */ -+ -+#endif /* !CONFIG_KLIPS_ESP */ -+ -+/* -+ * Local variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/ipsec_init.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,460 @@ -+/* -+ * @(#) Initialization code. -+ * Copyright (C) 1996, 1997 John Ioannidis. -+ * Copyright (C) 1998 - 2002 Richard Guy Briggs -+ * 2001 - 2004 Michael Richardson -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * /proc system code was split out into ipsec_proc.c after rev. 1.70. -+ * -+ */ -+ -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif -+#include -+#include -+#include /* printk() */ -+ -+#include "openswan/ipsec_param.h" -+ -+#ifdef MALLOC_SLAB -+# include /* kmalloc() */ -+#else /* MALLOC_SLAB */ -+# include /* kmalloc() */ -+#endif /* MALLOC_SLAB */ -+#include /* error codes */ -+#include /* size_t */ -+#include /* mark_bh */ -+ -+#include /* struct device, and other headers */ -+#include /* eth_type_trans */ -+#include /* struct iphdr */ -+#include /* struct sockaddr_in */ -+#include -+#include /* get_random_bytes() */ -+#include -+ -+#include -+ -+#ifdef SPINLOCK -+# ifdef SPINLOCK_23 -+# include /* *lock* */ -+# else /* 23_SPINLOCK */ -+# include /* *lock* */ -+# endif /* 23_SPINLOCK */ -+#endif /* SPINLOCK */ -+ -+#include -+ -+#ifdef CONFIG_PROC_FS -+# include -+#endif /* CONFIG_PROC_FS */ -+ -+#ifdef NETLINK_SOCK -+# include -+#else -+# include -+#endif -+ -+#include "openswan/radij.h" -+ -+#include "openswan/ipsec_life.h" -+#include "openswan/ipsec_stats.h" -+#include "openswan/ipsec_sa.h" -+ -+#include "openswan/ipsec_encap.h" -+#include "openswan/ipsec_radij.h" -+#include "openswan/ipsec_xform.h" -+#include "openswan/ipsec_tunnel.h" -+#include "openswan/ipsec_mast.h" -+ -+#include "openswan/ipsec_rcv.h" -+#include "openswan/ipsec_xmit.h" -+#include "openswan/ipsec_ah.h" -+#include "openswan/ipsec_esp.h" -+ -+#ifdef CONFIG_KLIPS_IPCOMP -+# include "openswan/ipcomp.h" -+#endif /* CONFIG_KLIPS_IPCOMP */ -+ -+#include "openswan/ipsec_proto.h" -+#include "openswan/ipsec_alg.h" -+ -+#ifdef CONFIG_KLIPS_OCF -+#include "ipsec_ocf.h" -+#endif -+ -+#include -+#include -+ -+#if defined(NET_26) && defined(CONFIG_IPSEC_NAT_TRAVERSAL) -+#include -+#endif -+ -+#ifndef HAVE_UDP_ENCAP_CONVERT -+# if defined(NET_26) && defined(CONFIG_IPSEC_NAT_TRAVERSAL) && !defined(HAVE_XFRM4_UDP_REGISTER) -+# warning "You are trying to build KLIPS2.6 with NAT-T support, but you did not" -+# error "properly apply the NAT-T patch to your 2.6 kernel source tree." -+# endif -+#endif -+ -+#if !defined(CONFIG_KLIPS_ESP) && !defined(CONFIG_KLIPS_AH) -+#error "kernel configuration must include ESP or AH" -+#endif -+ -+/* -+ * seems to be present in 2.4.10 (Linus), but also in some RH and other -+ * distro kernels of a lower number. -+ */ -+#ifdef MODULE_LICENSE -+MODULE_LICENSE("GPL"); -+#endif -+ -+struct prng ipsec_prng; -+ -+#if defined(NET_26) && defined(CONFIG_IPSEC_NAT_TRAVERSAL) -+xfrm4_rcv_encap_t klips_old_encap = NULL; -+#endif -+ -+extern int ipsec_device_event(struct notifier_block *dnot, unsigned long event, void *ptr); -+/* -+ * the following structure is required so that we receive -+ * event notifications when network devices are enabled and -+ * disabled (ifconfig up and down). -+ */ -+static struct notifier_block ipsec_dev_notifier={ -+ .notifier_call = ipsec_device_event -+}; -+ -+#ifdef CONFIG_SYSCTL -+extern int ipsec_sysctl_register(void); -+extern void ipsec_sysctl_unregister(void); -+#endif -+ -+/* -+ * inet_*_protocol returns void on 2.4.x, int on 2.6.x -+ * So we need our own wrapper -+*/ -+#ifdef NET_26 -+static inline int -+openswan_inet_add_protocol(struct inet_protocol *prot, unsigned protocol, char *protstr) -+{ -+ int err = inet_add_protocol(prot, protocol); -+ if (err) -+ printk(KERN_ERR "KLIPS: can not register %s protocol - recompile with CONFIG_INET_%s disabled or as module\n", protstr,protstr); -+ return err; -+} -+ -+static inline int -+openswan_inet_del_protocol(struct inet_protocol *prot, unsigned protocol) -+{ -+ return inet_del_protocol(prot, protocol); -+} -+ -+#else -+static inline int -+openswan_inet_add_protocol(struct inet_protocol *prot, unsigned protocol) -+{ -+#ifdef IPSKB_XFRM_TUNNEL_SIZE -+ inet_add_protocol(prot, protocol); -+#else -+ inet_add_protocol(prot); -+#endif -+ return 0; -+} -+ -+static inline int -+openswan_inet_del_protocol(struct inet_protocol *prot, unsigned protocol) -+{ -+#ifdef IPSKB_XFRM_TUNNEL_SIZE -+ inet_del_protocol(prot, protocol); -+#else -+ inet_del_protocol(prot); -+#endif -+ return 0; -+} -+ -+#endif -+ -+/* void */ -+int -+ipsec_klips_init(void) -+{ -+ int error = 0; -+ unsigned char seed[256]; -+#ifdef CONFIG_KLIPS_ENC_3DES -+ extern int des_check_key; -+ -+ /* turn off checking of keys */ -+ des_check_key=0; -+#endif /* CONFIG_KLIPS_ENC_3DES */ -+ -+ KLIPS_PRINT(1, "klips_info:ipsec_init: " -+ "KLIPS startup, Openswan KLIPS IPsec stack version: %s\n", -+ ipsec_version_code()); -+ -+ error = ipsec_xmit_state_cache_init (); -+ if (error) -+ goto error_xmit_state_cache; -+ -+ error = ipsec_rcv_state_cache_init (); -+ if (error) -+ goto error_rcv_state_cache; -+ -+ error |= ipsec_proc_init(); -+ if (error) -+ goto error_proc_init; -+ -+#ifdef SPINLOCK -+ ipsec_sadb.sadb_lock = SPIN_LOCK_UNLOCKED; -+#else /* SPINLOCK */ -+ ipsec_sadb.sadb_lock = 0; -+#endif /* SPINLOCK */ -+ -+#ifndef SPINLOCK -+ tdb_lock.lock = 0; -+ eroute_lock.lock = 0; -+#endif /* !SPINLOCK */ -+ -+ error |= ipsec_sadb_init(); -+ if (error) -+ goto error_sadb_init; -+ -+ error |= ipsec_radijinit(); -+ if (error) -+ goto error_radijinit; -+ -+ error |= pfkey_init(); -+ if (error) -+ goto error_pfkey_init; -+ -+ error |= register_netdevice_notifier(&ipsec_dev_notifier); -+ if (error) -+ goto error_netdev_notifier; -+ -+#ifdef CONFIG_XFRM_ALTERNATE_STACK -+ error = xfrm_register_alternate_rcv (ipsec_rcv); -+ if (error) -+ goto error_xfrm_register; -+ -+#else // CONFIG_XFRM_ALTERNATE_STACK -+ -+#ifdef CONFIG_KLIPS_ESP -+ error |= openswan_inet_add_protocol(&esp_protocol, IPPROTO_ESP,"ESP"); -+ if (error) -+ goto error_openswan_inet_add_protocol_esp; -+ -+#endif /* CONFIG_KLIPS_ESP */ -+ -+#ifdef CONFIG_KLIPS_AH -+ error |= openswan_inet_add_protocol(&ah_protocol, IPPROTO_AH,"AH"); -+ if (error) -+ goto error_openswan_inet_add_protocol_ah; -+#endif /* CONFIG_KLIPS_AH */ -+ -+/* we never actually link IPCOMP to the stack */ -+#ifdef IPCOMP_USED_ALONE -+#ifdef CONFIG_KLIPS_IPCOMP -+ error |= openswan_inet_add_protocol(&comp_protocol, IPPROTO_COMP,"IPCOMP"); -+ if (error) -+ goto error_openswan_inet_add_protocol_comp; -+#endif /* CONFIG_KLIPS_IPCOMP */ -+#endif -+ -+#endif // CONFIG_XFRM_ALTERNATE_STACK -+ -+ error |= ipsec_tunnel_init_devices(); -+ if (error) -+ goto error_tunnel_init_devices; -+ -+ error |= ipsec_mast_init_devices(); -+ -+#if defined(NET_26) && defined(CONFIG_IPSEC_NAT_TRAVERSAL) -+ /* register our ESP-UDP handler */ -+ if(udp4_register_esp_rcvencap(klips26_rcv_encap -+ , &klips_old_encap)!=0) { -+ printk(KERN_ERR "KLIPS: can not register klips_rcv_encap function\n"); -+ } -+#endif -+ -+#ifdef CONFIG_SYSCTL -+ error |= ipsec_sysctl_register(); -+ if (error) -+ goto error_sysctl_register; -+#endif -+ -+#ifdef CONFIG_KLIPS_ALG -+ ipsec_alg_init(); -+#endif -+ -+#ifdef CONFIG_KLIPS_OCF -+ ipsec_ocf_init(); -+#endif -+ -+ get_random_bytes((void *)seed, sizeof(seed)); -+ prng_init(&ipsec_prng, seed, sizeof(seed)); -+ return error; -+ -+ // undo ipsec_sysctl_register -+error_sysctl_register: -+ ipsec_tunnel_cleanup_devices(); -+error_tunnel_init_devices: -+#ifdef CONFIG_XFRM_ALTERNATE_STACK -+ xfrm_deregister_alternate_rcv(ipsec_rcv); -+error_xfrm_register: -+#else // CONFIG_XFRM_ALTERNATE_STACK -+#ifdef IPCOMP_USED_ALONE -+#ifdef CONFIG_KLIPS_IPCOMP -+error_openswan_inet_add_protocol_comp: -+ openswan_inet_del_protocol(&comp_protocol, IPPROTO_COMP); -+#endif /* CONFIG_KLIPS_IPCOMP */ -+#endif -+error_openswan_inet_add_protocol_ah: -+ openswan_inet_del_protocol(&ah_protocol, IPPROTO_AH); -+error_openswan_inet_add_protocol_esp: -+ openswan_inet_del_protocol(&esp_protocol, IPPROTO_ESP); -+#endif -+ unregister_netdevice_notifier(&ipsec_dev_notifier); -+error_netdev_notifier: -+ pfkey_cleanup(); -+error_pfkey_init: -+ ipsec_radijcleanup(); -+error_radijinit: -+ ipsec_sadb_cleanup(0); -+ ipsec_sadb_free(); -+error_sadb_init: -+error_proc_init: -+ // ipsec_proc_init() does not cleanup after itself, so we have to do it here -+ // TODO: ipsec_proc_init() should roll back what it chaned on failure -+ ipsec_proc_cleanup(); -+ ipsec_rcv_state_cache_cleanup (); -+error_rcv_state_cache: -+ ipsec_xmit_state_cache_cleanup (); -+error_xmit_state_cache: -+ return error; -+} -+ -+ -+/* void */ -+int -+ipsec_cleanup(void) -+{ -+ int error = 0; -+ -+#ifdef CONFIG_SYSCTL -+ ipsec_sysctl_unregister(); -+#endif -+#if defined(NET_26) && defined(CONFIG_IPSEC_NAT_TRAVERSAL) -+ if(udp4_unregister_esp_rcvencap(klips_old_encap) < 0) { -+ printk(KERN_ERR "KLIPS: can not unregister klips_rcv_encap function\n"); -+ } -+#endif -+ -+ KLIPS_PRINT(debug_netlink, /* debug_tunnel & DB_TN_INIT, */ -+ "klips_debug:ipsec_cleanup: " -+ "calling ipsec_tunnel_cleanup_devices.\n"); -+ error |= ipsec_tunnel_cleanup_devices(); -+ -+ KLIPS_PRINT(debug_netlink, "called ipsec_tunnel_cleanup_devices"); -+ -+#ifdef CONFIG_XFRM_ALTERNATE_STACK -+ -+ xfrm_deregister_alternate_rcv(ipsec_rcv); -+ -+#else // CONFIG_XFRM_ALTERNATE_STACK -+ -+/* we never actually link IPCOMP to the stack */ -+#ifdef IPCOMP_USED_ALONE -+#ifdef CONFIG_KLIPS_IPCOMP -+ if (openswan_inet_del_protocol(&comp_protocol, IPPROTO_COMP) < 0) -+ printk(KERN_INFO "klips_debug:ipsec_cleanup: " -+ "comp close: can't remove protocol\n"); -+#endif /* CONFIG_KLIPS_IPCOMP */ -+#endif /* IPCOMP_USED_ALONE */ -+ -+#ifdef CONFIG_KLIPS_AH -+ if (openswan_inet_del_protocol(&ah_protocol, IPPROTO_AH) < 0) -+ printk(KERN_INFO "klips_debug:ipsec_cleanup: " -+ "ah close: can't remove protocol\n"); -+#endif /* CONFIG_KLIPS_AH */ -+ -+#ifdef CONFIG_KLIPS_ESP -+ if (openswan_inet_del_protocol(&esp_protocol, IPPROTO_ESP) < 0) -+ printk(KERN_INFO "klips_debug:ipsec_cleanup: " -+ "esp close: can't remove protocol\n"); -+#endif /* CONFIG_KLIPS_ESP */ -+ -+#endif // CONFIG_XFRM_ALTERNATE_STACK -+ -+ error |= unregister_netdevice_notifier(&ipsec_dev_notifier); -+ -+ KLIPS_PRINT(debug_netlink, /* debug_tunnel & DB_TN_INIT, */ -+ "klips_debug:ipsec_cleanup: " -+ "calling ipsec_sadb_cleanup.\n"); -+ error |= ipsec_sadb_cleanup(0); -+ error |= ipsec_sadb_free(); -+ -+ KLIPS_PRINT(debug_netlink, /* debug_tunnel & DB_TN_INIT, */ -+ "klips_debug:ipsec_cleanup: " -+ "calling ipsec_radijcleanup.\n"); -+ error |= ipsec_radijcleanup(); -+ -+ KLIPS_PRINT(debug_pfkey, /* debug_tunnel & DB_TN_INIT, */ -+ "klips_debug:ipsec_cleanup: " -+ "calling pfkey_cleanup.\n"); -+ error |= pfkey_cleanup(); -+ -+ ipsec_rcv_state_cache_cleanup (); -+ ipsec_xmit_state_cache_cleanup (); -+ -+ ipsec_proc_cleanup(); -+ -+ prng_final(&ipsec_prng); -+ -+ return error; -+} -+ -+#ifdef MODULE -+int -+init_module(void) -+{ -+ int error = 0; -+ -+ error |= ipsec_klips_init(); -+ -+ return error; -+} -+ -+#ifndef NET_26 -+void -+cleanup_module(void) -+{ -+ KLIPS_PRINT(debug_netlink, /* debug_tunnel & DB_TN_INIT, */ -+ "klips_debug:cleanup_module: " -+ "calling ipsec_cleanup.\n"); -+ -+ ipsec_cleanup(); -+ -+ KLIPS_PRINT(1, "klips_info:cleanup_module: " -+ "ipsec module unloaded.\n"); -+} -+#endif -+#endif /* MODULE */ -+ -+/* -+ * -+ * Local variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/ipsec_ipcomp.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,253 @@ -+/* -+ * processing code for IPCOMP -+ * Copyright (C) 2003 Michael Richardson -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ */ -+ -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif -+#include -+ -+#define __NO_VERSION__ -+#include -+#include /* printk() */ -+ -+#include "openswan/ipsec_param.h" -+ -+#ifdef MALLOC_SLAB -+# include /* kmalloc() */ -+#else /* MALLOC_SLAB */ -+# include /* kmalloc() */ -+#endif /* MALLOC_SLAB */ -+#include /* error codes */ -+#include /* size_t */ -+#include /* mark_bh */ -+ -+#include /* struct device, and other headers */ -+#include /* eth_type_trans */ -+#include /* struct iphdr */ -+#include -+#include -+#ifdef SPINLOCK -+# ifdef SPINLOCK_23 -+# include /* *lock* */ -+# else /* SPINLOCK_23 */ -+# include /* *lock* */ -+# endif /* SPINLOCK_23 */ -+#endif /* SPINLOCK */ -+ -+#include -+ -+#include "openswan/radij.h" -+#include "openswan/ipsec_encap.h" -+#include "openswan/ipsec_sa.h" -+ -+#include "openswan/ipsec_radij.h" -+#include "openswan/ipsec_xform.h" -+#include "openswan/ipsec_tunnel.h" -+#include "openswan/ipsec_rcv.h" -+#include "openswan/ipsec_xmit.h" -+ -+#include "openswan/ipsec_auth.h" -+ -+#ifdef CONFIG_KLIPS_IPCOMP -+#include "openswan/ipsec_ipcomp.h" -+#endif /* CONFIG_KLIPS_IPCOMP */ -+ -+#include "openswan/ipsec_proto.h" -+ -+#ifdef CONFIG_KLIPS_IPCOMP -+enum ipsec_rcv_value -+ipsec_rcv_ipcomp_checks(struct ipsec_rcv_state *irs, -+ struct sk_buff *skb) -+{ -+ int ipcompminlen; -+ -+ ipcompminlen = sizeof(struct iphdr); -+ -+ if(skb->len < (ipcompminlen + sizeof(struct ipcomphdr))) { -+ KLIPS_PRINT(debug_rcv & DB_RX_INAU, -+ "klips_debug:ipsec_rcv: " -+ "runt comp packet of skb->len=%d received from %s, dropped.\n", -+ skb->len, -+ irs->ipsaddr_txt); -+ if(irs->stats) { -+ irs->stats->rx_errors++; -+ } -+ return IPSEC_RCV_BADLEN; -+ } -+ -+ irs->protostuff.ipcompstuff.compp = (struct ipcomphdr *)skb_transport_header(skb); -+ irs->said.spi = htonl((__u32)ntohs(irs->protostuff.ipcompstuff.compp->ipcomp_cpi)); -+ return IPSEC_RCV_OK; -+} -+ -+enum ipsec_rcv_value -+ipsec_rcv_ipcomp_decomp(struct ipsec_rcv_state *irs) -+{ -+ unsigned int flags = 0; -+ struct ipsec_sa *ipsp = irs->ipsp; -+ struct sk_buff *skb; -+ -+ skb=irs->skb; -+ -+ ipsec_xmit_dmp("ipcomp", skb_transport_header(skb), skb->len); -+ -+ if(ipsp == NULL) { -+ return IPSEC_RCV_SAIDNOTFOUND; -+ } -+ -+ if(sysctl_ipsec_inbound_policy_check && -+ ((((ntohl(ipsp->ips_said.spi) & 0x0000ffff) != (ntohl(irs->said.spi) & 0x0000ffff)) && -+ (ipsp->ips_encalg != ntohl(irs->said.spi)) /* this is a workaround for peer non-compliance with rfc2393 */ -+ ))) { -+ char sa2[SATOT_BUF]; -+ size_t sa_len2 = 0; -+ -+ sa_len2 = KLIPS_SATOT(debug_rcv, &ipsp->ips_said, 0, sa2, sizeof(sa2)); -+ -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "Incoming packet with SA(IPCA):%s does not match policy SA(IPCA):%s cpi=%04x cpi->spi=%08x spi=%08x, spi->cpi=%04x for SA grouping, dropped.\n", -+ irs->sa_len ? irs->sa : " (error)", -+ ipsp != NULL ? (sa_len2 ? sa2 : " (error)") : "NULL", -+ ntohs(irs->protostuff.ipcompstuff.compp->ipcomp_cpi), -+ (__u32)ntohl(irs->said.spi), -+ ipsp != NULL ? (__u32)ntohl((ipsp->ips_said.spi)) : 0, -+ ipsp != NULL ? (__u16)(ntohl(ipsp->ips_said.spi) & 0x0000ffff) : 0); -+ if(irs->stats) { -+ irs->stats->rx_dropped++; -+ } -+ return IPSEC_RCV_SAIDNOTFOUND; -+ } -+ -+ ipsp->ips_comp_ratio_cbytes += ntohs(irs->ipp->tot_len); -+ irs->next_header = irs->protostuff.ipcompstuff.compp->ipcomp_nh; -+ -+ skb = skb_decompress(skb, ipsp, &flags); -+ if (!skb || flags) { -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "skb_decompress() returned error flags=%x, dropped.\n", -+ flags); -+ if (irs->stats) { -+ if (flags) -+ irs->stats->rx_errors++; -+ else -+ irs->stats->rx_dropped++; -+ } -+ return IPSEC_RCV_IPCOMPFAILED; -+ } -+ -+ /* make sure we update the pointer */ -+ irs->skb = skb; -+ -+#ifdef NET_21 -+ irs->ipp = ip_hdr(skb); -+#else /* NET_21 */ -+ irs->ipp = skb->ip_hdr; -+#endif /* NET_21 */ -+ -+ ipsp->ips_comp_ratio_dbytes += ntohs(irs->ipp->tot_len); -+ -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "packet decompressed SA(IPCA):%s cpi->spi=%08x spi=%08x, spi->cpi=%04x, nh=%d.\n", -+ irs->sa_len ? irs->sa : " (error)", -+ (__u32)ntohl(irs->said.spi), -+ ipsp != NULL ? (__u32)ntohl((ipsp->ips_said.spi)) : 0, -+ ipsp != NULL ? (__u16)(ntohl(ipsp->ips_said.spi) & 0x0000ffff) : 0, -+ irs->next_header); -+ KLIPS_IP_PRINT(debug_rcv & DB_RX_PKTRX, irs->ipp); -+ -+ return IPSEC_RCV_OK; -+} -+ -+enum ipsec_xmit_value -+ipsec_xmit_ipcomp_setup(struct ipsec_xmit_state *ixs) -+{ -+ unsigned int flags = 0; -+#ifdef CONFIG_KLIPS_DEBUG -+ unsigned int old_tot_len = ntohs(ixs->iph->tot_len); -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+ ixs->ipsp->ips_comp_ratio_dbytes += ntohs(ixs->iph->tot_len); -+ -+ ixs->skb = skb_compress(ixs->skb, ixs->ipsp, &flags); -+ -+#ifdef NET_21 -+ ixs->iph = ip_hdr(ixs->skb); -+#else /* NET_21 */ -+ ixs->iph = ixs->skb->ip_hdr; -+#endif /* NET_21 */ -+ -+ ixs->ipsp->ips_comp_ratio_cbytes += ntohs(ixs->iph->tot_len); -+ -+#ifdef CONFIG_KLIPS_DEBUG -+ if (debug_tunnel & DB_TN_CROUT) -+ { -+ if (old_tot_len > ntohs(ixs->iph->tot_len)) -+ KLIPS_PRINT(debug_tunnel & DB_TN_CROUT, -+ "klips_debug:ipsec_xmit_encap_once: " -+ "packet shrunk from %d to %d bytes after compression, cpi=%04x (should be from spi=%08x, spi&0xffff=%04x.\n", -+ old_tot_len, ntohs(ixs->iph->tot_len), -+ ntohs(((struct ipcomphdr*)(((char*)ixs->iph) + ((ixs->iph->ihl) << 2)))->ipcomp_cpi), -+ ntohl(ixs->ipsp->ips_said.spi), -+ (__u16)(ntohl(ixs->ipsp->ips_said.spi) & 0x0000ffff)); -+ else -+ KLIPS_PRINT(debug_tunnel & DB_TN_CROUT, -+ "klips_debug:ipsec_xmit_encap_once: " -+ "packet did not compress (flags = %d).\n", -+ flags); -+ } -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+ return IPSEC_XMIT_OK; -+} -+ -+struct xform_functions ipcomp_xform_funcs[]={ -+ { -+ protocol: IPPROTO_COMP, -+ rcv_checks: ipsec_rcv_ipcomp_checks, -+ rcv_decrypt: ipsec_rcv_ipcomp_decomp, -+ xmit_setup: ipsec_xmit_ipcomp_setup, -+ xmit_headroom: 0, -+ xmit_needtailroom: 0, -+ }, -+}; -+ -+#if 0 -+/* We probably don't want to install a pure IPCOMP protocol handler, but -+ only want to handle IPCOMP if it is encapsulated inside an ESP payload -+ (which is already handled) */ -+#ifndef CONFIG_XFRM_ALTERNATE_STACK -+#ifdef CONFIG_KLIPS_IPCOMP -+struct inet_protocol comp_protocol = -+{ -+ ipsec_rcv, /* COMP handler */ -+ NULL, /* COMP error control */ -+#ifdef NETDEV_25 -+ 1, /* no policy */ -+#else -+ 0, /* next */ -+ IPPROTO_COMP, /* protocol ID */ -+ 0, /* copy */ -+ NULL, /* data */ -+ "COMP" /* name */ -+#endif -+}; -+#endif /* CONFIG_KLIPS_IPCOMP */ -+#endif /* CONFIG_XFRM_ALTERNATE_STACK */ -+#endif -+ -+#endif /* CONFIG_KLIPS_IPCOMP */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/ipsec_ipip.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,123 @@ -+/* -+ * processing code for IPIP -+ * Copyright (C) 2003 Michael Richardson -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ */ -+ -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif -+#include -+ -+#define __NO_VERSION__ -+#include -+#include /* printk() */ -+ -+#include "openswan/ipsec_param.h" -+ -+#ifdef MALLOC_SLAB -+# include /* kmalloc() */ -+#else /* MALLOC_SLAB */ -+# include /* kmalloc() */ -+#endif /* MALLOC_SLAB */ -+#include /* error codes */ -+#include /* size_t */ -+#include /* mark_bh */ -+ -+#include /* struct device, and other headers */ -+#include /* eth_type_trans */ -+#include /* struct iphdr */ -+#include -+#include -+#ifdef SPINLOCK -+# ifdef SPINLOCK_23 -+# include /* *lock* */ -+# else /* SPINLOCK_23 */ -+# include /* *lock* */ -+# endif /* SPINLOCK_23 */ -+#endif /* SPINLOCK */ -+ -+#include -+ -+#include "openswan/radij.h" -+#include "openswan/ipsec_encap.h" -+#include "openswan/ipsec_sa.h" -+ -+#include "openswan/ipsec_radij.h" -+#include "openswan/ipsec_xform.h" -+#include "openswan/ipsec_tunnel.h" -+#include "openswan/ipsec_rcv.h" -+#include "openswan/ipsec_xmit.h" -+ -+#include "openswan/ipsec_auth.h" -+#include "openswan/ipsec_ipip.h" -+#include "openswan/ipsec_param.h" -+ -+#include "openswan/ipsec_proto.h" -+ -+enum ipsec_xmit_value -+ipsec_xmit_ipip_setup(struct ipsec_xmit_state *ixs) -+{ -+ ixs->iph->version = 4; -+ -+ switch(sysctl_ipsec_tos) { -+ case 0: -+#ifdef NET_21 -+ ixs->iph->tos = ip_hdr(ixs->skb)->tos; -+#else /* NET_21 */ -+ ixs->iph->tos = ixs->skb->ip_hdr->tos; -+#endif /* NET_21 */ -+ break; -+ case 1: -+ ixs->iph->tos = 0; -+ break; -+ default: -+ break; -+ } -+ ixs->iph->ttl = SYSCTL_IPSEC_DEFAULT_TTL; -+ ixs->iph->frag_off = 0; -+ ixs->iph->saddr = ((struct sockaddr_in*)(ixs->ipsp->ips_addr_s))->sin_addr.s_addr; -+ ixs->iph->daddr = ((struct sockaddr_in*)(ixs->ipsp->ips_addr_d))->sin_addr.s_addr; -+ ixs->iph->protocol = IPPROTO_IPIP; -+ ixs->iph->ihl = sizeof(struct iphdr) >> 2; -+ -+ KLIPS_IP_SELECT_IDENT(ixs->iph, ixs->skb); -+ -+ ixs->newdst = (__u32)ixs->iph->daddr; -+ ixs->newsrc = (__u32)ixs->iph->saddr; -+ -+#ifdef NET_21 -+ skb_set_transport_header(ixs->skb, ipsec_skb_offset(ixs->skb, ip_hdr(ixs->skb))); -+#endif /* NET_21 */ -+ return IPSEC_XMIT_OK; -+} -+ -+struct xform_functions ipip_xform_funcs[]={ -+ { -+ protocol: IPPROTO_IPIP, -+ rcv_checks: NULL, -+ rcv_setup_auth: NULL, -+ rcv_calc_auth: NULL, -+ rcv_decrypt: NULL, -+ -+ xmit_setup: ipsec_xmit_ipip_setup, -+ xmit_headroom: sizeof(struct iphdr), -+ xmit_needtailroom: 0, -+ }, -+}; -+ -+ -+ -+ -+ -+ -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/ipsec_kern24.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,74 @@ -+/* -+ * Copyright 2005 (C) Michael Richardson -+ * -+ * This is a file of functions which are present in 2.6 kernels, -+ * but are not available by default in the 2.4 series. -+ * -+ * As such this code is usually from the Linux kernel, and is covered by -+ * GPL. -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * $Id: ipsec_kern24.c,v 1.2 2005/05/20 03:19:18 mcr Exp $ -+ * -+ */ -+ -+#include -+#include -+#include -+ -+/* -+ * printk rate limiting, lifted from the networking subsystem. -+ * -+ * This enforces a rate limit: not more than one kernel message -+ * every printk_ratelimit_jiffies to make a denial-of-service -+ * attack impossible. -+ */ -+static spinlock_t ratelimit_lock = SPIN_LOCK_UNLOCKED; -+ -+int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) -+{ -+ static unsigned long toks = 10*5*HZ; -+ static unsigned long last_msg; -+ static int missed; -+ unsigned long flags; -+ unsigned long now = jiffies; -+ -+ spin_lock_irqsave(&ratelimit_lock, flags); -+ toks += now - last_msg; -+ last_msg = now; -+ if (toks > (ratelimit_burst * ratelimit_jiffies)) -+ toks = ratelimit_burst * ratelimit_jiffies; -+ if (toks >= ratelimit_jiffies) { -+ int lost = missed; -+ missed = 0; -+ toks -= ratelimit_jiffies; -+ spin_unlock_irqrestore(&ratelimit_lock, flags); -+ if (lost) -+ printk(KERN_WARNING "printk: %d messages suppressed.\n", lost); -+ return 1; -+ } -+ missed++; -+ spin_unlock_irqrestore(&ratelimit_lock, flags); -+ return 0; -+} -+ -+/* minimum time in jiffies between messages */ -+int printk_ratelimit_jiffies = 5*HZ; -+ -+/* number of messages we send before ratelimiting */ -+int printk_ratelimit_burst = 10; -+ -+int printk_ratelimit(void) -+{ -+ return __printk_ratelimit(printk_ratelimit_jiffies, -+ printk_ratelimit_burst); -+} ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/ipsec_life.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,221 @@ -+/* -+ * @(#) lifetime structure utilities -+ * -+ * Copyright (C) 2001 Richard Guy Briggs -+ * and Michael Richardson -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * -+ */ -+ -+/* -+ * This provides series of utility functions for dealing with lifetime -+ * structures. -+ * -+ * ipsec_check_lifetime - returns -1 hard lifetime exceeded -+ * 0 soft lifetime exceeded -+ * 1 everything is okay -+ * based upon whether or not the count exceeds hard/soft -+ * -+ */ -+ -+#define __NO_VERSION__ -+#include -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif /* for CONFIG_IP_FORWARD */ -+#include -+#include /* printk() */ -+ -+#include "openswan/ipsec_param.h" -+ -+#include /* struct device, struct net_device_stats and other headers */ -+#include /* eth_type_trans */ -+#include -+#include -+#include -+ -+#include "openswan/radij.h" -+#include "openswan/ipsec_life.h" -+#include "openswan/ipsec_xform.h" -+#include "openswan/ipsec_eroute.h" -+#include "openswan/ipsec_encap.h" -+#include "openswan/ipsec_radij.h" -+ -+#include "openswan/ipsec_sa.h" -+#include "openswan/ipsec_tunnel.h" -+#include "openswan/ipsec_ipe4.h" -+#include "openswan/ipsec_ah.h" -+#include "openswan/ipsec_esp.h" -+ -+#ifdef CONFIG_KLIPS_IPCOMP -+#include "openswan/ipcomp.h" -+#endif /* CONFIG_KLIPS_IPCOMP */ -+ -+#include -+#include -+ -+#include "openswan/ipsec_proto.h" -+ -+ -+enum ipsec_life_alive -+ipsec_lifetime_check(struct ipsec_lifetime64 *il64, -+ const char *lifename, -+ const char *saname, -+ enum ipsec_life_type ilt, -+ enum ipsec_direction idir, -+ struct ipsec_sa *ips) -+{ -+ __u64 count; -+ const char *dir; -+ -+ if(saname == NULL) { -+ saname = "unknown-SA"; -+ } -+ -+ if(idir == ipsec_incoming) { -+ dir = "incoming"; -+ } else { -+ dir = "outgoing"; -+ } -+ -+ -+ if(ilt == ipsec_life_timebased) { -+ count = jiffies/HZ - il64->ipl_count; -+ } else { -+ count = il64->ipl_count; -+ } -+ -+ if(il64->ipl_hard && -+ (count > il64->ipl_hard)) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_lifetime_check: " -+ "hard %s lifetime of SA:<%s%s%s> %s has been reached, SA expired, " -+ "%s packet dropped.\n", -+ lifename, -+ IPS_XFORM_NAME(ips), -+ saname, -+ dir); -+ -+ pfkey_expire(ips, 1); -+ return ipsec_life_harddied; -+ } -+ -+ if(il64->ipl_soft && -+ (count > il64->ipl_soft)) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_lifetime_check: " -+ "soft %s lifetime of SA:<%s%s%s> %s has been reached, SA expiring, " -+ "soft expire message sent up, %s packet still processed.\n", -+ lifename, -+ IPS_XFORM_NAME(ips), -+ saname, -+ dir); -+ -+ if(ips->ips_state != K_SADB_SASTATE_DYING) { -+ pfkey_expire(ips, 0); -+ } -+ ips->ips_state = K_SADB_SASTATE_DYING; -+ -+ return ipsec_life_softdied; -+ } -+ return ipsec_life_okay; -+} -+ -+ -+/* -+ * This function takes a buffer (with length), a lifetime name and type, -+ * and formats a string to represent the current values of the lifetime. -+ * -+ * It returns the number of bytes that the format took (or would take, -+ * if the buffer were large enough: snprintf semantics). -+ * This is used in /proc routines and in debug output. -+ */ -+int -+ipsec_lifetime_format(char *buffer, -+ int buflen, -+ char *lifename, -+ enum ipsec_life_type timebaselife, -+ struct ipsec_lifetime64 *lifetime) -+{ -+ int len = 0; -+ __u64 count; -+ -+ if(timebaselife == ipsec_life_timebased) { -+ count = jiffies/HZ - lifetime->ipl_count; -+ } else { -+ count = lifetime->ipl_count; -+ } -+ -+ if(lifetime->ipl_count > 1 || -+ lifetime->ipl_soft || -+ lifetime->ipl_hard) { -+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,0)) -+ len = ipsec_snprintf(buffer, buflen, -+ "%s(%Lu,%Lu,%Lu)", -+ lifename, -+ count, -+ lifetime->ipl_soft, -+ lifetime->ipl_hard); -+#else /* XXX high 32 bits are not displayed */ -+ len = ipsec_snprintf(buffer, buflen, -+ "%s(%lu,%lu,%lu)", -+ lifename, -+ (unsigned long)count, -+ (unsigned long)lifetime->ipl_soft, -+ (unsigned long)lifetime->ipl_hard); -+#endif -+ } -+ -+ return len; -+} -+ -+void -+ipsec_lifetime_update_hard(struct ipsec_lifetime64 *lifetime, -+ __u64 newvalue) -+{ -+ if(newvalue && -+ (!lifetime->ipl_hard || -+ (newvalue < lifetime->ipl_hard))) { -+ lifetime->ipl_hard = newvalue; -+ -+ if(!lifetime->ipl_soft && -+ (lifetime->ipl_hard < lifetime->ipl_soft)) { -+ lifetime->ipl_soft = lifetime->ipl_hard; -+ } -+ } -+} -+ -+void -+ipsec_lifetime_update_soft(struct ipsec_lifetime64 *lifetime, -+ __u64 newvalue) -+{ -+ if(newvalue && -+ (!lifetime->ipl_soft || -+ (newvalue < lifetime->ipl_soft))) { -+ lifetime->ipl_soft = newvalue; -+ -+ if(lifetime->ipl_hard && -+ (lifetime->ipl_hard < lifetime->ipl_soft)) { -+ lifetime->ipl_soft = lifetime->ipl_hard; -+ } -+ } -+} -+ -+ -+/* -+ * -+ * Local variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/ipsec_mast.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,869 @@ -+/* -+ * IPSEC MAST code. -+ * Copyright (C) 2005 Michael Richardson -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ */ -+ -+char ipsec_mast_c_version[] = "RCSID $Id: ipsec_mast.c,v 1.7 2005/04/29 05:10:22 mcr Exp $"; -+ -+#define __NO_VERSION__ -+#include -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif /* for CONFIG_IP_FORWARD */ -+#include -+#include /* printk() */ -+ -+#include "openswan/ipsec_param.h" -+ -+#ifdef MALLOC_SLAB -+# include /* kmalloc() */ -+#else /* MALLOC_SLAB */ -+# include /* kmalloc() */ -+#endif /* MALLOC_SLAB */ -+#include /* error codes */ -+#include /* size_t */ -+#include /* mark_bh */ -+ -+#include -+#include -+#include -+#include -+ -+#include /* struct device, struct net_device_stats, dev_queue_xmit() and other headers */ -+#include /* eth_type_trans */ -+#include /* struct iphdr */ -+#include -+#include -+ -+#include -+ -+#include /* icmp_send() */ -+#include -+#ifdef NETDEV_23 -+# include -+#endif /* NETDEV_23 */ -+ -+#include -+ -+#include "openswan/ipsec_kversion.h" -+#include "openswan/radij.h" -+#include "openswan/ipsec_life.h" -+#include "openswan/ipsec_xform.h" -+#include "openswan/ipsec_eroute.h" -+#include "openswan/ipsec_encap.h" -+#include "openswan/ipsec_radij.h" -+#include "openswan/ipsec_sa.h" -+#include "openswan/ipsec_xmit.h" -+#include "openswan/ipsec_mast.h" -+#include "openswan/ipsec_tunnel.h" -+#include "openswan/ipsec_ipe4.h" -+#include "openswan/ipsec_ah.h" -+#include "openswan/ipsec_esp.h" -+#include "openswan/ipsec_kern24.h" -+ -+#include -+#include -+ -+#include "openswan/ipsec_proto.h" -+#ifdef CONFIG_IPSEC_NAT_TRAVERSAL -+#include -+#endif -+ -+int ipsec_mastdevice_count = -1; -+int debug_mast; -+ -+static __u32 zeroes[64]; -+ -+DEBUG_NO_STATIC int -+ipsec_mast_open(struct net_device *dev) -+{ -+ struct mastpriv *prv = dev->priv; -+ -+ prv = prv; -+ -+ /* -+ * Can't open until attached. -+ */ -+ -+ KLIPS_PRINT(debug_mast & DB_MAST_INIT, -+ "klips_debug:ipsec_mast_open: " -+ "dev = %s\n", -+ dev->name); -+ -+ return 0; -+} -+ -+DEBUG_NO_STATIC int -+ipsec_mast_close(struct net_device *dev) -+{ -+ return 0; -+} -+ -+static inline int ipsec_mast_xmit2(struct sk_buff *skb) -+{ -+ return dst_output(skb); -+} -+ -+#ifdef HAVE_IPSEC_SAREF -+int ip_cmsg_send_ipsec(struct cmsghdr *cmsg, struct ipcm_cookie *ipc) -+{ -+ struct ipsec_sa *sa1; -+ xfrm_sec_unique_t *ref; -+ struct sec_path *sp; -+ -+ if(cmsg->cmsg_len != CMSG_LEN(sizeof(xfrm_sec_unique_t))) { -+ return -EINVAL; -+ } -+ -+ ref = (xfrm_sec_unique_t *)CMSG_DATA(cmsg); -+ -+ sp = secpath_dup(NULL); -+ if(!sp) { -+ return -EINVAL; -+ } -+ -+ sp->ref = *ref; -+ KLIPS_PRINT(debug_mast, "sending with saref=%u\n", sp->ref); -+ -+ sa1 = ipsec_sa_getbyref(sp->ref); -+ if(sa1 && sa1->ips_out) { -+ ipc->oif = sa1->ips_out->ifindex; -+ KLIPS_PRINT(debug_mast, "setting oif: %d\n", ipc->oif); -+ } -+ ipsec_sa_put(sa1); -+ -+ ipc->sp = sp; -+ -+ return 0; -+} -+#endif -+ -+#if 0 -+/* Paul: This seems to be unused dead code */ -+enum ipsec_xmit_value -+ipsec_mast_send(struct ipsec_xmit_state*ixs) -+{ -+ /* new route/dst cache code from James Morris */ -+ ixs->skb->dev = ixs->physdev; -+ /*skb_orphan(ixs->skb);*/ -+ if((ixs->error = ip_route_output(&ixs->route, -+ ixs->skb->nh.iph->daddr, -+ ixs->pass ? 0 : ixs->skb->nh.iph->saddr, -+ RT_TOS(ixs->skb->nh.iph->tos), -+ ixs->physdev->ifindex /* rgb: should this be 0? */))) { -+ ixs->stats->tx_errors++; -+ KLIPS_PRINT(debug_mast & DB_MAST_XMIT, -+ "klips_debug:ipsec_xmit_send: " -+ "ip_route_output failed with error code %d, rt->u.dst.dev=%s, dropped\n", -+ ixs->error, -+ ixs->route->u.dst.dev->name); -+ return IPSEC_XMIT_ROUTEERR; -+ } -+ if(ixs->dev == ixs->route->u.dst.dev) { -+ ip_rt_put(ixs->route); -+ /* This is recursion, drop it. */ -+ ixs->stats->tx_errors++; -+ KLIPS_PRINT(debug_mast & DB_MAST_XMIT, -+ "klips_debug:ipsec_xmit_send: " -+ "suspect recursion, dev=rt->u.dst.dev=%s, dropped\n", -+ ixs->dev->name); -+ return IPSEC_XMIT_RECURSDETECT; -+ } -+ dst_release(ixs->skb->dst); -+ ixs->skb->dst = &ixs->route->u.dst; -+ ixs->stats->tx_bytes += ixs->skb->len; -+ if(ixs->skb->len < ixs->skb->nh.raw - ixs->skb->data) { -+ ixs->stats->tx_errors++; -+ printk(KERN_WARNING -+ "klips_error:ipsec_xmit_send: " -+ "tried to __skb_pull nh-data=%ld, %d available. This should never happen, please report.\n", -+ (unsigned long)(ixs->skb->nh.raw - ixs->skb->data), -+ ixs->skb->len); -+ return IPSEC_XMIT_PUSHPULLERR; -+ } -+ __skb_pull(ixs->skb, ixs->skb->nh.raw - ixs->skb->data); -+ -+ ipsec_nf_reset(ixs->skb); -+ -+ KLIPS_PRINT(debug_mast & DB_MAST_XMIT, -+ "klips_debug:ipsec_xmit_send: " -+ "...done, calling ip_send() on device:%s\n", -+ ixs->skb->dev ? ixs->skb->dev->name : "NULL"); -+ KLIPS_IP_PRINT(debug_mast & DB_MAST_XMIT, ixs->skb->nh.iph); -+ { -+ int err; -+ -+ err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, ixs->skb, NULL, ixs->route->u.dst.dev, -+ ipsec_mast_xmit2); -+ if(err != NET_XMIT_SUCCESS && err != NET_XMIT_CN) { -+ if(net_ratelimit()) -+ printk(KERN_ERR -+ "klips_error:ipsec_xmit_send: " -+ "ip_send() failed, err=%d\n", -+ -err); -+ ixs->stats->tx_errors++; -+ ixs->stats->tx_aborted_errors++; -+ ixs->skb = NULL; -+ return IPSEC_XMIT_IPSENDFAILURE; -+ } -+ } -+ ixs->stats->tx_packets++; -+ ixs->skb = NULL; -+ -+ return IPSEC_XMIT_OK; -+} -+#endif -+ -+static void -+ipsec_mast_xsm_complete( -+ struct ipsec_xmit_state *ixs, -+ enum ipsec_xmit_value stat) -+{ -+ if (stat != IPSEC_XMIT_OK) { -+ KLIPS_PRINT(debug_mast, -+ "klips_debug:ipsec_mast_xsm_complete: ipsec_xsm failed: %d\n", -+ stat); -+ goto cleanup; -+ } -+ -+ /* do any final NAT-encapsulation */ -+ stat = ipsec_nat_encap(ixs); -+ if(stat != IPSEC_XMIT_OK) { -+ goto cleanup; -+ } -+ -+ /* now send the packet again */ -+ { -+ struct flowi fl; -+ -+ memset(&fl, 0, sizeof(fl)); -+ ipsec_xmit_send(ixs, &fl); -+ } -+ -+cleanup: -+ ipsec_xmit_cleanup(ixs); -+ -+ if(ixs->ipsp) { -+ ipsec_sa_put(ixs->ipsp); -+ ixs->ipsp=NULL; -+ } -+ if(ixs->skb) { -+ ipsec_kfree_skb(ixs->skb); -+ ixs->skb=NULL; -+ } -+ ipsec_xmit_state_delete(ixs); -+} -+ -+/* -+ * This function assumes it is being called from dev_queue_xmit() -+ * and that skb is filled properly by that function. -+ */ -+int -+ipsec_mast_start_xmit(struct sk_buff *skb, struct net_device *dev) -+{ -+ struct ipsec_xmit_state *ixs; -+ IPsecSAref_t SAref; -+ -+ if(skb == NULL) { -+ printk("mast start_xmit passed NULL\n"); -+ return 0; -+ } -+ -+ ixs = ipsec_xmit_state_new(); -+ if(ixs == NULL) { -+ printk("mast failed to allocate IXS\n"); -+ return 0; -+ } -+ -+ ixs->skb = skb; -+ SAref = 0; -+ if(skb->nfmark & 0x80000000) { -+ SAref = NFmark2IPsecSAref(skb->nfmark); -+ KLIPS_PRINT(debug_mast, "getting SAref=%d from nfmark\n", -+ SAref); -+ } -+ -+#ifdef HAVE_IPSEC_SAREF -+ if(skb->sp && skb->sp->ref != IPSEC_SAREF_NULL) { -+ SAref = skb->sp->ref; -+ KLIPS_PRINT(debug_mast, "getting SAref=%d from sec_path\n", -+ SAref); -+ } -+#endif -+ KLIPS_PRINT(debug_mast, "skb=%p\n", skb); -+ -+ ipsec_xmit_sanity_check_skb(ixs); -+ -+ ixs->ipsp = ipsec_sa_getbyref(SAref); -+ if(ixs->ipsp == NULL) { -+ KLIPS_ERROR(debug_mast, "%s: no SA for saref=%d (sp=%p)\n", -+ dev->name, SAref, skb->sp); -+ ipsec_kfree_skb(skb); -+ return 0; -+ } -+ -+ /* -+ * we should be calculating the MTU by looking up a route -+ * based upon the destination in the SA, and then cache -+ * it into the SA, but we don't do that right now. -+ */ -+ ixs->cur_mtu = 1460; -+ ixs->physmtu = 1460; -+ -+ ixs->xsm_complete = ipsec_mast_xsm_complete; -+ ixs->state = IPSEC_XSM_INIT2; /* we start later in the process */ -+ -+ ipsec_xsm(ixs); -+ return 0; -+ -+} -+ -+DEBUG_NO_STATIC struct net_device_stats * -+ipsec_mast_get_stats(struct net_device *dev) -+{ -+ return &(((struct mastpriv *)(dev->priv))->mystats); -+} -+ -+#if 0 -+/* -+ * Revectored calls. -+ * For each of these calls, a field exists in our private structure. -+ */ -+DEBUG_NO_STATIC int -+ipsec_mast_hard_header(struct sk_buff *skb, struct net_device *dev, -+ unsigned short type, void *daddr, void *saddr, unsigned len) -+{ -+ struct mastpriv *prv = dev->priv; -+ struct net_device_stats *stats; /* This device's statistics */ -+ int ret = 0; -+ -+ if(skb == NULL) { -+ KLIPS_PRINT(debug_mast & DB_MAST_REVEC, -+ "klips_debug:ipsec_mast_hard_header: " -+ "no skb...\n"); -+ return -ENODATA; -+ } -+ -+ if(dev == NULL) { -+ KLIPS_PRINT(debug_mast & DB_MAST_REVEC, -+ "klips_debug:ipsec_mast_hard_header: " -+ "no device...\n"); -+ return -ENODEV; -+ } -+ -+ KLIPS_PRINT(debug_mast & DB_MAST_REVEC, -+ "klips_debug:ipsec_mast_hard_header: " -+ "skb->dev=%s\n", -+ dev->name); -+ -+ if(prv == NULL) { -+ KLIPS_PRINT(debug_mast & DB_MAST_REVEC, -+ "klips_debug:ipsec_mast_hard_header: " -+ "no private space associated with dev=%s\n", -+ dev->name ? dev->name : "NULL"); -+ return -ENODEV; -+ } -+ -+ stats = (struct net_device_stats *) &(prv->mystats); -+ -+ /* check if we have to send a IPv6 packet. It might be a Router -+ Solicitation, where the building of the packet happens in -+ reverse order: -+ 1. ll hdr, -+ 2. IPv6 hdr, -+ 3. ICMPv6 hdr -+ -> skb->nh.raw is still uninitialized when this function is -+ called!! If this is no IPv6 packet, we can print debugging -+ messages, otherwise we skip all debugging messages and just -+ build the ll header */ -+ if(type != ETH_P_IPV6) { -+ /* execute this only, if we don't have to build the -+ header for a IPv6 packet */ -+ if(!prv->hard_header) { -+ KLIPS_PRINT(debug_mast & DB_MAST_REVEC, -+ "klips_debug:ipsec_mast_hard_header: " -+ "physical device has been detached, packet dropped 0p%p->0p%p len=%d type=%d dev=%s->NULL ", -+ saddr, -+ daddr, -+ len, -+ type, -+ dev->name); -+ KLIPS_PRINTMORE(debug_mast & DB_MAST_REVEC, -+ "ip=%08x->%08x\n", -+ (__u32)ntohl(skb->nh.iph->saddr), -+ (__u32)ntohl(skb->nh.iph->daddr) ); -+ stats->tx_dropped++; -+ return -ENODEV; -+ } -+ } else { -+ KLIPS_PRINT(debug_mast, -+ "klips_debug:ipsec_mast_hard_header: " -+ "is IPv6 packet, skip debugging messages, only revector and build linklocal header.\n"); -+ } -+ -+ return ret; -+} -+ -+DEBUG_NO_STATIC int -+ipsec_mast_rebuild_header(struct sk_buff *skb) -+{ -+ struct mastpriv *prv = skb->dev->priv; -+ -+ prv = prv; -+ return 0; -+} -+ -+DEBUG_NO_STATIC int -+ipsec_mast_set_mac_address(struct net_device *dev, void *addr) -+{ -+ struct mastpriv *prv = dev->priv; -+ -+ prv = prv; -+ return 0; -+ -+} -+ -+DEBUG_NO_STATIC void -+ipsec_mast_cache_update(struct hh_cache *hh, struct net_device *dev, unsigned char * haddr) -+{ -+ struct mastpriv *prv = dev->priv; -+ -+ if(dev == NULL) { -+ KLIPS_PRINT(debug_mast & DB_MAST_REVEC, -+ "klips_debug:ipsec_mast_cache_update: " -+ "no device..."); -+ return; -+ } -+ -+ if(prv == NULL) { -+ KLIPS_PRINT(debug_mast & DB_MAST_REVEC, -+ "klips_debug:ipsec_mast_cache_update: " -+ "no private space associated with dev=%s", -+ dev->name ? dev->name : "NULL"); -+ return; -+ } -+ -+ KLIPS_PRINT(debug_mast & DB_MAST_REVEC, -+ "klips_debug:ipsec_mast: " -+ "Revectored cache_update\n"); -+ return; -+} -+#endif -+ -+DEBUG_NO_STATIC int -+ipsec_mast_neigh_setup(struct neighbour *n) -+{ -+ KLIPS_PRINT(debug_mast & DB_MAST_REVEC, -+ "klips_debug:ipsec_mast_neigh_setup:\n"); -+ -+ if (n->nud_state == NUD_NONE) { -+ n->ops = &arp_broken_ops; -+ n->output = n->ops->output; -+ } -+ return 0; -+} -+ -+DEBUG_NO_STATIC int -+ipsec_mast_neigh_setup_dev(struct net_device *dev, struct neigh_parms *p) -+{ -+ KLIPS_PRINT(debug_mast & DB_MAST_REVEC, -+ "klips_debug:ipsec_mast_neigh_setup_dev: " -+ "setting up %s\n", -+ dev ? dev->name : "NULL"); -+ -+ if (p->tbl->family == AF_INET) { -+ p->neigh_setup = ipsec_mast_neigh_setup; -+ p->ucast_probes = 0; -+ p->mcast_probes = 0; -+ } -+ return 0; -+} -+ -+DEBUG_NO_STATIC int -+ipsec_mast_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) -+{ -+ struct ipsecmastconf *cf = (struct ipsecmastconf *)&ifr->ifr_data; -+ struct ipsecpriv *prv = dev->priv; -+ -+ cf = cf; -+ prv=prv; -+ -+ if(dev == NULL) { -+ KLIPS_PRINT(debug_mast & DB_MAST_INIT, -+ "klips_debug:ipsec_mast_ioctl: " -+ "device not supplied.\n"); -+ return -ENODEV; -+ } -+ -+ KLIPS_PRINT(debug_mast & DB_MAST_INIT, -+ "klips_debug:ipsec_mast_ioctl: " -+ "tncfg service call #%d for dev=%s\n", -+ cmd, -+ dev->name ? dev->name : "NULL"); -+ -+ switch (cmd) { -+ default: -+ KLIPS_PRINT(debug_mast & DB_MAST_INIT, -+ "klips_debug:ipsec_mast_ioctl: " -+ "unknown command %d.\n", -+ cmd); -+ return -EOPNOTSUPP; -+ -+ } -+} -+ -+int -+ipsec_mast_device_event(struct notifier_block *unused, unsigned long event, void *ptr) -+{ -+ struct net_device *dev = ptr; -+ struct mastpriv *priv = dev->priv; -+ -+ priv = priv; -+ -+ if (dev == NULL) { -+ KLIPS_PRINT(debug_mast & DB_MAST_INIT, -+ "klips_debug:ipsec_mast_device_event: " -+ "dev=NULL for event type %ld.\n", -+ event); -+ return(NOTIFY_DONE); -+ } -+ -+ /* check for loopback devices */ -+ if (dev && (dev->flags & IFF_LOOPBACK)) { -+ return(NOTIFY_DONE); -+ } -+ -+ switch (event) { -+ case NETDEV_DOWN: -+ /* look very carefully at the scope of these compiler -+ directives before changing anything... -- RGB */ -+ -+ case NETDEV_UNREGISTER: -+ switch (event) { -+ case NETDEV_DOWN: -+ KLIPS_PRINT(debug_mast & DB_MAST_INIT, -+ "klips_debug:ipsec_mast_device_event: " -+ "NETDEV_DOWN dev=%s flags=%x\n", -+ dev->name, -+ dev->flags); -+ if(strncmp(dev->name, "ipsec", strlen("ipsec")) == 0) { -+ printk(KERN_CRIT "IPSEC EVENT: KLIPS device %s shut down.\n", -+ dev->name); -+ } -+ break; -+ case NETDEV_UNREGISTER: -+ KLIPS_PRINT(debug_mast & DB_MAST_INIT, -+ "klips_debug:ipsec_mast_device_event: " -+ "NETDEV_UNREGISTER dev=%s flags=%x\n", -+ dev->name, -+ dev->flags); -+ break; -+ } -+ break; -+ -+ case NETDEV_UP: -+ KLIPS_PRINT(debug_mast & DB_MAST_INIT, -+ "klips_debug:ipsec_mast_device_event: " -+ "NETDEV_UP dev=%s\n", -+ dev->name); -+ break; -+ -+ case NETDEV_REBOOT: -+ KLIPS_PRINT(debug_mast & DB_MAST_INIT, -+ "klips_debug:ipsec_mast_device_event: " -+ "NETDEV_REBOOT dev=%s\n", -+ dev->name); -+ break; -+ -+ case NETDEV_CHANGE: -+ KLIPS_PRINT(debug_mast & DB_MAST_INIT, -+ "klips_debug:ipsec_mast_device_event: " -+ "NETDEV_CHANGE dev=%s flags=%x\n", -+ dev->name, -+ dev->flags); -+ break; -+ -+ case NETDEV_REGISTER: -+ KLIPS_PRINT(debug_mast & DB_MAST_INIT, -+ "klips_debug:ipsec_mast_device_event: " -+ "NETDEV_REGISTER dev=%s\n", -+ dev->name); -+ break; -+ -+ case NETDEV_CHANGEMTU: -+ KLIPS_PRINT(debug_mast & DB_MAST_INIT, -+ "klips_debug:ipsec_mast_device_event: " -+ "NETDEV_CHANGEMTU dev=%s to mtu=%d\n", -+ dev->name, -+ dev->mtu); -+ break; -+ -+ case NETDEV_CHANGEADDR: -+ KLIPS_PRINT(debug_mast & DB_MAST_INIT, -+ "klips_debug:ipsec_mast_device_event: " -+ "NETDEV_CHANGEADDR dev=%s\n", -+ dev->name); -+ break; -+ -+ case NETDEV_GOING_DOWN: -+ KLIPS_PRINT(debug_mast & DB_MAST_INIT, -+ "klips_debug:ipsec_mast_device_event: " -+ "NETDEV_GOING_DOWN dev=%s\n", -+ dev->name); -+ break; -+ -+ case NETDEV_CHANGENAME: -+ KLIPS_PRINT(debug_mast & DB_MAST_INIT, -+ "klips_debug:ipsec_mast_device_event: " -+ "NETDEV_CHANGENAME dev=%s\n", -+ dev->name); -+ break; -+ -+ default: -+ KLIPS_PRINT(debug_mast & DB_MAST_INIT, -+ "klips_debug:ipsec_mast_device_event: " -+ "event type %ld unrecognised for dev=%s\n", -+ event, -+ dev->name); -+ break; -+ } -+ return NOTIFY_DONE; -+} -+ -+/* -+ * Called when an ipsec mast device is initialized. -+ * The ipsec mast device structure is passed to us. -+ */ -+int -+ipsec_mast_probe(struct net_device *dev) -+{ -+ int i; -+ -+ KLIPS_PRINT(debug_mast, -+ "klips_debug:ipsec_mast_init: " -+ "allocating %lu bytes initialising device: %s\n", -+ (unsigned long) sizeof(struct mastpriv), -+ dev->name ? dev->name : "NULL"); -+ -+ /* Add our mast functions to the device */ -+ dev->open = ipsec_mast_open; -+ dev->stop = ipsec_mast_close; -+ dev->hard_start_xmit = ipsec_mast_start_xmit; -+ dev->get_stats = ipsec_mast_get_stats; -+ -+ dev->priv = kmalloc(sizeof(struct mastpriv), GFP_KERNEL); -+ if (dev->priv == NULL) -+ return -ENOMEM; -+ memset((caddr_t)(dev->priv), 0, sizeof(struct mastpriv)); -+ -+ for(i = 0; i < sizeof(zeroes); i++) { -+ ((__u8*)(zeroes))[i] = 0; -+ } -+ -+ dev->set_multicast_list = NULL; -+ dev->do_ioctl = ipsec_mast_ioctl; -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) -+ dev->header_ops = NULL; -+#else -+ dev->hard_header = NULL; -+ dev->rebuild_header = NULL; -+ dev->header_cache_update= NULL; -+#endif -+ dev->set_mac_address = NULL; -+ dev->neigh_setup = ipsec_mast_neigh_setup_dev; -+ dev->hard_header_len = 8+20+20+8; -+ dev->mtu = 0; -+ dev->addr_len = 0; -+ dev->type = ARPHRD_NONE; -+ dev->tx_queue_len = 10; -+ memset((caddr_t)(dev->broadcast),0xFF, ETH_ALEN); /* what if this is not attached to ethernet? */ -+ -+ /* New-style flags. */ -+ dev->flags = IFF_NOARP; -+ -+ /* We're done. Have I forgotten anything? */ -+ return 0; -+} -+ -+#ifdef alloc_netdev -+static void ipsec_mast_netdev_setup(struct net_device *dev) -+{ -+} -+#endif -+struct net_device *mastdevices[IPSEC_NUM_IFMAX]; -+int mastdevices_max=-1; -+ -+int ipsec_mast_createnum(int vifnum) -+{ -+ struct net_device *im; -+ int vifentry; -+ char name[IFNAMSIZ]; -+ -+ if(vifnum > IPSEC_NUM_IFMAX) { -+ return -ENOENT; -+ } -+ -+ if(mastdevices[vifnum]!=NULL) { -+ return -EEXIST; -+ } -+ -+ /* no identical device */ -+ if(vifnum > mastdevices_max) { -+ mastdevices_max=vifnum; -+ } -+ vifentry = vifnum; -+ -+ snprintf(name, IFNAMSIZ, MAST_DEV_FORMAT, vifnum); -+ -+#ifdef alloc_netdev -+ im = alloc_netdev(0, name, ipsec_mast_netdev_setup); -+#else -+ im = (struct net_device *)kmalloc(sizeof(struct net_device),GFP_KERNEL); -+#endif -+ if(im == NULL) { -+ printk(KERN_ERR "failed to allocate space for mast%d device\n", vifnum); -+ return -ENOMEM; -+ } -+ -+#ifndef alloc_netdev -+ memset((caddr_t)im, 0, sizeof(struct net_device)); -+ memcpy(im->name, name, IFNAMSIZ); -+#endif -+ -+ im->init = ipsec_mast_probe; -+ -+ if(register_netdev(im) != 0) { -+ printk(KERN_ERR "ipsec_mast: failed to register %s\n", -+ im->name); -+ return -EIO; -+ } -+ -+ dev_hold(im); -+ mastdevices[vifentry]=im; -+ -+ return 0; -+} -+ -+ -+int -+ipsec_mast_deletenum(int vifnum) -+{ -+ struct net_device *dev_ipsec; -+ -+ if(vifnum > IPSEC_NUM_IFMAX) { -+ return -ENOENT; -+ } -+ -+ dev_ipsec = mastdevices[vifnum]; -+ if(dev_ipsec == NULL) { -+ return -ENOENT; -+ } -+ -+ /* release reference */ -+ mastdevices[vifnum]=NULL; -+ ipsec_dev_put(dev_ipsec); -+ -+ KLIPS_PRINT(debug_tunnel, "Unregistering %s (refcnt=%d)\n", -+ dev_ipsec->name, -+ atomic_read(&dev_ipsec->refcnt)); -+ unregister_netdev(dev_ipsec); -+ KLIPS_PRINT(debug_tunnel, "Unregisted %s\n", dev_ipsec->name); -+#ifndef NETDEV_23 -+ kfree(dev_ipsec->name); -+ dev_ipsec->name=NULL; -+#endif /* !NETDEV_23 */ -+ kfree(dev_ipsec->priv); -+ dev_ipsec->priv=NULL; -+ -+ return 0; -+} -+ -+ -+struct net_device * -+ipsec_mast_get_device(int vifnum) -+{ -+ int ovifnum = vifnum; -+ -+ if(vifnum > IPSECDEV_OFFSET) { -+ return ipsec_tunnel_get_device(vifnum-IPSECDEV_OFFSET); -+ } else { -+ struct net_device *nd; -+ -+ if(vifnum >= MASTTRANSPORT_OFFSET) { -+ vifnum -= MASTTRANSPORT_OFFSET; -+ } -+ -+ if(vifnum <= mastdevices_max) { -+ nd = mastdevices[vifnum]; -+ -+ if(nd) dev_hold(nd); -+ return nd; -+ } else { -+ KLIPS_ERROR(debug_tunnel, -+ "no such vif %d (ovif=%d)\n", vifnum, ovifnum); -+ return NULL; -+ } -+ } -+} -+ -+unsigned int -+ipsec_mast_is_transport(int vifnum) -+{ -+ if(vifnum > MASTTRANSPORT_OFFSET && vifnum priv); -+ dev_mast->priv=NULL; -+ dev_put(mastdevices[i]); -+ mastdevices[i]=NULL; -+ } -+ } -+ return error; -+} -+ -+/* -+ * -+ * Local Variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ -+ -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/ipsec_md5c.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,406 @@ -+/* -+ * RCSID $Id: ipsec_md5c.c,v 1.10 2005/04/15 01:25:57 mcr Exp $ -+ */ -+ -+/* -+ * The rest of the code is derived from MD5C.C by RSADSI. Minor cosmetic -+ * changes to accomodate it in the kernel by ji. -+ */ -+ -+#include -+#include -+ -+#include "openswan/ipsec_md5h.h" -+ -+/* MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm -+ */ -+ -+/* Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All -+rights reserved. -+ -+License to copy and use this software is granted provided that it -+is identified as the "RSA Data Security, Inc. MD5 Message-Digest -+Algorithm" in all material mentioning or referencing this software -+or this function. -+ -+License is also granted to make and use derivative works provided -+that such works are identified as "derived from the RSA Data -+Security, Inc. MD5 Message-Digest Algorithm" in all material -+mentioning or referencing the derived work. -+ -+RSA Data Security, Inc. makes no representations concerning either -+the merchantability of this software or the suitability of this -+software for any particular purpose. It is provided "as is" -+without express or implied warranty of any kind. -+ -+These notices must be retained in any copies of any part of this -+documentation and/or software. -+ */ -+ -+/* -+ * Additions by JI -+ * -+ * HAVEMEMCOPY is defined if mem* routines are available -+ * -+ * HAVEHTON is defined if htons() and htonl() can be used -+ * for big/little endian conversions -+ * -+ */ -+ -+#define HAVEMEMCOPY -+#ifdef __LITTLE_ENDIAN -+#define LITTLENDIAN -+#endif -+#ifdef __BIG_ENDIAN -+#define BIGENDIAN -+#endif -+ -+/* Constants for MD5Transform routine. -+ */ -+ -+#define S11 7 -+#define S12 12 -+#define S13 17 -+#define S14 22 -+#define S21 5 -+#define S22 9 -+#define S23 14 -+#define S24 20 -+#define S31 4 -+#define S32 11 -+#define S33 16 -+#define S34 23 -+#define S41 6 -+#define S42 10 -+#define S43 15 -+#define S44 21 -+ -+static void MD5Transform PROTO_LIST ((UINT4 [4], unsigned char [64])); -+ -+#ifdef LITTLEENDIAN -+#define Encode MD5_memcpy -+#define Decode MD5_memcpy -+#else -+static void Encode PROTO_LIST -+ ((unsigned char *, UINT4 *, unsigned int)); -+static void Decode PROTO_LIST -+ ((UINT4 *, unsigned char *, unsigned int)); -+#endif -+ -+#ifdef HAVEMEMCOPY -+/* no need to include here; defines these */ -+#define MD5_memcpy memcpy -+#define MD5_memset memset -+#else -+#ifdef HAVEBCOPY -+#define MD5_memcpy(_a,_b,_c) bcopy((_b),(_a),(_c)) -+#define MD5_memset(_a,_b,_c) bzero((_a),(_c)) -+#else -+static void MD5_memcpy PROTO_LIST ((POINTER, POINTER, unsigned int)); -+static void MD5_memset PROTO_LIST ((POINTER, int, unsigned int)); -+#endif -+#endif -+static unsigned char PADDING[64] = { -+ 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -+}; -+ -+/* F, G, H and I are basic MD5 functions. -+ */ -+#define F(x, y, z) (((x) & (y)) | ((~x) & (z))) -+#define G(x, y, z) (((x) & (z)) | ((y) & (~z))) -+#define H(x, y, z) ((x) ^ (y) ^ (z)) -+#define I(x, y, z) ((y) ^ ((x) | (~z))) -+ -+/* ROTATE_LEFT rotates x left n bits. -+ */ -+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n)))) -+ -+/* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4. -+Rotation is separate from addition to prevent recomputation. -+ */ -+#define FF(a, b, c, d, x, s, ac) { \ -+ (a) += F ((b), (c), (d)) + (x) + (UINT4)(ac); \ -+ (a) = ROTATE_LEFT ((a), (s)); \ -+ (a) += (b); \ -+ } -+#define GG(a, b, c, d, x, s, ac) { \ -+ (a) += G ((b), (c), (d)) + (x) + (UINT4)(ac); \ -+ (a) = ROTATE_LEFT ((a), (s)); \ -+ (a) += (b); \ -+ } -+#define HH(a, b, c, d, x, s, ac) { \ -+ (a) += H ((b), (c), (d)) + (x) + (UINT4)(ac); \ -+ (a) = ROTATE_LEFT ((a), (s)); \ -+ (a) += (b); \ -+ } -+#define II(a, b, c, d, x, s, ac) { \ -+ (a) += I ((b), (c), (d)) + (x) + (UINT4)(ac); \ -+ (a) = ROTATE_LEFT ((a), (s)); \ -+ (a) += (b); \ -+ } -+ -+/* -+ * MD5 initialization. Begins an MD5 operation, writing a new context. -+ */ -+void osMD5Init(void *vcontext) -+{ -+ MD5_CTX *context = vcontext; -+ -+ context->count[0] = context->count[1] = 0; -+ /* Load magic initialization constants.*/ -+ context->state[0] = 0x67452301; -+ context->state[1] = 0xefcdab89; -+ context->state[2] = 0x98badcfe; -+ context->state[3] = 0x10325476; -+} -+ -+/* MD5 block update operation. Continues an MD5 message-digest -+ operation, processing another message block, and updating the -+ context. -+ */ -+void osMD5Update (vcontext, input, inputLen) -+ void *vcontext; -+ unsigned char *input; /* input block */ -+ __u32 inputLen; /* length of input block */ -+{ -+ MD5_CTX *context = vcontext; -+ __u32 i; -+ unsigned int index, partLen; -+ -+ /* Compute number of bytes mod 64 */ -+ index = (unsigned int)((context->count[0] >> 3) & 0x3F); -+ -+ /* Update number of bits */ -+ if ((context->count[0] += ((UINT4)inputLen << 3)) -+ < ((UINT4)inputLen << 3)) -+ context->count[1]++; -+ context->count[1] += ((UINT4)inputLen >> 29); -+ -+ partLen = 64 - index; -+ -+ /* Transform as many times as possible. -+*/ -+ if (inputLen >= partLen) { -+ MD5_memcpy -+ ((POINTER)&context->buffer[index], (POINTER)input, partLen); -+ MD5Transform (context->state, context->buffer); -+ -+ for (i = partLen; i + 63 < inputLen; i += 64) -+ MD5Transform (context->state, &input[i]); -+ -+ index = 0; -+ } -+ else -+ i = 0; -+ -+ /* Buffer remaining input */ -+ MD5_memcpy -+ ((POINTER)&context->buffer[index], (POINTER)&input[i], -+ inputLen-i); -+} -+ -+/* MD5 finalization. Ends an MD5 message-digest operation, writing the -+ the message digest and zeroizing the context. -+ */ -+void osMD5Final (digest, vcontext) -+unsigned char digest[16]; /* message digest */ -+void *vcontext; /* context */ -+{ -+ MD5_CTX *context = vcontext; -+ unsigned char bits[8]; -+ unsigned int index, padLen; -+ -+ /* Save number of bits */ -+ Encode (bits, context->count, 8); -+ -+ /* Pad out to 56 mod 64. -+*/ -+ index = (unsigned int)((context->count[0] >> 3) & 0x3f); -+ padLen = (index < 56) ? (56 - index) : (120 - index); -+ osMD5Update (context, PADDING, padLen); -+ -+ /* Append length (before padding) */ -+ osMD5Update (context, bits, 8); -+ -+ if (digest != NULL) /* Bill Simpson's padding */ -+ { -+ /* store state in digest */ -+ Encode (digest, context->state, 16); -+ -+ /* Zeroize sensitive information. -+ */ -+ MD5_memset ((POINTER)context, 0, sizeof (*context)); -+ } -+} -+ -+/* MD5 basic transformation. Transforms state based on block. -+ */ -+static void MD5Transform (state, block) -+UINT4 state[4]; -+unsigned char block[64]; -+{ -+ UINT4 a = state[0], b = state[1], c = state[2], d = state[3], x[16]; -+ -+ Decode (x, block, 64); -+ -+ /* Round 1 */ -+ FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */ -+ FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */ -+ FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */ -+ FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */ -+ FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */ -+ FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */ -+ FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */ -+ FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */ -+ FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */ -+ FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */ -+ FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */ -+ FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */ -+ FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */ -+ FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */ -+ FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */ -+ FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */ -+ -+ /* Round 2 */ -+ GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */ -+ GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */ -+ GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */ -+ GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */ -+ GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */ -+ GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */ -+ GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */ -+ GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */ -+ GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */ -+ GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */ -+ GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */ -+ GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */ -+ GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */ -+ GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */ -+ GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */ -+ GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */ -+ -+ /* Round 3 */ -+ HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */ -+ HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */ -+ HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */ -+ HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */ -+ HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */ -+ HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */ -+ HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */ -+ HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */ -+ HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */ -+ HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */ -+ HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */ -+ HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */ -+ HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */ -+ HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */ -+ HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */ -+ HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */ -+ -+ /* Round 4 */ -+ II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */ -+ II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */ -+ II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */ -+ II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */ -+ II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */ -+ II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */ -+ II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */ -+ II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */ -+ II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */ -+ II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */ -+ II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */ -+ II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */ -+ II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */ -+ II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */ -+ II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */ -+ II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */ -+ -+ state[0] += a; -+ state[1] += b; -+ state[2] += c; -+ state[3] += d; -+ -+ /* Zeroize sensitive information. -+*/ -+ MD5_memset ((POINTER)x, 0, sizeof (x)); -+} -+ -+#ifndef LITTLEENDIAN -+ -+/* Encodes input (UINT4) into output (unsigned char). Assumes len is -+ a multiple of 4. -+ */ -+static void Encode (output, input, len) -+unsigned char *output; -+UINT4 *input; -+unsigned int len; -+{ -+ unsigned int i, j; -+ -+ for (i = 0, j = 0; j < len; i++, j += 4) { -+ output[j] = (unsigned char)(input[i] & 0xff); -+ output[j+1] = (unsigned char)((input[i] >> 8) & 0xff); -+ output[j+2] = (unsigned char)((input[i] >> 16) & 0xff); -+ output[j+3] = (unsigned char)((input[i] >> 24) & 0xff); -+ } -+} -+ -+/* Decodes input (unsigned char) into output (UINT4). Assumes len is -+ a multiple of 4. -+ */ -+static void Decode (output, input, len) -+UINT4 *output; -+unsigned char *input; -+unsigned int len; -+{ -+ unsigned int i, j; -+ -+ for (i = 0, j = 0; j < len; i++, j += 4) -+ output[i] = ((UINT4)input[j]) | (((UINT4)input[j+1]) << 8) | -+ (((UINT4)input[j+2]) << 16) | (((UINT4)input[j+3]) << 24); -+} -+ -+#endif -+ -+#ifndef HAVEMEMCOPY -+#ifndef HAVEBCOPY -+/* Note: Replace "for loop" with standard memcpy if possible. -+ */ -+ -+static void MD5_memcpy (output, input, len) -+POINTER output; -+POINTER input; -+unsigned int len; -+{ -+ unsigned int i; -+ -+ for (i = 0; i < len; i++) -+ -+ output[i] = input[i]; -+} -+ -+/* Note: Replace "for loop" with standard memset if possible. -+ */ -+ -+static void MD5_memset (output, value, len) -+POINTER output; -+int value; -+unsigned int len; -+{ -+ unsigned int i; -+ -+ for (i = 0; i < len; i++) -+ ((char *)output)[i] = (char)value; -+} -+#endif -+#endif -+ -+/* -+ * -+ * Local Variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/ipsec_ocf.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,739 @@ -+/* -+ * IPSEC OCF support -+ * -+ * This code written by David McCullough -+ * Copyright (C) 2005 Intel Corporation. All Rights Reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ */ -+ -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif -+#include -+ -+#define __NO_VERSION__ -+#include -+#include /* printk() */ -+ -+#include /* error codes */ -+#include /* size_t */ -+ -+#include -+ -+#include -+ -+#include -+#include "openswan/ipsec_sa.h" -+#include "openswan/ipsec_rcv.h" -+#include "openswan/ipsec_xmit.h" -+#include "openswan/ipsec_tunnel.h" -+#include "openswan/ipsec_xform.h" -+#include "openswan/ipsec_auth.h" -+#include "openswan/ipsec_esp.h" -+#include "openswan/ipsec_ah.h" -+ -+#include -+#include -+ -+#include "ipsec_ocf.h" -+ -+extern int debug_pfkey; -+extern int debug_rcv; -+ -+int ipsec_ocf_crid = (CRYPTOCAP_F_HARDWARE|CRYPTOCAP_F_SOFTWARE); -+#if 0 /* LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) */ -+/* -+ * allow users to force us to a particular OCF driver -+ */ -+char *ipsec_ocf_driver = NULL; -+module_parm(ipsec_ocf_driver, charp, 0644); -+MODULE_PARM_DESC(ipsec_ocf_driver, -+ "Driver name (ie., cryptosoft), hw, sw, both (default both)"); -+#endif -+ -+/* -+ * Tuning parameters, the settings below appear best for -+ * the IXP -+ */ -+#define USE_BATCH 1 /* enable batch mode */ -+#define USE_CBIMM 1 /* enable immediate callbacks */ -+#define FORCE_QS 0 /* force use of queues for continuation of state machine */ -+#ifdef DECLARE_TASKLET -+#define USE_TASKLET 1 /* use tasklet for continuation of state machine */ -+#else -+#define USE_TASKLET 0 /* don't use tasklet for continuation of state machine */ -+#endif -+/* -+ * Because some OCF operations are synchronous (ie., software encryption) -+ * we need to protect ourselves from distructive re-entry. All we do -+ * is track where we are at and either callback immediately or Q the -+ * callback to avoid conflicts. This allows us to deal with the fact that -+ * OCF doesn't tell us if our crypto operations will be async or sync. -+ */ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,21) -+#define _INIT_WORK(wq, fn, arg) INIT_WORK(&(wq), (void (*)(struct work_struct *))(fn)) -+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20) -+#define _INIT_WORK(wq, fn, arg) INIT_WORK(&(wq), (void (*)(struct work_queue *))(fn)) -+#else -+#define _INIT_WORK(wq, fn, arg) INIT_WORK(&(wq), (void (*)(void *))(fn), (void *)(arg)) -+#endif -+ -+#define PROCESS_LATER(wq, sm, arg) \ -+ ({ \ -+ _INIT_WORK(wq, sm, arg); \ -+ schedule_work(&(wq)); \ -+ }) -+ -+#define PROCESS_NOW(sm, arg) \ -+ ({ \ -+ (*sm)(arg); \ -+ }) -+ -+#if USE_TASKLET == 1 -+ #define PROCESS_NEXT(this, wqsm, sm) ({ \ -+ tasklet_init(&this->tasklet, \ -+ (void (*)(unsigned long)) sm, (unsigned long)this); \ -+ tasklet_schedule(&this->tasklet); \ -+ }) -+#elif FORCE_QS == 0 -+ #define PROCESS_NEXT(this, wqsm, sm) \ -+ if (in_interrupt()) { \ -+ PROCESS_LATER(this->workq, wqsm, this); \ -+ } else { \ -+ PROCESS_NOW(sm, this); \ -+ } -+#else -+ #define PROCESS_NEXT(this, wqsm, sm) PROCESS_LATER(this->workq, wqsm, this) -+#endif -+ -+/* -+ * convert openswan values to OCF values -+ */ -+ -+static int -+ipsec_ocf_authalg(int authalg) -+{ -+ switch (authalg) { -+ case AH_SHA: return CRYPTO_SHA1_HMAC; -+ case AH_MD5: return CRYPTO_MD5_HMAC; -+ } -+ return 0; -+} -+ -+ -+static int -+ipsec_ocf_encalg(int encalg) -+{ -+ switch (encalg) { -+ case ESP_NULL: return CRYPTO_NULL_CBC; -+ case ESP_DES: return CRYPTO_DES_CBC; -+ case ESP_3DES: return CRYPTO_3DES_CBC; -+ case ESP_AES: return CRYPTO_AES_CBC; -+ case ESP_CAST: return CRYPTO_CAST_CBC; -+ case ESP_BLOWFISH: return CRYPTO_BLF_CBC; -+ } -+ return 0; -+} -+ -+/* -+ * if we can do the request ops, setup the sessions and return true -+ * otherwise return false with ipsp unchanged -+ */ -+ -+int -+ipsec_ocf_sa_init(struct ipsec_sa *ipsp, int authalg, int encalg) -+{ -+ struct cryptoini crie, cria; -+ int error; -+ -+ KLIPS_PRINT(debug_pfkey, "klips_debug:ipsec_ocf_sa_init(a=0x%x,e=0x%x)\n", -+ authalg, encalg); -+ -+ if (authalg && ipsp->ips_key_bits_a == 0) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:ipsec_ocf_sa_init(a=0x%x,e=0x%x) a-key-bits=0\n", -+ authalg, encalg); -+ /* pretend we are happy with this */ -+ return 1; -+ } -+ -+ if (encalg && ipsp->ips_key_bits_e == 0) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:ipsec_ocf_sa_init(a=0x%x,e=0x%x) e-key-bits=0\n", -+ authalg, encalg); -+ /* pretend we are happy with this */ -+ return 1; -+ } -+ -+ memset(&crie, 0, sizeof(crie)); -+ memset(&cria, 0, sizeof(cria)); -+ -+ cria.cri_alg = ipsec_ocf_authalg(authalg); -+ cria.cri_klen = ipsp->ips_key_bits_a; -+ cria.cri_key = ipsp->ips_key_a; -+ cria.cri_mlen = 12; -+ -+ crie.cri_alg = ipsec_ocf_encalg(encalg); -+ crie.cri_klen = ipsp->ips_key_bits_e; -+ crie.cri_key = ipsp->ips_key_e; -+ switch (crie.cri_alg) { -+ case CRYPTO_AES_CBC: -+ ipsp->ips_iv_size = 16; -+ break; -+ case CRYPTO_DES_CBC: -+ case CRYPTO_3DES_CBC: -+ ipsp->ips_iv_size = 8; -+ break; -+ default: -+ ipsp->ips_iv_size = 0; -+ break; -+ } -+ ipsp->ips_iv_bits = ipsp->ips_iv_size * 8; -+ ipsp->ips_auth_bits = ipsp->ips_key_bits_a; -+ -+ if (authalg && encalg) { -+ crie.cri_next = &cria; -+ error = crypto_newsession(&ipsp->ocf_cryptoid, &crie, ipsec_ocf_crid); -+ } else if (encalg) { -+ error = crypto_newsession(&ipsp->ocf_cryptoid, &crie, ipsec_ocf_crid); -+ } else if (authalg) { -+ error = crypto_newsession(&ipsp->ocf_cryptoid, &cria, ipsec_ocf_crid); -+ } else { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:ipsec_ocf_sa_init: " -+ "no authalg or encalg\n"); -+ return 0; -+ } -+ -+ if (error) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:ipsec_ocf_sa_init: " -+ "crypto_newsession failed 0x%x\n", error); -+ return 0; -+ } -+ -+ /* make sure no ALG stuff bites us */ -+ if (ipsp->ips_alg_enc) -+ printk("We received an ALG initted SA\n"); -+ ipsp->ips_alg_enc = NULL; -+ -+ ipsp->ocf_in_use = 1; -+ return 1; -+} -+ -+ -+int -+ipsec_ocf_sa_free(struct ipsec_sa *ipsp) -+{ -+ KLIPS_PRINT(debug_pfkey, "klips_debug:ipsec_ocf_sa_free()\n"); -+ crypto_freesession(ipsp->ocf_cryptoid); -+ ipsp->ocf_cryptoid = -1; -+ ipsp->ocf_in_use = 0; -+ return 1; -+} -+ -+#if USE_TASKLET == 0 -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20) -+static void -+ipsec_rsm_wq(struct work_struct *work) -+{ -+ struct ipsec_rcv_state *irs = container_of(work, struct ipsec_rcv_state, workq); -+ ipsec_rsm(irs); -+} -+#else -+#define ipsec_rsm_wq ipsec_rsm -+#endif -+#endif /* USE_TASKLET */ -+ -+static int -+ipsec_ocf_rcv_cb(struct cryptop *crp) -+{ -+ struct ipsec_rcv_state *irs = (struct ipsec_rcv_state *)crp->crp_opaque; -+ -+ KLIPS_PRINT(debug_rcv, "klips_debug:ipsec_ocf_rcv_cb\n"); -+ if (irs == NULL) { -+ KLIPS_PRINT(debug_rcv, "klips_debug:ipsec_ocf_rcv_cb: " -+ "NULL irs in callback\n"); -+ return 0; -+ } -+ -+ /* -+ * we must update the state before returning to the state machine. -+ * if we have an error, terminate the processing by moving to the DONE -+ * state -+ */ -+ -+ irs->state = IPSEC_RSM_DONE; /* assume it went badly */ -+ if (crp->crp_etype) { -+ KLIPS_PRINT(debug_rcv, "klips_debug:ipsec_ocf_rcv_cb: " -+ "error in processing 0x%x\n", crp->crp_etype); -+ } else { -+ if (!irs->ipsp->ips_encalg) { -+ /* AH post processing, put back fields we had to zero */ -+ irs->ipp->ttl = irs->ttl; -+ irs->ipp->check = irs->check; -+ irs->ipp->frag_off = irs->frag_off; -+ irs->ipp->tos = irs->tos; -+ irs->state = IPSEC_RSM_AUTH_CHK; -+ /* pull up the IP header again after processing */ -+ skb_pull(irs->skb, ((unsigned char *)irs->protostuff.ahstuff.ahp) - -+ ((unsigned char *)irs->ipp)); -+ } else if (ipsec_rcv_esp_post_decrypt(irs) == IPSEC_RCV_OK) { -+ /* this one came up good, set next state */ -+ irs->state = IPSEC_RSM_DECAP_CONT; -+ } -+ } -+ -+ crypto_freereq(crp); -+ crp = NULL; -+ -+ /* setup the rest of the processing now */ -+ PROCESS_NEXT(irs, ipsec_rsm_wq, ipsec_rsm); -+ return 0; -+} -+ -+enum ipsec_rcv_value -+ipsec_ocf_rcv(struct ipsec_rcv_state *irs) -+{ -+ struct cryptop *crp; -+ struct cryptodesc *crde, *crda = NULL; -+ struct ipsec_sa *ipsp; -+ -+ KLIPS_PRINT(debug_rcv, "klips_debug:ipsec_ocf_rcv\n"); -+ -+ ipsp = irs->ipsp; -+ if (!ipsp) { -+ KLIPS_PRINT(debug_rcv, "klips_debug:ipsec_ocf_rcv: " -+ "no SA for rcv processing\n"); -+ return IPSEC_RCV_SAIDNOTFOUND; -+ } -+ -+ if (!irs->skb) { -+ KLIPS_PRINT(debug_rcv, "klips_debug:ipsec_ocf_rcv: no skb\n"); -+ return IPSEC_RCV_SAIDNOTFOUND; -+ } -+ -+ crp = crypto_getreq((ipsp->ips_authalg && ipsp->ips_encalg) ? 2 : 1); -+ if (!crp) { -+ KLIPS_PRINT(debug_rcv, "klips_debug:ipsec_ocf_rcv: " -+ "crypto_getreq returned NULL\n"); -+ return IPSEC_RCV_REALLYBAD; -+ } -+ -+ if (ipsp->ips_authalg) { -+ crda = crp->crp_desc; -+ crde = crda->crd_next; -+ } else { -+ crde = crp->crp_desc; -+ crda = crde->crd_next; -+ } -+ -+ if (crda) { -+ /* Authentication descriptor */ -+ crda->crd_alg = ipsec_ocf_authalg(ipsp->ips_authalg); -+ if (!crda->crd_alg) { -+ KLIPS_PRINT(debug_rcv, "klips_debug:ipsec_ocf_rcv: " -+ "bad auth alg 0x%x\n", ipsp->ips_authalg); -+ crypto_freereq(crp); -+ return IPSEC_RCV_BADPROTO; -+ } -+ -+ if (!crde) { /* assuming AH processing */ -+ /* push the IP header so we can authenticate it */ -+ skb_push(irs->skb, ((unsigned char *)irs->protostuff.ahstuff.ahp) - -+ ((unsigned char *)irs->ipp)); -+ } -+ -+ crda->crd_key = ipsp->ips_key_a; -+ crda->crd_klen = ipsp->ips_key_bits_a; -+ crda->crd_inject = irs->authenticator - irs->skb->data; -+ /* Copy the authenticator to check aganinst later */ -+ memcpy(irs->hash, irs->authenticator, 12); -+ -+ if (!crde) { /* assume AH processing */ -+ /* AH processing, save fields we have to zero */ -+ irs->ttl = irs->ipp->ttl; -+ irs->check = irs->ipp->check; -+ irs->frag_off = irs->ipp->frag_off; -+ irs->tos = irs->ipp->tos; -+ irs->ipp->ttl = 0; -+ irs->ipp->check = 0; -+ irs->ipp->frag_off = 0; -+ irs->ipp->tos = 0; -+ crda->crd_len = irs->skb->len; -+ crda->crd_skip = ((unsigned char *)irs->ipp) - irs->skb->data; -+ memset(irs->authenticator, 0, 12); -+ } else { -+ crda->crd_len = irs->ilen; -+ crda->crd_skip = -+ ((unsigned char *) irs->protostuff.espstuff.espp) - -+ irs->skb->data; -+ /* -+ * It would be nice to clear the authenticator here -+ * to be sure we do not see it again later when checking. -+ * We cannot. Some HW actually expects to check the in-data -+ * hash and and flag an error if it is incorrect. -+ * -+ * What we do to allow this is to pass in the current in-data -+ * value. Your OCF driver must ensure that it fails a request -+ * for hash+decrypt with an invalid hash value, or returns the -+ * computed in-data hash as requested. -+ * -+ * If your driver does not check the in-data hash but just -+ * computes it value, you must ensure that it does not return -+ * the original in-data hash by accident. It must invalidate the -+ * in-data hash itself to force an auth check error. -+ * -+ * All existing drivers that do not care about the current -+ * in-data hash do this by clearing the in-data hash before -+ * processing, either directly or via their implementation. -+ */ -+#if 0 -+ memset(irs->authenticator, 0, 12); -+#endif -+ } -+ } -+ -+ if (crde) { -+ crde->crd_alg = ipsec_ocf_encalg(ipsp->ips_encalg); -+ if (!crde->crd_alg) { -+ KLIPS_PRINT(debug_rcv, "klips_debug:ipsec_ocf_rcv: " -+ "bad enc alg 0x%x\n", ipsp->ips_encalg); -+ crypto_freereq(crp); -+ return IPSEC_RCV_BADPROTO; -+ } -+ -+ irs->esphlen = ESP_HEADER_LEN + ipsp->ips_iv_size; -+ irs->ilen -= irs->esphlen; -+ crde->crd_skip = (skb_transport_header(irs->skb) - irs->skb->data) + irs->esphlen; -+ crde->crd_len = irs->ilen; -+ crde->crd_inject = crde->crd_skip - ipsp->ips_iv_size; -+ crde->crd_klen = ipsp->ips_key_bits_e; -+ crde->crd_key = ipsp->ips_key_e; -+ } -+ -+ crp->crp_ilen = irs->skb->len; /* Total input length */ -+ crp->crp_flags = -+ CRYPTO_F_SKBUF | -+#if USE_CBIMM == 1 -+ CRYPTO_F_CBIMM | -+#endif -+#if USE_BATCH == 1 -+ CRYPTO_F_BATCH | -+#endif -+ 0; -+ crp->crp_buf = (caddr_t) irs->skb; -+ crp->crp_callback = ipsec_ocf_rcv_cb; -+ crp->crp_sid = ipsp->ocf_cryptoid; -+ crp->crp_opaque = (caddr_t) irs; -+ if (crypto_dispatch(crp)){ -+ crypto_freereq(crp); -+ return IPSEC_RCV_REALLYBAD; -+ } -+ return(IPSEC_RCV_PENDING); -+} -+ -+#if USE_TASKLET == 0 -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20) -+static void -+ipsec_xsm_wq(struct work_struct *work) -+{ -+ struct ipsec_xmit_state *ixs = container_of(work, struct ipsec_xmit_state, workq); -+ ipsec_xsm(ixs); -+} -+#else -+#define ipsec_xsm_wq ipsec_xsm -+#endif -+#endif /* USE_TASKLET */ -+ -+static int -+ipsec_ocf_xmit_cb(struct cryptop *crp) -+{ -+ struct ipsec_xmit_state *ixs = (struct ipsec_xmit_state *)crp->crp_opaque; -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, "klips_debug:ipsec_ocf_xmit_cb\n"); -+ -+ if (ixs == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, "klips_debug:ipsec_ocf_xmit_cb: " -+ "NULL ixs in callback\n"); -+ return 0; -+ } -+ -+ /* -+ * we must update the state before returning to the state machine. -+ * if we have an error, terminate the processing by moving to the DONE -+ * state -+ */ -+ -+ ixs->state = IPSEC_XSM_DONE; /* assume bad xmit */ -+ if (crp->crp_etype) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, "klips_debug:ipsec_ocf_xmit_cb: " -+ "error in processing 0x%x\n", crp->crp_etype); -+ } else { -+ if (!ixs->ipsp->ips_encalg) { -+ /* AH post processing, put back fields we had to zero */ -+ ixs->iph->ttl = ixs->ttl; -+ ixs->iph->check = ixs->check; -+ ixs->iph->frag_off = ixs->frag_off; -+ ixs->iph->tos = ixs->tos; -+ } -+ ixs->state = IPSEC_XSM_CONT; /* ESP was all good */ -+ } -+ -+ crypto_freereq(crp); -+ crp = NULL; -+ -+ /* setup the rest of the processing now */ -+ PROCESS_NEXT(ixs, ipsec_xsm_wq, ipsec_xsm); -+ return 0; -+} -+ -+ -+enum ipsec_xmit_value -+ipsec_ocf_xmit(struct ipsec_xmit_state *ixs) -+{ -+ struct cryptop *crp; -+ struct cryptodesc *crde, *crda; -+ struct ipsec_sa *ipsp; -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, "klips_debug:ipsec_ocf_xmit\n"); -+ -+ ipsp = ixs->ipsp; -+ if (!ipsp) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, "klips_debug:ipsec_ocf_xmit: " -+ "no SA for rcv processing\n"); -+ return IPSEC_XMIT_SAIDNOTFOUND; -+ } -+ -+ if (!ixs->skb) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_ocf_xmit: no skb\n"); -+ return IPSEC_XMIT_SAIDNOTFOUND; -+ } -+ -+ crp = crypto_getreq((ipsp->ips_authalg && ipsp->ips_encalg) ? 2 : 1); -+ if (!crp) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, "klips_debug:ipsec_ocf_xmit: " -+ "crypto_getreq returned NULL\n"); -+ return IPSEC_XMIT_ERRMEMALLOC; -+ } -+ -+ if (ipsp->ips_encalg) { -+ crde = crp->crp_desc; -+ crda = crde->crd_next; -+ } else { -+ crda = crp->crp_desc; -+ crde = crda->crd_next; -+ } -+ -+ if (crda) { -+ /* Authentication descriptor */ -+ crda->crd_alg = ipsec_ocf_authalg(ipsp->ips_authalg); -+ if (!crda->crd_alg) { -+ KLIPS_PRINT(debug_tunnel&DB_TN_XMIT, "klips_debug:ipsec_ocf_xmit: " -+ "bad auth alg 0x%x\n", ipsp->ips_authalg); -+ crypto_freereq(crp); -+ return IPSEC_RCV_BADPROTO; -+ } -+ if (!crde) { /* assume AH processing */ -+ /* AH processing, save fields we have to zero */ -+ crda->crd_skip = ((unsigned char *) ixs->iph) - ixs->skb->data; -+ ixs->ttl = ixs->iph->ttl; -+ ixs->check = ixs->iph->check; -+ ixs->frag_off = ixs->iph->frag_off; -+ ixs->tos = ixs->iph->tos; -+ ixs->iph->ttl = 0; -+ ixs->iph->check = 0; -+ ixs->iph->frag_off = 0; -+ ixs->iph->tos = 0; -+ crda->crd_inject = -+ (((struct ahhdr *)(ixs->dat + ixs->iphlen))->ah_data) - -+ ixs->skb->data; -+ crda->crd_len = ixs->len - ixs->authlen; -+ memset(ixs->skb->data + crda->crd_inject, 0, 12); // DM -+ } else { -+ crda->crd_skip = ((unsigned char *) ixs->espp) - ixs->skb->data; -+ crda->crd_inject = ixs->len - ixs->authlen; -+ crda->crd_len = ixs->len - ixs->iphlen - ixs->authlen; -+ } -+ crda->crd_key = ipsp->ips_key_a; -+ crda->crd_klen = ipsp->ips_key_bits_a; -+ } -+ -+ if (crde) { -+ /* Encryption descriptor */ -+ crde->crd_alg = ipsec_ocf_encalg(ipsp->ips_encalg); -+ if (!crde->crd_alg) { -+ KLIPS_PRINT(debug_tunnel&DB_TN_XMIT, "klips_debug:ipsec_ocf_xmit: " -+ "bad enc alg 0x%x\n", ipsp->ips_encalg); -+ crypto_freereq(crp); -+ return IPSEC_RCV_BADPROTO; -+ } -+ crde->crd_flags = CRD_F_ENCRYPT; -+ crde->crd_skip = ixs->idat - ixs->dat; -+ crde->crd_len = ixs->ilen; -+ crde->crd_inject = ((unsigned char *) ixs->espp->esp_iv) - ixs->dat; -+ crde->crd_klen = ipsp->ips_key_bits_e; -+ crde->crd_key = ipsp->ips_key_e; -+ } -+ -+ crp->crp_ilen = ixs->skb->len; /* Total input length */ -+ crp->crp_flags = -+ CRYPTO_F_SKBUF | -+#if USE_CBIMM == 1 -+ CRYPTO_F_CBIMM | -+#endif -+#if USE_BATCH == 1 -+ CRYPTO_F_BATCH | -+#endif -+ 0; -+ crp->crp_buf = (caddr_t) ixs->skb; -+ crp->crp_callback = ipsec_ocf_xmit_cb; -+ crp->crp_sid = ipsp->ocf_cryptoid; -+ crp->crp_opaque = (caddr_t) ixs; -+ if (crypto_dispatch(crp)){ -+ crypto_freereq(crp); -+ return IPSEC_XMIT_ERRMEMALLOC; -+ } -+ return(IPSEC_XMIT_PENDING); -+} -+ -+ -+ -+ -+#ifdef CONFIG_KLIPS_AH -+static struct ipsec_alg_supported ocf_ah_algs[] = { -+ { -+ .ias_name = "ocf-md5hmac", -+ .ias_id = AH_MD5, -+ .ias_exttype = SADB_EXT_SUPPORTED_AUTH, -+ .ias_ivlen = 0, -+ .ias_keyminbits = 128, -+ .ias_keymaxbits = 128, -+ }, -+ { -+ .ias_name = "ocf-sha1hmac", -+ .ias_id = AH_SHA, -+ .ias_exttype = SADB_EXT_SUPPORTED_AUTH, -+ .ias_ivlen = 0, -+ .ias_keyminbits = 160, -+ .ias_keymaxbits = 160, -+ }, -+ { -+ .ias_name = NULL, -+ .ias_id = 0, -+ .ias_exttype = 0, -+ .ias_ivlen = 0, -+ .ias_keyminbits = 0, -+ .ias_keymaxbits = 0, -+ } -+}; -+#endif /* CONFIG_KLIPS_AH */ -+ -+static struct ipsec_alg_supported ocf_esp_algs[] = { -+ { -+ .ias_name = "ocf-md5hmac", -+ .ias_id = AH_MD5, -+ .ias_exttype = SADB_EXT_SUPPORTED_AUTH, -+ .ias_ivlen = 0, -+ .ias_keyminbits = 128, -+ .ias_keymaxbits = 128, -+ }, -+ { -+ .ias_name = "ocf-sha1hmac", -+ .ias_id = AH_SHA, -+ .ias_exttype = SADB_EXT_SUPPORTED_AUTH, -+ .ias_ivlen = 0, -+ .ias_keyminbits = 160, -+ .ias_keymaxbits = 160, -+ }, -+ { -+ .ias_name = "ocf-aes", -+ .ias_id = ESP_AES, -+ .ias_exttype = SADB_EXT_SUPPORTED_ENCRYPT, -+ .ias_ivlen = 16, -+ .ias_keyminbits = 128, -+ .ias_keymaxbits = 256, -+ }, -+ { -+ .ias_name = "ocf-3des", -+ .ias_id = ESP_3DES, -+ .ias_exttype = SADB_EXT_SUPPORTED_ENCRYPT, -+ .ias_ivlen = 8, -+ .ias_keyminbits = 192, -+ .ias_keymaxbits = 192, -+ }, -+ { -+ .ias_name = "ocf-des", -+ .ias_id = ESP_DES, -+ .ias_exttype = SADB_EXT_SUPPORTED_ENCRYPT, -+ .ias_ivlen = 8, -+ .ias_keyminbits = 64, -+ .ias_keymaxbits = 64, -+ }, -+ { -+ .ias_name = NULL, -+ .ias_id = 0, -+ .ias_exttype = 0, -+ .ias_ivlen = 0, -+ .ias_keyminbits = 0, -+ .ias_keymaxbits = 0, -+ } -+}; -+ -+static int -+ipsec_ocf_check_alg(struct ipsec_alg_supported *s) -+{ -+ struct cryptoini cri; -+ int64_t cryptoid; -+ -+ memset(&cri, 0, sizeof(cri)); -+ if (s->ias_exttype == SADB_EXT_SUPPORTED_ENCRYPT) -+ cri.cri_alg = ipsec_ocf_encalg(s->ias_id); -+ else -+ cri.cri_alg = ipsec_ocf_authalg(s->ias_id); -+ cri.cri_klen = s->ias_keyminbits; -+ cri.cri_key = "0123456789abcdefghijklmnopqrstuvwxyz"; -+ -+ if (crypto_newsession(&cryptoid, &cri, ipsec_ocf_crid)) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:ipsec_ocf:%s not supported\n", -+ s->ias_name); -+ return 0; -+ } -+ crypto_freesession(cryptoid); -+ KLIPS_PRINT(debug_pfkey, "klips_debug:ipsec_ocf:%s supported\n", -+ s->ias_name); -+ return 1; -+} -+ -+void -+ipsec_ocf_init(void) -+{ -+ struct ipsec_alg_supported *s; -+ -+ for (s = ocf_esp_algs; s->ias_name; s++) { -+ if (ipsec_ocf_check_alg(s)) -+ (void)pfkey_list_insert_supported(s, -+ &(pfkey_supported_list[SADB_SATYPE_ESP])); -+ } -+ -+#ifdef CONFIG_KLIPS_AH -+ for (s = ocf_ah_algs; s->ias_name; s++) { -+ if (ipsec_ocf_check_alg(s)) -+ (void)pfkey_list_insert_supported(s, -+ &(pfkey_supported_list[SADB_SATYPE_AH])); -+ } -+#endif -+ -+ /* send register event to userspace */ -+ pfkey_register_reply(SADB_SATYPE_ESP, NULL); -+ pfkey_register_reply(SADB_SATYPE_AH, NULL); -+} -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/ipsec_ocf.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,41 @@ -+#ifndef _IPSEC_OCF_H_ -+#define _IPSEC_OCF_H_ -+/****************************************************************************/ -+/* -+ * IPSEC OCF support -+ * -+ * This code written by David McCullough -+ * Copyright (C) 2005 Intel Corporation. All Rights Reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ */ -+ -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif -+#include -+#include -+ -+#ifdef CONFIG_KLIPS_OCF -+ -+#include -+ -+extern int ipsec_ocf_sa_init(struct ipsec_sa *ipsp, int authalg, int encalg); -+extern int ipsec_ocf_sa_free(struct ipsec_sa *ipsp); -+extern enum ipsec_rcv_value ipsec_ocf_rcv(struct ipsec_rcv_state *irs); -+extern enum ipsec_xmit_value ipsec_ocf_xmit(struct ipsec_xmit_state *ixs); -+extern void ipsec_ocf_init(void); -+ -+#endif -+ -+/****************************************************************************/ -+#endif /* _IPSEC_OCF_H_ */ -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/ipsec_proc.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,1056 @@ -+/* -+ * @(#) /proc file system interface code. -+ * -+ * Copyright (C) 1996, 1997 John Ioannidis. -+ * Copyright (C) 1998, 1999, 2000, 2001 Richard Guy Briggs -+ * 2001 Michael Richardson -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * Split out from ipsec_init.c version 1.70. -+ */ -+ -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif -+#include -+#define __NO_VERSION__ -+#include -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) -+#include -+#endif -+#include /* printk() */ -+#include /* struct iphdr */ -+ -+#include "openswan/ipsec_kversion.h" -+#include "openswan/ipsec_param.h" -+ -+#ifdef MALLOC_SLAB -+# include /* kmalloc() */ -+#else /* MALLOC_SLAB */ -+# include /* kmalloc() */ -+#endif /* MALLOC_SLAB */ -+#include /* error codes */ -+#include /* size_t */ -+#include /* mark_bh */ -+ -+#include /* struct device, and other headers */ -+#include /* eth_type_trans */ -+#include /* struct sockaddr_in */ -+#include -+#include /* copy_from_user */ -+#include -+#ifdef SPINLOCK -+#ifdef SPINLOCK_23 -+#include /* *lock* */ -+#else /* SPINLOCK_23 */ -+#include /* *lock* */ -+#endif /* SPINLOCK_23 */ -+#endif /* SPINLOCK */ -+ -+#include -+#ifdef CONFIG_PROC_FS -+#include -+#endif /* CONFIG_PROC_FS */ -+#ifdef NETLINK_SOCK -+#include -+#else -+#include -+#endif -+ -+#include "openswan/radij.h" -+ -+#include "openswan/ipsec_life.h" -+#include "openswan/ipsec_stats.h" -+#include "openswan/ipsec_sa.h" -+ -+#include "openswan/ipsec_encap.h" -+#include "openswan/ipsec_radij.h" -+#include "openswan/ipsec_xform.h" -+#include "openswan/ipsec_tunnel.h" -+#include "openswan/ipsec_xmit.h" -+ -+#include "openswan/ipsec_rcv.h" -+#include "openswan/ipsec_ah.h" -+#include "openswan/ipsec_esp.h" -+#include "openswan/ipsec_kern24.h" -+ -+#ifdef CONFIG_KLIPS_IPCOMP -+#include "openswan/ipcomp.h" -+#endif /* CONFIG_KLIPS_IPCOMP */ -+ -+#include "openswan/ipsec_proto.h" -+ -+#include -+#include -+ -+#ifdef CONFIG_PROC_FS -+ -+#ifdef IPSEC_PROC_SUBDIRS -+static struct proc_dir_entry *proc_net_ipsec_dir = NULL; -+static struct proc_dir_entry *proc_eroute_dir = NULL; -+static struct proc_dir_entry *proc_spi_dir = NULL; -+static struct proc_dir_entry *proc_spigrp_dir = NULL; -+static struct proc_dir_entry *proc_birth_dir = NULL; -+static struct proc_dir_entry *proc_stats_dir = NULL; -+#endif -+ -+struct ipsec_birth_reply ipsec_ipv4_birth_packet; -+struct ipsec_birth_reply ipsec_ipv6_birth_packet; -+ -+#ifdef CONFIG_KLIPS_DEBUG -+int debug_esp = 0; -+int debug_ah = 0; -+int sysctl_ipsec_inbound_policy_check = 1; -+int debug_tunnel = 0; -+int debug_xmit = 0; -+int debug_xform = 0; -+int debug_eroute = 0; -+int debug_spi = 0; -+int debug_radij = 0; -+int debug_pfkey = 0; -+int debug_rcv = 0; -+int debug_netlink = 0; -+int sysctl_ipsec_debug_verbose = 0; -+int sysctl_ipsec_debug_ipcomp =0; -+int sysctl_ipsec_icmp = 0; -+int sysctl_ipsec_tos = 0; -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+#define DECREMENT_UNSIGNED(X, amount) ((amount < (X)) ? (X)-amount : 0) -+ -+#ifdef CONFIG_KLIPS_ALG -+extern int ipsec_xform_get_info(char *buffer, char **start, -+ off_t offset, int length IPSEC_PROC_LAST_ARG); -+#endif -+ -+IPSEC_PROCFS_DEBUG_NO_STATIC -+int -+ipsec_eroute_get_info(char *buffer, -+ char **start, -+ off_t offset, -+ int length IPSEC_PROC_LAST_ARG) -+{ -+ struct wsbuf w = {buffer, length, offset, 0, 0}; -+ -+#ifdef CONFIG_KLIPS_DEBUG -+ if (debug_radij & DB_RJ_DUMPTREES) -+ rj_dumptrees(); /* XXXXXXXXX */ -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_PROCFS, -+ "klips_debug:ipsec_eroute_get_info: " -+ "buffer=0p%p, *start=0p%p, offset=%d, length=%d\n", -+ buffer, -+ *start, -+ (int)offset, -+ length); -+ -+ spin_lock_bh(&eroute_lock); -+ -+ rj_walktree(rnh, ipsec_rj_walker_procprint, &w); -+/* rj_walktree(mask_rjhead, ipsec_rj_walker_procprint, &w); */ -+ -+ spin_unlock_bh(&eroute_lock); -+ -+ *start = buffer + (offset - w.begin); /* Start of wanted data */ -+ return w.len - (offset - w.begin); -+} -+ -+IPSEC_PROCFS_DEBUG_NO_STATIC -+int -+ipsec_spi_get_info(char *buffer, -+ char **start, -+ off_t offset, -+ int length IPSEC_PROC_LAST_ARG) -+{ -+ const int max_content = length > 0? length-1 : 0; -+ int len = 0; -+ off_t begin = 0; -+ int i; -+ struct ipsec_sa *sa_p; -+ char sa[SATOT_BUF]; -+ char buf_s[SUBNETTOA_BUF]; -+ char buf_d[SUBNETTOA_BUF]; -+ size_t sa_len; -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_PROCFS, -+ "klips_debug:ipsec_spi_get_info: " -+ "buffer=0p%p, *start=0p%p, offset=%d, length=%d\n", -+ buffer, -+ *start, -+ (int)offset, -+ length); -+ -+ spin_lock_bh(&tdb_lock); -+ -+ for (i = 0; i < SADB_HASHMOD; i++) { -+ for (sa_p = ipsec_sadb_hash[i]; -+ sa_p; -+ sa_p = sa_p->ips_hnext) { -+ ipsec_sa_get(sa_p); -+ sa_len = satot(&sa_p->ips_said, 'x', sa, sizeof(sa)); -+ len += ipsec_snprintf(buffer+len, length-len, "%s ", -+ sa_len ? sa : " (error)"); -+ -+ len += ipsec_snprintf(buffer+len, length-len, "%s%s%s", -+ IPS_XFORM_NAME(sa_p)); -+ -+ len += ipsec_snprintf(buffer+len, length-len, ": dir=%s", -+ (sa_p->ips_flags & EMT_INBOUND) ? -+ "in " : "out"); -+ -+ if(sa_p->ips_addr_s) { -+ addrtoa(((struct sockaddr_in*)(sa_p->ips_addr_s))->sin_addr, -+ 0, buf_s, sizeof(buf_s)); -+ len += ipsec_snprintf(buffer+len, length-len, " src=%s", -+ buf_s); -+ } -+ -+ if((sa_p->ips_said.proto == IPPROTO_IPIP) -+ && (sa_p->ips_flags & SADB_X_SAFLAGS_INFLOW)) { -+ subnettoa(sa_p->ips_flow_s.u.v4.sin_addr, -+ sa_p->ips_mask_s.u.v4.sin_addr, -+ 0, -+ buf_s, -+ sizeof(buf_s)); -+ -+ subnettoa(sa_p->ips_flow_d.u.v4.sin_addr, -+ sa_p->ips_mask_d.u.v4.sin_addr, -+ 0, -+ buf_d, -+ sizeof(buf_d)); -+ -+ len += ipsec_snprintf(buffer+len, length-len, " policy=%s->%s", -+ buf_s, buf_d); -+ } -+ -+ if(sa_p->ips_iv_bits) { -+ int j; -+ len += ipsec_snprintf(buffer+len, length-len, " iv_bits=%dbits iv=0x", -+ sa_p->ips_iv_bits); -+ -+#ifdef CONFIG_KLIPS_OCF -+ if (!sa_p->ips_iv) { -+ /* ocf doesn't set the IV, fake it for the UML tests */ -+ len += ipsec_snprintf(buffer+len, length-len, "0cf0"); -+ for (j = 0; j < (sa_p->ips_iv_bits / 8) - 2; j++) { -+ len += ipsec_snprintf(buffer+len, length-len, "%02x", -+ (int) ((((long)sa_p) >> j) & 0xff)); -+ } -+ } else -+#endif -+ for(j = 0; j < sa_p->ips_iv_bits / 8; j++) { -+ len += ipsec_snprintf(buffer+len, length-len, "%02x", -+ (__u32)((__u8*)(sa_p->ips_iv))[j]); -+ } -+ } -+ -+ if(sa_p->ips_encalg || sa_p->ips_authalg) { -+ if(sa_p->ips_replaywin) { -+ len += ipsec_snprintf(buffer+len, length-len, " ooowin=%d", -+ sa_p->ips_replaywin); -+ } -+ if(sa_p->ips_errs.ips_replaywin_errs) { -+ len += ipsec_snprintf(buffer+len, length-len, " ooo_errs=%d", -+ sa_p->ips_errs.ips_replaywin_errs); -+ } -+ if(sa_p->ips_replaywin_lastseq) { -+ len += ipsec_snprintf(buffer+len, length-len, " seq=%d", -+ sa_p->ips_replaywin_lastseq); -+ } -+ if(sa_p->ips_replaywin_bitmap) { -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,0) -+ len += ipsec_snprintf(buffer+len, length-len, " bit=0x%Lx", -+ sa_p->ips_replaywin_bitmap); -+#else -+ len += ipsec_snprintf(buffer+len, length-len, " bit=0x%x%08x", -+ (__u32)(sa_p->ips_replaywin_bitmap >> 32), -+ (__u32)sa_p->ips_replaywin_bitmap); -+#endif -+ } -+ if(sa_p->ips_replaywin_maxdiff) { -+ len += ipsec_snprintf(buffer+len, length-len, " max_seq_diff=%d", -+ sa_p->ips_replaywin_maxdiff); -+ } -+ } -+ if(sa_p->ips_flags & ~EMT_INBOUND) { -+ len += ipsec_snprintf(buffer+len, length-len, " flags=0x%x", -+ sa_p->ips_flags & ~EMT_INBOUND); -+ len += ipsec_snprintf(buffer+len, length-len, "<"); -+ /* flag printing goes here */ -+ len += ipsec_snprintf(buffer+len, length-len, ">"); -+ } -+ if(sa_p->ips_auth_bits) { -+ len += ipsec_snprintf(buffer+len, length-len, " alen=%d", -+ sa_p->ips_auth_bits); -+ } -+ if(sa_p->ips_key_bits_a) { -+ len += ipsec_snprintf(buffer+len, length-len, " aklen=%d", -+ sa_p->ips_key_bits_a); -+ } -+ if(sa_p->ips_errs.ips_auth_errs) { -+ len += ipsec_snprintf(buffer+len, length-len, " auth_errs=%d", -+ sa_p->ips_errs.ips_auth_errs); -+ } -+ if(sa_p->ips_key_bits_e) { -+ len += ipsec_snprintf(buffer+len, length-len, " eklen=%d", -+ sa_p->ips_key_bits_e); -+ } -+ if(sa_p->ips_errs.ips_encsize_errs) { -+ len += ipsec_snprintf(buffer+len, length-len, " encr_size_errs=%d", -+ sa_p->ips_errs.ips_encsize_errs); -+ } -+ if(sa_p->ips_errs.ips_encpad_errs) { -+ len += ipsec_snprintf(buffer+len, length-len, " encr_pad_errs=%d", -+ sa_p->ips_errs.ips_encpad_errs); -+ } -+ -+ len += ipsec_snprintf(buffer+len, length-len, " life(c,s,h)="); -+ -+ len += ipsec_lifetime_format(buffer + len, -+ length - len, -+ "alloc", -+ ipsec_life_countbased, -+ &sa_p->ips_life.ipl_allocations); -+ -+ len += ipsec_lifetime_format(buffer + len, -+ length - len, -+ "bytes", -+ ipsec_life_countbased, -+ &sa_p->ips_life.ipl_bytes); -+ -+ len += ipsec_lifetime_format(buffer + len, -+ length - len, -+ "addtime", -+ ipsec_life_timebased, -+ &sa_p->ips_life.ipl_addtime); -+ -+ len += ipsec_lifetime_format(buffer + len, -+ length - len, -+ "usetime", -+ ipsec_life_timebased, -+ &sa_p->ips_life.ipl_usetime); -+ -+ len += ipsec_lifetime_format(buffer + len, -+ length - len, -+ "packets", -+ ipsec_life_countbased, -+ &sa_p->ips_life.ipl_packets); -+ -+ if(sa_p->ips_life.ipl_usetime.ipl_last) { /* XXX-MCR should be last? */ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,0) -+ len += ipsec_snprintf(buffer+len, length-len, " idle=%Ld", -+ jiffies / HZ - sa_p->ips_life.ipl_usetime.ipl_last); -+#else -+ len += ipsec_snprintf(buffer+len, length-len, " idle=%lu", -+ jiffies / HZ - (unsigned long)sa_p->ips_life.ipl_usetime.ipl_last); -+#endif -+ } -+ -+#ifdef CONFIG_KLIPS_IPCOMP -+ if(sa_p->ips_said.proto == IPPROTO_COMP && -+ (sa_p->ips_comp_ratio_dbytes || -+ sa_p->ips_comp_ratio_cbytes)) { -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,0) -+ len += ipsec_snprintf(buffer+len, length-len, " ratio=%Ld:%Ld", -+ sa_p->ips_comp_ratio_dbytes, -+ sa_p->ips_comp_ratio_cbytes); -+#else -+ len += ipsec_snprintf(buffer+len, length-len, " ratio=%lu:%lu", -+ (unsigned long)sa_p->ips_comp_ratio_dbytes, -+ (unsigned long)sa_p->ips_comp_ratio_cbytes); -+#endif -+ } -+#endif /* CONFIG_KLIPS_IPCOMP */ -+ -+#ifdef CONFIG_IPSEC_NAT_TRAVERSAL -+ { -+ char *natttype_name; -+ -+ switch(sa_p->ips_natt_type) -+ { -+ case 0: -+ natttype_name="none"; -+ break; -+ case ESPINUDP_WITH_NON_IKE: -+ natttype_name="nonike"; -+ break; -+ case ESPINUDP_WITH_NON_ESP: -+ natttype_name="nonesp"; -+ break; -+ default: -+ natttype_name = "unknown"; -+ break; -+ } -+ -+ len += ipsec_snprintf(buffer + len, length-len, " natencap=%s", -+ natttype_name); -+ -+ len += ipsec_snprintf(buffer + len, length-len, " natsport=%d", -+ sa_p->ips_natt_sport); -+ -+ len += ipsec_snprintf(buffer + len,length-len, " natdport=%d", -+ sa_p->ips_natt_dport); -+ } -+#else -+ len += ipsec_snprintf(buffer + len, length-len, " natencap=na"); -+#endif /* CONFIG_IPSEC_NAT_TRAVERSAL */ -+ -+ /* we decrement by one, because this SA has been referenced in order to dump this info */ -+ len += ipsec_snprintf(buffer + len,length-len, " refcount=%d", -+ atomic_read(&sa_p->ips_refcount)-1); -+ -+ len += ipsec_snprintf(buffer+len, length-len, " ref=%d", -+ sa_p->ips_ref); -+ len += ipsec_snprintf(buffer+len, length-len, " refhim=%d", -+ sa_p->ips_refhim); -+ -+ if(sa_p->ips_out) { -+ len += ipsec_snprintf(buffer+len, length-len, " outif=%s:%d", -+ sa_p->ips_out->name, -+ sa_p->ips_transport_direct); -+ } -+#ifdef CONFIG_KLIPS_DEBUG -+ if(debug_xform) { -+ len += ipsec_snprintf(buffer+len, length-len, " reftable=%lu refentry=%lu", -+ (unsigned long)IPsecSAref2table(sa_p->ips_ref), -+ (unsigned long)IPsecSAref2entry(sa_p->ips_ref)); -+ } -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+ len += ipsec_snprintf(buffer+len, length-len, "\n"); -+ -+ ipsec_sa_put(sa_p); -+ -+ if (len >= max_content) { -+ /* we've done all that can fit -- stop loops */ -+ len = max_content; /* truncate crap */ -+ goto done_spi_i; -+ } else { -+ const off_t pos = begin + len; /* file position of end of what we've generated */ -+ -+ if (pos <= offset) { -+ /* all is before first interesting character: -+ * discard, but note where we are. -+ */ -+ len = 0; -+ begin = pos; -+ } -+ } -+ } -+ } -+ -+done_spi_i: -+ spin_unlock_bh(&tdb_lock); -+ -+ *start = buffer + (offset - begin); /* Start of wanted data */ -+ return len - (offset - begin); -+} -+ -+IPSEC_PROCFS_DEBUG_NO_STATIC -+int -+ipsec_spigrp_get_info(char *buffer, -+ char **start, -+ off_t offset, -+ int length IPSEC_PROC_LAST_ARG) -+{ -+ /* Limit of useful snprintf output */ -+ const int max_content = length > 0? length-1 : 0; -+ -+ int len = 0; -+ off_t begin = 0; -+ int i; -+ struct ipsec_sa *sa_p, *sa_p2; -+ char sa[SATOT_BUF]; -+ size_t sa_len; -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_PROCFS, -+ "klips_debug:ipsec_spigrp_get_info: " -+ "buffer=0p%p, *start=0p%p, offset=%d, length=%d\n", -+ buffer, -+ *start, -+ (int)offset, -+ length); -+ -+ spin_lock_bh(&tdb_lock); -+ -+ for (i = 0; i < SADB_HASHMOD; i++) { -+ for (sa_p = ipsec_sadb_hash[i]; -+ sa_p != NULL; -+ sa_p = sa_p->ips_hnext) -+ { -+ sa_p2 = sa_p; -+ while(sa_p2 != NULL) { -+ struct ipsec_sa *sa2n; -+ sa_len = satot(&sa_p2->ips_said, -+ 'x', sa, sizeof(sa)); -+ -+ len += ipsec_snprintf(buffer+len, length-len, "%s ", -+ sa_len ? sa : " (error)"); -+ -+ sa2n = sa_p2->ips_next; -+ sa_p2 = sa2n; -+ } -+ len += ipsec_snprintf(buffer+len, length-len, "\n"); -+ -+ if (len >= max_content) { -+ /* we've done all that can fit -- stop loops */ -+ len = max_content; /* truncate crap */ -+ goto done_spigrp_i; -+ } else { -+ const off_t pos = begin + len; -+ -+ if (pos <= offset) { -+ /* all is before first interesting character: -+ * discard, but note where we are. -+ */ -+ len = 0; -+ begin = pos; -+ } -+ } -+ } -+ } -+ -+done_spigrp_i: -+ spin_unlock_bh(&tdb_lock); -+ -+ *start = buffer + (offset - begin); /* Start of wanted data */ -+ return len - (offset - begin); -+} -+ -+ -+IPSEC_PROCFS_DEBUG_NO_STATIC -+int -+ipsec_tncfg_get_info(char *buffer, -+ char **start, -+ off_t offset, -+ int length IPSEC_PROC_LAST_ARG) -+{ -+ /* limit of useful snprintf output */ -+ const int max_content = length > 0? length-1 : 0; -+ int len = 0; -+ off_t begin = 0; -+ int i; -+ char name[9]; -+ struct net_device *dev, *privdev; -+ struct ipsecpriv *priv; -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_PROCFS, -+ "klips_debug:ipsec_tncfg_get_info: " -+ "buffer=0p%p, *start=0p%p, offset=%d, length=%d\n", -+ buffer, -+ *start, -+ (int)offset, -+ length); -+ -+ for(i = 0; i < IPSEC_NUM_IF; i++) { -+ ipsec_snprintf(name, (ssize_t) sizeof(name), IPSEC_DEV_FORMAT, i); -+ dev = __ipsec_dev_get(name); -+ if(dev) { -+ priv = (struct ipsecpriv *)(dev->priv); -+ len += ipsec_snprintf(buffer+len, length-len, "%s", -+ dev->name); -+ if(priv) { -+ privdev = (struct net_device *)(priv->dev); -+ len += ipsec_snprintf(buffer+len, length-len, " -> %s", -+ privdev ? privdev->name : "NULL"); -+ len += ipsec_snprintf(buffer+len, length-len, " mtu=%d(%d) -> %d", -+ dev->mtu, -+ priv->mtu, -+ privdev ? privdev->mtu : 0); -+ } else { -+ KLIPS_PRINT(debug_tunnel & DB_TN_PROCFS, -+ "klips_debug:ipsec_tncfg_get_info: device '%s' has no private data space!\n", -+ dev->name); -+ } -+ len += ipsec_snprintf(buffer+len, length-len, "\n"); -+ -+ if (len >= max_content) { -+ /* we've done all that can fit -- stop loop */ -+ len = max_content; /* truncate crap */ -+ break; -+ } else { -+ const off_t pos = begin + len; -+ if (pos <= offset) { -+ len = 0; -+ begin = pos; -+ } -+ } -+ } -+ } -+ *start = buffer + (offset - begin); /* Start of wanted data */ -+ len -= (offset - begin); /* Start slop */ -+ if (len > length) -+ len = length; -+ return len; -+} -+ -+IPSEC_PROCFS_DEBUG_NO_STATIC -+int -+ipsec_version_get_info(char *buffer, -+ char **start, -+ off_t offset, -+ int length IPSEC_PROC_LAST_ARG) -+{ -+ int len = 0; -+ off_t begin = 0; -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_PROCFS, -+ "klips_debug:ipsec_version_get_info: " -+ "buffer=0p%p, *start=0p%p, offset=%d, length=%d\n", -+ buffer, -+ *start, -+ (int)offset, -+ length); -+ -+ len += ipsec_snprintf(buffer + len,length-len, "Openswan version: %s\n", -+ ipsec_version_code()); -+#if 0 -+ KLIPS_PRINT(debug_tunnel & DB_TN_PROCFS, -+ "klips_debug:ipsec_version_get_info: " -+ "ipsec_init version: %s\n", -+ ipsec_init_c_version); -+ KLIPS_PRINT(debug_tunnel & DB_TN_PROCFS, -+ "klips_debug:ipsec_version_get_info: " -+ "ipsec_tunnel version: %s\n", -+ ipsec_tunnel_c_version); -+ KLIPS_PRINT(debug_tunnel & DB_TN_PROCFS, -+ "klips_debug:ipsec_version_get_info: " -+ "ipsec_netlink version: %s\n", -+ ipsec_netlink_c_version); -+ KLIPS_PRINT(debug_tunnel & DB_TN_PROCFS, -+ "klips_debug:ipsec_version_get_info: " -+ "radij_c_version: %s\n", -+ radij_c_version); -+#endif -+ -+ -+ *start = buffer + (offset - begin); /* Start of wanted data */ -+ len -= (offset - begin); /* Start slop */ -+ if (len > length) -+ len = length; -+ return len; -+} -+ -+#ifdef CONFIG_IPSEC_NAT_TRAVERSAL -+unsigned int natt_available = 1; -+#else -+unsigned int natt_available = 0; -+#endif -+module_param(natt_available,int,0644); -+ -+IPSEC_PROCFS_DEBUG_NO_STATIC -+int -+ipsec_natt_get_info(char *buffer, -+ char **start, -+ off_t offset, -+ int length IPSEC_PROC_LAST_ARG) -+{ -+ int len = 0; -+ off_t begin = 0; -+ -+ len += ipsec_snprintf(buffer + len, -+ length-len, "%d\n", -+#ifdef CONFIG_IPSEC_NAT_TRAVERSAL -+ 1 -+#else -+ 0 -+#endif -+ ); -+ -+ *start = buffer + (offset - begin); /* Start of wanted data */ -+ len -= (offset - begin); /* Start slop */ -+ if (len > length) -+ len = length; -+ return len; -+} -+ -+IPSEC_PROCFS_DEBUG_NO_STATIC -+int -+ipsec_birth_info(char *page, -+ char **start, -+ off_t offset, -+ int count, -+ int *eof, -+ void *data) -+{ -+ struct ipsec_birth_reply *ibr = (struct ipsec_birth_reply *)data; -+ int len; -+ -+ if(offset >= ibr->packet_template_len) { -+ if(eof) { -+ *eof=1; -+ } -+ return 0; -+ } -+ -+ len = ibr->packet_template_len; -+ len -= offset; -+ if (len > count) -+ len = count; -+ -+ memcpy(page + offset, ibr->packet_template+offset, len); -+ -+ return len; -+} -+ -+IPSEC_PROCFS_DEBUG_NO_STATIC -+int -+ipsec_birth_set(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ struct ipsec_birth_reply *ibr = (struct ipsec_birth_reply *)data; -+ int len; -+ -+ KLIPS_INC_USE; -+ if(count > IPSEC_BIRTH_TEMPLATE_MAXLEN) { -+ len = IPSEC_BIRTH_TEMPLATE_MAXLEN; -+ } else { -+ len = count; -+ } -+ -+ if(copy_from_user(ibr->packet_template, buffer, len)) { -+ KLIPS_DEC_USE; -+ return -EFAULT; -+ } -+ ibr->packet_template_len = len; -+ -+ KLIPS_DEC_USE; -+ -+ return len; -+} -+ -+ -+#ifdef CONFIG_KLIPS_DEBUG -+IPSEC_PROCFS_DEBUG_NO_STATIC -+int -+ipsec_klipsdebug_get_info(char *buffer, -+ char **start, -+ off_t offset, -+ int length IPSEC_PROC_LAST_ARG) -+{ -+ int len = 0; -+ off_t begin = 0; -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_PROCFS, -+ "klips_debug:ipsec_klipsdebug_get_info: " -+ "buffer=0p%p, *start=0p%p, offset=%d, length=%d\n", -+ buffer, -+ *start, -+ (int)offset, -+ length); -+ -+ len += ipsec_snprintf(buffer+len, length-len, "debug_tunnel=%08x.\n", debug_tunnel); -+ len += ipsec_snprintf(buffer+len, length-len, "debug_xform=%08x.\n", debug_xform); -+ len += ipsec_snprintf(buffer+len, length-len, "debug_eroute=%08x.\n", debug_eroute); -+ len += ipsec_snprintf(buffer+len, length-len, "debug_spi=%08x.\n", debug_spi); -+ len += ipsec_snprintf(buffer+len, length-len, "debug_radij=%08x.\n", debug_radij); -+ len += ipsec_snprintf(buffer+len, length-len, "debug_esp=%08x.\n", debug_esp); -+ len += ipsec_snprintf(buffer+len, length-len, "debug_ah=%08x.\n", debug_ah); -+ len += ipsec_snprintf(buffer+len, length-len, "debug_rcv=%08x.\n", debug_rcv); -+ len += ipsec_snprintf(buffer+len, length-len, "debug_pfkey=%08x.\n", debug_pfkey); -+ -+ *start = buffer + (offset - begin); /* Start of wanted data */ -+ len -= (offset - begin); /* Start slop */ -+ if (len > length) -+ len = length; -+ return len; -+} -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+IPSEC_PROCFS_DEBUG_NO_STATIC -+int -+ipsec_stats_get_int_info(char *buffer, -+ char **start, -+ off_t offset, -+ int length, -+ int *eof, -+ void *data) -+{ -+ -+ const int max_content = length > 0? length-1 : 0; -+ int len = 0; -+ int *thing; -+ -+ thing = (int *)data; -+ -+ len = ipsec_snprintf(buffer+len, length-len, "%08x\n", *thing); -+ -+ if (len >= max_content) -+ len = max_content; /* truncate crap */ -+ -+ *start = buffer + offset; /* Start of wanted data */ -+ return len > offset? len - offset : 0; -+ -+} -+ -+#ifndef PROC_FS_2325 -+struct proc_dir_entry ipsec_eroute = -+{ -+ 0, -+ 12, "ipsec_eroute", -+ S_IFREG | S_IRUGO, 1, 0, 0, 0, -+ &proc_net_inode_operations, -+ ipsec_eroute_get_info, -+ NULL, NULL, NULL, NULL, NULL -+}; -+ -+struct proc_dir_entry ipsec_spi = -+{ -+ 0, -+ 9, "ipsec_spi", -+ S_IFREG | S_IRUGO, 1, 0, 0, 0, -+ &proc_net_inode_operations, -+ ipsec_spi_get_info, -+ NULL, NULL, NULL, NULL, NULL -+}; -+ -+struct proc_dir_entry ipsec_spigrp = -+{ -+ 0, -+ 12, "ipsec_spigrp", -+ S_IFREG | S_IRUGO, 1, 0, 0, 0, -+ &proc_net_inode_operations, -+ ipsec_spigrp_get_info, -+ NULL, NULL, NULL, NULL, NULL -+}; -+ -+struct proc_dir_entry ipsec_tncfg = -+{ -+ 0, -+ 11, "ipsec_tncfg", -+ S_IFREG | S_IRUGO, 1, 0, 0, 0, -+ &proc_net_inode_operations, -+ ipsec_tncfg_get_info, -+ NULL, NULL, NULL, NULL, NULL -+}; -+ -+struct proc_dir_entry ipsec_version = -+{ -+ 0, -+ 13, "ipsec_version", -+ S_IFREG | S_IRUGO, 1, 0, 0, 0, -+ &proc_net_inode_operations, -+ ipsec_version_get_info, -+ NULL, NULL, NULL, NULL, NULL -+}; -+ -+#ifdef CONFIG_KLIPS_DEBUG -+struct proc_dir_entry ipsec_klipsdebug = -+{ -+ 0, -+ 16, "ipsec_klipsdebug", -+ S_IFREG | S_IRUGO, 1, 0, 0, 0, -+ &proc_net_inode_operations, -+ ipsec_klipsdebug_get_info, -+ NULL, NULL, NULL, NULL, NULL -+}; -+#endif /* CONFIG_KLIPS_DEBUG */ -+#endif /* !PROC_FS_2325 */ -+#endif /* CONFIG_PROC_FS */ -+ -+#if defined(PROC_FS_2325) -+struct ipsec_proc_list { -+ char *name; -+ struct proc_dir_entry **parent; -+ struct proc_dir_entry **dir; -+ read_proc_t *readthing; -+ write_proc_t *writething; -+ void *data; -+}; -+static struct ipsec_proc_list proc_items[]={ -+#ifdef CONFIG_KLIPS_DEBUG -+ {"klipsdebug", &proc_net_ipsec_dir, NULL, ipsec_klipsdebug_get_info, NULL, NULL}, -+#endif -+ {"eroute", &proc_net_ipsec_dir, &proc_eroute_dir, NULL, NULL, NULL}, -+ {"all", &proc_eroute_dir, NULL, ipsec_eroute_get_info, NULL, NULL}, -+ {"spi", &proc_net_ipsec_dir, &proc_spi_dir, NULL, NULL, NULL}, -+ {"all", &proc_spi_dir, NULL, ipsec_spi_get_info, NULL, NULL}, -+ {"spigrp", &proc_net_ipsec_dir, &proc_spigrp_dir, NULL, NULL, NULL}, -+ {"all", &proc_spigrp_dir, NULL, ipsec_spigrp_get_info, NULL, NULL}, -+ {"birth", &proc_net_ipsec_dir, &proc_birth_dir, NULL, NULL, NULL}, -+ {"ipv4", &proc_birth_dir, NULL, ipsec_birth_info, ipsec_birth_set, (void *)&ipsec_ipv4_birth_packet}, -+ {"ipv6", &proc_birth_dir, NULL, ipsec_birth_info, ipsec_birth_set, (void *)&ipsec_ipv6_birth_packet}, -+ {"tncfg", &proc_net_ipsec_dir, NULL, ipsec_tncfg_get_info, NULL, NULL}, -+#ifdef CONFIG_KLIPS_ALG -+ -+ {"xforms", &proc_net_ipsec_dir, NULL, ipsec_xform_get_info, NULL, NULL}, -+#endif -+ {"stats", &proc_net_ipsec_dir, &proc_stats_dir, NULL, NULL, NULL}, -+ {"trap_count", &proc_stats_dir, NULL, ipsec_stats_get_int_info, NULL, &ipsec_xmit_trap_count}, -+ {"trap_sendcount", &proc_stats_dir, NULL, ipsec_stats_get_int_info, NULL, &ipsec_xmit_trap_sendcount}, -+ {"natt", &proc_net_ipsec_dir, NULL, ipsec_natt_get_info, NULL, NULL}, -+ {"version", &proc_net_ipsec_dir, NULL, ipsec_version_get_info, NULL, NULL}, -+ {NULL, NULL, NULL, NULL, NULL, NULL} -+}; -+#endif -+ -+int -+ipsec_proc_init() -+{ -+ int error = 0; -+#ifdef IPSEC_PROC_SUBDIRS -+ struct proc_dir_entry *item; -+#endif -+ -+ /* -+ * just complain because pluto won't run without /proc! -+ */ -+#ifndef CONFIG_PROC_FS -+#error You must have PROC_FS built in to use KLIPS -+#endif -+ -+ /* for 2.0 kernels */ -+#if !defined(PROC_FS_2325) && !defined(PROC_FS_21) -+ error |= proc_register_dynamic(&PROC_NET, &ipsec_eroute); -+ error |= proc_register_dynamic(&PROC_NET, &ipsec_spi); -+ error |= proc_register_dynamic(&PROC_NET, &ipsec_spigrp); -+ error |= proc_register_dynamic(&PROC_NET, &ipsec_tncfg); -+ error |= proc_register_dynamic(&PROC_NET, &ipsec_version); -+#ifdef CONFIG_KLIPS_DEBUG -+ error |= proc_register_dynamic(&PROC_NET, &ipsec_klipsdebug); -+#endif /* CONFIG_KLIPS_DEBUG */ -+#endif -+ -+ /* for 2.2 kernels */ -+#if !defined(PROC_FS_2325) && defined(PROC_FS_21) -+ error |= proc_register(PROC_NET, &ipsec_eroute); -+ error |= proc_register(PROC_NET, &ipsec_spi); -+ error |= proc_register(PROC_NET, &ipsec_spigrp); -+ error |= proc_register(PROC_NET, &ipsec_tncfg); -+ error |= proc_register(PROC_NET, &ipsec_version); -+#ifdef CONFIG_KLIPS_DEBUG -+ error |= proc_register(PROC_NET, &ipsec_klipsdebug); -+#endif /* CONFIG_KLIPS_DEBUG */ -+#endif -+ -+ /* for 2.4 kernels */ -+#if defined(PROC_FS_2325) -+ /* create /proc/net/ipsec */ -+ -+ /* zero these out before we initialize /proc/net/ipsec/birth/stuff */ -+ memset(&ipsec_ipv4_birth_packet, 0, sizeof(struct ipsec_birth_reply)); -+ memset(&ipsec_ipv6_birth_packet, 0, sizeof(struct ipsec_birth_reply)); -+ -+ proc_net_ipsec_dir = proc_mkdir("ipsec", PROC_NET); -+ if(proc_net_ipsec_dir == NULL) { -+ /* no point in continuing */ -+ return 1; -+ } -+ -+ { -+ struct ipsec_proc_list *it; -+ -+ it=proc_items; -+ while(it->name!=NULL) { -+ if(it->dir) { -+ /* make a dir instead */ -+ item = proc_mkdir(it->name, *it->parent); -+ *it->dir = item; -+ } else { -+ item = create_proc_entry(it->name, 0400, *it->parent); -+ } -+ if(item) { -+ item->read_proc = it->readthing; -+ item->write_proc = it->writething; -+ item->data = it->data; -+#ifdef MODULE -+ item->owner = THIS_MODULE; -+#endif -+ } else { -+ error |= 1; -+ } -+ it++; -+ } -+ } -+ -+ /* now create some symlinks to provide compatibility */ -+ proc_symlink("ipsec_eroute", PROC_NET, "ipsec/eroute/all"); -+ proc_symlink("ipsec_spi", PROC_NET, "ipsec/spi/all"); -+ proc_symlink("ipsec_spigrp", PROC_NET, "ipsec/spigrp/all"); -+ proc_symlink("ipsec_tncfg", PROC_NET, "ipsec/tncfg"); -+ proc_symlink("ipsec_version",PROC_NET, "ipsec/version"); -+ proc_symlink("ipsec_klipsdebug",PROC_NET,"ipsec/klipsdebug"); -+ -+#endif /* !PROC_FS_2325 */ -+ -+ return error; -+} -+ -+void -+ipsec_proc_cleanup() -+{ -+ -+ /* for 2.0 and 2.2 kernels */ -+#if !defined(PROC_FS_2325) -+ -+#ifdef CONFIG_KLIPS_DEBUG -+ if (proc_net_unregister(ipsec_klipsdebug.low_ino) != 0) -+ printk("klips_debug:ipsec_cleanup: " -+ "cannot unregister /proc/net/ipsec_klipsdebug\n"); -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+ if (proc_net_unregister(ipsec_version.low_ino) != 0) -+ printk("klips_debug:ipsec_cleanup: " -+ "cannot unregister /proc/net/ipsec_version\n"); -+ if (proc_net_unregister(ipsec_eroute.low_ino) != 0) -+ printk("klips_debug:ipsec_cleanup: " -+ "cannot unregister /proc/net/ipsec_eroute\n"); -+ if (proc_net_unregister(ipsec_spi.low_ino) != 0) -+ printk("klips_debug:ipsec_cleanup: " -+ "cannot unregister /proc/net/ipsec_spi\n"); -+ if (proc_net_unregister(ipsec_spigrp.low_ino) != 0) -+ printk("klips_debug:ipsec_cleanup: " -+ "cannot unregister /proc/net/ipsec_spigrp\n"); -+ if (proc_net_unregister(ipsec_tncfg.low_ino) != 0) -+ printk("klips_debug:ipsec_cleanup: " -+ "cannot unregister /proc/net/ipsec_tncfg\n"); -+#endif -+ -+ /* for 2.4 kernels */ -+#if defined(PROC_FS_2325) -+ { -+ struct ipsec_proc_list *it; -+ -+ /* find end of list */ -+ it=proc_items; -+ while(it->name!=NULL) { -+ it++; -+ } -+ it--; -+ -+ do { -+ remove_proc_entry(it->name, *it->parent); -+ it--; -+ } while(it >= proc_items); -+ } -+ -+ -+#ifdef CONFIG_KLIPS_DEBUG -+ remove_proc_entry("ipsec_klipsdebug", PROC_NET); -+#endif /* CONFIG_KLIPS_DEBUG */ -+ remove_proc_entry("ipsec_eroute", PROC_NET); -+ remove_proc_entry("ipsec_spi", PROC_NET); -+ remove_proc_entry("ipsec_spigrp", PROC_NET); -+ remove_proc_entry("ipsec_tncfg", PROC_NET); -+ remove_proc_entry("ipsec_version", PROC_NET); -+ remove_proc_entry("ipsec", PROC_NET); -+#endif /* 2.4 kernel */ -+} -+ -+/* -+ * -+ * Local variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/ipsec_radij.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,554 @@ -+/* -+ * Interface between the IPSEC code and the radix (radij) tree code -+ * Copyright (C) 1996, 1997 John Ioannidis. -+ * Copyright (C) 1998, 1999, 2000, 2001 Richard Guy Briggs. -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ */ -+ -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif -+#include -+#include /* printk() */ -+ -+#include "openswan/ipsec_param.h" -+ -+#ifdef MALLOC_SLAB -+# include /* kmalloc() */ -+#else /* MALLOC_SLAB */ -+# include /* kmalloc() */ -+#endif /* MALLOC_SLAB */ -+#include /* error codes */ -+#include /* size_t */ -+#include /* mark_bh */ -+ -+#include /* struct device, struct net_device_stats and other headers */ -+#include /* eth_type_trans */ -+#include /* struct iphdr */ -+#include -+#include -+#ifdef SPINLOCK -+# ifdef SPINLOCK_23 -+# include /* *lock* */ -+# else /* 23_SPINLOCK */ -+# include /* *lock* */ -+# endif /* 23_SPINLOCK */ -+#endif /* SPINLOCK */ -+ -+#include -+ -+#include "openswan/ipsec_eroute.h" -+#include "openswan/ipsec_sa.h" -+ -+#include "openswan/radij.h" -+#include "openswan/ipsec_encap.h" -+#include "openswan/radij.h" -+#include "openswan/ipsec_encap.h" -+#include "openswan/ipsec_radij.h" -+#include "openswan/ipsec_tunnel.h" /* struct ipsecpriv */ -+#include "openswan/ipsec_xform.h" -+ -+#include -+#include -+ -+#include "openswan/ipsec_proto.h" -+ -+struct radij_node_head *rnh = NULL; -+#ifdef SPINLOCK -+spinlock_t eroute_lock = SPIN_LOCK_UNLOCKED; -+#else /* SPINLOCK */ -+spinlock_t eroute_lock; -+#endif /* SPINLOCK */ -+ -+int -+ipsec_radijinit(void) -+{ -+ maj_keylen = sizeof (struct sockaddr_encap); -+ -+ rj_init(); -+ -+ if (rj_inithead((void **)&rnh, /*16*/offsetof(struct sockaddr_encap, sen_type) * sizeof(__u8)) == 0) /* 16 is bit offset of sen_type */ -+ return -1; -+ return 0; -+} -+ -+int -+ipsec_radijcleanup(void) -+{ -+ int error = 0; -+ -+ spin_lock_bh(&eroute_lock); -+ -+ error = radijcleanup(); -+ -+ spin_unlock_bh(&eroute_lock); -+ -+ return error; -+} -+ -+int -+ipsec_cleareroutes(void) -+{ -+ int error; -+ -+ spin_lock_bh(&eroute_lock); -+ -+ error = radijcleartree(); -+ -+ spin_unlock_bh(&eroute_lock); -+ -+ return error; -+} -+ -+int -+ipsec_breakroute(struct sockaddr_encap *eaddr, -+ struct sockaddr_encap *emask, -+ struct sk_buff **first, -+ struct sk_buff **last) -+{ -+ struct eroute *ro; -+ struct radij_node *rn; -+ int error; -+#ifdef CONFIG_KLIPS_DEBUG -+ -+ if (debug_eroute) { -+ char buf1[SUBNETTOA_BUF], buf2[SUBNETTOA_BUF]; -+ subnettoa(eaddr->sen_ip_src, emask->sen_ip_src, 0, buf1, sizeof(buf1)); -+ subnettoa(eaddr->sen_ip_dst, emask->sen_ip_dst, 0, buf2, sizeof(buf2)); -+ KLIPS_PRINT(debug_eroute, -+ "klips_debug:ipsec_breakroute: " -+ "attempting to delete eroute for %s:%d->%s:%d %d\n", -+ buf1, ntohs(eaddr->sen_sport), -+ buf2, ntohs(eaddr->sen_dport), eaddr->sen_proto); -+ } -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+ spin_lock_bh(&eroute_lock); -+ -+ if ((error = rj_delete(eaddr, emask, rnh, &rn)) != 0) { -+ spin_unlock_bh(&eroute_lock); -+ KLIPS_PRINT(debug_eroute, -+ "klips_debug:ipsec_breakroute: " -+ "node not found, eroute delete failed.\n"); -+ return error; -+ } -+ -+ spin_unlock_bh(&eroute_lock); -+ -+ ro = (struct eroute *)rn; -+ -+ KLIPS_PRINT(debug_eroute, -+ "klips_debug:ipsec_breakroute: " -+ "deleted eroute=0p%p, ident=0p%p->0p%p, first=0p%p, last=0p%p\n", -+ ro, -+ ro->er_ident_s.data, -+ ro->er_ident_d.data, -+ ro->er_first, -+ ro->er_last); -+ -+ if (ro->er_ident_s.data != NULL) { -+ kfree(ro->er_ident_s.data); -+ } -+ if (ro->er_ident_d.data != NULL) { -+ kfree(ro->er_ident_d.data); -+ } -+ if (ro->er_first != NULL) { -+#if 0 -+ struct net_device_stats *stats = (struct net_device_stats *) &(((struct ipsecpriv *)(ro->er_first->dev->priv))->mystats); -+ stats->tx_dropped--; -+#endif -+ *first = ro->er_first; -+ } -+ if (ro->er_last != NULL) { -+#if 0 -+ struct net_device_stats *stats = (struct net_device_stats *) &(((struct ipsecpriv *)(ro->er_last->dev->priv))->mystats); -+ stats->tx_dropped--; -+#endif -+ *last = ro->er_last; -+ } -+ -+ if (rn->rj_flags & (RJF_ACTIVE | RJF_ROOT)) -+ panic ("ipsec_breakroute RMT_DELEROUTE root or active node\n"); -+ memset((caddr_t)rn, 0, sizeof (struct eroute)); -+ kfree(rn); -+ -+ return 0; -+} -+ -+int -+ipsec_makeroute(struct sockaddr_encap *eaddr, -+ struct sockaddr_encap *emask, -+ ip_said said, -+ uint32_t pid, -+ struct sk_buff *skb, -+ struct ident *ident_s, -+ struct ident *ident_d) -+{ -+ struct eroute *retrt; -+ int error; -+ char sa[SATOT_BUF]; -+ size_t sa_len; -+ -+#ifdef CONFIG_KLIPS_DEBUG -+ -+ if (debug_eroute) { -+ -+ { -+ char buf1[SUBNETTOA_BUF], buf2[SUBNETTOA_BUF]; -+ -+ subnettoa(eaddr->sen_ip_src, emask->sen_ip_src, 0, buf1, sizeof(buf1)); -+ subnettoa(eaddr->sen_ip_dst, emask->sen_ip_dst, 0, buf2, sizeof(buf2)); -+ sa_len = satot(&said, 0, sa, sizeof(sa)); -+ KLIPS_PRINT(debug_eroute, -+ "klips_debug:ipsec_makeroute: " -+ "attempting to allocate %lu bytes to insert eroute for %s->%s, SA: %s, PID:%d, skb=0p%p, ident:%s->%s\n", -+ (unsigned long) sizeof(struct eroute), -+ buf1, -+ buf2, -+ sa_len ? sa : " (error)", -+ pid, -+ skb, -+ (ident_s ? (ident_s->data ? ident_s->data : "NULL") : "NULL"), -+ (ident_d ? (ident_d->data ? ident_d->data : "NULL") : "NULL")); -+ } -+ { -+ char buf1[sizeof(struct sockaddr_encap)*2 + 1], -+ buf2[sizeof(struct sockaddr_encap)*2 + 1]; -+ int i; -+ unsigned char *b1 = buf1, -+ *b2 = buf2, -+ *ea = (unsigned char *)eaddr, -+ *em = (unsigned char *)emask; -+ -+ -+ for (i=0; ier_eaddr = *eaddr; -+ retrt->er_emask = *emask; -+ retrt->er_said = said; -+ retrt->er_pid = pid; -+ retrt->er_count = 0; -+ retrt->er_lasttime = jiffies/HZ; -+ -+ { -+ /* this is because gcc 3. doesn't like cast's as lvalues */ -+ struct rjtentry *rje = (struct rjtentry *)&(retrt->er_rjt); -+ caddr_t er = (caddr_t)&(retrt->er_eaddr); -+ -+ rje->rd_nodes->rj_key= er; -+ } -+ -+ if (ident_s && ident_s->type != SADB_IDENTTYPE_RESERVED) { -+ int data_len = ident_s->len * IPSEC_PFKEYv2_ALIGN - sizeof(struct sadb_ident); -+ -+ retrt->er_ident_s.type = ident_s->type; -+ retrt->er_ident_s.id = ident_s->id; -+ retrt->er_ident_s.len = ident_s->len; -+ if(data_len) { -+ KLIPS_PRINT(debug_eroute, -+ "klips_debug:ipsec_makeroute: " -+ "attempting to allocate %u bytes for ident_s.\n", -+ data_len); -+ if(!(retrt->er_ident_s.data = kmalloc(data_len, GFP_KERNEL))) { -+ kfree(retrt); -+ printk("klips_error:ipsec_makeroute: not able to allocate kernel memory (%d)\n", data_len); -+ return ENOMEM; -+ } -+ memcpy(retrt->er_ident_s.data, ident_s->data, data_len); -+ } else { -+ retrt->er_ident_s.data = NULL; -+ } -+ } -+ -+ if (ident_d && ident_d->type != SADB_IDENTTYPE_RESERVED) { -+ int data_len = ident_d->len * IPSEC_PFKEYv2_ALIGN - sizeof(struct sadb_ident); -+ -+ retrt->er_ident_d.type = ident_d->type; -+ retrt->er_ident_d.id = ident_d->id; -+ retrt->er_ident_d.len = ident_d->len; -+ if(data_len) { -+ KLIPS_PRINT(debug_eroute, -+ "klips_debug:ipsec_makeroute: " -+ "attempting to allocate %u bytes for ident_d.\n", -+ data_len); -+ if(!(retrt->er_ident_d.data = kmalloc(data_len, GFP_KERNEL))) { -+ if (retrt->er_ident_s.data) -+ kfree(retrt->er_ident_s.data); -+ kfree(retrt); -+ printk("klips_error:ipsec_makeroute: not able to allocate kernel memory (%d)\n", data_len); -+ return ENOMEM; -+ } -+ memcpy(retrt->er_ident_d.data, ident_d->data, data_len); -+ } else { -+ retrt->er_ident_d.data = NULL; -+ } -+ } -+ retrt->er_first = skb; -+ retrt->er_last = NULL; -+ -+ KLIPS_PRINT(debug_eroute, -+ "klips_debug:ipsec_makeroute: " -+ "calling rj_addroute now\n"); -+ -+ spin_lock_bh(&eroute_lock); -+ -+ error = rj_addroute(&(retrt->er_eaddr), &(retrt->er_emask), -+ rnh, retrt->er_rjt.rd_nodes); -+ -+ spin_unlock_bh(&eroute_lock); -+ -+ if(error) { -+ sa_len = KLIPS_SATOT(debug_eroute, &said, 0, sa, sizeof(sa)); -+ KLIPS_PRINT(debug_eroute, -+ "klips_debug:ipsec_makeroute: " -+ "rj_addroute not able to insert eroute for SA:%s (error:%d)\n", -+ sa_len ? sa : " (error)", error); -+ if (retrt->er_ident_s.data) -+ kfree(retrt->er_ident_s.data); -+ if (retrt->er_ident_d.data) -+ kfree(retrt->er_ident_d.data); -+ -+ kfree(retrt); -+ -+ return error; -+ } -+ -+#ifdef CONFIG_KLIPS_DEBUG -+ if (debug_eroute) { -+ char buf1[SUBNETTOA_BUF], buf2[SUBNETTOA_BUF]; -+/* -+ subnettoa(eaddr->sen_ip_src, emask->sen_ip_src, 0, buf1, sizeof(buf1)); -+ subnettoa(eaddr->sen_ip_dst, emask->sen_ip_dst, 0, buf2, sizeof(buf2)); -+*/ -+ subnettoa(rd_key((&(retrt->er_rjt)))->sen_ip_src, rd_mask((&(retrt->er_rjt)))->sen_ip_src, 0, buf1, sizeof(buf1)); -+ subnettoa(rd_key((&(retrt->er_rjt)))->sen_ip_dst, rd_mask((&(retrt->er_rjt)))->sen_ip_dst, 0, buf2, sizeof(buf2)); -+ sa_len = satot(&retrt->er_said, 0, sa, sizeof(sa)); -+ -+ KLIPS_PRINT(debug_eroute, -+ "klips_debug:ipsec_makeroute: " -+ "pid=%05d " -+ "count=%10d " -+ "lasttime=%6d " -+ "%-18s -> %-18s => %s\n", -+ retrt->er_pid, -+ retrt->er_count, -+ (int)(jiffies/HZ - retrt->er_lasttime), -+ buf1, -+ buf2, -+ sa_len ? sa : " (error)"); -+ } -+#endif /* CONFIG_KLIPS_DEBUG */ -+ KLIPS_PRINT(debug_eroute, -+ "klips_debug:ipsec_makeroute: " -+ "succeeded.\n"); -+ return 0; -+} -+ -+struct eroute * -+ipsec_findroute(struct sockaddr_encap *eaddr) -+{ -+ struct radij_node *rn; -+#ifdef CONFIG_KLIPS_DEBUG -+ char buf1[ADDRTOA_BUF], buf2[ADDRTOA_BUF]; -+ -+ if (debug_radij & DB_RJ_FINDROUTE) { -+ addrtoa(eaddr->sen_ip_src, 0, buf1, sizeof(buf1)); -+ addrtoa(eaddr->sen_ip_dst, 0, buf2, sizeof(buf2)); -+ KLIPS_PRINT(debug_eroute, -+ "klips_debug:ipsec_findroute: " -+ "%s:%d->%s:%d %d\n", -+ buf1, ntohs(eaddr->sen_sport), -+ buf2, ntohs(eaddr->sen_dport), -+ eaddr->sen_proto); -+ } -+#endif /* CONFIG_KLIPS_DEBUG */ -+ rn = rj_match((caddr_t)eaddr, rnh); -+ if(rn) { -+ KLIPS_PRINT(debug_eroute && sysctl_ipsec_debug_verbose, -+ "klips_debug:ipsec_findroute: " -+ "found, points to proto=%d, spi=%x, dst=%x.\n", -+ ((struct eroute*)rn)->er_said.proto, -+ ntohl(((struct eroute*)rn)->er_said.spi), -+ ntohl(((struct eroute*)rn)->er_said.dst.u.v4.sin_addr.s_addr)); -+ } -+ return (struct eroute *)rn; -+} -+ -+#ifdef CONFIG_PROC_FS -+/** ipsec_rj_walker_procprint: print one line of eroute table output. -+ * -+ * Theoretical BUG: if w->length is less than the length -+ * of some line we should produce, that line will never -+ * be finished. In effect, the "file" will stop part way -+ * through that line. -+ */ -+int -+ipsec_rj_walker_procprint(struct radij_node *rn, void *w0) -+{ -+ struct eroute *ro = (struct eroute *)rn; -+ struct rjtentry *rd = (struct rjtentry *)rn; -+ struct wsbuf *w = (struct wsbuf *)w0; -+ char buf1[SUBNETTOA_BUF], buf2[SUBNETTOA_BUF]; -+ char buf3[16]; -+ char sa[SATOT_BUF]; -+ size_t sa_len, buf_len; -+ struct sockaddr_encap *key, *mask; -+ -+ KLIPS_PRINT(debug_radij, -+ "klips_debug:ipsec_rj_walker_procprint: " -+ "rn=0p%p, w0=0p%p\n", -+ rn, -+ w0); -+ if (rn->rj_b >= 0) { -+ return 0; -+ } -+ -+ key = rd_key(rd); -+ mask = rd_mask(rd); -+ -+ if (key == NULL || mask == NULL) { -+ return 0; -+ } -+ -+ buf_len = subnettoa(key->sen_ip_src, mask->sen_ip_src, 0, buf1, sizeof(buf1)); -+ if(key->sen_sport != 0) { -+ sprintf(buf1+buf_len-1, ":%d", ntohs(key->sen_sport)); -+ } -+ -+ buf_len = subnettoa(key->sen_ip_dst, mask->sen_ip_dst, 0, buf2, sizeof(buf2)); -+ if(key->sen_dport != 0) { -+ sprintf(buf2+buf_len-1, ":%d", ntohs(key->sen_dport)); -+ } -+ -+ buf3[0]='\0'; -+ if(key->sen_proto != 0) { -+ sprintf(buf3, ":%d", key->sen_proto); -+ } -+ -+ sa_len = satot(&ro->er_said, 'x', sa, sizeof(sa)); -+ w->len += ipsec_snprintf(w->buffer + w->len, -+ w->length - w->len, -+ "%-10d " -+ "%-18s -> %-18s => %s%s\n", -+ ro->er_count, -+ buf1, -+ buf2, -+ sa_len ? sa : " (error)", -+ buf3); -+ -+ { -+ /* snprintf can only fill the last character with NUL -+ * so the maximum useful character is w->length-1. -+ * However, if w->length == 0, we cannot go back. -+ * (w->length surely cannot be negative.) -+ */ -+ int max_content = w->length > 0? w->length-1 : 0; -+ -+ if (w->len >= max_content) { -+ /* we've done all that can fit -- stop treewalking */ -+ w->len = max_content; /* truncate crap */ -+ return -ENOBUFS; -+ } else { -+ const off_t pos = w->begin + w->len; /* file position of end of what we've generated */ -+ -+ if (pos <= w->offset) { -+ /* all is before first interesting character: -+ * discard, but note where we are. -+ */ -+ w->len = 0; -+ w->begin = pos; -+ } -+ return 0; -+ } -+ } -+} -+#endif /* CONFIG_PROC_FS */ -+ -+int -+ipsec_rj_walker_delete(struct radij_node *rn, void *w0) -+{ -+ struct eroute *ro; -+ struct rjtentry *rd = (struct rjtentry *)rn; -+ struct radij_node *rn2; -+ int error; -+ struct sockaddr_encap *key, *mask; -+ -+ key = rd_key(rd); -+ mask = rd_mask(rd); -+ -+ if(!key || !mask) { -+ return -ENODATA; -+ } -+#ifdef CONFIG_KLIPS_DEBUG -+ if(debug_radij) { -+ char buf1[SUBNETTOA_BUF], buf2[SUBNETTOA_BUF]; -+ subnettoa(key->sen_ip_src, mask->sen_ip_src, 0, buf1, sizeof(buf1)); -+ subnettoa(key->sen_ip_dst, mask->sen_ip_dst, 0, buf2, sizeof(buf2)); -+ KLIPS_PRINT(debug_radij, -+ "klips_debug:ipsec_rj_walker_delete: " -+ "deleting: %s -> %s\n", -+ buf1, -+ buf2); -+ } -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+ if((error = rj_delete(key, mask, rnh, &rn2))) { -+ KLIPS_PRINT(debug_radij, -+ "klips_debug:ipsec_rj_walker_delete: " -+ "rj_delete failed with error=%d.\n", error); -+ return error; -+ } -+ -+ if(rn2 != rn) { -+ printk("klips_debug:ipsec_rj_walker_delete: " -+ "tried to delete a different node?!? This should never happen!\n"); -+ } -+ -+ ro = (struct eroute *)rn; -+ -+ if (ro->er_ident_s.data) -+ kfree(ro->er_ident_s.data); -+ if (ro->er_ident_d.data) -+ kfree(ro->er_ident_d.data); -+ -+ memset((caddr_t)rn, 0, sizeof (struct eroute)); -+ kfree(rn); -+ -+ return 0; -+} -+ -+/* -+ * -+ * Local Variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/ipsec_rcv.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,2124 @@ -+/* -+ * receive code -+ * Copyright (C) 1996, 1997 John Ioannidis. -+ * Copyright (C) 1998-2003 Richard Guy Briggs. -+ * Copyright (C) 2004-2007 Michael Richardson -+ * Copyright (C) 2007-2008 Paul Wouters -+ * -+ * OCF/receive state machine written by -+ * David McCullough -+ * Copyright (C) 2004-2005 Intel Corporation. All Rights Reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ */ -+ -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif -+#include -+ -+#define __NO_VERSION__ -+#include -+#include /* printk() */ -+ -+#include "openswan/ipsec_param.h" -+ -+#ifdef MALLOC_SLAB -+# include /* kmalloc() */ -+#else /* MALLOC_SLAB */ -+# include /* kmalloc() */ -+#endif /* MALLOC_SLAB */ -+#include /* error codes */ -+#include /* size_t */ -+#include /* mark_bh */ -+ -+#include /* struct device, and other headers */ -+#include /* eth_type_trans */ -+#include /* struct iphdr */ -+ -+#include -+#include -+#include -+#include -+#include -+ -+#ifdef SPINLOCK -+# ifdef SPINLOCK_23 -+# include /* *lock* */ -+# ifdef NEED_SPINLOCK_TYPES -+# include -+# endif -+# else /* SPINLOCK_23 */ -+# include /* *lock* */ -+# endif /* SPINLOCK_23 */ -+#endif /* SPINLOCK */ -+ -+#include -+ -+#include "openswan/ipsec_kern24.h" -+#include "openswan/radij.h" -+#include "openswan/ipsec_encap.h" -+#include "openswan/ipsec_sa.h" -+ -+#include "openswan/ipsec_radij.h" -+#include "openswan/ipsec_xform.h" -+#include "openswan/ipsec_tunnel.h" -+#include "openswan/ipsec_mast.h" -+#include "openswan/ipsec_rcv.h" -+ -+#include "openswan/ipsec_auth.h" -+ -+#include "openswan/ipsec_esp.h" -+ -+#ifdef CONFIG_KLIPS_AH -+#include "openswan/ipsec_ah.h" -+#endif /* CONFIG_KLIPS_AH */ -+ -+#ifdef CONFIG_KLIPS_IPCOMP -+#include "openswan/ipsec_ipcomp.h" -+#endif /* CONFIG_KLIPS_COMP */ -+ -+#include -+#include -+ -+#include "openswan/ipsec_proto.h" -+#include "openswan/ipsec_alg.h" -+#include "openswan/ipsec_kern24.h" -+ -+#ifdef CONFIG_KLIPS_OCF -+#include "ipsec_ocf.h" -+#endif -+ -+#ifdef CONFIG_IPSEC_NAT_TRAVERSAL -+#include -+#endif -+ -+/* This is a private use protocol, and AT&T should be ashamed. They should have -+ * used protocol # 59, which is "no next header" instead of 0xFE. -+ */ -+#ifndef IPPROTO_ATT_HEARTBEAT -+#define IPPROTO_ATT_HEARTBEAT 0xFE -+#endif -+ -+/* management of buffers */ -+static struct ipsec_rcv_state *ipsec_rcv_state_new (void); -+static void ipsec_rcv_state_delete (struct ipsec_rcv_state *irs); -+ -+/* -+ * Check-replay-window routine, adapted from the original -+ * by J. Hughes, from draft-ietf-ipsec-esp-des-md5-03.txt -+ * -+ * This is a routine that implements a 64 packet window. This is intend- -+ * ed on being an implementation sample. -+ */ -+ -+DEBUG_NO_STATIC int -+ipsec_checkreplaywindow(struct ipsec_sa*ipsp, __u32 seq) -+{ -+ __u32 diff; -+ -+ if (ipsp->ips_replaywin == 0) /* replay shut off */ -+ return 1; -+ if (seq == 0) -+ return 0; /* first == 0 or wrapped */ -+ -+ /* new larger sequence number */ -+ if (seq > ipsp->ips_replaywin_lastseq) { -+ return 1; /* larger is good */ -+ } -+ diff = ipsp->ips_replaywin_lastseq - seq; -+ -+ /* too old or wrapped */ /* if wrapped, kill off SA? */ -+ if (diff >= ipsp->ips_replaywin) { -+ return 0; -+ } -+ /* this packet already seen */ -+ if (ipsp->ips_replaywin_bitmap & (1 << diff)) -+ return 0; -+ return 1; /* out of order but good */ -+} -+ -+DEBUG_NO_STATIC int -+ipsec_updatereplaywindow(struct ipsec_sa*ipsp, __u32 seq) -+{ -+ __u32 diff; -+ -+ if (ipsp->ips_replaywin == 0) /* replay shut off */ -+ return 1; -+ if (seq == 0) -+ return 0; /* first == 0 or wrapped */ -+ -+ /* new larger sequence number */ -+ if (seq > ipsp->ips_replaywin_lastseq) { -+ diff = seq - ipsp->ips_replaywin_lastseq; -+ -+ /* In win, set bit for this pkt */ -+ if (diff < ipsp->ips_replaywin) -+ ipsp->ips_replaywin_bitmap = -+ (ipsp->ips_replaywin_bitmap << diff) | 1; -+ else -+ /* This packet has way larger seq num */ -+ ipsp->ips_replaywin_bitmap = 1; -+ -+ if(seq - ipsp->ips_replaywin_lastseq - 1 > ipsp->ips_replaywin_maxdiff) { -+ ipsp->ips_replaywin_maxdiff = seq - ipsp->ips_replaywin_lastseq - 1; -+ } -+ ipsp->ips_replaywin_lastseq = seq; -+ return 1; /* larger is good */ -+ } -+ diff = ipsp->ips_replaywin_lastseq - seq; -+ -+ /* too old or wrapped */ /* if wrapped, kill off SA? */ -+ if (diff >= ipsp->ips_replaywin) { -+/* -+ if(seq < 0.25*max && ipsp->ips_replaywin_lastseq > 0.75*max) { -+ ipsec_sa_delchain(ipsp); -+ } -+*/ -+ return 0; -+ } -+ /* this packet already seen */ -+ if (ipsp->ips_replaywin_bitmap & (1 << diff)) -+ return 0; -+ ipsp->ips_replaywin_bitmap |= (1 << diff); /* mark as seen */ -+ return 1; /* out of order but good */ -+} -+ -+#ifdef CONFIG_KLIPS_AUTH_HMAC_MD5 -+struct auth_alg ipsec_rcv_md5[]={ -+ {osMD5Init, osMD5Update, osMD5Final, AHMD596_ALEN} -+}; -+ -+#endif /* CONFIG_KLIPS_AUTH_HMAC_MD5 */ -+ -+#ifdef CONFIG_KLIPS_AUTH_HMAC_SHA1 -+struct auth_alg ipsec_rcv_sha1[]={ -+ {SHA1Init, SHA1Update, SHA1Final, AHSHA196_ALEN} -+}; -+#endif /* CONFIG_KLIPS_AUTH_HMAC_MD5 */ -+ -+ -+static inline void ipsec_rcv_redodebug(struct ipsec_rcv_state *irs) -+{ -+ struct iphdr * ipp = irs->ipp; -+ struct in_addr ipsaddr, ipdaddr; -+ -+ ipsaddr.s_addr = ipp->saddr; -+ addrtoa(ipsaddr, 0, irs->ipsaddr_txt, sizeof(irs->ipsaddr_txt)); -+ ipdaddr.s_addr = ipp->daddr; -+ addrtoa(ipdaddr, 0, irs->ipdaddr_txt, sizeof(irs->ipdaddr_txt)); -+} -+ -+ -+#ifdef CONFIG_KLIPS_DEBUG -+DEBUG_NO_STATIC char * -+ipsec_rcv_err(int err) -+{ -+ static char tmp[32]; -+ switch ((int) err) { -+ case IPSEC_RCV_PENDING: return("IPSEC_RCV_PENDING"); -+ case IPSEC_RCV_LASTPROTO: return("IPSEC_RCV_LASTPROTO"); -+ case IPSEC_RCV_OK: return("IPSEC_RCV_OK"); -+ case IPSEC_RCV_BADPROTO: return("IPSEC_RCV_BADPROTO"); -+ case IPSEC_RCV_BADLEN: return("IPSEC_RCV_BADLEN"); -+ case IPSEC_RCV_ESP_BADALG: return("IPSEC_RCV_ESP_BADALG"); -+ case IPSEC_RCV_3DES_BADBLOCKING:return("IPSEC_RCV_3DES_BADBLOCKING"); -+ case IPSEC_RCV_ESP_DECAPFAIL: return("IPSEC_RCV_ESP_DECAPFAIL"); -+ case IPSEC_RCV_DECAPFAIL: return("IPSEC_RCV_DECAPFAIL"); -+ case IPSEC_RCV_SAIDNOTFOUND: return("IPSEC_RCV_SAIDNOTFOUND"); -+ case IPSEC_RCV_IPCOMPALONE: return("IPSEC_RCV_IPCOMPALONE"); -+ case IPSEC_RCV_IPCOMPFAILED: return("IPSEC_RCV_IPCOMPFAILED"); -+ case IPSEC_RCV_SAIDNOTLIVE: return("IPSEC_RCV_SAIDNOTLIVE"); -+ case IPSEC_RCV_FAILEDINBOUND: return("IPSEC_RCV_FAILEDINBOUND"); -+ case IPSEC_RCV_LIFETIMEFAILED: return("IPSEC_RCV_LIFETIMEFAILED"); -+ case IPSEC_RCV_BADAUTH: return("IPSEC_RCV_BADAUTH"); -+ case IPSEC_RCV_REPLAYFAILED: return("IPSEC_RCV_REPLAYFAILED"); -+ case IPSEC_RCV_AUTHFAILED: return("IPSEC_RCV_AUTHFAILED"); -+ case IPSEC_RCV_REPLAYROLLED: return("IPSEC_RCV_REPLAYROLLED"); -+ case IPSEC_RCV_BAD_DECRYPT: return("IPSEC_RCV_BAD_DECRYPT"); -+ case IPSEC_RCV_REALLYBAD: return("IPSEC_RCV_REALLYBAD"); -+ } -+ snprintf(tmp, sizeof(tmp), "%d", err); -+ return tmp; -+} -+#endif -+ -+/* -+ * here is a state machine to handle receiving ipsec packets. -+ * basically we keep getting re-entered until processing is -+ * complete. For the simple case we step down the states and finish. -+ * each state is ideally some logical part of the process. If a state -+ * can pend (ie., require async processing to complete), then this -+ * should be the part of last action before it returns IPSEC_RCV_PENDING -+ * -+ * Any particular action may alter the next_state in irs to move us to -+ * a state other than the preferred "next_state", but this is the -+ * exception and is highlighted when it is done. -+ * -+ * prototypes for state action -+ */ -+ -+static enum ipsec_rcv_value ipsec_rcv_init(struct ipsec_rcv_state *irs); -+static enum ipsec_rcv_value ipsec_rcv_decap_init(struct ipsec_rcv_state *irs); -+static enum ipsec_rcv_value ipsec_rcv_decap_lookup(struct ipsec_rcv_state *irs); -+static enum ipsec_rcv_value ipsec_rcv_auth_init(struct ipsec_rcv_state *irs); -+static enum ipsec_rcv_value ipsec_rcv_auth_decap(struct ipsec_rcv_state *irs); -+static enum ipsec_rcv_value ipsec_rcv_auth_calc(struct ipsec_rcv_state *irs); -+static enum ipsec_rcv_value ipsec_rcv_auth_chk(struct ipsec_rcv_state *irs); -+static enum ipsec_rcv_value ipsec_rcv_decrypt(struct ipsec_rcv_state *irs); -+static enum ipsec_rcv_value ipsec_rcv_decap_cont(struct ipsec_rcv_state *irs); -+static enum ipsec_rcv_value ipsec_rcv_cleanup(struct ipsec_rcv_state *irs); -+static enum ipsec_rcv_value ipsec_rcv_complete(struct ipsec_rcv_state *irs); -+ -+/* -+ * the state table and each action -+ */ -+ -+struct { -+ enum ipsec_rcv_value (*action)(struct ipsec_rcv_state *irs); -+ int next_state; -+} rcv_state_table[] = { -+ [IPSEC_RSM_INIT] = {ipsec_rcv_init, IPSEC_RSM_DECAP_INIT }, -+ [IPSEC_RSM_DECAP_INIT] = {ipsec_rcv_decap_init, IPSEC_RSM_DECAP_LOOKUP }, -+ [IPSEC_RSM_DECAP_LOOKUP] = {ipsec_rcv_decap_lookup,IPSEC_RSM_AUTH_INIT }, -+ [IPSEC_RSM_AUTH_INIT] = {ipsec_rcv_auth_init, IPSEC_RSM_AUTH_DECAP }, -+ [IPSEC_RSM_AUTH_DECAP] = {ipsec_rcv_auth_decap, IPSEC_RSM_AUTH_CALC }, -+ [IPSEC_RSM_AUTH_CALC] = {ipsec_rcv_auth_calc, IPSEC_RSM_AUTH_CHK }, -+ [IPSEC_RSM_AUTH_CHK] = {ipsec_rcv_auth_chk, IPSEC_RSM_DECRYPT }, -+ [IPSEC_RSM_DECRYPT] = {ipsec_rcv_decrypt, IPSEC_RSM_DECAP_CONT }, -+ [IPSEC_RSM_DECAP_CONT] = {ipsec_rcv_decap_cont, IPSEC_RSM_CLEANUP }, -+ [IPSEC_RSM_CLEANUP] = {ipsec_rcv_cleanup, IPSEC_RSM_COMPLETE }, -+ [IPSEC_RSM_COMPLETE] = {ipsec_rcv_complete, IPSEC_RSM_DONE }, -+ -+ [IPSEC_RSM_DONE] = {NULL, IPSEC_RSM_DONE}, -+}; -+ -+ -+ -+struct sk_buff *ipsec_rcv_unclone(struct sk_buff *skb, -+ struct ipsec_rcv_state *irs) -+{ -+ /* if skb was cloned (most likely due to a packet sniffer such as -+ tcpdump being momentarily attached to the interface), make -+ a copy of our own to modify */ -+ if(skb_cloned(skb)) { -+ /* include any mac header while copying.. */ -+ if(skb_headroom(skb) < irs->hard_header_len) { -+ printk(KERN_WARNING "klips_error:ipsec_rcv: " -+ "tried to skb_push hhlen=%d, %d available. This should never happen, please report.\n", -+ irs->hard_header_len, -+ skb_headroom(skb)); -+ goto rcvleave; -+ } -+ skb_push(skb, irs->hard_header_len); -+ if -+#ifdef SKB_COW_NEW -+ (skb_cow(skb, skb_headroom(skb)) != 0) -+#else /* SKB_COW_NEW */ -+ ((skb = skb_cow(skb, skb_headroom(skb))) == NULL) -+#endif /* SKB_COW_NEW */ -+ { -+ goto rcvleave; -+ } -+ if(skb->len < irs->hard_header_len) { -+ printk(KERN_WARNING "klips_error:ipsec_rcv: " -+ "tried to skb_pull hhlen=%d, %d available. This should never happen, please report.\n", -+ irs->hard_header_len, -+ skb->len); -+ goto rcvleave; -+ } -+ skb_pull(skb, irs->hard_header_len); -+ } -+ return skb; -+ -+rcvleave: -+ ipsec_kfree_skb(skb); -+ return NULL; -+} -+ -+ -+ -+ -+#if !defined(NET_26) && defined(CONFIG_IPSEC_NAT_TRAVERSAL) -+/* -+ * decapsulate a UDP encapsulated ESP packet -+ */ -+struct sk_buff *ipsec_rcv_natt_decap(struct sk_buff *skb -+ , struct ipsec_rcv_state *irs -+ , int *udp_decap_ret_p) -+{ -+ *udp_decap_ret_p = 0; -+ if (skb->sk && ip_hdr(skb) && ip_hdr(skb)->protocol==IPPROTO_UDP) { -+ /** -+ * Packet comes from udp_queue_rcv_skb so it is already defrag, -+ * checksum verified, ... (ie safe to use) -+ * -+ * If the packet is not for us, return -1 and udp_queue_rcv_skb -+ * will continue to handle it (do not kfree skb !!). -+ */ -+ -+#ifndef UDP_OPT_IN_SOCK -+ struct udp_opt { -+ __u32 esp_in_udp; -+ }; -+ struct udp_opt *tp = (struct udp_opt *)&(skb->sk->tp_pinfo.af_tcp); -+#else -+ struct udp_opt *tp = &(skb->sk->tp_pinfo.af_udp); -+#endif -+ -+ struct iphdr *ip = ip_hdr(skb); -+ struct udphdr *udp = (struct udphdr *)((__u32 *)ip+ip->ihl); -+ __u8 *udpdata = (__u8 *)udp + sizeof(struct udphdr); -+ __u32 *udpdata32 = (__u32 *)udpdata; -+ -+ irs->natt_sport = ntohs(udp->source); -+ irs->natt_dport = ntohs(udp->dest); -+ -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "suspected ESPinUDP packet (NAT-Traversal) [%d].\n", -+ tp->esp_in_udp); -+ KLIPS_IP_PRINT(debug_rcv, ip); -+ -+ if (udpdata < skb->tail) { -+ unsigned int len = skb->tail - udpdata; -+ if ((len==1) && (udpdata[0]==0xff)) { -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ /* not IPv6 compliant message */ -+ "NAT-keepalive from %d.%d.%d.%d.\n", NIPQUAD(ip->saddr)); -+ *udp_decap_ret_p = 0; -+ return NULL; -+ } -+ else if ( (tp->esp_in_udp == ESPINUDP_WITH_NON_IKE) && -+ (len > (2*sizeof(__u32) + sizeof(struct esphdr))) && -+ (udpdata32[0]==0) && (udpdata32[1]==0) ) { -+ /* ESP Packet with Non-IKE header */ -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "ESPinUDP pkt with Non-IKE - spi=0x%x\n", -+ ntohl(udpdata32[2])); -+ irs->natt_type = ESPINUDP_WITH_NON_IKE; -+ irs->natt_len = sizeof(struct udphdr)+(2*sizeof(__u32)); -+ } -+ else if ( (tp->esp_in_udp == ESPINUDP_WITH_NON_ESP) && -+ (len > sizeof(struct esphdr)) && -+ (udpdata32[0]!=0) ) { -+ /* ESP Packet without Non-ESP header */ -+ irs->natt_type = ESPINUDP_WITH_NON_ESP; -+ irs->natt_len = sizeof(struct udphdr); -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "ESPinUDP pkt without Non-ESP - spi=0x%x\n", -+ ntohl(udpdata32[0])); -+ } -+ else { -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "IKE packet - not handled here\n"); -+ *udp_decap_ret_p = -1; -+ return NULL; -+ } -+ } -+ else { -+ return NULL; -+ } -+ } -+ return skb; -+} -+#endif -+ -+#ifdef HAVE_IPSEC_SAREF -+void ip_cmsg_recv_ipsec(struct msghdr *msg, struct sk_buff *skb) -+{ -+ struct ipsec_sa *sa1; -+ struct sec_path *sp; -+ xfrm_sec_unique_t refs[2]; -+ -+ sp = skb->sp; -+ -+ if(sp==NULL) return; -+ -+ KLIPS_PRINT(debug_rcv, "retrieving saref=%u from skb=%p\n", -+ sp->ref, skb); -+ -+ sa1 = ipsec_sa_getbyref(sp->ref); -+ if(sa1) { -+ refs[1]= sa1->ips_refhim; -+ } -+ refs[0]=sp->ref; -+ -+ put_cmsg(msg, SOL_IP, IP_IPSEC_REFINFO, -+ sizeof(xfrm_sec_unique_t)*2, &refs); -+} -+#endif -+ -+ -+void ipsec_rcv_setoutif(struct ipsec_rcv_state *irs) -+{ -+ struct sk_buff *skb = irs->skb; -+ -+ if(skb!=NULL && irs->ipsp->ips_out) { -+ if(skb->dev != irs->ipsp->ips_out) { -+ KLIPS_PRINT(debug_rcv, -+ "changing originating interface from %s to %s\n", -+ skb->dev->name, -+ irs->ipsp->ips_out->name); -+ } -+ skb->dev = irs->ipsp->ips_out; -+ -+ if(skb->dev && skb->dev->get_stats) { -+ struct net_device_stats *stats = skb->dev->get_stats(skb->dev); -+ irs->stats = stats; -+ } -+ } -+} -+ -+static enum ipsec_rcv_value -+ipsec_rcv_decap_ipip(struct ipsec_rcv_state *irs) -+{ -+ struct ipsec_sa *ipsp = NULL; -+ struct ipsec_sa* ipsnext = NULL; -+ struct iphdr *ipp; -+ struct sk_buff *skb; -+ enum ipsec_rcv_value result = IPSEC_RCV_DECAPFAIL; -+ -+ ipp = irs->ipp; -+ ipsp = irs->ipsp; -+ skb = irs->skb; -+ irs->sa_len = satot(&irs->said, 0, irs->sa, sizeof(irs->sa)); -+ if((ipp->protocol != IPPROTO_IPIP) && -+ (ipp->protocol != IPPROTO_ATT_HEARTBEAT)) { /* AT&T heartbeats to SIG/GIG */ -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "SA:%s, Hey! How did this get through? Dropped.\n", -+ irs->sa_len ? irs->sa : " (error)"); -+ if(irs->stats) { -+ irs->stats->rx_dropped++; -+ } -+ goto rcvleave; -+ } -+ if(sysctl_ipsec_inbound_policy_check) { -+ struct sockaddr_in *psin = (struct sockaddr_in*)(ipsp->ips_addr_s); -+ if((ipsnext = ipsp->ips_next)) { -+ char sa2[SATOT_BUF]; -+ size_t sa_len2; -+ sa_len2 = satot(&ipsnext->ips_said, 0, sa2, sizeof(sa2)); -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "unexpected SA:%s after IPIP SA:%s\n", -+ sa_len2 ? sa2 : " (error)", -+ irs->sa_len ? irs->sa : " (error)"); -+ if(irs->stats) { -+ irs->stats->rx_dropped++; -+ } -+ goto rcvleave; -+ } -+ if(ipp->saddr != psin->sin_addr.s_addr) { -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "SA:%s, src=%s(%08x) does match expected 0x%08x.\n", -+ irs->sa_len ? irs->sa : " (error)", -+ irs->ipsaddr_txt, -+ ipp->saddr, psin->sin_addr.s_addr); -+ if(irs->stats) { -+ irs->stats->rx_dropped++; -+ } -+ goto rcvleave; -+ } -+ } -+ -+ ipsec_rcv_setoutif(irs); -+ -+ if(ipp->protocol == IPPROTO_IPIP) /* added to support AT&T heartbeats to SIG/GIG */ -+ { -+ /* -+ * XXX this needs to be locked from when it was first looked -+ * up in the decapsulation loop. Perhaps it is better to put -+ * the IPIP decap inside the loop. -+ */ -+ ipsp->ips_life.ipl_bytes.ipl_count += skb->len; -+ ipsp->ips_life.ipl_bytes.ipl_last = skb->len; -+ -+ if(!ipsp->ips_life.ipl_usetime.ipl_count) { -+ ipsp->ips_life.ipl_usetime.ipl_count = jiffies / HZ; -+ } -+ ipsp->ips_life.ipl_usetime.ipl_last = jiffies / HZ; -+ ipsp->ips_life.ipl_packets.ipl_count += 1; -+ -+ if(skb->len < irs->iphlen) { -+ printk(KERN_WARNING "klips_debug:ipsec_rcv: " -+ "tried to skb_pull iphlen=%d, %d available. This should never happen, please report.\n", -+ irs->iphlen, -+ (int)(skb->len)); -+ -+ goto rcvleave; -+ } -+ -+ /* -+ * we need to pull up by size of IP header, -+ * options, but also by any UDP/ESP encap there might -+ * have been, and this deals with all cases. -+ */ -+ skb_pull(skb, (skb_transport_header(skb) - skb_network_header(skb))); -+ -+ /* new L3 header is where L4 payload was */ -+ skb_set_network_header(skb, ipsec_skb_offset(skb, skb_transport_header(skb))); -+ -+ /* now setup new L4 payload location */ -+ ipp = (struct iphdr *)skb_network_header(skb); -+ skb_set_transport_header(skb, ipsec_skb_offset(skb, skb_network_header(skb) + (ipp->ihl << 2))); -+ -+ -+ /* remove any saved options that we might have, -+ * since we have a new IP header. -+ */ -+ memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); -+ -+#if 0 -+ KLIPS_PRINT(debug_rcv, "csum: %d\n", ip_fast_csum((u8 *)ipp, ipp->ihl)); -+#endif -+ -+ /* re-do any strings for debugging */ -+ irs->ipp = ipp; -+ ipsec_rcv_redodebug(irs); -+ -+ skb->protocol = htons(ETH_P_IP); -+ skb->ip_summed = 0; -+ KLIPS_PRINT(debug_rcv & DB_RX_PKTRX, -+ "klips_debug:ipsec_rcv: " -+ "IPIP tunnel stripped.\n"); -+ KLIPS_IP_PRINT(debug_rcv & DB_RX_PKTRX, ipp); -+ } -+ -+ if(sysctl_ipsec_inbound_policy_check -+ /* -+ Note: "xor" (^) logically replaces "not equal" -+ (!=) and "bitwise or" (|) logically replaces -+ "boolean or" (||). This is done to speed up -+ execution by doing only bitwise operations and -+ no branch operations -+ */ -+ && (((ipp->saddr & ipsp->ips_mask_s.u.v4.sin_addr.s_addr) -+ ^ ipsp->ips_flow_s.u.v4.sin_addr.s_addr) -+ | ((ipp->daddr & ipsp->ips_mask_d.u.v4.sin_addr.s_addr) -+ ^ ipsp->ips_flow_d.u.v4.sin_addr.s_addr)) ) -+ { -+ char sflow_txt[SUBNETTOA_BUF], dflow_txt[SUBNETTOA_BUF]; -+ -+ subnettoa(ipsp->ips_flow_s.u.v4.sin_addr, -+ ipsp->ips_mask_s.u.v4.sin_addr, -+ 0, sflow_txt, sizeof(sflow_txt)); -+ subnettoa(ipsp->ips_flow_d.u.v4.sin_addr, -+ ipsp->ips_mask_d.u.v4.sin_addr, -+ 0, dflow_txt, sizeof(dflow_txt)); -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "SA:%s, inner tunnel policy [%s -> %s] does not agree with pkt contents [%s -> %s].\n", -+ irs->sa_len ? irs->sa : " (error)", -+ sflow_txt, -+ dflow_txt, -+ irs->ipsaddr_txt, -+ irs->ipdaddr_txt); -+ if(irs->stats) { -+ irs->stats->rx_dropped++; -+ } -+ goto rcvleave; -+ } -+#ifdef CONFIG_NETFILTER -+ skb->nfmark = (skb->nfmark & (~(IPsecSAref2NFmark(IPSEC_SA_REF_TABLE_MASK)))) -+ | IPsecSAref2NFmark(IPsecSA2SAref(ipsp)); -+ KLIPS_PRINT(debug_rcv & DB_RX_PKTRX, -+ "klips_debug:ipsec_rcv: " -+ "IPIP SA sets skb->nfmark=0x%x.\n", -+ (unsigned)skb->nfmark); -+#endif /* CONFIG_NETFILTER */ -+ -+ result = IPSEC_RCV_OK; -+ -+rcvleave: -+ return result; -+} -+ -+/* -+ * get all the initial checking and setup done. Not of this can be off -+ * loaded by any currently support hardware -+ * -+ * the following things should be setup when we exit this function. -+ * -+ * irs->stats == stats structure (or NULL) -+ * irs->ipp = IP header. -+ * irs->len = total length of packet -+ * skb->nh.iph = ipp; -+ * skb->h.raw = start of payload -+ * irs->ipsp = NULL. -+ * irs->iphlen = N/A = is recalculated. -+ * irs->ilen = 0; -+ * irs->authlen = 0; -+ * irs->authfuncs = NULL; -+ * irs->skb = the skb; -+ * -+ * proto_funcs should be from ipsec_esp.c, ipsec_ah.c or ipsec_ipcomp.c. -+ * -+ */ -+ -+static enum ipsec_rcv_value -+ipsec_rcv_init(struct ipsec_rcv_state *irs) -+{ -+#ifdef CONFIG_KLIPS_DEBUG -+ struct net_device *dev; -+#endif /* CONFIG_KLIPS_DEBUG */ -+ unsigned char protoc; -+ struct iphdr *ipp; -+ struct net_device_stats *stats = NULL; /* This device's statistics */ -+ int i; -+ struct sk_buff *skb; -+ -+ KLIPS_PRINT(debug_rcv, "klips_debug: %s(st=%d,nxt=%d)\n", __FUNCTION__, -+ irs->state, irs->next_state); -+ -+ if (irs == NULL) { -+ KLIPS_PRINT(debug_rcv, "klips_debug:ipsec_rcv_init: NULL irs."); -+ return IPSEC_RCV_REALLYBAD; -+ } -+ -+ skb = irs->skb; -+ if (!skb) { -+ KLIPS_PRINT(debug_rcv, "klips_debug:ipsec_rcv_init: NULL skb."); -+ return IPSEC_RCV_REALLYBAD; -+ } -+ dev = skb->dev; -+ -+ if (skb->data == NULL) { -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "NULL skb->data passed in, packet is bogus, dropping.\n"); -+ return IPSEC_RCV_REALLYBAD; -+ } -+ -+ /* dev->hard_header_len is unreliable and should not be used */ -+ /* klips26_rcv_encap will have already set hard_header_len for us */ -+ if (irs->hard_header_len == 0) { -+ irs->hard_header_len = skb_mac_header(skb) ? (skb_network_header(skb) - skb_mac_header(skb)) : 0; -+ if((irs->hard_header_len < 0) || (irs->hard_header_len > skb_headroom(skb))) -+ irs->hard_header_len = 0; -+ } -+ -+ skb = ipsec_rcv_unclone(skb, irs); -+ if(skb == NULL) { -+ return IPSEC_RCV_REALLYBAD; -+ } -+ -+#if IP_FRAGMENT_LINEARIZE -+ /* In Linux 2.4.4, we may have to reassemble fragments. They are -+ not assembled automatically to save TCP from having to copy -+ twice. -+ */ -+ if (skb_is_nonlinear(skb)) { -+#ifdef HAVE_NEW_SKB_LINEARIZE -+ if (skb_linearize_cow(skb) != 0) -+#else -+ if (skb_linearize(skb, GFP_ATOMIC) != 0) -+#endif -+ { -+ return IPSEC_RCV_REALLYBAD; -+ } -+ } -+#endif /* IP_FRAGMENT_LINEARIZE */ -+ -+ ipp = ip_hdr(skb); -+ irs->ipp = ipp; -+ -+#if defined(CONFIG_IPSEC_NAT_TRAVERSAL) && !defined(NET_26) -+ if (irs->natt_len) { -+ /** -+ * Now, we are sure packet is ESPinUDP, and we have a private -+ * copy that has been linearized, remove natt_len bytes -+ * from packet and modify protocol to ESP. -+ */ -+ if (((unsigned char *)skb->data > (unsigned char *)ip_hdr(skb)) -+ && ((unsigned char *)ip_hdr(skb) > (unsigned char *)skb->head)) -+ { -+ unsigned int _len = (unsigned char *)skb->data - -+ (unsigned char *)ip_hdr(skb); -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: adjusting skb: skb_push(%u)\n", -+ _len); -+ skb_push(skb, _len); -+ } -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "removing %d bytes from ESPinUDP packet\n" -+ , irs->natt_len); -+ -+ ipp = (struct iphdr *)skb->data; -+ irs->iphlen = ipp->ihl << 2; -+ ipp->tot_len = htons(ntohs(ipp->tot_len) - irs->natt_len); -+ if (skb->len < irs->iphlen + irs->natt_len) { -+ printk(KERN_WARNING -+ "klips_error:ipsec_rcv: " -+ "ESPinUDP packet is too small (%d < %d+%d). " -+ "This should never happen, please report.\n", -+ (int)(skb->len), irs->iphlen, irs->natt_len); -+ return IPSEC_RCV_REALLYBAD; -+ } -+ -+ /* advance payload pointer to point past the UDP header */ -+ skb->h.raw = skb->h.raw + irs->natt_len; -+ -+ /* modify protocol */ -+ ipp->protocol = IPPROTO_ESP; -+ -+ skb->sk = NULL; -+ -+ KLIPS_IP_PRINT(debug_rcv, ip_hdr(skb)); -+ } -+#endif -+ -+ if (debug_rcv) -+ ipsec_rcv_redodebug(irs); -+ -+ irs->iphlen = ipp->ihl << 2; -+ -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "<<< Info -- "); -+ KLIPS_PRINTMORE(debug_rcv && skb->dev, "skb->dev=%s ", -+ skb->dev->name ? skb->dev->name : "NULL"); -+ KLIPS_PRINTMORE(debug_rcv && dev, "dev=%s ", -+ dev->name ? dev->name : "NULL"); -+ KLIPS_PRINTMORE(debug_rcv, "\n"); -+ -+ KLIPS_PRINT(debug_rcv && !(skb->dev && dev && (skb->dev == dev)), -+ "klips_debug:ipsec_rcv: " -+ "Informational -- **if this happens, find out why** skb->dev:%s is not equal to dev:%s\n", -+ skb->dev ? (skb->dev->name ? skb->dev->name : "NULL") : "NULL", -+ dev ? (dev->name ? dev->name : "NULL") : "NULL"); -+ -+ protoc = ipp->protocol; -+#ifndef NET_21 -+ if((!protocol) || (protocol->protocol != protoc)) { -+ KLIPS_PRINT(debug_rcv & DB_RX_IPSA, -+ "klips_debug:ipsec_rcv: " -+ "protocol arg is NULL or unequal to the packet contents, this is odd, using value in packet.\n"); -+ } -+#endif /* !NET_21 */ -+ -+ if( (protoc != IPPROTO_AH) && -+#ifdef CONFIG_KLIPS_IPCOMP_disabled_until_we_register_IPCOMP_HANDLER -+ (protoc != IPPROTO_COMP) && -+#endif /* CONFIG_KLIPS_IPCOMP */ -+ (protoc != IPPROTO_ESP) ) { -+ KLIPS_PRINT(debug_rcv & DB_RX_IPSA, -+ "klips_debug:ipsec_rcv: Why the hell is someone " -+ "passing me a non-ipsec protocol = %d packet? -- dropped.\n", -+ protoc); -+ return IPSEC_RCV_REALLYBAD; -+ } -+ -+ /* -+ * if there is an attached ipsec device, then use that device for -+ * stats until we know better. -+ */ -+ if(skb->dev) { -+ struct ipsecpriv *prvdev = NULL; -+ struct net_device *ipsecdev = NULL; -+ -+ for(i = 0; i <= ipsecdevices_max; i++) { -+ if(ipsecdevices[i] == NULL) continue; -+ prvdev = ipsecdevices[i]->priv; -+ -+ if(prvdev == NULL) continue; -+ -+ if(prvdev->dev == skb->dev) { -+ ipsecdev = ipsecdevices[i]; -+ break; -+ } -+ } -+ -+ if(ipsecdev) { -+ skb->dev = ipsecdev; -+ } else { -+ skb->dev = ipsec_mast_get_device(0); -+ -+ /* ipsec_mast_get takes the device */ -+ if(skb->dev) dev_put(skb->dev); -+ } -+ -+ if(prvdev) { -+ stats = (struct net_device_stats *) &(prvdev->mystats); -+ } -+ } -+ -+ if(stats) { -+ stats->rx_packets++; -+ } -+ -+ KLIPS_IP_PRINT(debug_rcv, ipp); -+ -+ /* set up for decap */ -+ irs->stats= stats; -+ irs->ipp = ipp; -+ irs->ipsp = NULL; -+ irs->ilen = 0; -+ irs->authlen=0; -+ irs->authfuncs=NULL; -+ irs->skb = skb; -+ return IPSEC_RCV_OK; -+} -+ -+ -+static enum ipsec_rcv_value -+ipsec_rcv_decap_init(struct ipsec_rcv_state *irs) -+{ -+ KLIPS_PRINT(debug_rcv, "klips_debug: %s(st=%d,nxt=%d)\n", __FUNCTION__, -+ irs->state, irs->next_state); -+ -+ switch (irs->ipp->protocol) { -+ case IPPROTO_ESP: -+ irs->proto_funcs = esp_xform_funcs; -+ break; -+ -+#ifdef CONFIG_KLIPS_AH -+ case IPPROTO_AH: -+ irs->proto_funcs = ah_xform_funcs; -+ break; -+#endif /* !CONFIG_KLIPS_AH */ -+ -+#ifdef CONFIG_KLIPS_IPCOMP -+ case IPPROTO_COMP: -+ irs->proto_funcs = ipcomp_xform_funcs; -+ break; -+#endif /* !CONFIG_KLIPS_IPCOMP */ -+ -+ default: -+ if (irs->stats) { -+ irs->stats->rx_errors++; -+ } -+ return IPSEC_RCV_BADPROTO; -+ } -+ return IPSEC_RCV_OK; -+} -+ -+ -+static enum ipsec_rcv_value -+ipsec_rcv_decap_lookup(struct ipsec_rcv_state *irs) -+{ -+ struct iphdr *ipp; -+ struct sk_buff *skb; -+ -+ KLIPS_PRINT(debug_rcv, "klips_debug: %s(st=%d,nxt=%d)\n", __FUNCTION__, -+ irs->state, irs->next_state); -+ -+ irs->replay = 0; -+#ifdef CONFIG_KLIPS_ALG -+ irs->ixt_a = NULL; -+#endif /* CONFIG_KLIPS_ALG */ -+ -+ skb = irs->skb; -+ irs->len = skb->len; -+ ipp = irs->ipp; -+ irs->proto = ipp->protocol; -+ if (debug_rcv) -+ ipsec_rcv_redodebug(irs); -+ -+ irs->iphlen = ipp->ihl << 2; -+ ipp->check = 0; /* we know the sum is good */ -+ -+ irs->said.dst.u.v4.sin_addr.s_addr = ipp->daddr; -+ irs->said.dst.u.v4.sin_family = AF_INET; -+ -+ /* note: rcv_checks set up the said.spi value, if appropriate */ -+ if (irs->proto_funcs->rcv_checks) -+ return (*irs->proto_funcs->rcv_checks)(irs, irs->skb); -+ -+ return IPSEC_RCV_OK; -+} -+ -+ -+static enum ipsec_rcv_value -+ipsec_rcv_auth_init(struct ipsec_rcv_state *irs) -+{ -+ struct ipsec_sa *newipsp; -+ -+ KLIPS_PRINT(debug_rcv, "klips_debug: %s(st=%d,nxt=%d)\n", __FUNCTION__, -+ irs->state, irs->next_state); -+ -+ irs->said.proto = irs->proto; -+ if (debug_rcv) { -+ irs->sa_len = satot(&irs->said, 0, irs->sa, sizeof(irs->sa)); -+ if(irs->sa_len == 0) { -+ strcpy(irs->sa, "(error)"); -+ } -+ } else -+ irs->sa_len = 0; -+ -+ newipsp = ipsec_sa_getbyid(&irs->said); -+ if (newipsp == NULL) { -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "no ipsec_sa for SA:%s: incoming packet with no SA dropped\n", -+ irs->sa_len ? irs->sa : " (error)"); -+ if(irs->stats) { -+ irs->stats->rx_dropped++; -+ } -+ return IPSEC_RCV_SAIDNOTFOUND; -+ } -+ -+ /* If it is in larval state, drop the packet, we cannot process yet. */ -+ if(newipsp->ips_state == K_SADB_SASTATE_LARVAL) { -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "ipsec_sa in larval state, cannot be used yet, dropping packet.\n"); -+ if(irs->stats) { -+ irs->stats->rx_dropped++; -+ } -+ ipsec_sa_put(newipsp); -+ return IPSEC_RCV_SAIDNOTLIVE; -+ } -+ -+ if(newipsp->ips_state == K_SADB_SASTATE_DEAD) { -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "ipsec_sa in dead state, cannot be used any more, dropping packet.\n"); -+ if(irs->stats) { -+ irs->stats->rx_dropped++; -+ } -+ ipsec_sa_put(newipsp); -+ return IPSEC_RCV_SAIDNOTLIVE; -+ } -+ -+ if(sysctl_ipsec_inbound_policy_check) { -+ if(irs->ipp->saddr != ((struct sockaddr_in*)(newipsp->ips_addr_s))->sin_addr.s_addr) { -+ KLIPS_ERROR(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "SA:%s, src=%s of pkt does not agree with expected SA source address policy.\n", -+ irs->sa_len ? irs->sa : " (error)", -+ irs->ipsaddr_txt); -+ if(irs->stats) { -+ irs->stats->rx_dropped++; -+ } -+ ipsec_sa_put(newipsp); -+ return IPSEC_RCV_FAILEDINBOUND; -+ } -+ -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "SA:%s, src=%s of pkt agrees with expected SA source address policy.\n", -+ irs->sa_len ? irs->sa : " (error)", -+ irs->ipsaddr_txt); -+ -+ /* -+ * at this point, we have looked up a new SA, and we want to -+ * make sure that if this isn't the first SA in the list, -+ * that the previous SA actually points at this one. -+ */ -+ if(irs->ipsp) { -+ if(irs->ipsp->ips_next != newipsp) { -+ KLIPS_ERROR(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "unexpected SA:%s: does not agree with ips->inext policy, dropped\n", -+ irs->sa_len ? irs->sa : " (error)"); -+ if(irs->stats) { -+ irs->stats->rx_dropped++; -+ } -+ ipsec_sa_put(newipsp); -+ return IPSEC_RCV_FAILEDINBOUND; -+ } -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "SA:%s grouping from previous SA is OK.\n", -+ irs->sa_len ? irs->sa : " (error)"); -+ } else { -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "SA:%s First SA in group.\n", -+ irs->sa_len ? irs->sa : " (error)"); -+ } -+ -+#ifdef CONFIG_IPSEC_NAT_TRAVERSAL -+ if (irs->proto == IPPROTO_ESP) { -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "natt_type=%u tdbp->ips_natt_type=%u : %s\n", -+ irs->natt_type, newipsp->ips_natt_type, -+ (irs->natt_type==newipsp->ips_natt_type)?"ok":"bad"); -+ if (irs->natt_type != newipsp->ips_natt_type) { -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "SA:%s does not agree with expected NAT-T policy.\n", -+ irs->sa_len ? irs->sa : " (error)"); -+ if(irs->stats) { -+ irs->stats->rx_dropped++; -+ } -+ ipsec_sa_put(newipsp); -+ return IPSEC_RCV_FAILEDINBOUND; -+ } -+ } -+#endif -+ } -+ -+ irs->ipsp=newipsp; -+ -+ return IPSEC_RCV_OK; -+} -+ -+static enum ipsec_rcv_value -+ipsec_rcv_auth_decap(struct ipsec_rcv_state *irs) -+{ -+ ipsec_rcv_setoutif(irs); -+ -+ irs->proto_funcs = irs->ipsp->ips_xformfuncs; -+ if (irs->proto_funcs == NULL) -+ return IPSEC_RCV_BADPROTO; -+ -+ if (irs->proto_funcs->protocol != irs->ipp->protocol) { -+ if(irs->proto_funcs->protocol == IPPROTO_COMP) { -+ /* looks like an IPCOMP that we can skip */ -+ struct ipsec_sa *newipsp = NULL; -+ -+ newipsp = irs->ipsp->ips_next; -+ if(newipsp) { -+ ipsec_sa_get(newipsp); -+ } -+ if(irs->lastipsp) { -+ ipsec_sa_put(irs->lastipsp); -+ } -+ irs->lastipsp = irs->ipsp; -+ irs->ipsp=newipsp; -+ -+ /* come back into here with the next transform */ -+ irs->next_state = IPSEC_RSM_AUTH_DECAP; -+ return IPSEC_RCV_OK; -+ } -+ -+ if(irs->stats) { -+ irs->stats->rx_errors++; -+ } -+ return IPSEC_RCV_FAILEDINBOUND; -+ } -+ -+ if (debug_rcv) -+ ipsec_rcv_redodebug(irs); -+ -+ /* now check the lifetimes */ -+ if(ipsec_lifetime_check(&irs->ipsp->ips_life.ipl_bytes, "bytes", -+ irs->sa, ipsec_life_countbased, ipsec_incoming, -+ irs->ipsp) == ipsec_life_harddied || -+ ipsec_lifetime_check(&irs->ipsp->ips_life.ipl_addtime, "addtime", -+ irs->sa, ipsec_life_timebased, ipsec_incoming, -+ irs->ipsp) == ipsec_life_harddied || -+ ipsec_lifetime_check(&irs->ipsp->ips_life.ipl_addtime, "usetime", -+ irs->sa, ipsec_life_timebased, ipsec_incoming, -+ irs->ipsp) == ipsec_life_harddied || -+ ipsec_lifetime_check(&irs->ipsp->ips_life.ipl_packets, "packets", -+ irs->sa, ipsec_life_countbased, ipsec_incoming, -+ irs->ipsp) == ipsec_life_harddied) { -+ -+ /* -+ * disconnect SA from the hash table, so it can not be -+ * found again. -+ */ -+ ipsec_sa_rm(irs->ipsp); -+ if(irs->stats) { -+ irs->stats->rx_dropped++; -+ } -+ -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv_decap_once: " -+ "decap (%d) failed lifetime check\n", -+ irs->proto); -+ -+ return IPSEC_RCV_LIFETIMEFAILED; -+ } -+ -+#if 0 -+ /* -+ * This is removed for some reasons: -+ * 1) it needs to happen *after* authentication. -+ * 2) do we really care, if it authenticates, if it came -+ * from the wrong location? -+ * 3) the NAT_KA messages in IKE will also get to pluto -+ * and it will figure out that stuff has moved. -+ * 4) the 2.6 udp-esp encap function does not pass us -+ * the originating port number, and I can't tell -+ * if skb->sk is guaranteed to be valid here. -+ * 2005-04-16: mcr@xelerance.com -+ */ -+#ifdef CONFIG_IPSEC_NAT_TRAVERSAL -+ /* -+ * -+ * XXX we should ONLY update pluto if the SA passes all checks, -+ * which we clearly do not now. -+ */ -+ if ((irs->natt_type) && -+ ( (irs->ipp->saddr != (((struct sockaddr_in*)(newipsp->ips_addr_s))->sin_addr.s_addr)) || -+ (irs->natt_sport != newipsp->ips_natt_sport) -+ )) { -+ struct sockaddr sipaddr; -+ struct sockaddr_in *psin = (struct sockaddr_in*)(newipsp->ips_addr_s); -+ -+ /** Advertise NAT-T addr change to pluto **/ -+ sipaddr.sa_family = AF_INET; -+ ((struct sockaddr_in*)&sipaddr)->sin_addr.s_addr = irs->ipp->saddr; -+ ((struct sockaddr_in*)&sipaddr)->sin_port = htons(irs->natt_sport); -+ pfkey_nat_t_new_mapping(newipsp, &sipaddr, irs->natt_sport); -+ -+ /** -+ * Then allow or block packet depending on -+ * sysctl_ipsec_inbound_policy_check. -+ * -+ * In all cases, pluto will update SA if new mapping is -+ * accepted. -+ */ -+ if (sysctl_ipsec_inbound_policy_check) { -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "SA:%s, src=%s:%u of pkt does not agree with expected " -+ "SA source address [%08x:%u] (notifying pluto of change).\n", -+ irs->sa_len ? irs->sa : " (error)", -+ irs->ipsaddr_txt, irs->natt_sport, -+ psin->sin_addr.s_addr, -+ newipsp->ips_natt_sport); -+ if(irs->stats) { -+ irs->stats->rx_dropped++; -+ } -+ ipsec_sa_put(newipsp); -+ return IPSEC_RCV_FAILEDINBOUND; -+ } -+ } -+#endif -+#endif -+ -+ irs->authfuncs=NULL; -+ -+ /* authenticate, if required */ -+#ifdef CONFIG_KLIPS_OCF -+ if (irs->ipsp->ocf_in_use) { -+ irs->authlen = AHHMAC_HASHLEN; -+ irs->authfuncs = NULL; -+ irs->ictx = NULL; -+ irs->octx = NULL; -+ irs->ictx_len = 0; -+ irs->octx_len = 0; -+ } else -+#endif /* CONFIG_KLIPS_OCF */ -+#ifdef CONFIG_KLIPS_ALG -+ /* authenticate, if required */ -+ if ((irs->ixt_a=irs->ipsp->ips_alg_auth)) { -+ irs->authlen = AHHMAC_HASHLEN; -+ irs->authfuncs = NULL; -+ irs->ictx = NULL; -+ irs->octx = NULL; -+ irs->ictx_len = 0; -+ irs->octx_len = 0; -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "authalg=%d authlen=%d\n", -+ irs->ipsp->ips_authalg, -+ irs->authlen); -+ } else -+#endif /* CONFIG_KLIPS_ALG */ -+ switch(irs->ipsp->ips_authalg) { -+#ifdef CONFIG_KLIPS_AUTH_HMAC_MD5 -+ case AH_MD5: -+ irs->authlen = AHHMAC_HASHLEN; -+ irs->authfuncs = ipsec_rcv_md5; -+ irs->ictx = (void *)&((struct md5_ctx*)(irs->ipsp->ips_key_a))->ictx; -+ irs->octx = (void *)&((struct md5_ctx*)(irs->ipsp->ips_key_a))->octx; -+ irs->ictx_len = sizeof(((struct md5_ctx*)(irs->ipsp->ips_key_a))->ictx); -+ irs->octx_len = sizeof(((struct md5_ctx*)(irs->ipsp->ips_key_a))->octx); -+ break; -+#endif /* CONFIG_KLIPS_AUTH_HMAC_MD5 */ -+#ifdef CONFIG_KLIPS_AUTH_HMAC_SHA1 -+ case AH_SHA: -+ irs->authlen = AHHMAC_HASHLEN; -+ irs->authfuncs = ipsec_rcv_sha1; -+ irs->ictx = (void *)&((struct sha1_ctx*)(irs->ipsp->ips_key_a))->ictx; -+ irs->octx = (void *)&((struct sha1_ctx*)(irs->ipsp->ips_key_a))->octx; -+ irs->ictx_len = sizeof(((struct sha1_ctx*)(irs->ipsp->ips_key_a))->ictx); -+ irs->octx_len = sizeof(((struct sha1_ctx*)(irs->ipsp->ips_key_a))->octx); -+ break; -+#endif /* CONFIG_KLIPS_AUTH_HMAC_SHA1 */ -+ case AH_NONE: -+ irs->authlen = 0; -+ irs->authfuncs = NULL; -+ irs->ictx = NULL; -+ irs->octx = NULL; -+ irs->ictx_len = 0; -+ irs->octx_len = 0; -+ break; -+ default: -+ irs->ipsp->ips_errs.ips_alg_errs += 1; -+ if(irs->stats) { -+ irs->stats->rx_errors++; -+ } -+ return IPSEC_RCV_BADAUTH; -+ } -+ -+ /* ilen counts number of bytes in ESP portion */ -+ irs->ilen = ((irs->skb->data + irs->skb->len) - skb_transport_header(irs->skb)) - irs->authlen; -+ if(irs->ilen <= 0) { -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "runt %s packet with no data, dropping.\n", -+ (irs->proto == IPPROTO_ESP ? "esp" : "ah")); -+ if(irs->stats) { -+ irs->stats->rx_dropped++; -+ } -+ return IPSEC_RCV_BADLEN; -+ } -+ -+ if(irs->authfuncs || -+#ifdef CONFIG_KLIPS_OCF -+ irs->ipsp->ocf_in_use || -+#endif -+#ifdef CONFIG_KLIPS_ALG -+ irs->ixt_a || -+#endif -+ 0) { -+ if(irs->proto_funcs->rcv_setup_auth) -+ return (*irs->proto_funcs->rcv_setup_auth)(irs, irs->skb, -+ &irs->replay, &irs->authenticator); -+ } -+ return IPSEC_RCV_OK; -+} -+ -+ -+static enum ipsec_rcv_value -+ipsec_rcv_auth_calc(struct ipsec_rcv_state *irs) -+{ -+ KLIPS_PRINT(debug_rcv, "klips_debug: %s(st=%d,nxt=%d)\n", __FUNCTION__, -+ irs->state, irs->next_state); -+ -+ if(irs->authfuncs || -+#ifdef CONFIG_KLIPS_OCF -+ irs->ipsp->ocf_in_use || -+#endif -+#ifdef CONFIG_KLIPS_ALG -+ irs->ixt_a || -+#endif -+ 0) { -+ if(!irs->authenticator) { -+ irs->ipsp->ips_errs.ips_auth_errs += 1; -+ if(irs->stats) { -+ irs->stats->rx_dropped++; -+ } -+ return IPSEC_RCV_BADAUTH; -+ } -+ -+ if(!ipsec_checkreplaywindow(irs->ipsp, irs->replay)) { -+ irs->ipsp->ips_errs.ips_replaywin_errs += 1; -+ KLIPS_PRINT(debug_rcv & DB_RX_REPLAY, -+ "klips_debug:ipsec_rcv: " -+ "duplicate frame from %s, packet dropped\n", -+ irs->ipsaddr_txt); -+ if(irs->stats) { -+ irs->stats->rx_dropped++; -+ } -+ return IPSEC_RCV_REPLAYFAILED; -+ } -+ -+ /* -+ * verify authenticator -+ */ -+ -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "encalg = %d, authalg = %d.\n", -+ irs->ipsp->ips_encalg, -+ irs->ipsp->ips_authalg); -+ -+ /* calculate authenticator */ -+ if(irs->proto_funcs->rcv_calc_auth == NULL) { -+ return IPSEC_RCV_BADAUTH; -+ } -+ return (*irs->proto_funcs->rcv_calc_auth)(irs, irs->skb); -+ } -+ return IPSEC_RCV_OK; -+} -+ -+static enum ipsec_rcv_value -+ipsec_rcv_auth_chk(struct ipsec_rcv_state *irs) -+{ -+ KLIPS_PRINT(debug_rcv, "klips_debug: %s(st=%d,nxt=%d) - %s\n", __FUNCTION__, -+ irs->state, irs->next_state, -+ irs->auth_checked ? "already checked" : "will check"); -+ -+ if (irs->auth_checked) -+ return IPSEC_RCV_OK; -+ -+ if(irs->authfuncs || -+#ifdef CONFIG_KLIPS_OCF -+ irs->ipsp->ocf_in_use || -+#endif -+#ifdef CONFIG_KLIPS_ALG -+ irs->ixt_a || -+#endif -+ 0) { -+ if (memcmp(irs->hash, irs->authenticator, irs->authlen)) { -+ irs->ipsp->ips_errs.ips_auth_errs += 1; -+ KLIPS_ERROR(debug_rcv & DB_RX_INAU, -+ "klips_debug:ipsec_rcv: " -+ "auth failed on incoming packet from %s (replay=%d): calculated hash=%08x%08x%08x received hash=%08x%08x%08x, dropped\n", -+ irs->ipsaddr_txt, -+ irs->replay, -+ ntohl(*(__u32*)&irs->hash[0]), -+ ntohl(*(__u32*)&irs->hash[4]), -+ ntohl(*(__u32*)&irs->hash[8]), -+ ntohl(*(__u32*)irs->authenticator), -+ ntohl(*((__u32*)irs->authenticator + 1)), -+ ntohl(*((__u32*)irs->authenticator + 2))); -+ if(irs->stats) { -+ irs->stats->rx_dropped++; -+ } -+ return IPSEC_RCV_AUTHFAILED; -+ } else { -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "authentication successful.\n"); -+ } -+ -+ /* Crypto hygiene: clear memory used to calculate autheticator. -+ * The length varies with the algorithm. -+ */ -+ memset(irs->hash, 0, irs->authlen); -+ -+ /* If the sequence number == 0, expire SA, it had rolled */ -+ if(irs->ipsp->ips_replaywin && !irs->replay /* !irs->ipsp->ips_replaywin_lastseq */) { -+ /* we need to remove it from the sadb hash, so that it can't be found again */ -+ ipsec_sa_rm(irs->ipsp); -+ -+ KLIPS_ERROR(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "replay window counter rolled, expiring SA.\n"); -+ if(irs->stats) { -+ irs->stats->rx_dropped++; -+ } -+ return IPSEC_RCV_REPLAYROLLED; -+ } -+ -+ /* now update the replay counter */ -+ if (!ipsec_updatereplaywindow(irs->ipsp, irs->replay)) { -+ irs->ipsp->ips_errs.ips_replaywin_errs += 1; -+ KLIPS_ERROR(debug_rcv & DB_RX_REPLAY, -+ "klips_debug:ipsec_rcv: " -+ "duplicate frame from %s, packet dropped\n", -+ irs->ipsaddr_txt); -+ if(irs->stats) { -+ irs->stats->rx_dropped++; -+ } -+ return IPSEC_RCV_REPLAYROLLED; -+ } -+ irs->auth_checked = 1; -+ } -+ return IPSEC_RCV_OK; -+} -+ -+static enum ipsec_rcv_value -+ipsec_rcv_decrypt(struct ipsec_rcv_state *irs) -+{ -+ KLIPS_PRINT(debug_rcv, "klips_debug: %s(st=%d,nxt=%d)\n", __FUNCTION__, -+ irs->state, irs->next_state); -+ -+ if (irs->proto_funcs->rcv_decrypt) { -+ return (*irs->proto_funcs->rcv_decrypt)(irs); -+ } -+ return IPSEC_RCV_OK; -+} -+ -+/* -+ * here we decide if there is more decapsulating required and -+ * change the next state appropriately -+ */ -+static enum ipsec_rcv_value -+ipsec_rcv_decap_cont(struct ipsec_rcv_state *irs) -+{ -+ struct sk_buff *skb; -+ struct iphdr *ipp; -+ struct ipsec_sa *ipsnext = NULL; /* next SA towards inside of packet */ -+ enum ipsec_rcv_value rv; -+ -+ KLIPS_PRINT(debug_rcv, "klips_debug: %s(st=%d,nxt=%d)\n", __FUNCTION__, -+ irs->state, irs->next_state); -+ -+ /* -+ * if we haven't checked the auth values yet, do it now. -+ * This is needed for the case where drivers do crypt+hash -+ * in one operation. -+ */ -+ rv = ipsec_rcv_auth_chk(irs); -+ if (rv != IPSEC_RCV_OK) -+ return rv; -+ -+ /* -+ * Adjust pointers after decrypt -+ */ -+ skb = irs->skb; -+ irs->len = skb->len; -+ ipp = irs->ipp = ip_hdr(skb); -+ irs->iphlen = ipp->ihl<<2; -+ skb_set_transport_header(skb, ipsec_skb_offset(skb, skb_network_header(skb) + irs->iphlen)); -+ -+ /* zero any options that there might be */ -+ memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); -+ -+ if (debug_rcv) -+ ipsec_rcv_redodebug(irs); -+ -+ /* -+ * Discard the original ESP/AH header -+ */ -+ ipp->protocol = irs->next_header; -+ -+ ipp->check = 0; /* NOTE: this will be included in checksum */ -+ ipp->check = ip_fast_csum((unsigned char *)ip_hdr(skb), irs->iphlen >> 2); -+ -+ KLIPS_PRINT(debug_rcv & DB_RX_PKTRX, -+ "klips_debug:ipsec_rcv: " -+ "after <%s%s%s>, SA:%s:\n", -+ IPS_XFORM_NAME(irs->ipsp), -+ irs->sa_len ? irs->sa : " (error)"); -+ KLIPS_IP_PRINT(debug_rcv & DB_RX_PKTRX, ipp); -+ -+ skb->protocol = htons(ETH_P_IP); -+ skb->ip_summed = 0; -+ -+ ipsnext = irs->ipsp->ips_next; -+ if(sysctl_ipsec_inbound_policy_check) { -+ if(ipsnext) { -+ if( -+ ipp->protocol != IPPROTO_AH -+ && ipp->protocol != IPPROTO_ESP -+#ifdef CONFIG_KLIPS_IPCOMP -+ && ipp->protocol != IPPROTO_COMP -+ && (ipsnext->ips_said.proto != IPPROTO_COMP -+ || ipsnext->ips_next) -+#endif /* CONFIG_KLIPS_IPCOMP */ -+ && ipp->protocol != IPPROTO_IPIP -+ && ipp->protocol != IPPROTO_ATT_HEARTBEAT /* heartbeats to AT&T SIG/GIG */ -+ ) { -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "packet with incomplete policy dropped, last successful SA:%s.\n", -+ irs->sa_len ? irs->sa : " (error)"); -+ if(irs->stats) { -+ irs->stats->rx_dropped++; -+ } -+ return IPSEC_RCV_FAILEDINBOUND; -+ } -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "SA:%s, Another IPSEC header to process.\n", -+ irs->sa_len ? irs->sa : " (error)"); -+ } else { -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "No ips_inext from this SA:%s.\n", -+ irs->sa_len ? irs->sa : " (error)"); -+ } -+ } -+ -+#ifdef CONFIG_KLIPS_IPCOMP -+ /* update ipcomp ratio counters, even if no ipcomp packet is present */ -+ if (ipsnext -+ && ipsnext->ips_said.proto == IPPROTO_COMP -+ && ipp->protocol != IPPROTO_COMP) { -+ ipsnext->ips_comp_ratio_cbytes += ntohs(ipp->tot_len); -+ ipsnext->ips_comp_ratio_dbytes += ntohs(ipp->tot_len); -+ } -+#endif /* CONFIG_KLIPS_IPCOMP */ -+ -+ irs->ipsp->ips_life.ipl_bytes.ipl_count += irs->len; -+ irs->ipsp->ips_life.ipl_bytes.ipl_last = irs->len; -+ -+ if(!irs->ipsp->ips_life.ipl_usetime.ipl_count) { -+ irs->ipsp->ips_life.ipl_usetime.ipl_count = jiffies / HZ; -+ } -+ irs->ipsp->ips_life.ipl_usetime.ipl_last = jiffies / HZ; -+ irs->ipsp->ips_life.ipl_packets.ipl_count += 1; -+ -+#ifdef CONFIG_NETFILTER -+ if(irs->proto == IPPROTO_ESP || irs->proto == IPPROTO_AH) { -+ skb->nfmark = (skb->nfmark & (~(IPsecSAref2NFmark(IPSEC_SA_REF_MASK)))) -+ | IPsecSAref2NFmark(IPsecSA2SAref(irs->ipsp)); -+ KLIPS_PRINT(debug_rcv & DB_RX_PKTRX, -+ "klips_debug:ipsec_rcv: " -+ "%s SA sets skb->nfmark=0x%x.\n", -+ irs->proto == IPPROTO_ESP ? "ESP" : "AH", -+ (unsigned)skb->nfmark); -+ } -+#endif /* CONFIG_NETFILTER */ -+ -+ /* okay, acted on this SA, so free any previous SA, and record a new one */ -+ if(irs->ipsp) { -+ struct ipsec_sa *newipsp = NULL; -+ newipsp = irs->ipsp->ips_next; -+ if(newipsp) { -+ ipsec_sa_get(newipsp); -+ } -+ if(irs->lastipsp) { -+ ipsec_sa_put(irs->lastipsp); -+ } -+ irs->lastipsp = irs->ipsp; -+ irs->ipsp=newipsp; -+ } -+ -+ /* do we need to do more decapsulation */ -+ if ((irs->ipp->protocol == IPPROTO_ESP || -+ irs->ipp->protocol == IPPROTO_AH || -+#ifdef CONFIG_KLIPS_IPCOMP -+ irs->ipp->protocol == IPPROTO_COMP || -+#endif /* CONFIG_KLIPS_IPCOMP */ -+ 0) && irs->ipsp != NULL) { -+ irs->next_state = IPSEC_RSM_AUTH_DECAP; -+ } -+ return IPSEC_RCV_OK; -+} -+ -+ -+static enum ipsec_rcv_value -+ipsec_rcv_cleanup(struct ipsec_rcv_state *irs) -+{ -+ struct sk_buff *skb; -+ struct iphdr *ipp; -+ struct ipsec_sa *ipsp = NULL; -+ -+ KLIPS_PRINT(debug_rcv, "klips_debug: %s(st=%d,nxt=%d)\n", __FUNCTION__, -+ irs->state, irs->next_state); -+ -+ /* set up for decap loop */ -+ ipp = irs->ipp; -+ ipsp = irs->ipsp; -+ skb = irs->skb; -+ -+ /* if there is an IPCOMP, but we don't have an IPPROTO_COMP, -+ * then we can just skip it -+ */ -+#ifdef CONFIG_KLIPS_IPCOMP -+ if(irs->ipsp && irs->ipsp->ips_said.proto == IPPROTO_COMP) { -+ struct ipsec_sa *newipsp = NULL; -+ newipsp = irs->ipsp->ips_next; -+ if(newipsp) { -+ ipsec_sa_get(newipsp); -+ } -+ if(irs->lastipsp) { -+ ipsec_sa_put(irs->lastipsp); -+ } -+ irs->lastipsp = irs->ipsp; -+ irs->ipsp=newipsp; -+ } -+#endif /* CONFIG_KLIPS_IPCOMP */ -+ -+#ifdef CONFIG_IPSEC_NAT_TRAVERSAL -+ if ((irs->natt_type) && (ipp->protocol != IPPROTO_IPIP)) { -+ /** -+ * NAT-Traversal and Transport Mode: -+ * we need to correct TCP/UDP checksum -+ * -+ * If we've got NAT-OA, we can fix checksum without recalculation. -+ */ -+ __u32 natt_oa = ipsp->ips_natt_oa ? -+ ((struct sockaddr_in*)(ipsp->ips_natt_oa))->sin_addr.s_addr : 0; -+ -+ if(natt_oa != 0) { -+ /* reset source address to what it was before NAT */ -+ ipp->saddr = natt_oa; -+ ipp->check = 0; -+ ipp->check = ip_fast_csum((unsigned char *)ipp, ipp->ihl); -+ KLIPS_PRINT(debug_rcv, "csum: %04x\n", ipp->check); -+ } -+ } -+#endif -+ -+ /* -+ * the SA is still locked from the loop -+ */ -+ if(irs->ipsp && irs->ipsp->ips_xformfuncs->protocol == IPPROTO_IPIP) { -+ enum ipsec_rcv_value decap_stat; -+ -+ decap_stat = ipsec_rcv_decap_ipip(irs); -+ if(decap_stat != IPSEC_RCV_OK) { -+ return decap_stat; -+ } -+ } -+ -+ if(irs->stats) { -+ irs->stats->rx_bytes += skb->len; -+ } -+ -+ /* -+ * if we are supposed to return the packet directly to the transport -+ * layer, then dump it out correctly. -+ */ -+ if(unlikely(!irs->lastipsp)) -+ printk("%s,%d: %s lastipsp should never be NULL\n", -+ __FILE__, __LINE__, __FUNCTION__); -+ if(irs->lastipsp->ips_transport_direct) { -+ KLIPS_PRINT(debug_rcv, "receiving packet as transport direct\n"); -+ skb->ip_summed=CHECKSUM_UNNECESSARY; -+ /* STUFF */ -+ } -+ -+#ifdef HAVE_IPSEC_SAREF -+ if(skb->sp) { -+ secpath_put(skb->sp); -+ } -+ skb->sp = secpath_dup(NULL); -+ skb->sp->ref = irs->lastipsp->ips_ref; -+#endif -+ -+ /* release the dst that was attached, since we have likely -+ * changed the actual destination of the packet. -+ */ -+ if(skb->dst) { -+ dst_release(skb->dst); -+ skb->dst = NULL; -+ } -+ skb->pkt_type = PACKET_HOST; -+ if(irs->hard_header_len && -+ (skb_mac_header(skb) != (skb_network_header(skb) - irs->hard_header_len)) && -+ (irs->hard_header_len <= skb_headroom(skb))) { -+ /* copy back original MAC header */ -+ memmove(skb_network_header(skb) - irs->hard_header_len, -+ skb_mac_header(skb), irs->hard_header_len); -+ skb_set_mac_header(skb, ipsec_skb_offset(skb, skb_network_header(skb) - irs->hard_header_len)); -+ } -+ return IPSEC_RCV_OK; -+} -+ -+ -+static enum ipsec_rcv_value -+ipsec_rcv_complete(struct ipsec_rcv_state *irs) -+{ -+ KLIPS_PRINT(debug_rcv, "klips_debug: %s(st=%d,nxt=%d)\n", __FUNCTION__, -+ irs->state, irs->next_state); -+ -+ /* -+ * make sure that data now starts at IP header, since we are going -+ * to pass this back to ip_input (aka netif_rx). Rules for what the -+ * pointers wind up a different for 2.6 vs 2.4, so we just fudge it here. -+ */ -+#ifdef NET_26 -+ irs->skb->data = skb_push(irs->skb, skb_transport_header(irs->skb) - skb_network_header(irs->skb)); -+#else -+ irs->skb->data = skb_network_header(irs->skb); -+ { -+ struct iphdr *iph = ip_hdr(irs->skb); -+ int len = ntohs(iph->tot_len); -+ irs->skb->len = len; -+ } -+#endif -+ -+ ipsec_nf_reset(irs->skb); -+ -+ KLIPS_PRINT(debug_rcv & DB_RX_PKTRX, -+ "klips_debug:ipsec_rcv: " -+ "netif_rx(%s) called.\n", irs->skb->dev->name); -+ netif_rx(irs->skb); -+ irs->skb = NULL; -+ return IPSEC_RCV_OK; -+} -+ -+ -+ -+/* -+ * ipsec_rsm is responsible for walking us through the state machine -+ * it is the only entry point into the receive processing and does -+ * appropriate checks and state changes for us. -+ */ -+ -+void -+ipsec_rsm(struct ipsec_rcv_state *irs) -+{ -+ if (irs == NULL) { -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rsm: " -+ "irs == NULL.\n"); -+ return; -+ } -+ -+ /* -+ * make sure nothing is removed from underneath us -+ */ -+ spin_lock_bh(&tdb_lock); -+ -+ /* -+ * if we have a valid said, then we must check it here to ensure it -+ * hasn't gone away while we were waiting for a task to complete -+ */ -+ -+ if (irs->said.proto && ipsec_sa_getbyid(&irs->said) == NULL) { -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "no ipsec_sa for SA:%s: incoming packet with no SA dropped\n", -+ irs->sa_len ? irs->sa : " (error)"); -+ if (irs->stats) -+ irs->stats->rx_dropped++; -+ -+ /* drop through and cleanup */ -+ irs->state = IPSEC_RSM_DONE; -+ } -+ -+ while (irs->state != IPSEC_RSM_DONE) { -+ int rc; -+ -+ irs->next_state = rcv_state_table[irs->state].next_state; -+ -+ rc = rcv_state_table[irs->state].action(irs); -+ -+ if (rc == IPSEC_RCV_OK) { -+ /* some functions change the next state, see the state table */ -+ irs->state = irs->next_state; -+ } else if (rc == IPSEC_RCV_PENDING) { -+ /* -+ * things are on hold until we return here in the next/new state -+ * we check our SA is valid when we return -+ */ -+ spin_unlock_bh(&tdb_lock); -+ return; -+ } else { -+ /* bad result, force state change to done */ -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rsm: " -+ "processing completed due to %s.\n", -+ ipsec_rcv_err(rc)); -+ irs->state = IPSEC_RSM_DONE; -+ } -+ } -+ -+ /* -+ * all done with anything needing locks -+ */ -+ spin_unlock_bh(&tdb_lock); -+ -+ if (irs->lastipsp) { -+ ipsec_sa_put(irs->lastipsp); -+ irs->lastipsp=NULL; -+ } -+ -+ if (irs->ipsp) { -+ ipsec_sa_put(irs->ipsp); -+ irs->ipsp=NULL; -+ } -+ -+ if (irs->skb) { -+ ipsec_kfree_skb(irs->skb); -+ irs->skb = NULL; -+ } -+ -+ ipsec_rcv_state_delete(irs); -+ -+ KLIPS_DEC_USE; /* once less packet using the driver */ -+} -+ -+ -+int -+ipsec_rcv(struct sk_buff *skb -+#ifndef PROTO_HANDLER_SINGLE_PARM -+ unsigned short xlen -+#endif /* PROTO_HANDLER_SINGLE_PARM */ -+ ) -+{ -+ struct ipsec_rcv_state *irs = NULL; -+ -+ /* Don't unlink in the middle of a turnaround */ -+ KLIPS_INC_USE; -+ -+ if (skb == NULL) { -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "NULL skb passed in.\n"); -+ goto rcvleave; -+ } -+ -+ if (skb->data == NULL) { -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "NULL skb->data passed in, packet is bogus, dropping.\n"); -+ goto rcvleave; -+ } -+ -+ irs = ipsec_rcv_state_new(); -+ if (unlikely (! irs)) { -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "failled to allocate a rcv state object\n"); -+ goto rcvleave; -+ } -+ -+#if defined(CONFIG_IPSEC_NAT_TRAVERSAL) && !defined(NET_26) -+ { -+ /* NET_26 NAT-T is handled by seperate function */ -+ struct sk_buff *nskb; -+ int udp_decap_ret = 0; -+ -+ nskb = ipsec_rcv_natt_decap(skb, irs, &udp_decap_ret); -+ if(nskb == NULL) { -+ /* return with non-zero, because UDP.c code -+ * need to send it upstream. -+ */ -+ if(skb && udp_decap_ret == 0) { -+ ipsec_kfree_skb(skb); -+ } -+ if (irs) { -+ ipsec_rcv_state_delete(irs); -+ } -+ KLIPS_DEC_USE; -+ return(udp_decap_ret); -+ } -+ skb = nskb; -+ } -+#endif /* NAT_T */ -+ -+ irs->skb = skb; -+ -+ /* -+ * we hand off real early to the state machine because we just cannot -+ * know how much processing it is off-loading -+ */ -+ ipsec_rsm(irs); -+ -+ return(0); -+ -+ rcvleave: -+ if (irs) { -+ ipsec_rcv_state_delete(irs); -+ } -+ if (skb) { -+ ipsec_kfree_skb(skb); -+ } -+ KLIPS_DEC_USE; -+ return(0); -+} -+ -+ -+#ifdef NET_26 -+/* -+ * this entry point is not a protocol entry point, so the entry -+ * is a bit different. -+ * -+ * skb->iph->tot_len has been byte-swapped, and reduced by the size of -+ * the IP header (and options). -+ * -+ * skb->h.raw has been pulled up the ESP header. -+ * -+ * skb->iph->protocol = 50 IPPROTO_ESP; -+ * -+ */ -+int klips26_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) -+{ -+ return klips26_rcv_encap(skb, udp_sk(sk)->encap_type); -+} -+ -+int klips26_rcv_encap(struct sk_buff *skb, __u16 encap_type) -+{ -+ struct ipsec_rcv_state *irs = NULL; -+ -+ /* Don't unlink in the middle of a turnaround */ -+ KLIPS_INC_USE; -+ -+ if (skb == NULL) { -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "NULL skb passed in.\n"); -+ goto rcvleave; -+ } -+ -+ if (skb->data == NULL) { -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "NULL skb->data passed in, packet is bogus, dropping.\n"); -+ goto rcvleave; -+ } -+ -+ irs = ipsec_rcv_state_new(); -+ if (unlikely (! irs)) { -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv: " -+ "failled to allocate a rcv state object\n"); -+ goto rcvleave; -+ } -+ -+ /* XXX fudge it so that all nat-t stuff comes from ipsec0 */ -+ /* eventually, the SA itself will determine which device -+ * it comes from -+ */ -+ { -+ skb->dev = ipsec_get_device(0); -+ } -+ irs->hard_header_len = skb->dev->hard_header_len; -+ -+#ifdef CONFIG_IPSEC_NAT_TRAVERSAL -+ switch(encap_type) { -+ case UDP_ENCAP_ESPINUDP: -+ irs->natt_type = ESPINUDP_WITH_NON_ESP; -+ break; -+ -+ case UDP_ENCAP_ESPINUDP_NON_IKE: -+ irs->natt_type = ESPINUDP_WITH_NON_IKE; -+ break; -+ -+ default: -+ if(printk_ratelimit()) { -+ printk(KERN_INFO "KLIPS received unknown UDP-ESP encap type %u\n", -+ encap_type); -+ } -+ goto rcvleave; -+ } -+#endif /* NAT_T */ -+ -+ irs->skb = skb; -+ -+ /* -+ * we hand off real early to the state machine because we just cannot -+ * know how much processing it is off-loading -+ */ -+ ipsec_rsm(irs); -+ -+ return(0); -+ -+ rcvleave: -+ if (irs) { -+ ipsec_rcv_state_delete(irs); -+ } -+ if (skb) { -+ ipsec_kfree_skb(skb); -+ } -+ KLIPS_DEC_USE; -+ return(0); -+} -+#endif -+ -+// ------------------------------------------------------------------------ -+// this handles creating and managing state for recv path -+ -+static spinlock_t irs_cache_lock = SPIN_LOCK_UNLOCKED; -+#ifdef HAVE_KMEM_CACHE_MACRO -+static struct kmem_cache *irs_cache_allocator = NULL; -+#else -+static kmem_cache_t *irs_cache_allocator = NULL; -+#endif -+static unsigned irs_cache_allocated_count = 0; -+ -+int ipsec_irs_cache_allocated_max = 1000; -+module_param(ipsec_irs_cache_allocated_max,int,0644); -+MODULE_PARM_DESC(ipsec_irs_cache_allocated_max, -+ "Maximum outstanding receive packets (before they are dropped)"); -+ -+int -+ipsec_rcv_state_cache_init (void) -+{ -+ if (irs_cache_allocator) -+ return -EBUSY; -+ -+ spin_lock_init(&irs_cache_lock); -+#ifdef HAVE_KMEM_CACHE_MACRO -+ /* irs_cache_allocator = KMEM_CACHE(ipsec_irs,0); */ -+ irs_cache_allocator = kmem_cache_create ("ipsec_irs", -+ sizeof (struct ipsec_rcv_state), 0, -+ 0, NULL); -+#else -+ irs_cache_allocator = kmem_cache_create ("ipsec_irs", -+ sizeof (struct ipsec_rcv_state), 0, -+ 0, NULL, NULL); -+#endif -+ if (! irs_cache_allocator) -+ return -ENOMEM; -+ -+ return 0; -+} -+ -+void -+ipsec_rcv_state_cache_cleanup (void) -+{ -+ if (unlikely (irs_cache_allocated_count)) -+ printk ("ipsec: deleting ipsec_irs kmem_cache while in use\n"); -+ -+ if (irs_cache_allocator) { -+ kmem_cache_destroy (irs_cache_allocator); -+ irs_cache_allocator = NULL; -+ } -+ irs_cache_allocated_count = 0; -+} -+ -+static struct ipsec_rcv_state * -+ipsec_rcv_state_new (void) -+{ -+ struct ipsec_rcv_state *irs; -+ -+ spin_lock_bh (&irs_cache_lock); -+ -+ if (irs_cache_allocated_count >= ipsec_irs_cache_allocated_max) { -+ spin_unlock_bh (&irs_cache_lock); -+ KLIPS_PRINT(debug_rcv, -+ "klips_debug:ipsec_rcv_state_new: " -+ "exceeded maximum outstanding RX packet cnt %d\n", -+ irs_cache_allocated_count); -+ return NULL; -+ } -+ -+ irs = kmem_cache_alloc (irs_cache_allocator, GFP_ATOMIC); -+ -+ if (likely (irs != NULL)) -+ irs_cache_allocated_count++; -+ -+ spin_unlock_bh (&irs_cache_lock); -+ -+ if (unlikely (NULL == irs)) -+ goto bail; -+ -+ // initialize the object -+#if 1 -+ memset((caddr_t)irs, 0, sizeof(*irs)); -+#else -+ /* optimised to only clear the essentials */ -+ irs->state = 0; -+ irs->next_state = 0; -+ irs->auth_checked = 0; -+ irs->stats = NULL; -+ irs->authenticator = NULL; -+ irs->said.proto = 0; -+ -+ irs->hard_header_len = 0; -+#ifdef CONFIG_IPSEC_NAT_TRAVERSAL -+ irs->natt_type = 0; -+ irs->natt_len = 0; -+#endif -+ -+ irs->lastipsp = NULL; -+#endif -+ -+bail: -+ return irs; -+} -+ -+static void -+ipsec_rcv_state_delete (struct ipsec_rcv_state *irs) -+{ -+ if (unlikely (! irs)) -+ return; -+ -+ spin_lock_bh (&irs_cache_lock); -+ -+ irs_cache_allocated_count--; -+ kmem_cache_free (irs_cache_allocator, irs); -+ -+ spin_unlock_bh (&irs_cache_lock); -+} -+ -+/* -+ * -+ * Local Variables: -+ * c-set-style: linux -+ * End: -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/ipsec_sa.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,1575 @@ -+/* -+ * Common routines for IPsec SA maintenance routines. -+ * -+ * Copyright (C) 1996, 1997 John Ioannidis. -+ * Copyright (C) 1998, 1999, 2000, 2001, 2002 Richard Guy Briggs. -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * RCSID $Id: ipsec_sa.c,v 1.31 2005/11/11 04:38:56 paul Exp $ -+ * -+ * This is the file formerly known as "ipsec_xform.h" -+ * -+ */ -+ -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif -+#include -+#include /* printk() */ -+ -+#include "openswan/ipsec_param.h" -+ -+#ifdef MALLOC_SLAB -+# include /* kmalloc() */ -+#else /* MALLOC_SLAB */ -+# include /* kmalloc() */ -+#endif /* MALLOC_SLAB */ -+#include /* vmalloc() */ -+#include /* error codes */ -+#include /* size_t */ -+#include /* mark_bh */ -+ -+#include /* struct device, and other headers */ -+#include /* eth_type_trans */ -+#include /* struct iphdr */ -+#include -+#include -+#ifdef SPINLOCK -+#ifdef SPINLOCK_23 -+#include /* *lock* */ -+#else /* SPINLOCK_23 */ -+#include /* *lock* */ -+#endif /* SPINLOCK_23 */ -+#endif /* SPINLOCK */ -+ -+#include -+ -+#include "openswan/radij.h" -+ -+#include "openswan/ipsec_stats.h" -+#include "openswan/ipsec_life.h" -+#include "openswan/ipsec_sa.h" -+#include "openswan/ipsec_xform.h" -+ -+#include "openswan/ipsec_encap.h" -+#include "openswan/ipsec_radij.h" -+#include "openswan/ipsec_xform.h" -+#include "openswan/ipsec_ipe4.h" -+#include "openswan/ipsec_ah.h" -+#include "openswan/ipsec_esp.h" -+#include "openswan/ipsec_ipip.h" -+#ifdef CONFIG_KLIPS_IPCOMP -+#include "openswan/ipsec_ipcomp.h" -+#endif /* CONFIG_KLIPS_COMP */ -+ -+#include -+#include -+ -+#include "openswan/ipsec_proto.h" -+#include "openswan/ipsec_alg.h" -+ -+#include "ipsec_ocf.h" -+ -+ -+#define SENDERR(_x) do { error = -(_x); goto errlab; } while (0) -+ -+struct ipsec_sa *ipsec_sadb_hash[SADB_HASHMOD]; -+#ifdef SPINLOCK -+spinlock_t tdb_lock = SPIN_LOCK_UNLOCKED; -+#else /* SPINLOCK */ -+spinlock_t tdb_lock; -+#endif /* SPINLOCK */ -+ -+struct ipsec_sadb ipsec_sadb; -+ -+/* the sub table must be narrower (or equal) in bits than the variable type -+ in the main table to count the number of unused entries in it. */ -+typedef struct { -+ int testSizeOf_refSubTable : -+ ((sizeof(IPsecRefTableUnusedCount) * 8) < IPSEC_SA_REF_SUBTABLE_IDX_WIDTH ? -1 : 1); -+} dummy; -+ -+ -+/* The field where the saref will be hosted in the skb must be wide enough to -+ accomodate the information it needs to store. */ -+typedef struct { -+ int testSizeOf_refField : -+ (IPSEC_SA_REF_HOST_FIELD_WIDTH < IPSEC_SA_REF_TABLE_IDX_WIDTH ? -1 : 1 ); -+} dummy2; -+ -+ -+#define IPS_HASH(said) (((said)->spi + (said)->dst.u.v4.sin_addr.s_addr + (said)->proto) % SADB_HASHMOD) -+ -+int -+ipsec_SAref_recycle(void) -+{ -+ int table, i; -+ int error = 0; -+ int entry; -+ int addone; -+ -+ ipsec_sadb.refFreeListHead = IPSEC_SAREF_NULL; -+ ipsec_sadb.refFreeListTail = IPSEC_SAREF_NULL; -+ -+ if(ipsec_sadb.refFreeListCont == IPSEC_SA_REF_MAINTABLE_NUM_ENTRIES * IPSEC_SA_REF_SUBTABLE_NUM_ENTRIES) { -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_SAref_recycle: " -+ "end of table reached, continuing at start..\n"); -+ ipsec_sadb.refFreeListCont = IPSEC_SAREF_FIRST; -+ } -+ -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_SAref_recycle: " -+ "recycling, continuing from SAref=%d (0p%p), table=%d, entry=%d.\n", -+ ipsec_sadb.refFreeListCont, -+ (ipsec_sadb.refTable[IPsecSAref2table(ipsec_sadb.refFreeListCont)] != NULL) ? IPsecSAref2SA(ipsec_sadb.refFreeListCont) : NULL, -+ IPsecSAref2table(ipsec_sadb.refFreeListCont), -+ IPsecSAref2entry(ipsec_sadb.refFreeListCont)); -+ -+ /* add one additional table entry */ -+ addone = 0; -+ -+ ipsec_sadb.refFreeListHead = IPSEC_SAREF_FIRST; -+ for(i = 0; i < IPSEC_SA_REF_FREELIST_NUM_ENTRIES; i++) { -+ table = IPsecSAref2table(ipsec_sadb.refFreeListCont); -+ if(addone == 0 && ipsec_sadb.refTable[table] == NULL) { -+ addone = 1; -+ error = ipsec_SArefSubTable_alloc(table); -+ if(error) { -+ return error; -+ } -+ } -+ for(entry = IPsecSAref2entry(ipsec_sadb.refFreeListCont); -+ entry < IPSEC_SA_REF_SUBTABLE_NUM_ENTRIES; -+ entry++) { -+ if(ipsec_sadb.refTable[table]->entry[entry] == NULL) { -+ ipsec_sadb.refFreeList[++ipsec_sadb.refFreeListTail] = IPsecSArefBuild(table, entry); -+ if(ipsec_sadb.refFreeListTail == (IPSEC_SA_REF_FREELIST_NUM_ENTRIES - 1)) { -+ ipsec_sadb.refFreeListHead = IPSEC_SAREF_FIRST; -+ ipsec_sadb.refFreeListCont = ipsec_sadb.refFreeList[ipsec_sadb.refFreeListTail] + 1; -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_SAref_recycle: " -+ "SArefFreeList refilled.\n"); -+ return 0; -+ } -+ } -+ } -+ ipsec_sadb.refFreeListCont++; -+ ipsec_sadb.refFreeListTail=i; -+ } -+ -+ if(ipsec_sadb.refFreeListTail == IPSEC_SAREF_NULL) { -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_SAref_recycle: " -+ "out of room in the SArefTable.\n"); -+ -+ return(-ENOSPC); -+ } -+ -+ ipsec_sadb.refFreeListHead = IPSEC_SAREF_FIRST; -+ ipsec_sadb.refFreeListCont = ipsec_sadb.refFreeList[ipsec_sadb.refFreeListTail] + 1; -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_SAref_recycle: " -+ "SArefFreeList partly refilled to %d of %d.\n", -+ ipsec_sadb.refFreeListTail, -+ IPSEC_SA_REF_FREELIST_NUM_ENTRIES); -+ return 0; -+} -+ -+int -+ipsec_SArefSubTable_alloc(unsigned table) -+{ -+ unsigned entry; -+ struct IPsecSArefSubTable* SArefsub; -+ -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_SArefSubTable_alloc: " -+ "allocating %lu bytes for table %u of %u.\n", -+ (unsigned long) (IPSEC_SA_REF_SUBTABLE_NUM_ENTRIES * sizeof(struct ipsec_sa *)), -+ table, -+ IPSEC_SA_REF_MAINTABLE_NUM_ENTRIES); -+ -+ /* allocate another sub-table */ -+ SArefsub = vmalloc(IPSEC_SA_REF_SUBTABLE_NUM_ENTRIES * sizeof(struct ipsec_sa *)); -+ if(SArefsub == NULL) { -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_SArefSubTable_alloc: " -+ "error allocating memory for table %u of %u!\n", -+ table, -+ IPSEC_SA_REF_MAINTABLE_NUM_ENTRIES); -+ return -ENOMEM; -+ } -+ -+ /* add this sub-table to the main table */ -+ ipsec_sadb.refTable[table] = SArefsub; -+ -+ /* initialise each element to NULL */ -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_SArefSubTable_alloc: " -+ "initialising %u elements (2 ^ %u) of table %u.\n", -+ IPSEC_SA_REF_SUBTABLE_NUM_ENTRIES, -+ IPSEC_SA_REF_SUBTABLE_IDX_WIDTH, -+ table); -+ for(entry = 0; entry < IPSEC_SA_REF_SUBTABLE_NUM_ENTRIES; entry++) { -+ SArefsub->entry[entry] = NULL; -+ } -+ -+ return 0; -+} -+ -+int -+ipsec_saref_verify_slot(IPsecSAref_t ref) -+{ -+ int ref_table=IPsecSAref2table(ref); -+ -+ if(ipsec_sadb.refTable[ref_table] == NULL) { -+ return ipsec_SArefSubTable_alloc(ref_table); -+ } -+ return 0; -+} -+ -+int -+ipsec_saref_freelist_init(void) -+{ -+ int i; -+ -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_saref_freelist_init: " -+ "initialising %u elements of FreeList.\n", -+ IPSEC_SA_REF_FREELIST_NUM_ENTRIES); -+ -+ for(i = 0; i < IPSEC_SA_REF_FREELIST_NUM_ENTRIES; i++) { -+ ipsec_sadb.refFreeList[i] = IPSEC_SAREF_NULL; -+ } -+ ipsec_sadb.refFreeListHead = IPSEC_SAREF_NULL; -+ ipsec_sadb.refFreeListCont = IPSEC_SAREF_FIRST; -+ ipsec_sadb.refFreeListTail = IPSEC_SAREF_NULL; -+ -+ return 0; -+} -+ -+int -+ipsec_sadb_init(void) -+{ -+ int error = 0; -+ unsigned i; -+ -+ for(i = 0; i < SADB_HASHMOD; i++) { -+ ipsec_sadb_hash[i] = NULL; -+ } -+ /* parts above are for the old style SADB hash table */ -+ -+ -+ /* initialise SA reference table */ -+ -+ /* initialise the main table */ -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_sadb_init: " -+ "initialising main table of size %u (2 ^ %u).\n", -+ IPSEC_SA_REF_MAINTABLE_NUM_ENTRIES, -+ IPSEC_SA_REF_MAINTABLE_IDX_WIDTH); -+ { -+ unsigned table; -+ for(table = 0; table < IPSEC_SA_REF_MAINTABLE_NUM_ENTRIES; table++) { -+ ipsec_sadb.refTable[table] = NULL; -+ } -+ } -+ -+ /* allocate the first sub-table */ -+ error = ipsec_SArefSubTable_alloc(0); -+ if(error) { -+ return error; -+ } -+ -+ error = ipsec_saref_freelist_init(); -+ return error; -+} -+ -+IPsecSAref_t -+ipsec_SAref_alloc(int*error) /* pass in error var by pointer */ -+{ -+ IPsecSAref_t SAref; -+ -+ KLIPS_PRINT(debug_xform, -+ "ipsec_SAref_alloc: " -+ "SAref requested... head=%d, cont=%d, tail=%d, listsize=%d.\n", -+ ipsec_sadb.refFreeListHead, -+ ipsec_sadb.refFreeListCont, -+ ipsec_sadb.refFreeListTail, -+ IPSEC_SA_REF_FREELIST_NUM_ENTRIES); -+ -+ if(ipsec_sadb.refFreeListHead == IPSEC_SAREF_NULL) { -+ KLIPS_PRINT(debug_xform, -+ "ipsec_SAref_alloc: " -+ "FreeList empty, recycling...\n"); -+ *error = ipsec_SAref_recycle(); -+ if(*error) { -+ return IPSEC_SAREF_NULL; -+ } -+ } -+ -+ SAref = ipsec_sadb.refFreeList[ipsec_sadb.refFreeListHead]; -+ if(SAref == IPSEC_SAREF_NULL) { -+ KLIPS_ERROR(debug_xform, -+ "ipsec_SAref_alloc: " -+ "unexpected error, refFreeListHead = %d points to invalid entry.\n", -+ ipsec_sadb.refFreeListHead); -+ *error = -ESPIPE; -+ return IPSEC_SAREF_NULL; -+ } -+ -+ KLIPS_PRINT(debug_xform, -+ "ipsec_SAref_alloc: " -+ "allocating SAref=%d, table=%u, entry=%u of %u.\n", -+ SAref, -+ IPsecSAref2table(SAref), -+ IPsecSAref2entry(SAref), -+ IPSEC_SA_REF_MAINTABLE_NUM_ENTRIES * IPSEC_SA_REF_SUBTABLE_NUM_ENTRIES); -+ -+ ipsec_sadb.refFreeList[ipsec_sadb.refFreeListHead] = IPSEC_SAREF_NULL; -+ ipsec_sadb.refFreeListHead++; -+ if(ipsec_sadb.refFreeListHead > ipsec_sadb.refFreeListTail) { -+ KLIPS_PRINT(debug_xform, -+ "ipsec_SAref_alloc: " -+ "last FreeList entry allocated, resetting list head to empty.\n"); -+ ipsec_sadb.refFreeListHead = IPSEC_SAREF_NULL; -+ } -+ -+ return SAref; -+} -+ -+int -+ipsec_sa_print(struct ipsec_sa *ips) -+{ -+ char sa[SATOT_BUF]; -+ size_t sa_len; -+ -+ printk(KERN_INFO "klips_debug: SA:"); -+ if(ips == NULL) { -+ printk("NULL\n"); -+ return -ENOENT; -+ } -+ printk(" ref=%d", ips->ips_ref); -+ printk(" refcount=%d", atomic_read(&ips->ips_refcount)); -+ if(ips->ips_hnext != NULL) { -+ printk(" hnext=0p%p", ips->ips_hnext); -+ } -+ if(ips->ips_next != NULL) { -+ printk(" next=0p%p", ips->ips_next); -+ } -+ sa_len = satot(&ips->ips_said, 0, sa, sizeof(sa)); -+ printk(" said=%s", sa_len ? sa : " (error)"); -+ if(ips->ips_seq) { -+ printk(" seq=%u", ips->ips_seq); -+ } -+ if(ips->ips_pid) { -+ printk(" pid=%u", ips->ips_pid); -+ } -+ if(ips->ips_authalg) { -+ printk(" authalg=%u", ips->ips_authalg); -+ } -+ if(ips->ips_encalg) { -+ printk(" encalg=%u", ips->ips_encalg); -+ } -+ printk(" XFORM=%s%s%s", IPS_XFORM_NAME(ips)); -+ if(ips->ips_replaywin) { -+ printk(" ooowin=%u", ips->ips_replaywin); -+ } -+ if(ips->ips_flags) { -+ printk(" flags=%u", ips->ips_flags); -+ } -+ if(ips->ips_addr_s) { -+ char buf[SUBNETTOA_BUF]; -+ addrtoa(((struct sockaddr_in*)(ips->ips_addr_s))->sin_addr, -+ 0, buf, sizeof(buf)); -+ printk(" src=%s", buf); -+ } -+ if(ips->ips_addr_d) { -+ char buf[SUBNETTOA_BUF]; -+ addrtoa(((struct sockaddr_in*)(ips->ips_addr_s))->sin_addr, -+ 0, buf, sizeof(buf)); -+ printk(" dst=%s", buf); -+ } -+ if(ips->ips_addr_p) { -+ char buf[SUBNETTOA_BUF]; -+ addrtoa(((struct sockaddr_in*)(ips->ips_addr_p))->sin_addr, -+ 0, buf, sizeof(buf)); -+ printk(" proxy=%s", buf); -+ } -+ if(ips->ips_key_bits_a) { -+ printk(" key_bits_a=%u", ips->ips_key_bits_a); -+ } -+ if(ips->ips_key_bits_e) { -+ printk(" key_bits_e=%u", ips->ips_key_bits_e); -+ } -+ -+ printk("\n"); -+ return 0; -+} -+ -+struct ipsec_sa* -+ipsec_sa_alloc(int*error) /* pass in error var by pointer */ -+{ -+ struct ipsec_sa* ips; -+ -+ if((ips = kmalloc(sizeof(*ips), GFP_ATOMIC) ) == NULL) { -+ KLIPS_PRINT(debug_xform, -+ "ipsec_sa_alloc: " -+ "memory allocation error\n"); -+ *error = -ENOMEM; -+ return NULL; -+ } -+ memset((caddr_t)ips, 0, sizeof(*ips)); -+ -+ /* return with at least counter = 1 */ -+ ipsec_sa_get(ips); -+ -+ *error = 0; -+ return(ips); -+} -+ -+void -+ipsec_sa_untern(struct ipsec_sa *ips) -+{ -+ IPsecSAref_t ref = ips->ips_ref; -+ int error; -+ -+ /* verify that we are removing correct item! */ -+ error = ipsec_saref_verify_slot(ref); -+ if(error) { -+ return; -+ } -+ -+ if(IPsecSAref2SA(ref) == ips) { -+ IPsecSAref2SA(ref) = NULL; -+ ipsec_sa_put(ips); -+ } else { -+ KLIPS_PRINT(debug_xform, -+ "ipsec_sa_untern: " -+ "ref=%u -> %p but untern'ing %p\n", ref, -+ IPsecSAref2SA(ref), ips); -+ } -+ -+} -+ -+int -+ipsec_sa_intern(struct ipsec_sa *ips) -+{ -+ int error; -+ IPsecSAref_t ref = ips->ips_ref; -+ -+ if(ref == IPSEC_SAREF_NULL) { -+ ref = ipsec_SAref_alloc(&error); /* pass in error return by pointer */ -+ KLIPS_PRINT(debug_xform, -+ "ipsec_sa_intern: " -+ "allocated ref=%u for sa %p\n", ref, ips); -+ -+ if(ref == IPSEC_SAREF_NULL) { -+ KLIPS_PRINT(debug_xform, -+ "ipsec_sa_intern: " -+ "SAref allocation error\n"); -+ return error; -+ } -+ -+ ips->ips_ref = ref; -+ } -+ -+ error = ipsec_saref_verify_slot(ref); -+ if(error) { -+ return error; -+ } -+ -+ ipsec_sa_get(ips); -+ /* -+ * if there is an existing SA at this reference, then free it -+ * note, that nsa might == ips!. That's okay, we just incremented -+ * the reference count above. -+ */ -+ { -+ struct ipsec_sa *nsa = IPsecSAref2SA(ref); -+ if(nsa) { -+ ipsec_sa_put(nsa); -+ } -+ } -+ -+ KLIPS_PRINT(debug_xform, -+ "ipsec_sa_alloc: " -+ "SAref[%d]=%p\n", -+ ips->ips_ref, ips); -+ IPsecSAref2SA(ips->ips_ref) = ips; -+ -+ /* return OK */ -+ return 0; -+} -+ -+ -+struct ipsec_sa * -+ipsec_sa_getbyid(ip_said *said) -+{ -+ int hashval; -+ struct ipsec_sa *ips; -+ char sa[SATOT_BUF]; -+ size_t sa_len; -+ -+ if(said == NULL) { -+ KLIPS_PRINT(debug_xform, -+ "ipsec_sa_getbyid: " -+ "null pointer passed in!\n"); -+ return NULL; -+ } -+ -+ hashval = IPS_HASH(said); -+ -+ sa_len = KLIPS_SATOT(debug_xform, said, 0, sa, sizeof(sa)); -+ KLIPS_PRINT(debug_xform, -+ "ipsec_sa_getbyid: " -+ "linked entry in ipsec_sa table for hash=%d of SA:%s requested.\n", -+ hashval, -+ sa_len ? sa : " (error)"); -+ -+ if((ips = ipsec_sadb_hash[hashval]) == NULL) { -+ KLIPS_PRINT(debug_xform, -+ "ipsec_sa_getbyid: " -+ "no entries in ipsec_sa table for hash=%d of SA:%s.\n", -+ hashval, -+ sa_len ? sa : " (error)"); -+ return NULL; -+ } -+ -+ for (; ips; ips = ips->ips_hnext) { -+ if ((ips->ips_said.spi == said->spi) && -+ (ips->ips_said.dst.u.v4.sin_addr.s_addr == said->dst.u.v4.sin_addr.s_addr) && -+ (ips->ips_said.proto == said->proto)) { -+ ipsec_sa_get(ips); -+ return ips; -+ } -+ } -+ -+ KLIPS_PRINT(debug_xform, -+ "ipsec_sa_getbyid: " -+ "no entry in linked list for hash=%d of SA:%s.\n", -+ hashval, -+ sa_len ? sa : " (error)"); -+ return NULL; -+} -+ -+struct ipsec_sa * -+ipsec_sa_getbyref(IPsecSAref_t ref) -+{ -+ struct ipsec_sa *ips; -+ struct IPsecSArefSubTable *st = ipsec_sadb.refTable[IPsecSAref2table(ref)]; -+ -+ if(st == NULL) { -+ return NULL; -+ } -+ -+ ips = st->entry[IPsecSAref2entry(ref)]; -+ if(ips) { -+ ipsec_sa_get(ips); -+ } -+ return ips; -+} -+ -+ -+void -+__ipsec_sa_put(struct ipsec_sa *ips, const char *func, int line) -+{ -+ if(ips == NULL) { -+ KLIPS_PRINT(debug_xform, -+ "ipsec_sa_put: " -+ "null pointer passed in!\n"); -+ return; -+ } -+ -+#ifdef CONFIG_KLIPS_DEBUG -+ if(debug_xform) { -+ char sa[SATOT_BUF]; -+ size_t sa_len; -+ sa_len = satot(&ips->ips_said, 0, sa, sizeof(sa)); -+ -+ KLIPS_PRINT(debug_xform, -+ "ipsec_sa_put: " -+ "ipsec_sa %p SA:%s, ref:%d reference count (%d--) decremented by %s:%d.\n", -+ ips, -+ sa_len ? sa : " (error)", -+ ips->ips_ref, -+ atomic_read(&ips->ips_refcount), -+ func, line); -+ } -+#endif -+ -+ if(atomic_dec_and_test(&ips->ips_refcount)) { -+ KLIPS_PRINT(debug_xform, -+ "ipsec_sa_put: freeing %p\n", -+ ips); -+ /* it was zero */ -+ ipsec_sa_wipe(ips); -+ } -+ -+ return; -+} -+ -+struct ipsec_sa * -+__ipsec_sa_get(struct ipsec_sa *ips, const char *func, int line) -+{ -+ if (ips == NULL) -+ return NULL; -+ -+#ifdef CONFIG_KLIPS_DEBUG -+ if(debug_xform) { -+ char sa[SATOT_BUF]; -+ size_t sa_len; -+ sa_len = satot(&ips->ips_said, 0, sa, sizeof(sa)); -+ -+ KLIPS_PRINT(debug_xform, -+ "ipsec_sa_get: " -+ "ipsec_sa %p SA:%s, ref:%d reference count (%d++) incremented by %s:%d.\n", -+ ips, -+ sa_len ? sa : " (error)", -+ ips->ips_ref, -+ atomic_read(&ips->ips_refcount), -+ func, line); -+ } -+#endif -+ -+ atomic_inc(&ips->ips_refcount); -+ -+#if 0 -+ /* -+ * DAVIDM: if we include this code it means the SA is freed immediately -+ * on creation and then reused ! Not sure why it is here. -+ */ -+ -+ if(atomic_dec_and_test(&ips->ips_refcount)) { -+ KLIPS_PRINT(debug_xform, -+ "ipsec_sa_get: freeing %p\n", -+ ips); -+ /* it was zero */ -+ ipsec_sa_wipe(ips); -+ } -+#endif -+ -+ return ips; -+} -+ -+/* -+ The ipsec_sa table better *NOT* be locked before it is handed in, or SMP locks will happen -+*/ -+int -+ipsec_sa_add(struct ipsec_sa *ips) -+{ -+ int error = 0; -+ unsigned int hashval; -+ -+ ips = ipsec_sa_get(ips); -+ -+ if(ips == NULL) { -+ KLIPS_PRINT(debug_xform, -+ "klips_error:ipsec_sa_add: " -+ "null pointer passed in!\n"); -+ return -ENODATA; -+ } -+ hashval = IPS_HASH(&ips->ips_said); -+ -+ ipsec_sa_get(ips); -+ spin_lock_bh(&tdb_lock); -+ -+ ips->ips_hnext = ipsec_sadb_hash[hashval]; -+ ipsec_sadb_hash[hashval] = ips; -+ -+ spin_unlock_bh(&tdb_lock); -+ -+ return error; -+} -+ -+/* -+ * remove it from the hash chain, decrementing hash count -+ */ -+void ipsec_sa_rm(struct ipsec_sa *ips) -+{ -+ unsigned int hashval; -+ char sa[SATOT_BUF]; -+ size_t sa_len; -+ -+ -+ if(ips == NULL) return; -+ -+ -+ hashval = IPS_HASH(&ips->ips_said); -+ -+ sa_len = KLIPS_SATOT(debug_xform, &ips->ips_said, 0, sa, sizeof(sa)); -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_sa_del: " -+ "unhashing SA:%s (ref=%u), hashval=%d.\n", -+ sa_len ? sa : " (error)", -+ ips->ips_ref, -+ hashval); -+ -+ if(ipsec_sadb_hash[hashval] == NULL) { -+ return; -+ } -+ -+ if (ips == ipsec_sadb_hash[hashval]) { -+ ipsec_sadb_hash[hashval] = ipsec_sadb_hash[hashval]->ips_hnext; -+ ips->ips_hnext = NULL; -+ ipsec_sa_put(ips); -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_sa_del: " -+ "successfully unhashed first ipsec_sa in chain.\n"); -+ return; -+ } else { -+ struct ipsec_sa *ipstp; -+ -+ for (ipstp = ipsec_sadb_hash[hashval]; -+ ipstp; -+ ipstp = ipstp->ips_hnext) { -+ if (ipstp->ips_hnext == ips) { -+ ipstp->ips_hnext = ips->ips_hnext; -+ ips->ips_hnext = NULL; -+ ipsec_sa_put(ips); -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_sa_del: " -+ "successfully unhashed link in ipsec_sa chain.\n"); -+ return; -+ } -+ } -+ } -+} -+ -+ -+#if 0 -+/* -+ * The ipsec_sa table better be locked before it is handed in, -+ * or races might happen. -+ * -+ * this routine assumes the SA has a refcount==0, and we free it. -+ * we also assume that the pointers are already cleaned up. -+ */ -+static int -+ipsec_sa_del(struct ipsec_sa *ips) -+{ -+ unsigned int hashval; -+ struct ipsec_sa *ipstp; -+ char sa[SATOT_BUF]; -+ size_t sa_len; -+ -+ if(ips == NULL) { -+ KLIPS_ERROR(debug_xform, -+ "klips_error:ipsec_sa_del: " -+ "null pointer passed in!\n"); -+ return -ENODATA; -+ } -+ -+ if(ips->ips_next) { -+ struct ipsec_sa *in = ips->ips_next; -+ -+ ips->ips_next=NULL; -+ ipsec_sa_put(in); -+ } -+ -+ sa_len = KLIPS_SATOT(debug_xform, &ips->ips_said, 0, sa, sizeof(sa)); -+ hashval = IPS_HASH(&ips->ips_said); -+ -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_sa_del: " -+ "deleting SA:%s (ref=%u), hashval=%d.\n", -+ sa_len ? sa : " (error)", -+ ips->ips_ref, -+ hashval); -+ -+ if(ipsec_sadb_hash[hashval] == NULL) { -+ /* if this is NULL, then we can be sure that the SA was never -+ * added to the SADB, so we just free it. -+ */ -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_sa_del: " -+ "no entries in ipsec_sa table for hash=%d (ref=%u) of SA:%s.\n", -+ hashval, -+ ips->ips_ref, -+ sa_len ? sa : " (error)"); -+ return -ENOENT; -+ } -+ -+ if (ips == ipsec_sadb_hash[hashval]) { -+ ipsec_sadb_hash[hashval] = ipsec_sadb_hash[hashval]->ips_hnext; -+ ips->ips_hnext = NULL; -+ -+ ipsec_sa_put(ips); -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_sa_del: " -+ "successfully deleted first ipsec_sa in chain.\n"); -+ return 0; -+ } else { -+ for (ipstp = ipsec_sadb_hash[hashval]; -+ ipstp; -+ ipstp = ipstp->ips_hnext) { -+ if (ipstp->ips_hnext == ips) { -+ ipstp->ips_hnext = ips->ips_hnext; -+ ips->ips_hnext = NULL; -+ ipsec_sa_put(ips); -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_sa_del: " -+ "successfully deleted link in ipsec_sa chain.\n"); -+ return 0; -+ } -+ } -+ } -+ -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_sa_del: " -+ "no entries in linked list for hash=%d of SA:%s.\n", -+ hashval, -+ sa_len ? sa : " (error)"); -+ return -ENOENT; -+} -+#endif -+ -+int -+ipsec_sadb_cleanup(__u8 proto) -+{ -+ unsigned i; -+ int error = 0; -+ struct ipsec_sa *ips; -+ //struct ipsec_sa *ipsnext, **ipsprev; -+ //char sa[SATOT_BUF]; -+ //size_t sa_len; -+ -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_sadb_cleanup: " -+ "cleaning up proto=%d.\n", -+ proto); -+ -+ spin_lock_bh(&tdb_lock); -+ -+ for (i = 0; i < SADB_HASHMOD; i++) { -+ ips = ipsec_sadb_hash[i]; -+ -+ while(ips) { -+ ipsec_sadb_hash[i]=ips->ips_hnext; -+ ips->ips_hnext=NULL; -+ ipsec_sa_put(ips); -+ -+ ips = ipsec_sadb_hash[i]; -+ } -+ } -+ -+//errlab: -+ -+ spin_unlock_bh(&tdb_lock); -+ -+ -+#if IPSEC_SA_REF_CODE -+ /* clean up SA reference table */ -+ -+ /* go through the ref table and clean out all the SAs */ -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_sadb_cleanup: " -+ "removing SAref entries and tables."); -+ { -+ unsigned table, entry; -+ for(table = 0; table < IPSEC_SA_REF_MAINTABLE_NUM_ENTRIES; table++) { -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_sadb_cleanup: " -+ "cleaning SAref table=%u.\n", -+ table); -+ if(ipsec_sadb.refTable[table] == NULL) { -+ printk("\n"); -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_sadb_cleanup: " -+ "cleaned %u used refTables.\n", -+ table); -+ break; -+ } -+ for(entry = 0; entry < IPSEC_SA_REF_SUBTABLE_NUM_ENTRIES; entry++) { -+ if(ipsec_sadb.refTable[table]->entry[entry] != NULL) { -+ struct ipsec_sa *sa1 = ipsec_sadb.refTable[table]->entry[entry]; -+ ipsec_sa_put(sa1); -+ ipsec_sadb.refTable[table]->entry[entry] = NULL; -+ } -+ } -+ } -+ } -+#endif /* IPSEC_SA_REF_CODE */ -+ -+ return(error); -+} -+ -+int -+ipsec_sadb_free(void) -+{ -+ int error = 0; -+ -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_sadb_free: " -+ "freeing SArefTable memory.\n"); -+ -+ /* clean up SA reference table */ -+ -+ /* go through the ref table and clean out all the SAs if any are -+ left and free table memory */ -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_sadb_free: " -+ "removing SAref entries and tables.\n"); -+ { -+ unsigned table, entry; -+ for(table = 0; table < IPSEC_SA_REF_MAINTABLE_NUM_ENTRIES; table++) { -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_sadb_free: " -+ "removing SAref table=%u.\n", -+ table); -+ if(ipsec_sadb.refTable[table] == NULL) { -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_sadb_free: " -+ "removed %u used refTables.\n", -+ table); -+ break; -+ } -+ for(entry = 0; entry < IPSEC_SA_REF_SUBTABLE_NUM_ENTRIES; entry++) { -+ if(ipsec_sadb.refTable[table]->entry[entry] != NULL) { -+ struct ipsec_sa *sa1 = ipsec_sadb.refTable[table]->entry[entry]; -+ -+ BUG_ON(atomic_read(&sa1->ips_refcount) == 1); -+ ipsec_sa_put(sa1); -+ ipsec_sadb.refTable[table]->entry[entry] = NULL; -+ } -+ } -+ vfree(ipsec_sadb.refTable[table]); -+ ipsec_sadb.refTable[table] = NULL; -+ } -+ } -+ -+ return(error); -+} -+ -+int -+ipsec_sa_wipe(struct ipsec_sa *ips) -+{ -+ if(ips == NULL) { -+ return -ENODATA; -+ } -+ -+#if IPSEC_SA_REF_CODE -+ /* remove me from the SArefTable */ -+ if(debug_xform) -+ { -+ char sa[SATOT_BUF]; -+ size_t sa_len; -+ struct IPsecSArefSubTable *subtable = NULL; -+ -+ if(IPsecSAref2table(IPsecSA2SAref(ips))ips_said, 0, sa, sizeof(sa)); -+ KLIPS_PRINT(debug_xform, -+ "klips_debug:ipsec_sa_wipe: " -+ "removing SA=%s(0p%p), SAref=%d, table=%d(0p%p), entry=%d from the refTable.\n", -+ sa_len ? sa : " (error)", -+ ips, -+ ips->ips_ref, -+ IPsecSAref2table(IPsecSA2SAref(ips)), -+ subtable, -+ subtable ? IPsecSAref2entry(IPsecSA2SAref(ips)) : 0); -+ } -+ -+ if(ips->ips_ref != IPSEC_SAREF_NULL) { -+ struct IPsecSArefSubTable *subtable = NULL; -+ int ref_table=IPsecSAref2table(IPsecSA2SAref(ips)); -+ int ref_entry=IPsecSAref2entry(IPsecSA2SAref(ips)); -+ -+ if(ref_table < IPSEC_SA_REF_SUBTABLE_NUM_ENTRIES) { -+ subtable = ipsec_sadb.refTable[ref_table]; -+ if(subtable!=NULL && subtable->entry[ref_entry] == ips) { -+ -+ subtable->entry[ref_entry] = NULL; -+ } -+ } -+ ips->ips_ref = IPSEC_SAREF_NULL; -+ } -+#endif /* IPSEC_SA_REF_CODE */ -+ -+ /* paranoid clean up */ -+ if(ips->ips_addr_s != NULL) { -+ memset((caddr_t)(ips->ips_addr_s), 0, ips->ips_addr_s_size); -+ kfree(ips->ips_addr_s); -+ } -+ ips->ips_addr_s = NULL; -+ -+ if(ips->ips_addr_d != NULL) { -+ memset((caddr_t)(ips->ips_addr_d), 0, ips->ips_addr_d_size); -+ kfree(ips->ips_addr_d); -+ } -+ ips->ips_addr_d = NULL; -+ -+ if(ips->ips_addr_p != NULL) { -+ memset((caddr_t)(ips->ips_addr_p), 0, ips->ips_addr_p_size); -+ kfree(ips->ips_addr_p); -+ } -+ ips->ips_addr_p = NULL; -+ -+#ifdef CONFIG_IPSEC_NAT_TRAVERSAL -+ if(ips->ips_natt_oa) { -+ memset((caddr_t)(ips->ips_natt_oa), 0, ips->ips_natt_oa_size); -+ kfree(ips->ips_natt_oa); -+ } -+ ips->ips_natt_oa = NULL; -+#endif -+ -+ if(ips->ips_key_a != NULL) { -+ memset((caddr_t)(ips->ips_key_a), 0, ips->ips_key_a_size); -+ kfree(ips->ips_key_a); -+ } -+ ips->ips_key_a = NULL; -+ -+ if(ips->ips_key_e != NULL) { -+#ifdef CONFIG_KLIPS_ALG -+ if (ips->ips_alg_enc && -+ ips->ips_alg_enc->ixt_e_destroy_key) -+ { -+ ips->ips_alg_enc->ixt_e_destroy_key(ips->ips_alg_enc, -+ ips->ips_key_e); -+ } else -+#endif -+ { -+ memset((caddr_t)(ips->ips_key_e), 0, ips->ips_key_e_size); -+ kfree(ips->ips_key_e); -+ } -+ } -+ ips->ips_key_e = NULL; -+ -+ if(ips->ips_iv != NULL) { -+ memset((caddr_t)(ips->ips_iv), 0, ips->ips_iv_size); -+ kfree(ips->ips_iv); -+ } -+ ips->ips_iv = NULL; -+ -+#ifdef CONFIG_KLIPS_OCF -+ if (ips->ocf_in_use) -+ ipsec_ocf_sa_free(ips); -+#endif -+ -+ if(ips->ips_ident_s.data != NULL) { -+ memset((caddr_t)(ips->ips_ident_s.data), -+ 0, -+ ips->ips_ident_s.len * IPSEC_PFKEYv2_ALIGN - sizeof(struct sadb_ident)); -+ kfree(ips->ips_ident_s.data); -+ } -+ ips->ips_ident_s.data = NULL; -+ -+ if(ips->ips_ident_d.data != NULL) { -+ memset((caddr_t)(ips->ips_ident_d.data), -+ 0, -+ ips->ips_ident_d.len * IPSEC_PFKEYv2_ALIGN - sizeof(struct sadb_ident)); -+ kfree(ips->ips_ident_d.data); -+ } -+ ips->ips_ident_d.data = NULL; -+ -+#ifdef CONFIG_KLIPS_ALG -+ if (ips->ips_alg_enc||ips->ips_alg_auth) { -+ ipsec_alg_sa_wipe(ips); -+ } -+#endif -+ -+ BUG_ON(atomic_read(&ips->ips_refcount) != 0); -+ -+ memset((caddr_t)ips, 0, sizeof(*ips)); -+ kfree(ips); -+ ips = NULL; -+ -+ return 0; -+} -+ -+extern int sysctl_ipsec_debug_verbose; -+ -+int ipsec_sa_init(struct ipsec_sa *ipsp) -+{ -+ int error = 0; -+ char sa[SATOT_BUF]; -+ size_t sa_len; -+ char ipaddr_txt[ADDRTOA_BUF]; -+ char ipaddr2_txt[ADDRTOA_BUF]; -+#if defined (CONFIG_KLIPS_AUTH_HMAC_MD5) || defined (CONFIG_KLIPS_AUTH_HMAC_SHA1) -+ unsigned char kb[AHMD596_BLKLEN]; -+#endif -+#ifdef CONFIG_KLIPS_ALG -+ struct ipsec_alg_enc *ixt_e = NULL; -+ struct ipsec_alg_auth *ixt_a = NULL; -+ int i; -+#endif -+ -+ if(ipsp == NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "ipsec_sa_init: " -+ "ipsp is NULL, fatal\n"); -+ SENDERR(EINVAL); -+ } -+ -+ sa_len = KLIPS_SATOT(debug_pfkey, &ipsp->ips_said, 0, sa, sizeof(sa)); -+ -+ KLIPS_PRINT(debug_pfkey, -+ "ipsec_sa_init: " -+ "(pfkey defined) called for SA:%s\n", -+ sa_len ? sa : " (error)"); -+ -+ KLIPS_PRINT(debug_pfkey, -+ "ipsec_sa_init: " -+ "calling init routine of %s%s%s\n", -+ IPS_XFORM_NAME(ipsp)); -+ -+ switch(ipsp->ips_said.proto) { -+#ifdef CONFIG_KLIPS_IPIP -+ case IPPROTO_IPIP: { -+ ipsp->ips_xformfuncs = ipip_xform_funcs; -+ addrtoa(((struct sockaddr_in*)(ipsp->ips_addr_s))->sin_addr, -+ 0, -+ ipaddr_txt, sizeof(ipaddr_txt)); -+ addrtoa(((struct sockaddr_in*)(ipsp->ips_addr_d))->sin_addr, -+ 0, -+ ipaddr2_txt, sizeof(ipaddr_txt)); -+ KLIPS_PRINT(debug_pfkey, -+ "ipsec_sa_init: " -+ "(pfkey defined) IPIP ipsec_sa set for %s->%s.\n", -+ ipaddr_txt, -+ ipaddr2_txt); -+ } -+ break; -+#endif /* !CONFIG_KLIPS_IPIP */ -+ -+#ifdef CONFIG_KLIPS_AH -+ case IPPROTO_AH: -+ -+#ifdef CONFIG_KLIPS_OCF -+ if (ipsec_ocf_sa_init(ipsp, ipsp->ips_authalg, 0)) -+ break; -+#endif -+ -+ ipsp->ips_xformfuncs = ah_xform_funcs; -+ switch(ipsp->ips_authalg) { -+# ifdef CONFIG_KLIPS_AUTH_HMAC_MD5 -+ case AH_MD5: { -+ unsigned char *akp; -+ unsigned int aks; -+ MD5_CTX *ictx; -+ MD5_CTX *octx; -+ -+ if(ipsp->ips_key_bits_a != (AHMD596_KLEN * 8)) { -+ KLIPS_PRINT(debug_pfkey, -+ "ipsec_sa_init: " -+ "incorrect key size: %d bits -- must be %d bits\n"/*octets (bytes)\n"*/, -+ ipsp->ips_key_bits_a, AHMD596_KLEN * 8); -+ SENDERR(EINVAL); -+ } -+ -+# if KLIPS_DIVULGE_HMAC_KEY -+ KLIPS_PRINT(debug_pfkey && sysctl_ipsec_debug_verbose, -+ "ipsec_sa_init: " -+ "hmac md5-96 key is 0x%08x %08x %08x %08x\n", -+ ntohl(*(((__u32 *)ipsp->ips_key_a)+0)), -+ ntohl(*(((__u32 *)ipsp->ips_key_a)+1)), -+ ntohl(*(((__u32 *)ipsp->ips_key_a)+2)), -+ ntohl(*(((__u32 *)ipsp->ips_key_a)+3))); -+# endif /* KLIPS_DIVULGE_HMAC_KEY */ -+ -+ ipsp->ips_auth_bits = AHMD596_ALEN * 8; -+ -+ /* save the pointer to the key material */ -+ akp = ipsp->ips_key_a; -+ aks = ipsp->ips_key_a_size; -+ -+ KLIPS_PRINT(debug_pfkey && sysctl_ipsec_debug_verbose, -+ "ipsec_sa_init: " -+ "allocating %lu bytes for md5_ctx.\n", -+ (unsigned long) sizeof(struct md5_ctx)); -+ if((ipsp->ips_key_a = (caddr_t) -+ kmalloc(sizeof(struct md5_ctx), GFP_ATOMIC)) == NULL) { -+ ipsp->ips_key_a = akp; -+ SENDERR(ENOMEM); -+ } -+ ipsp->ips_key_a_size = sizeof(struct md5_ctx); -+ -+ for (i = 0; i < DIVUP(ipsp->ips_key_bits_a, 8); i++) { -+ kb[i] = akp[i] ^ HMAC_IPAD; -+ } -+ for (; i < AHMD596_BLKLEN; i++) { -+ kb[i] = HMAC_IPAD; -+ } -+ -+ ictx = &(((struct md5_ctx*)(ipsp->ips_key_a))->ictx); -+ osMD5Init(ictx); -+ osMD5Update(ictx, kb, AHMD596_BLKLEN); -+ -+ for (i = 0; i < AHMD596_BLKLEN; i++) { -+ kb[i] ^= (HMAC_IPAD ^ HMAC_OPAD); -+ } -+ -+ octx = &(((struct md5_ctx*)(ipsp->ips_key_a))->octx); -+ osMD5Init(octx); -+ osMD5Update(octx, kb, AHMD596_BLKLEN); -+ -+# if KLIPS_DIVULGE_HMAC_KEY -+ KLIPS_PRINT(debug_pfkey && sysctl_ipsec_debug_verbose, -+ "ipsec_sa_init: " -+ "MD5 ictx=0x%08x %08x %08x %08x octx=0x%08x %08x %08x %08x\n", -+ ((__u32*)ictx)[0], -+ ((__u32*)ictx)[1], -+ ((__u32*)ictx)[2], -+ ((__u32*)ictx)[3], -+ ((__u32*)octx)[0], -+ ((__u32*)octx)[1], -+ ((__u32*)octx)[2], -+ ((__u32*)octx)[3] ); -+# endif /* KLIPS_DIVULGE_HMAC_KEY */ -+ -+ /* zero key buffer -- paranoid */ -+ memset(akp, 0, aks); -+ kfree(akp); -+ } -+ break; -+# endif /* CONFIG_KLIPS_AUTH_HMAC_MD5 */ -+# ifdef CONFIG_KLIPS_AUTH_HMAC_SHA1 -+ case AH_SHA: { -+ unsigned char *akp; -+ unsigned int aks; -+ SHA1_CTX *ictx; -+ SHA1_CTX *octx; -+ -+ if(ipsp->ips_key_bits_a != (AHSHA196_KLEN * 8)) { -+ KLIPS_PRINT(debug_pfkey, -+ "ipsec_sa_init: " -+ "incorrect key size: %d bits -- must be %d bits\n"/*octets (bytes)\n"*/, -+ ipsp->ips_key_bits_a, AHSHA196_KLEN * 8); -+ SENDERR(EINVAL); -+ } -+ -+# if KLIPS_DIVULGE_HMAC_KEY -+ KLIPS_PRINT(debug_pfkey && sysctl_ipsec_debug_verbose, -+ "ipsec_sa_init: " -+ "hmac sha1-96 key is 0x%08x %08x %08x %08x\n", -+ ntohl(*(((__u32 *)ipsp->ips_key_a)+0)), -+ ntohl(*(((__u32 *)ipsp->ips_key_a)+1)), -+ ntohl(*(((__u32 *)ipsp->ips_key_a)+2)), -+ ntohl(*(((__u32 *)ipsp->ips_key_a)+3))); -+# endif /* KLIPS_DIVULGE_HMAC_KEY */ -+ -+ ipsp->ips_auth_bits = AHSHA196_ALEN * 8; -+ -+ /* save the pointer to the key material */ -+ akp = ipsp->ips_key_a; -+ aks = ipsp->ips_key_a_size; -+ -+ KLIPS_PRINT(debug_pfkey && sysctl_ipsec_debug_verbose, -+ "ipsec_sa_init: " -+ "allocating %lu bytes for sha1_ctx.\n", -+ (unsigned long) sizeof(struct sha1_ctx)); -+ if((ipsp->ips_key_a = (caddr_t) -+ kmalloc(sizeof(struct sha1_ctx), GFP_ATOMIC)) == NULL) { -+ ipsp->ips_key_a = akp; -+ SENDERR(ENOMEM); -+ } -+ ipsp->ips_key_a_size = sizeof(struct sha1_ctx); -+ -+ for (i = 0; i < DIVUP(ipsp->ips_key_bits_a, 8); i++) { -+ kb[i] = akp[i] ^ HMAC_IPAD; -+ } -+ for (; i < AHMD596_BLKLEN; i++) { -+ kb[i] = HMAC_IPAD; -+ } -+ -+ ictx = &(((struct sha1_ctx*)(ipsp->ips_key_a))->ictx); -+ SHA1Init(ictx); -+ SHA1Update(ictx, kb, AHSHA196_BLKLEN); -+ -+ for (i = 0; i < AHSHA196_BLKLEN; i++) { -+ kb[i] ^= (HMAC_IPAD ^ HMAC_OPAD); -+ } -+ -+ octx = &(((struct sha1_ctx*)(ipsp->ips_key_a))->octx); -+ SHA1Init(octx); -+ SHA1Update(octx, kb, AHSHA196_BLKLEN); -+ -+# if KLIPS_DIVULGE_HMAC_KEY -+ KLIPS_PRINT(debug_pfkey && sysctl_ipsec_debug_verbose, -+ "ipsec_sa_init: " -+ "SHA1 ictx=0x%08x %08x %08x %08x octx=0x%08x %08x %08x %08x\n", -+ ((__u32*)ictx)[0], -+ ((__u32*)ictx)[1], -+ ((__u32*)ictx)[2], -+ ((__u32*)ictx)[3], -+ ((__u32*)octx)[0], -+ ((__u32*)octx)[1], -+ ((__u32*)octx)[2], -+ ((__u32*)octx)[3] ); -+# endif /* KLIPS_DIVULGE_HMAC_KEY */ -+ /* zero key buffer -- paranoid */ -+ memset(akp, 0, aks); -+ kfree(akp); -+ } -+ break; -+# endif /* CONFIG_KLIPS_AUTH_HMAC_SHA1 */ -+ default: -+ KLIPS_PRINT(debug_pfkey, -+ "ipsec_sa_init: " -+ "authalg=%d support not available in the kernel", -+ ipsp->ips_authalg); -+ SENDERR(EINVAL); -+ } -+ break; -+#endif /* CONFIG_KLIPS_AH */ -+ -+#ifdef CONFIG_KLIPS_ESP -+ case IPPROTO_ESP: -+ ipsp->ips_xformfuncs = esp_xform_funcs; -+ { -+#if defined (CONFIG_KLIPS_AUTH_HMAC_MD5) || defined (CONFIG_KLIPS_AUTH_HMAC_SHA1) -+ unsigned char *akp; -+ unsigned int aks; -+#endif -+ -+#ifdef CONFIG_KLIPS_OCF -+ if (ipsec_ocf_sa_init(ipsp, ipsp->ips_authalg, ipsp->ips_encalg)) -+ break; -+#endif -+ -+#ifdef CONFIG_KLIPS_ALG -+ ipsec_alg_sa_init(ipsp); -+ ixt_e=ipsp->ips_alg_enc; -+ -+ if (ixt_e == NULL) { -+ if(printk_ratelimit()) { -+ printk(KERN_ERR -+ "ipsec_sa_init: " -+ "encalg=%d support not available in the kernel", -+ ipsp->ips_encalg); -+ } -+ SENDERR(ENOENT); -+ } -+ -+ ipsp->ips_iv_size = ixt_e->ixt_common.ixt_support.ias_ivlen/8; -+ -+ /* Create IV */ -+ if (ipsp->ips_iv_size) { -+ if((ipsp->ips_iv = (caddr_t) -+ kmalloc(ipsp->ips_iv_size, GFP_ATOMIC)) == NULL) { -+ SENDERR(ENOMEM); -+ } -+ prng_bytes(&ipsec_prng, -+ (char *)ipsp->ips_iv, -+ ipsp->ips_iv_size); -+ ipsp->ips_iv_bits = ipsp->ips_iv_size * 8; -+ } -+ -+ if ((error=ipsec_alg_enc_key_create(ipsp)) < 0) -+ SENDERR(-error); -+ -+ if ((ixt_a=ipsp->ips_alg_auth)) { -+ if ((error=ipsec_alg_auth_key_create(ipsp)) < 0) -+ SENDERR(-error); -+ } else -+#endif /* CONFIG_KLIPS_ALG */ -+ -+ switch(ipsp->ips_authalg) { -+# ifdef CONFIG_KLIPS_AUTH_HMAC_MD5 -+ case AH_MD5: { -+ MD5_CTX *ictx; -+ MD5_CTX *octx; -+ -+ if(ipsp->ips_key_bits_a != (AHMD596_KLEN * 8)) { -+ KLIPS_PRINT(debug_pfkey, -+ "ipsec_sa_init: " -+ "incorrect authorisation key size: %d bits -- must be %d bits\n"/*octets (bytes)\n"*/, -+ ipsp->ips_key_bits_a, -+ AHMD596_KLEN * 8); -+ SENDERR(EINVAL); -+ } -+ -+# if KLIPS_DIVULGE_HMAC_KEY -+ KLIPS_PRINT(debug_pfkey && sysctl_ipsec_debug_verbose, -+ "ipsec_sa_init: " -+ "hmac md5-96 key is 0x%08x %08x %08x %08x\n", -+ ntohl(*(((__u32 *)(ipsp->ips_key_a))+0)), -+ ntohl(*(((__u32 *)(ipsp->ips_key_a))+1)), -+ ntohl(*(((__u32 *)(ipsp->ips_key_a))+2)), -+ ntohl(*(((__u32 *)(ipsp->ips_key_a))+3))); -+# endif /* KLIPS_DIVULGE_HMAC_KEY */ -+ ipsp->ips_auth_bits = AHMD596_ALEN * 8; -+ -+ /* save the pointer to the key material */ -+ akp = ipsp->ips_key_a; -+ aks = ipsp->ips_key_a_size; -+ -+ KLIPS_PRINT(debug_pfkey && sysctl_ipsec_debug_verbose, -+ "ipsec_sa_init: " -+ "allocating %lu bytes for md5_ctx.\n", -+ (unsigned long) sizeof(struct md5_ctx)); -+ if((ipsp->ips_key_a = (caddr_t) -+ kmalloc(sizeof(struct md5_ctx), GFP_ATOMIC)) == NULL) { -+ ipsp->ips_key_a = akp; -+ SENDERR(ENOMEM); -+ } -+ ipsp->ips_key_a_size = sizeof(struct md5_ctx); -+ -+ for (i = 0; i < DIVUP(ipsp->ips_key_bits_a, 8); i++) { -+ kb[i] = akp[i] ^ HMAC_IPAD; -+ } -+ for (; i < AHMD596_BLKLEN; i++) { -+ kb[i] = HMAC_IPAD; -+ } -+ -+ ictx = &(((struct md5_ctx*)(ipsp->ips_key_a))->ictx); -+ osMD5Init(ictx); -+ osMD5Update(ictx, kb, AHMD596_BLKLEN); -+ -+ for (i = 0; i < AHMD596_BLKLEN; i++) { -+ kb[i] ^= (HMAC_IPAD ^ HMAC_OPAD); -+ } -+ -+ octx = &(((struct md5_ctx*)(ipsp->ips_key_a))->octx); -+ osMD5Init(octx); -+ osMD5Update(octx, kb, AHMD596_BLKLEN); -+ -+# if KLIPS_DIVULGE_HMAC_KEY -+ KLIPS_PRINT(debug_pfkey && sysctl_ipsec_debug_verbose, -+ "ipsec_sa_init: " -+ "MD5 ictx=0x%08x %08x %08x %08x octx=0x%08x %08x %08x %08x\n", -+ ((__u32*)ictx)[0], -+ ((__u32*)ictx)[1], -+ ((__u32*)ictx)[2], -+ ((__u32*)ictx)[3], -+ ((__u32*)octx)[0], -+ ((__u32*)octx)[1], -+ ((__u32*)octx)[2], -+ ((__u32*)octx)[3] ); -+# endif /* KLIPS_DIVULGE_HMAC_KEY */ -+ /* paranoid */ -+ memset(akp, 0, aks); -+ kfree(akp); -+ break; -+ } -+# endif /* CONFIG_KLIPS_AUTH_HMAC_MD5 */ -+# ifdef CONFIG_KLIPS_AUTH_HMAC_SHA1 -+ case AH_SHA: { -+ SHA1_CTX *ictx; -+ SHA1_CTX *octx; -+ -+ if(ipsp->ips_key_bits_a != (AHSHA196_KLEN * 8)) { -+ KLIPS_PRINT(debug_pfkey, -+ "ipsec_sa_init: " -+ "incorrect authorisation key size: %d bits -- must be %d bits\n"/*octets (bytes)\n"*/, -+ ipsp->ips_key_bits_a, -+ AHSHA196_KLEN * 8); -+ SENDERR(EINVAL); -+ } -+ -+# if KLIPS_DIVULGE_HMAC_KEY -+ KLIPS_PRINT(debug_pfkey && sysctl_ipsec_debug_verbose, -+ "ipsec_sa_init: " -+ "hmac sha1-96 key is 0x%08x %08x %08x %08x\n", -+ ntohl(*(((__u32 *)ipsp->ips_key_a)+0)), -+ ntohl(*(((__u32 *)ipsp->ips_key_a)+1)), -+ ntohl(*(((__u32 *)ipsp->ips_key_a)+2)), -+ ntohl(*(((__u32 *)ipsp->ips_key_a)+3))); -+# endif /* KLIPS_DIVULGE_HMAC_KEY */ -+ ipsp->ips_auth_bits = AHSHA196_ALEN * 8; -+ -+ /* save the pointer to the key material */ -+ akp = ipsp->ips_key_a; -+ aks = ipsp->ips_key_a_size; -+ -+ KLIPS_PRINT(debug_pfkey && sysctl_ipsec_debug_verbose, -+ "ipsec_sa_init: " -+ "allocating %lu bytes for sha1_ctx.\n", -+ (unsigned long) sizeof(struct sha1_ctx)); -+ if((ipsp->ips_key_a = (caddr_t) -+ kmalloc(sizeof(struct sha1_ctx), GFP_ATOMIC)) == NULL) { -+ ipsp->ips_key_a = akp; -+ SENDERR(ENOMEM); -+ } -+ ipsp->ips_key_a_size = sizeof(struct sha1_ctx); -+ -+ for (i = 0; i < DIVUP(ipsp->ips_key_bits_a, 8); i++) { -+ kb[i] = akp[i] ^ HMAC_IPAD; -+ } -+ for (; i < AHMD596_BLKLEN; i++) { -+ kb[i] = HMAC_IPAD; -+ } -+ -+ ictx = &(((struct sha1_ctx*)(ipsp->ips_key_a))->ictx); -+ SHA1Init(ictx); -+ SHA1Update(ictx, kb, AHSHA196_BLKLEN); -+ -+ for (i = 0; i < AHSHA196_BLKLEN; i++) { -+ kb[i] ^= (HMAC_IPAD ^ HMAC_OPAD); -+ } -+ -+ octx = &((struct sha1_ctx*)(ipsp->ips_key_a))->octx; -+ SHA1Init(octx); -+ SHA1Update(octx, kb, AHSHA196_BLKLEN); -+ -+# if KLIPS_DIVULGE_HMAC_KEY -+ KLIPS_PRINT(debug_pfkey && sysctl_ipsec_debug_verbose, -+ "ipsec_sa_init: " -+ "SHA1 ictx=0x%08x %08x %08x %08x octx=0x%08x %08x %08x %08x\n", -+ ((__u32*)ictx)[0], -+ ((__u32*)ictx)[1], -+ ((__u32*)ictx)[2], -+ ((__u32*)ictx)[3], -+ ((__u32*)octx)[0], -+ ((__u32*)octx)[1], -+ ((__u32*)octx)[2], -+ ((__u32*)octx)[3] ); -+# endif /* KLIPS_DIVULGE_HMAC_KEY */ -+ memset(akp, 0, aks); -+ kfree(akp); -+ break; -+ } -+# endif /* CONFIG_KLIPS_AUTH_HMAC_SHA1 */ -+ case AH_NONE: -+ break; -+ default: -+ KLIPS_PRINT(debug_pfkey, -+ "ipsec_sa_init: " -+ "authalg=%d support not available in the kernel.\n", -+ ipsp->ips_authalg); -+ SENDERR(EINVAL); -+ } -+ } -+ break; -+#endif /* !CONFIG_KLIPS_ESP */ -+#ifdef CONFIG_KLIPS_IPCOMP -+ case IPPROTO_COMP: -+ ipsp->ips_xformfuncs = ipcomp_xform_funcs; -+ ipsp->ips_comp_adapt_tries = 0; -+ ipsp->ips_comp_adapt_skip = 0; -+ ipsp->ips_comp_ratio_cbytes = 0; -+ ipsp->ips_comp_ratio_dbytes = 0; -+ break; -+#endif /* CONFIG_KLIPS_IPCOMP */ -+ default: -+ printk(KERN_ERR "KLIPS sa initialization: " -+ "proto=%d unknown.\n", -+ ipsp->ips_said.proto); -+ SENDERR(EINVAL); -+ } -+ -+ errlab: -+ return(error); -+} -+ -+/* -+ * -+ * Local Variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/ipsec_sha1.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,177 @@ -+/* -+ * RCSID $Id: ipsec_sha1.c,v 1.9 2004/04/06 02:49:26 mcr Exp $ -+ */ -+ -+/* -+ * The rest of the code is derived from sha1.c by Steve Reid, which is -+ * public domain. -+ * Minor cosmetic changes to accomodate it in the Linux kernel by ji. -+ */ -+ -+#include -+#include -+ -+#include "openswan/ipsec_sha1.h" -+ -+#if defined(rol) -+#undef rol -+#endif -+ -+#define SHA1HANDSOFF -+ -+#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits)))) -+ -+/* blk0() and blk() perform the initial expand. */ -+/* I got the idea of expanding during the round function from SSLeay */ -+#ifdef __LITTLE_ENDIAN -+#define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \ -+ |(rol(block->l[i],8)&0x00FF00FF)) -+#else -+#define blk0(i) block->l[i] -+#endif -+#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \ -+ ^block->l[(i+2)&15]^block->l[i&15],1)) -+ -+/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */ -+#define R0(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30); -+#define R1(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30); -+#define R2(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30); -+#define R3(v,w,x,y,z,i) z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30); -+#define R4(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30); -+ -+ -+/* Hash a single 512-bit block. This is the core of the algorithm. */ -+ -+void SHA1Transform(__u32 state[5], __u8 buffer[64]) -+{ -+__u32 a, b, c, d, e; -+typedef union { -+ unsigned char c[64]; -+ __u32 l[16]; -+} CHAR64LONG16; -+CHAR64LONG16* block; -+#ifdef SHA1HANDSOFF -+static unsigned char workspace[64]; -+ block = (CHAR64LONG16*)workspace; -+ memcpy(block, buffer, 64); -+#else -+ block = (CHAR64LONG16*)buffer; -+#endif -+ /* Copy context->state[] to working vars */ -+ a = state[0]; -+ b = state[1]; -+ c = state[2]; -+ d = state[3]; -+ e = state[4]; -+ /* 4 rounds of 20 operations each. Loop unrolled. */ -+ R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3); -+ R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7); -+ R0(c,d,e,a,b, 8); R0(b,c,d,e,a, 9); R0(a,b,c,d,e,10); R0(e,a,b,c,d,11); -+ R0(d,e,a,b,c,12); R0(c,d,e,a,b,13); R0(b,c,d,e,a,14); R0(a,b,c,d,e,15); -+ R1(e,a,b,c,d,16); R1(d,e,a,b,c,17); R1(c,d,e,a,b,18); R1(b,c,d,e,a,19); -+ R2(a,b,c,d,e,20); R2(e,a,b,c,d,21); R2(d,e,a,b,c,22); R2(c,d,e,a,b,23); -+ R2(b,c,d,e,a,24); R2(a,b,c,d,e,25); R2(e,a,b,c,d,26); R2(d,e,a,b,c,27); -+ R2(c,d,e,a,b,28); R2(b,c,d,e,a,29); R2(a,b,c,d,e,30); R2(e,a,b,c,d,31); -+ R2(d,e,a,b,c,32); R2(c,d,e,a,b,33); R2(b,c,d,e,a,34); R2(a,b,c,d,e,35); -+ R2(e,a,b,c,d,36); R2(d,e,a,b,c,37); R2(c,d,e,a,b,38); R2(b,c,d,e,a,39); -+ R3(a,b,c,d,e,40); R3(e,a,b,c,d,41); R3(d,e,a,b,c,42); R3(c,d,e,a,b,43); -+ R3(b,c,d,e,a,44); R3(a,b,c,d,e,45); R3(e,a,b,c,d,46); R3(d,e,a,b,c,47); -+ R3(c,d,e,a,b,48); R3(b,c,d,e,a,49); R3(a,b,c,d,e,50); R3(e,a,b,c,d,51); -+ R3(d,e,a,b,c,52); R3(c,d,e,a,b,53); R3(b,c,d,e,a,54); R3(a,b,c,d,e,55); -+ R3(e,a,b,c,d,56); R3(d,e,a,b,c,57); R3(c,d,e,a,b,58); R3(b,c,d,e,a,59); -+ R4(a,b,c,d,e,60); R4(e,a,b,c,d,61); R4(d,e,a,b,c,62); R4(c,d,e,a,b,63); -+ R4(b,c,d,e,a,64); R4(a,b,c,d,e,65); R4(e,a,b,c,d,66); R4(d,e,a,b,c,67); -+ R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71); -+ R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75); -+ R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79); -+ /* Add the working vars back into context.state[] */ -+ state[0] += a; -+ state[1] += b; -+ state[2] += c; -+ state[3] += d; -+ state[4] += e; -+ /* Wipe variables */ -+ a = b = c = d = e = 0; -+} -+ -+ -+/* SHA1Init - Initialize new context */ -+ -+void SHA1Init(void *vcontext) -+{ -+ SHA1_CTX* context = vcontext; -+ -+ /* SHA1 initialization constants */ -+ context->state[0] = 0x67452301; -+ context->state[1] = 0xEFCDAB89; -+ context->state[2] = 0x98BADCFE; -+ context->state[3] = 0x10325476; -+ context->state[4] = 0xC3D2E1F0; -+ context->count[0] = context->count[1] = 0; -+} -+ -+ -+/* Run your data through this. */ -+ -+void SHA1Update(void *vcontext, unsigned char* data, __u32 len) -+{ -+ SHA1_CTX* context = vcontext; -+ __u32 i, j; -+ -+ j = context->count[0]; -+ if ((context->count[0] += len << 3) < j) -+ context->count[1]++; -+ context->count[1] += (len>>29); -+ j = (j >> 3) & 63; -+ if ((j + len) > 63) { -+ memcpy(&context->buffer[j], data, (i = 64-j)); -+ SHA1Transform(context->state, context->buffer); -+ for ( ; i + 63 < len; i += 64) { -+ SHA1Transform(context->state, &data[i]); -+ } -+ j = 0; -+ } -+ else i = 0; -+ memcpy(&context->buffer[j], &data[i], len - i); -+} -+ -+ -+/* Add padding and return the message digest. */ -+ -+void SHA1Final(unsigned char digest[20], void *vcontext) -+{ -+ __u32 i, j; -+ unsigned char finalcount[8]; -+ SHA1_CTX* context = vcontext; -+ -+ for (i = 0; i < 8; i++) { -+ finalcount[i] = (unsigned char)((context->count[(i >= 4 ? 0 : 1)] -+ >> ((3-(i & 3)) * 8) ) & 255); /* Endian independent */ -+ } -+ SHA1Update(context, (unsigned char *)"\200", 1); -+ while ((context->count[0] & 504) != 448) { -+ SHA1Update(context, (unsigned char *)"\0", 1); -+ } -+ SHA1Update(context, finalcount, 8); /* Should cause a SHA1Transform() */ -+ for (i = 0; i < 20; i++) { -+ digest[i] = (unsigned char) -+ ((context->state[i>>2] >> ((3-(i & 3)) * 8) ) & 255); -+ } -+ /* Wipe variables */ -+ i = j = 0; -+ memset(context->buffer, 0, 64); -+ memset(context->state, 0, 20); -+ memset(context->count, 0, 8); -+ memset(&finalcount, 0, 8); -+#ifdef SHA1HANDSOFF /* make SHA1Transform overwrite its own static vars */ -+ SHA1Transform(context->state, context->buffer); -+#endif -+} -+ -+/* -+ * -+ * Local Variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/ipsec_snprintf.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,121 @@ -+/* -+ * @(#) ipsec_snprintf() function -+ * -+ * Copyright (C) 1998, 1999, 2000, 2001 Richard Guy Briggs -+ * 2001 Michael Richardson -+ * Copyright (C) 2005 Michael Richardson -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * Split out from ipsec_proc.c. -+ */ -+ -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif -+#include -+#define __NO_VERSION__ -+#include -+#include /* printk() */ -+ -+#include "openswan/ipsec_kversion.h" -+#include "openswan/ipsec_param.h" -+ -+#include -+ -+#include "openswan/radij.h" -+ -+#include "openswan/ipsec_life.h" -+#include "openswan/ipsec_stats.h" -+#include "openswan/ipsec_sa.h" -+ -+#include "openswan/ipsec_encap.h" -+#include "openswan/ipsec_radij.h" -+#include "openswan/ipsec_xform.h" -+#include "openswan/ipsec_tunnel.h" -+#include "openswan/ipsec_xmit.h" -+ -+#include "openswan/ipsec_rcv.h" -+#include "openswan/ipsec_ah.h" -+#include "openswan/ipsec_esp.h" -+#include "openswan/ipsec_kern24.h" -+ -+#ifdef CONFIG_KLIPS_IPCOMP -+#include "openswan/ipcomp.h" -+#endif /* CONFIG_KLIPS_IPCOMP */ -+ -+#include "openswan/ipsec_proto.h" -+ -+#include -+#include -+ -+/* ipsec_snprintf: like snprintf except -+ * - size is signed and a negative value is treated as if it were 0 -+ * - the returned result is never negative -- -+ * an error generates a "?" or null output (depending on space). -+ * (Our callers are too lazy to check for an error return.) -+ * -+ * @param buf String buffer -+ * @param size Size of the string -+ * @param fmt printf string -+ * @param ... Variables to be displayed in fmt -+ * @return int Return code -+ */ -+int ipsec_snprintf(char *buf, ssize_t size, const char *fmt, ...) -+{ -+ va_list args; -+ int i; -+ size_t possize = size < 0? 0 : size; -+ va_start(args, fmt); -+ i = vsnprintf(buf,possize,fmt,args); -+ va_end(args); -+ if (i < 0) { -+ /* create empty output in place of error */ -+ i = 0; -+ if (size > 0) { -+ *buf = '\0'; -+ } -+ } -+ return i; -+} -+ -+ -+void ipsec_dmp_block(char *s, caddr_t bb, int len) -+{ -+ int i; -+ unsigned char *b = bb; -+ -+ printk(KERN_INFO "klips_dmp: " -+ "at %s, len=%d:\n", s, len); -+ -+ for(i = 0; i < len; i++ /*, c++*/) { -+ if(!(i % 16)) { -+ printk(KERN_INFO -+ "klips_debug: @%03x:", -+ i); -+ } -+ printk(" %02x", b[i]); -+ if(!((i + 1) % 16)) { -+ printk("\n"); -+ } -+ } -+ if(i % 16) { -+ printk("\n"); -+ } -+} -+ -+/* -+ * Local Variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/ipsec_tunnel.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,2004 @@ -+/* -+ * IPSEC Tunneling code. Heavily based on drivers/net/new_tunnel.c -+ * Copyright (C) 1996, 1997 John Ioannidis. -+ * Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003 Richard Guy Briggs. -+ * -+ * OCF/receive state machine written by -+ * David McCullough -+ * Copyright (C) 2004-2005 Intel Corporation. All Rights Reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ */ -+ -+#define __NO_VERSION__ -+#include -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif /* for CONFIG_IP_FORWARD */ -+#include -+#include /* printk() */ -+ -+#include "openswan/ipsec_param.h" -+ -+#ifdef MALLOC_SLAB -+# include /* kmalloc() */ -+#else /* MALLOC_SLAB */ -+# include /* kmalloc() */ -+#endif /* MALLOC_SLAB */ -+#include /* error codes */ -+#include /* size_t */ -+#include /* mark_bh */ -+ -+#include -+#include -+#include -+ -+#include /* struct device, struct net_device_stats, dev_queue_xmit() and other headers */ -+#include /* eth_type_trans */ -+#include /* struct iphdr */ -+#include -+#include -+ -+#include -+ -+#ifdef NET_21 -+# include -+# define IS_MYADDR RTN_LOCAL -+# include -+# undef dev_kfree_skb -+# define dev_kfree_skb(a,b) kfree_skb(a) -+# define PHYSDEV_TYPE -+#endif /* NET_21 */ -+ -+#ifndef NETDEV_TX_BUSY -+# ifdef NETDEV_XMIT_CN -+# define NETDEV_TX_BUSY NETDEV_XMIT_CN -+# else -+# define NETDEV_TX_BUSY 1 -+# endif -+#endif -+ -+#include /* icmp_send() */ -+#include -+#include -+#ifdef NETDEV_23 -+# include -+#endif /* NETDEV_23 */ -+ -+#include -+#include -+ -+#include "openswan/ipsec_kversion.h" -+#include "openswan/radij.h" -+#include "openswan/ipsec_life.h" -+#include "openswan/ipsec_xform.h" -+#include "openswan/ipsec_eroute.h" -+#include "openswan/ipsec_encap.h" -+#include "openswan/ipsec_radij.h" -+#include "openswan/ipsec_sa.h" -+#include "openswan/ipsec_tunnel.h" -+#include "openswan/ipsec_xmit.h" -+#include "openswan/ipsec_ipe4.h" -+#include "openswan/ipsec_ah.h" -+#include "openswan/ipsec_esp.h" -+#include "openswan/ipsec_kern24.h" -+ -+#include -+#include -+ -+#include "openswan/ipsec_proto.h" -+#ifdef CONFIG_IPSEC_NAT_TRAVERSAL -+#include -+#endif -+ -+static __u32 zeroes[64]; -+ -+DEBUG_NO_STATIC int -+ipsec_tunnel_open(struct net_device *dev) -+{ -+ struct ipsecpriv *prv = dev->priv; -+ -+ /* -+ * Can't open until attached. -+ */ -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_tunnel_open: " -+ "dev = %s, prv->dev = %s\n", -+ dev->name, prv->dev?prv->dev->name:"NONE"); -+ -+ if (prv->dev == NULL) -+ return -ENODEV; -+ -+ KLIPS_INC_USE; -+ return 0; -+} -+ -+DEBUG_NO_STATIC int -+ipsec_tunnel_close(struct net_device *dev) -+{ -+ KLIPS_DEC_USE; -+ return 0; -+} -+ -+static inline int ipsec_tunnel_xmit2(struct sk_buff *skb) -+{ -+ -+#ifdef NETDEV_25 /* 2.6 kernels */ -+ return dst_output(skb); -+#else -+ return ip_send(skb); -+#endif -+} -+ -+enum ipsec_xmit_value -+ipsec_tunnel_strip_hard_header(struct ipsec_xmit_state *ixs) -+{ -+ /* ixs->physdev->hard_header_len is unreliable and should not be used */ -+ ixs->hard_header_len = (unsigned char *)(ixs->iph) - ixs->skb->data; -+ -+ if(ixs->hard_header_len < 0) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_error:ipsec_xmit_strip_hard_header: " -+ "Negative hard_header_len (%d)?!\n", ixs->hard_header_len); -+ ixs->stats->tx_dropped++; -+ return IPSEC_XMIT_BADHHLEN; -+ } -+ -+ /* while ixs->physdev->hard_header_len is unreliable and -+ * should not be trusted, it accurate and required for ATM, GRE and -+ * some other interfaces to work. Thanks to Willy Tarreau -+ * . -+ */ -+ if(ixs->hard_header_len == 0) { /* no hard header present */ -+ ixs->hard_header_stripped = 1; -+ ixs->hard_header_len = ixs->physdev->hard_header_len; -+ } -+ -+#ifdef CONFIG_KLIPS_DEBUG -+ if (debug_tunnel & DB_TN_XMIT) { -+ int i; -+ char c; -+ -+ printk(KERN_INFO "klips_debug:ipsec_xmit_strip_hard_header: " -+ ">>> skb->len=%ld hard_header_len:%d", -+ (unsigned long int)ixs->skb->len, ixs->hard_header_len); -+ c = ' '; -+ for (i=0; i < ixs->hard_header_len; i++) { -+ printk("%c%02x", c, ixs->skb->data[i]); -+ c = ':'; -+ } -+ printk(" \n"); -+ } -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+ KLIPS_IP_PRINT(debug_tunnel & DB_TN_XMIT, ixs->iph); -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_CROUT, -+ "klips_debug:ipsec_xmit_strip_hard_header: " -+ "Original head,tailroom: %d,%d\n", -+ skb_headroom(ixs->skb), skb_tailroom(ixs->skb)); -+ -+ return IPSEC_XMIT_OK; -+} -+ -+enum ipsec_xmit_value -+ipsec_tunnel_SAlookup(struct ipsec_xmit_state *ixs) -+{ -+ unsigned int bypass; -+ -+ bypass = FALSE; -+ -+ /* -+ * First things first -- look us up in the erouting tables. -+ */ -+ ixs->matcher.sen_len = sizeof (struct sockaddr_encap); -+ ixs->matcher.sen_family = AF_ENCAP; -+ ixs->matcher.sen_type = SENT_IP4; -+ ixs->matcher.sen_ip_src.s_addr = ixs->iph->saddr; -+ ixs->matcher.sen_ip_dst.s_addr = ixs->iph->daddr; -+ ixs->matcher.sen_proto = ixs->iph->protocol; -+ ipsec_extract_ports(ixs->iph, &ixs->matcher); -+ -+ /* -+ * The spinlock is to prevent any other process from accessing or deleting -+ * the eroute while we are using and updating it. -+ */ -+ spin_lock_bh(&eroute_lock); -+ -+ ixs->eroute = ipsec_findroute(&ixs->matcher); -+ -+ if(ixs->iph->protocol == IPPROTO_UDP) { -+ struct udphdr *t = NULL; -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:udp port check: " -+ "fragoff: %d len: %d>%ld \n", -+ ntohs(ixs->iph->frag_off) & IP_OFFSET, -+ (ixs->skb->len - ixs->hard_header_len), -+ (unsigned long int) ((ixs->iph->ihl << 2) + sizeof(struct udphdr))); -+ -+ if((ntohs(ixs->iph->frag_off) & IP_OFFSET) == 0 && -+ ((ixs->skb->len - ixs->hard_header_len) >= -+ ((ixs->iph->ihl << 2) + sizeof(struct udphdr)))) -+ { -+ t =((struct udphdr*)((caddr_t)ixs->iph+(ixs->iph->ihl<<2))); -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:udp port in packet: " -+ "port %d -> %d\n", -+ ntohs(t->source), ntohs(t->dest)); -+ } -+ -+ ixs->sport=0; ixs->dport=0; -+ -+ if(ixs->skb->sk) { -+#ifdef NET_26 -+#ifdef HAVE_INET_SK_SPORT -+ ixs->sport = ntohs(inet_sk(ixs->skb->sk)->sport); -+ ixs->dport = ntohs(inet_sk(ixs->skb->sk)->dport); -+#else -+ struct udp_sock *us; -+ -+ us = (struct udp_sock *)ixs->skb->sk; -+ -+ ixs->sport = ntohs(us->inet.sport); -+ ixs->dport = ntohs(us->inet.dport); -+#endif -+#else -+ ixs->sport = ntohs(ixs->skb->sk->sport); -+ ixs->dport = ntohs(ixs->skb->sk->dport); -+#endif -+ -+ } -+ -+ if(t != NULL) { -+ if(ixs->sport == 0) { -+ ixs->sport = ntohs(t->source); -+ } -+ if(ixs->dport == 0) { -+ ixs->dport = ntohs(t->dest); -+ } -+ } -+ } -+ -+ /* -+ * practically identical to above, but let's be careful about -+ * tcp vs udp headers -+ */ -+ if(ixs->iph->protocol == IPPROTO_TCP) { -+ struct tcphdr *t = NULL; -+ -+ if((ntohs(ixs->iph->frag_off) & IP_OFFSET) == 0 && -+ ((ixs->skb->len - ixs->hard_header_len) >= -+ ((ixs->iph->ihl << 2) + sizeof(struct tcphdr)))) { -+ t =((struct tcphdr*)((caddr_t)ixs->iph+(ixs->iph->ihl<<2))); -+ } -+ -+ ixs->sport=0; ixs->dport=0; -+ -+ if(ixs->skb->sk) { -+#ifdef NET_26 -+#ifdef HAVE_INET_SK_SPORT -+ ixs->sport = ntohs(inet_sk(ixs->skb->sk)->sport); -+ ixs->dport = ntohs(inet_sk(ixs->skb->sk)->dport); -+#else -+ struct tcp_tw_bucket *tw; -+ tw = (struct tcp_tw_bucket *)ixs->skb->sk; -+ ixs->sport = ntohs(tw->tw_sport); -+ ixs->dport = ntohs(tw->tw_dport); -+#endif -+#else -+ ixs->sport = ntohs(ixs->skb->sk->sport); -+ ixs->dport = ntohs(ixs->skb->sk->dport); -+#endif -+ } -+ -+ if(t != NULL) { -+ if(ixs->sport == 0) { -+ ixs->sport = ntohs(t->source); -+ } -+ if(ixs->dport == 0) { -+ ixs->dport = ntohs(t->dest); -+ } -+ } -+ } -+ -+ /* default to a %drop eroute */ -+ ixs->outgoing_said.proto = IPPROTO_INT; -+ ixs->outgoing_said.spi = htonl(SPI_DROP); -+ ixs->outgoing_said.dst.u.v4.sin_addr.s_addr = INADDR_ANY; -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_xmit_SAlookup: " -+ "checking for local udp/500 IKE packet " -+ "saddr=%x, er=0p%p, daddr=%x, er_dst=%x, proto=%d sport=%d dport=%d\n", -+ ntohl((unsigned int)ixs->iph->saddr), -+ ixs->eroute, -+ ntohl((unsigned int)ixs->iph->daddr), -+ ixs->eroute ? ntohl((unsigned int)ixs->eroute->er_said.dst.u.v4.sin_addr.s_addr) : 0, -+ ixs->iph->protocol, -+ ixs->sport, -+ ixs->dport); -+ -+ /* -+ * cheat for now...are we udp/500? If so, let it through -+ * without interference since it is most likely an IKE packet. -+ */ -+ -+ if (ip_chk_addr((unsigned long)ixs->iph->saddr) == IS_MYADDR -+ && (ixs->eroute==NULL -+ || ixs->iph->daddr == ixs->eroute->er_said.dst.u.v4.sin_addr.s_addr -+ || INADDR_ANY == ixs->eroute->er_said.dst.u.v4.sin_addr.s_addr) -+ && (ixs->iph->protocol == IPPROTO_UDP && -+ (ixs->sport == 500 || ixs->sport == 4500))) { -+ /* Whatever the eroute, this is an IKE message -+ * from us (i.e. not being forwarded). -+ * Furthermore, if there is a tunnel eroute, -+ * the destination is the peer for this eroute. -+ * So %pass the packet: modify the default %drop. -+ */ -+ -+ ixs->outgoing_said.spi = htonl(SPI_PASS); -+ if(!(ixs->skb->sk) && ((ntohs(ixs->iph->frag_off) & IP_MF) != 0)) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_xmit_SAlookup: " -+ "local UDP/500 (probably IKE) passthrough: base fragment, rest of fragments will probably get filtered.\n"); -+ } -+ bypass = TRUE; -+ } -+ -+#ifdef KLIPS_EXCEPT_DNS53 -+ /* -+ * -+ * if we are udp/53 or tcp/53, also let it through a %trap or %hold, -+ * since it is DNS, but *also* follow the %trap. -+ * -+ * we do not do this for tunnels, only %trap's and %hold's. -+ * -+ */ -+ -+ if (ip_chk_addr((unsigned long)ixs->iph->saddr) == IS_MYADDR -+ && (ixs->eroute==NULL -+ || ixs->iph->daddr == ixs->eroute->er_said.dst.u.v4.sin_addr.s_addr -+ || INADDR_ANY == ixs->eroute->er_said.dst.u.v4.sin_addr.s_addr) -+ && ((ixs->iph->protocol == IPPROTO_UDP -+ || ixs->iph->protocol == IPPROTO_TCP) -+ && ixs->dport == 53)) { -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_xmit_SAlookup: " -+ "possible DNS packet\n"); -+ -+ if(ixs->eroute) -+ { -+ if(ixs->eroute->er_said.spi == htonl(SPI_TRAP) -+ || ixs->eroute->er_said.spi == htonl(SPI_HOLD)) -+ { -+ ixs->outgoing_said.spi = htonl(SPI_PASSTRAP); -+ bypass = TRUE; -+ } -+ } -+ else -+ { -+ ixs->outgoing_said.spi = htonl(SPI_PASSTRAP); -+ bypass = TRUE; -+ } -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_xmit_SAlookup: " -+ "bypass = %d\n", bypass); -+ -+ if(bypass -+ && !(ixs->skb->sk) -+ && ((ntohs(ixs->iph->frag_off) & IP_MF) != 0)) -+ { -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_xmit_SAlookup: " -+ "local port 53 (probably DNS) passthrough:" -+ "base fragment, rest of fragments will " -+ "probably get filtered.\n"); -+ } -+ } -+#endif -+ -+ if (bypass==FALSE && ixs->eroute) { -+ ixs->eroute->er_count++; -+ ixs->eroute->er_lasttime = jiffies/HZ; -+ if(ixs->eroute->er_said.proto==IPPROTO_INT -+ && ixs->eroute->er_said.spi==htonl(SPI_HOLD)) -+ { -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_xmit_SAlookup: " -+ "shunt SA of HOLD: skb stored in HOLD.\n"); -+ if(ixs->eroute->er_last != NULL) { -+ kfree_skb(ixs->eroute->er_last); -+ } -+ ixs->eroute->er_last = ixs->skb; -+ ixs->skb = NULL; -+ ixs->stats->tx_dropped++; -+ spin_unlock_bh(&eroute_lock); -+ return IPSEC_XMIT_STOLEN; -+ } -+ ixs->outgoing_said = ixs->eroute->er_said; -+ ixs->eroute_pid = ixs->eroute->er_pid; -+ -+ /* Copy of the ident for the TRAP/TRAPSUBNET eroutes */ -+ if(ixs->outgoing_said.proto==IPPROTO_INT -+ && (ixs->outgoing_said.spi==htonl(SPI_TRAP) -+ || (ixs->outgoing_said.spi==htonl(SPI_TRAPSUBNET)))) { -+ int len; -+ -+ ixs->ips.ips_ident_s.type = ixs->eroute->er_ident_s.type; -+ ixs->ips.ips_ident_s.id = ixs->eroute->er_ident_s.id; -+ ixs->ips.ips_ident_s.len = ixs->eroute->er_ident_s.len; -+ if (ixs->ips.ips_ident_s.len) -+ { -+ len = ixs->ips.ips_ident_s.len * IPSEC_PFKEYv2_ALIGN - sizeof(struct sadb_ident); -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_xmit_SAlookup: " -+ "allocating %d bytes for ident_s shunt SA of HOLD: skb stored in HOLD.\n", -+ len); -+ if ((ixs->ips.ips_ident_s.data = kmalloc(len, GFP_ATOMIC)) == NULL) { -+ printk(KERN_WARNING "klips_debug:ipsec_xmit_SAlookup: " -+ "Failed, tried to allocate %d bytes for source ident.\n", -+ len); -+ ixs->stats->tx_dropped++; -+ spin_unlock_bh(&eroute_lock); -+ return IPSEC_XMIT_ERRMEMALLOC; -+ } -+ memcpy(ixs->ips.ips_ident_s.data, ixs->eroute->er_ident_s.data, len); -+ } -+ ixs->ips.ips_ident_d.type = ixs->eroute->er_ident_d.type; -+ ixs->ips.ips_ident_d.id = ixs->eroute->er_ident_d.id; -+ ixs->ips.ips_ident_d.len = ixs->eroute->er_ident_d.len; -+ if (ixs->ips.ips_ident_d.len) -+ { -+ len = ixs->ips.ips_ident_d.len * IPSEC_PFKEYv2_ALIGN - sizeof(struct sadb_ident); -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_xmit_SAlookup: " -+ "allocating %d bytes for ident_d shunt SA of HOLD: skb stored in HOLD.\n", -+ len); -+ if ((ixs->ips.ips_ident_d.data = kmalloc(len, GFP_ATOMIC)) == NULL) { -+ printk(KERN_WARNING "klips_debug:ipsec_xmit_SAlookup: " -+ "Failed, tried to allocate %d bytes for dest ident.\n", -+ len); -+ ixs->stats->tx_dropped++; -+ spin_unlock_bh(&eroute_lock); -+ return IPSEC_XMIT_ERRMEMALLOC; -+ } -+ memcpy(ixs->ips.ips_ident_d.data, ixs->eroute->er_ident_d.data, len); -+ } -+ } -+ } -+ -+ spin_unlock_bh(&eroute_lock); -+ return IPSEC_XMIT_OK; -+} -+ -+ -+enum ipsec_xmit_value -+ipsec_tunnel_restore_hard_header(struct ipsec_xmit_state*ixs) -+{ -+ KLIPS_PRINT(debug_tunnel & DB_TN_CROUT, -+ "klips_debug:ipsec_xmit_restore_hard_header: " -+ "After recursive xforms -- head,tailroom: %d,%d\n", -+ skb_headroom(ixs->skb), -+ skb_tailroom(ixs->skb)); -+ -+ if(ixs->saved_header) { -+ if(skb_headroom(ixs->skb) < ixs->hard_header_len) { -+ printk(KERN_WARNING -+ "klips_error:ipsec_xmit_restore_hard_header: " -+ "tried to skb_push hhlen=%d, %d available. This should never happen, please report.\n", -+ ixs->hard_header_len, -+ skb_headroom(ixs->skb)); -+ ixs->stats->tx_errors++; -+ return IPSEC_XMIT_PUSHPULLERR; -+ -+ } -+ skb_push(ixs->skb, ixs->hard_header_len); -+ { -+ int i; -+ for (i = 0; i < ixs->hard_header_len; i++) { -+ ixs->skb->data[i] = ixs->saved_header[i]; -+ } -+ } -+ } -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_CROUT, -+ "klips_debug:ipsec_xmit_restore_hard_header: " -+ "With hard_header, final head,tailroom: %d,%d\n", -+ skb_headroom(ixs->skb), -+ skb_tailroom(ixs->skb)); -+ -+ return IPSEC_XMIT_OK; -+} -+ -+ -+/* -+ * when encap processing is complete it call this for us to continue -+ */ -+ -+void -+ipsec_tunnel_xsm_complete( -+ struct ipsec_xmit_state *ixs, -+ enum ipsec_xmit_value stat) -+{ -+ if(stat != IPSEC_XMIT_OK) { -+ if(stat == IPSEC_XMIT_PASS) { -+ goto bypass; -+ } -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_tunnel_start_xmit: encap_bundle failed: %d\n", -+ stat); -+ goto cleanup; -+ } -+ -+ ixs->matcher.sen_ip_src.s_addr = ixs->iph->saddr; -+ ixs->matcher.sen_ip_dst.s_addr = ixs->iph->daddr; -+ ixs->matcher.sen_proto = ixs->iph->protocol; -+ ipsec_extract_ports(ixs->iph, &ixs->matcher); -+ -+ spin_lock_bh(&eroute_lock); -+ ixs->eroute = ipsec_findroute(&ixs->matcher); -+ if(ixs->eroute) { -+ ixs->outgoing_said = ixs->eroute->er_said; -+ ixs->eroute_pid = ixs->eroute->er_pid; -+ ixs->eroute->er_count++; -+ ixs->eroute->er_lasttime = jiffies/HZ; -+ } -+ spin_unlock_bh(&eroute_lock); -+ -+ KLIPS_PRINT((debug_tunnel & DB_TN_XMIT) && -+ /* ((ixs->orgdst != ixs->newdst) || (ixs->orgsrc != ixs->newsrc)) */ -+ (ixs->orgedst != ixs->outgoing_said.dst.u.v4.sin_addr.s_addr) && -+ ixs->outgoing_said.dst.u.v4.sin_addr.s_addr && -+ ixs->eroute, -+ "klips_debug:ipsec_tunnel_start_xmit: " -+ "We are recursing here.\n"); -+ -+ if (/*((ixs->orgdst != ixs->newdst) || (ixs->orgsrc != ixs->newsrc))*/ -+ (ixs->orgedst != ixs->outgoing_said.dst.u.v4.sin_addr.s_addr) && -+ ixs->outgoing_said.dst.u.v4.sin_addr.s_addr && -+ ixs->eroute) { -+ ipsec_xsm(ixs); -+ return; -+ } -+ -+ stat = ipsec_nat_encap(ixs); -+ if(stat != IPSEC_XMIT_OK) { -+ goto cleanup; -+ } -+ -+ stat = ipsec_tunnel_restore_hard_header(ixs); -+ if(stat != IPSEC_XMIT_OK) { -+ goto cleanup; -+ } -+ -+bypass: -+ stat = ipsec_tunnel_send(ixs); -+ -+cleanup: -+ ipsec_xmit_cleanup(ixs); -+ ipsec_xmit_state_delete(ixs); -+} -+ -+ -+/* -+ * This function assumes it is being called from dev_queue_xmit() -+ * and that skb is filled properly by that function. -+ */ -+int -+ipsec_tunnel_start_xmit(struct sk_buff *skb, struct net_device *dev) -+{ -+ struct ipsec_xmit_state *ixs = NULL; -+ enum ipsec_xmit_value stat; -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "\n\nipsec_tunnel_start_xmit: STARTING"); -+ -+ stat = IPSEC_XMIT_ERRMEMALLOC; -+ ixs = ipsec_xmit_state_new(); -+ if (! ixs) { -+ goto alloc_error; -+ } -+ -+ ixs->dev = dev; -+ ixs->skb = skb; -+ -+ stat = ipsec_xmit_sanity_check_dev(ixs); -+ if(stat != IPSEC_XMIT_OK) { -+ goto cleanup; -+ } -+ -+ stat = ipsec_xmit_sanity_check_skb(ixs); -+ if(stat != IPSEC_XMIT_OK) { -+ goto cleanup; -+ } -+ -+ stat = ipsec_tunnel_strip_hard_header(ixs); -+ if(stat != IPSEC_XMIT_OK) { -+ goto cleanup; -+ } -+ -+ stat = ipsec_tunnel_SAlookup(ixs); -+ if(stat != IPSEC_XMIT_OK) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_tunnel_start_xmit: SAlookup failed: %d\n", -+ stat); -+ goto cleanup; -+ } -+ -+ ixs->innersrc = ixs->iph->saddr; -+ -+ ixs->xsm_complete = ipsec_tunnel_xsm_complete; -+ -+ ipsec_xsm(ixs); -+ return 0; -+ -+ cleanup: -+ ipsec_xmit_cleanup(ixs); -+ ipsec_xmit_state_delete(ixs); -+alloc_error: -+ return 0; -+} -+ -+DEBUG_NO_STATIC struct net_device_stats * -+ipsec_tunnel_get_stats(struct net_device *dev) -+{ -+ return &(((struct ipsecpriv *)(dev->priv))->mystats); -+} -+ -+/* -+ * Revectored calls. -+ * For each of these calls, a field exists in our private structure. -+ */ -+ -+DEBUG_NO_STATIC int -+ipsec_tunnel_hard_header(struct sk_buff *skb, struct net_device *dev, -+ unsigned short type, const void *daddr, const void *saddr, unsigned len) -+{ -+ struct ipsecpriv *prv = dev->priv; -+ struct net_device *tmp; -+ int ret; -+ struct net_device_stats *stats; /* This device's statistics */ -+ -+ if(skb == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_hard_header: " -+ "no skb...\n"); -+ return -ENODATA; -+ } -+ -+ if(dev == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_hard_header: " -+ "no device...\n"); -+ return -ENODEV; -+ } -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_hard_header: " -+ "skb->dev=%s dev=%s.\n", -+ skb->dev ? skb->dev->name : "NULL", -+ dev->name); -+ -+ if(prv == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_hard_header: " -+ "no private space associated with dev=%s\n", -+ dev->name ? dev->name : "NULL"); -+ return -ENODEV; -+ } -+ -+ stats = (struct net_device_stats *) &(prv->mystats); -+ -+ if(prv->dev == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_hard_header: " -+ "no physical device associated with dev=%s\n", -+ dev->name ? dev->name : "NULL"); -+ stats->tx_dropped++; -+ return -ENODEV; -+ } -+ -+ /* check if we have to send a IPv6 packet. It might be a Router -+ Solicitation, where the building of the packet happens in -+ reverse order: -+ 1. ll hdr, -+ 2. IPv6 hdr, -+ 3. ICMPv6 hdr -+ -> skb->nh.raw is still uninitialized when this function is -+ called!! If this is no IPv6 packet, we can print debugging -+ messages, otherwise we skip all debugging messages and just -+ build the ll header */ -+ if(type != ETH_P_IPV6) { -+ /* execute this only, if we don't have to build the -+ header for a IPv6 packet */ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) -+ if(!prv->header_ops->create) -+#else -+ if(!prv->hard_header) -+#endif -+ { -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_hard_header: " -+ "physical device has been detached, packet dropped 0p%p->0p%p len=%d type=%d dev=%s->NULL ", -+ saddr, -+ daddr, -+ len, -+ type, -+ dev->name); -+#ifdef NET_21 -+ KLIPS_PRINTMORE(debug_tunnel & DB_TN_REVEC, -+ "ip=%08x->%08x\n", -+ (__u32)ntohl(ip_hdr(skb)->saddr), -+ (__u32)ntohl(ip_hdr(skb)->daddr) ); -+#else /* NET_21 */ -+ KLIPS_PRINTMORE(debug_tunnel & DB_TN_REVEC, -+ "ip=%08x->%08x\n", -+ (__u32)ntohl(skb->ip_hdr->saddr), -+ (__u32)ntohl(skb->ip_hdr->daddr) ); -+#endif /* NET_21 */ -+ stats->tx_dropped++; -+ return -ENODEV; -+ } -+ -+#define da ((struct net_device *)(prv->dev))->dev_addr -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_hard_header: " -+ "Revectored 0p%p->0p%p len=%d type=%d dev=%s->%s dev_addr=%02x:%02x:%02x:%02x:%02x:%02x ", -+ saddr, -+ daddr, -+ len, -+ type, -+ dev->name, -+ prv->dev->name, -+ da[0], da[1], da[2], da[3], da[4], da[5]); -+#ifdef NET_21 -+ KLIPS_PRINTMORE(debug_tunnel & DB_TN_REVEC, -+ "ip=%08x->%08x\n", -+ (__u32)ntohl(ip_hdr(skb)->saddr), -+ (__u32)ntohl(ip_hdr(skb)->daddr) ); -+#else /* NET_21 */ -+ KLIPS_PRINTMORE(debug_tunnel & DB_TN_REVEC, -+ "ip=%08x->%08x\n", -+ (__u32)ntohl(skb->ip_hdr->saddr), -+ (__u32)ntohl(skb->ip_hdr->daddr) ); -+#endif /* NET_21 */ -+ } else { -+ KLIPS_PRINT(debug_tunnel, -+ "klips_debug:ipsec_tunnel_hard_header: " -+ "is IPv6 packet, skip debugging messages, only revector and build linklocal header.\n"); -+ } -+ tmp = skb->dev; -+ skb->dev = prv->dev; -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) -+ ret = prv->header_ops->create(skb, prv->dev, type, (void *)daddr, (void *)saddr, len); -+#else -+ ret = prv->hard_header(skb, prv->dev, type, (void *)daddr, (void *)saddr, len); -+#endif -+ skb->dev = tmp; -+ return ret; -+} -+ -+DEBUG_NO_STATIC int -+#ifdef NET_21 -+ipsec_tunnel_rebuild_header(struct sk_buff *skb) -+#else /* NET_21 */ -+ipsec_tunnel_rebuild_header(void *buff, struct net_device *dev, -+ unsigned long raddr, struct sk_buff *skb) -+#endif /* NET_21 */ -+{ -+ struct ipsecpriv *prv = skb->dev->priv; -+ struct net_device *tmp; -+ int ret; -+ struct net_device_stats *stats; /* This device's statistics */ -+ -+ if(skb->dev == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_rebuild_header: " -+ "no device..."); -+ return -ENODEV; -+ } -+ -+ if(prv == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_rebuild_header: " -+ "no private space associated with dev=%s", -+ skb->dev->name ? skb->dev->name : "NULL"); -+ return -ENODEV; -+ } -+ -+ stats = (struct net_device_stats *) &(prv->mystats); -+ -+ if(prv->dev == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_rebuild_header: " -+ "no physical device associated with dev=%s", -+ skb->dev->name ? skb->dev->name : "NULL"); -+ stats->tx_dropped++; -+ return -ENODEV; -+ } -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) -+ if(!prv->header_ops->rebuild) -+#else -+ if(!prv->rebuild_header) -+#endif -+ { -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_rebuild_header: " -+ "physical device has been detached, packet dropped skb->dev=%s->NULL ", -+ skb->dev->name); -+#ifdef NET_21 -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "ip=%08x->%08x\n", -+ (__u32)ntohl(ip_hdr(skb)->saddr), -+ (__u32)ntohl(ip_hdr(skb)->daddr) ); -+#else /* NET_21 */ -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "ip=%08x->%08x\n", -+ (__u32)ntohl(skb->ip_hdr->saddr), -+ (__u32)ntohl(skb->ip_hdr->daddr) ); -+#endif /* NET_21 */ -+ stats->tx_dropped++; -+ return -ENODEV; -+ } -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel: " -+ "Revectored rebuild_header dev=%s->%s ", -+ skb->dev->name, prv->dev->name); -+#ifdef NET_21 -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "ip=%08x->%08x\n", -+ (__u32)ntohl(ip_hdr(skb)->saddr), -+ (__u32)ntohl(ip_hdr(skb)->daddr) ); -+#else /* NET_21 */ -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "ip=%08x->%08x\n", -+ (__u32)ntohl(skb->ip_hdr->saddr), -+ (__u32)ntohl(skb->ip_hdr->daddr) ); -+#endif /* NET_21 */ -+ tmp = skb->dev; -+ skb->dev = prv->dev; -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) -+ ret = prv->header_ops->rebuild(skb); -+#else -+#ifdef NET_21 -+ ret = prv->rebuild_header(skb); -+#else /* NET_21 */ -+ ret = prv->rebuild_header(buff, prv->dev, raddr, skb); -+#endif /* NET_21 */ -+#endif -+ skb->dev = tmp; -+ return ret; -+} -+ -+DEBUG_NO_STATIC int -+ipsec_tunnel_set_mac_address(struct net_device *dev, void *addr) -+{ -+ struct ipsecpriv *prv = dev->priv; -+ -+ struct net_device_stats *stats; /* This device's statistics */ -+ -+ if(dev == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_set_mac_address: " -+ "no device..."); -+ return -ENODEV; -+ } -+ -+ if(prv == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_set_mac_address: " -+ "no private space associated with dev=%s", -+ dev->name ? dev->name : "NULL"); -+ return -ENODEV; -+ } -+ -+ stats = (struct net_device_stats *) &(prv->mystats); -+ -+ if(prv->dev == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_set_mac_address: " -+ "no physical device associated with dev=%s", -+ dev->name ? dev->name : "NULL"); -+ stats->tx_dropped++; -+ return -ENODEV; -+ } -+ -+ if(!prv->set_mac_address) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_set_mac_address: " -+ "physical device has been detached, cannot set - skb->dev=%s->NULL\n", -+ dev->name); -+ return -ENODEV; -+ } -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_set_mac_address: " -+ "Revectored dev=%s->%s addr=0p%p\n", -+ dev->name, prv->dev->name, addr); -+ return prv->set_mac_address(prv->dev, addr); -+ -+} -+ -+#ifndef NET_21 -+DEBUG_NO_STATIC void -+ipsec_tunnel_cache_bind(struct hh_cache **hhp, struct net_device *dev, -+ unsigned short htype, __u32 daddr) -+{ -+ struct ipsecpriv *prv = dev->priv; -+ -+ struct net_device_stats *stats; /* This device's statistics */ -+ -+ if(dev == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_cache_bind: " -+ "no device..."); -+ return; -+ } -+ -+ if(prv == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_cache_bind: " -+ "no private space associated with dev=%s", -+ dev->name ? dev->name : "NULL"); -+ return; -+ } -+ -+ stats = (struct net_device_stats *) &(prv->mystats); -+ -+ if(prv->dev == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_cache_bind: " -+ "no physical device associated with dev=%s", -+ dev->name ? dev->name : "NULL"); -+ stats->tx_dropped++; -+ return; -+ } -+ -+ if(!prv->header_cache_bind) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_cache_bind: " -+ "physical device has been detached, cannot set - skb->dev=%s->NULL\n", -+ dev->name); -+ stats->tx_dropped++; -+ return; -+ } -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_cache_bind: " -+ "Revectored \n"); -+ prv->header_cache_bind(hhp, prv->dev, htype, daddr); -+ return; -+} -+#endif /* !NET_21 */ -+ -+ -+DEBUG_NO_STATIC void -+ipsec_tunnel_cache_update(struct hh_cache *hh, const struct net_device *dev, -+ const unsigned char * haddr) -+{ -+ struct ipsecpriv *prv = dev->priv; -+ -+ struct net_device_stats *stats; /* This device's statistics */ -+ -+ if(dev == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_cache_update: " -+ "no device..."); -+ return; -+ } -+ -+ if(prv == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_cache_update: " -+ "no private space associated with dev=%s", -+ dev->name ? dev->name : "NULL"); -+ return; -+ } -+ -+ stats = (struct net_device_stats *) &(prv->mystats); -+ -+ if(prv->dev == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_cache_update: " -+ "no physical device associated with dev=%s", -+ dev->name ? dev->name : "NULL"); -+ stats->tx_dropped++; -+ return; -+ } -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) -+ if(!prv->header_ops->cache_update) -+#else -+ if(!prv->header_cache_update) -+#endif -+ { -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_cache_update: " -+ "physical device has been detached, cannot set - skb->dev=%s->NULL\n", -+ dev->name); -+ return; -+ } -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel: " -+ "Revectored cache_update\n"); -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) -+ prv->header_ops->cache_update(hh, prv->dev, haddr); -+#else -+ prv->header_cache_update(hh, prv->dev, haddr); -+#endif -+ return; -+} -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) -+const struct header_ops ipsec_tunnel_header_ops = { -+ .create = ipsec_tunnel_hard_header, -+ .rebuild = ipsec_tunnel_rebuild_header, -+ .cache_update = ipsec_tunnel_cache_update, -+}; -+#endif -+ -+#ifdef NET_21 -+DEBUG_NO_STATIC int -+ipsec_tunnel_neigh_setup(struct neighbour *n) -+{ -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_neigh_setup:\n"); -+ -+ if (n->nud_state == NUD_NONE) { -+ n->ops = &arp_broken_ops; -+ n->output = n->ops->output; -+ } -+ return 0; -+} -+ -+DEBUG_NO_STATIC int -+ipsec_tunnel_neigh_setup_dev(struct net_device *dev, struct neigh_parms *p) -+{ -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_neigh_setup_dev: " -+ "setting up %s\n", -+ dev ? dev->name : "NULL"); -+ -+ if (p->tbl->family == AF_INET) { -+ p->neigh_setup = ipsec_tunnel_neigh_setup; -+ p->ucast_probes = 0; -+ p->mcast_probes = 0; -+ } -+ return 0; -+} -+#endif /* NET_21 */ -+ -+/* -+ * We call the attach routine to attach another device. -+ */ -+ -+DEBUG_NO_STATIC int -+ipsec_tunnel_attach(struct net_device *dev, struct net_device *physdev) -+{ -+ int i; -+ struct ipsecpriv *prv = dev->priv; -+ -+ if(dev == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_attach: " -+ "no device..."); -+ return -ENODEV; -+ } -+ -+ if(prv == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_attach: " -+ "no private space associated with dev=%s", -+ dev->name ? dev->name : "NULL"); -+ return -ENODATA; -+ } -+ -+ prv->dev = physdev; -+ prv->hard_start_xmit = physdev->hard_start_xmit; -+ prv->get_stats = physdev->get_stats; -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) -+ if (physdev->header_ops) { -+ prv->header_ops = physdev->header_ops; -+ dev->header_ops = &ipsec_tunnel_header_ops; -+ } else -+ dev->header_ops = NULL; -+#else -+ if (physdev->hard_header) { -+ prv->hard_header = physdev->hard_header; -+ dev->hard_header = &ipsec_tunnel_hard_header; -+ } else -+ dev->hard_header = NULL; -+ -+ if (physdev->rebuild_header) { -+ prv->rebuild_header = physdev->rebuild_header; -+ dev->rebuild_header = ipsec_tunnel_rebuild_header; -+ } else -+ dev->rebuild_header = NULL; -+ -+#ifndef NET_21 -+ if (physdev->header_cache_bind) { -+ prv->header_cache_bind = physdev->header_cache_bind; -+ dev->header_cache_bind = ipsec_tunnel_cache_bind; -+ } else -+ dev->header_cache_bind = NULL; -+#endif /* !NET_21 */ -+ -+ if (physdev->header_cache_update) { -+ prv->header_cache_update = physdev->header_cache_update; -+ dev->header_cache_update = ipsec_tunnel_cache_update; -+ } else -+ dev->header_cache_update = NULL; -+#endif -+ -+ if (physdev->set_mac_address) { -+ prv->set_mac_address = physdev->set_mac_address; -+ dev->set_mac_address = ipsec_tunnel_set_mac_address; -+ } else -+ dev->set_mac_address = NULL; -+ -+ dev->hard_header_len = physdev->hard_header_len; -+ -+#ifdef NET_21 -+/* prv->neigh_setup = physdev->neigh_setup; */ -+ dev->neigh_setup = ipsec_tunnel_neigh_setup_dev; -+#endif /* NET_21 */ -+ dev->mtu = 16260; /* 0xfff0; */ /* dev->mtu; */ -+ prv->mtu = physdev->mtu; -+ -+#ifdef PHYSDEV_TYPE -+ dev->type = physdev->type; /* ARPHRD_TUNNEL; */ -+#endif /* PHYSDEV_TYPE */ -+ -+ dev->addr_len = physdev->addr_len; -+ for (i=0; iaddr_len; i++) { -+ dev->dev_addr[i] = physdev->dev_addr[i]; -+ } -+#ifdef CONFIG_KLIPS_DEBUG -+ if(debug_tunnel & DB_TN_INIT) { -+ printk(KERN_INFO "klips_debug:ipsec_tunnel_attach: " -+ "physical device %s being attached has HW address: %2x", -+ physdev->name, physdev->dev_addr[0]); -+ for (i=1; i < physdev->addr_len; i++) { -+ printk(":%02x", physdev->dev_addr[i]); -+ } -+ printk("\n"); -+ } -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+ return 0; -+} -+ -+/* -+ * We call the detach routine to detach the ipsec tunnel from another device. -+ */ -+ -+DEBUG_NO_STATIC int -+ipsec_tunnel_detach(struct net_device *dev) -+{ -+ int i; -+ struct ipsecpriv *prv = dev->priv; -+ -+ if(dev == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_detach: " -+ "no device..."); -+ return -ENODEV; -+ } -+ -+ if(prv == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_REVEC, -+ "klips_debug:ipsec_tunnel_detach: " -+ "no private space associated with dev=%s", -+ dev->name ? dev->name : "NULL"); -+ return -ENODATA; -+ } -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_tunnel_detach: " -+ "physical device %s being detached from virtual device %s\n", -+ prv->dev ? prv->dev->name : "NULL", -+ dev->name); -+ -+ ipsec_dev_put(prv->dev); -+ prv->dev = NULL; -+ prv->hard_start_xmit = NULL; -+ prv->get_stats = NULL; -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) -+ prv->header_ops = NULL; -+#else -+ prv->hard_header = NULL; -+ prv->rebuild_header = NULL; -+ prv->header_cache_update = NULL; -+#ifndef NET_21 -+ prv->header_cache_bind = NULL; -+#else -+/* prv->neigh_setup = NULL; */ -+#endif -+#endif -+ prv->set_mac_address = NULL; -+ dev->hard_header_len = 0; -+ -+#ifdef DETACH_AND_DOWN -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) -+ dev->header_ops = NULL; -+#else -+ dev->hard_header = NULL; -+ dev->rebuild_header = NULL; -+ dev->header_cache_update = NULL; -+#ifndef NET_21 -+ dev->header_cache_bind = NULL; -+#else -+ dev->neigh_setup = NULL; -+#endif -+#endif -+ dev->set_mac_address = NULL; -+ dev->mtu = 0; -+#endif /* DETACH_AND_DOWN */ -+ -+ prv->mtu = 0; -+ for (i=0; idev_addr[i] = 0; -+ } -+ dev->addr_len = 0; -+#ifdef PHYSDEV_TYPE -+ dev->type = ARPHRD_VOID; /* ARPHRD_TUNNEL; */ -+#endif /* PHYSDEV_TYPE */ -+ -+ return 0; -+} -+ -+/* -+ * We call the clear routine to detach all ipsec tunnels from other devices. -+ */ -+DEBUG_NO_STATIC int -+ipsec_tunnel_clear(void) -+{ -+ int i; -+ struct net_device *ipsecdev = NULL, *prvdev; -+ struct ipsecpriv *prv; -+ int ret; -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_tunnel_clear: .\n"); -+ -+ for(i = 0; i < IPSEC_NUM_IF; i++) { -+ ipsecdev = ipsecdevices[i]; -+ if(ipsecdev != NULL) { -+ if((prv = (struct ipsecpriv *)(ipsecdev->priv))) { -+ prvdev = (struct net_device *)(prv->dev); -+ if(prvdev) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_tunnel_clear: " -+ "physical device for device %s is %s\n", -+ ipsecdev->name, prvdev->name); -+ if((ret = ipsec_tunnel_detach(ipsecdev))) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_tunnel_clear: " -+ "error %d detatching device %s from device %s.\n", -+ ret, ipsecdev->name, prvdev->name); -+ return ret; -+ } -+ } -+ } -+ } -+ } -+ return 0; -+} -+ -+DEBUG_NO_STATIC int -+ipsec_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) -+{ -+ struct ipsectunnelconf *cf = (struct ipsectunnelconf *)&ifr->ifr_data; -+ struct ipsecpriv *prv = dev->priv; -+ struct net_device *them; /* physical device */ -+#ifdef CONFIG_IP_ALIAS -+ char *colon; -+ char realphysname[IFNAMSIZ]; -+#endif /* CONFIG_IP_ALIAS */ -+ -+ if(dev == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_tunnel_ioctl: " -+ "device not supplied.\n"); -+ return -ENODEV; -+ } -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_tunnel_ioctl: " -+ "tncfg service call #%d for dev=%s\n", -+ cmd, -+ dev->name ? dev->name : "NULL"); -+ switch (cmd) { -+ /* attach a virtual ipsec? device to a physical device */ -+ case IPSEC_SET_DEV: -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_tunnel_ioctl: " -+ "calling ipsec_tunnel_attatch...\n"); -+#ifdef CONFIG_IP_ALIAS -+ /* If this is an IP alias interface, get its real physical name */ -+ strncpy(realphysname, cf->cf_name, IFNAMSIZ); -+ realphysname[IFNAMSIZ-1] = 0; -+ colon = strchr(realphysname, ':'); -+ if (colon) *colon = 0; -+ them = ipsec_dev_get(realphysname); -+#else /* CONFIG_IP_ALIAS */ -+ them = ipsec_dev_get(cf->cf_name); -+#endif /* CONFIG_IP_ALIAS */ -+ -+ if (them == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_tunnel_ioctl: " -+ "physical device %s requested is null\n", -+ cf->cf_name); -+ return -ENXIO; -+ } -+ -+#if 0 -+ if (them->flags & IFF_UP) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_tunnel_ioctl: " -+ "physical device %s requested is not up.\n", -+ cf->cf_name); -+ ipsec_dev_put(them); -+ return -ENXIO; -+ } -+#endif -+ -+ if (prv && prv->dev) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_tunnel_ioctl: " -+ "virtual device is already connected to %s.\n", -+ prv->dev->name ? prv->dev->name : "NULL"); -+ ipsec_dev_put(them); -+ return -EBUSY; -+ } -+ return ipsec_tunnel_attach(dev, them); -+ -+ case IPSEC_DEL_DEV: -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_tunnel_ioctl: " -+ "calling ipsec_tunnel_detatch.\n"); -+ if (! prv->dev) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_tunnel_ioctl: " -+ "physical device not connected.\n"); -+ return -ENODEV; -+ } -+ return ipsec_tunnel_detach(dev); -+ -+ case IPSEC_CLR_DEV: -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_tunnel_ioctl: " -+ "calling ipsec_tunnel_clear.\n"); -+ return ipsec_tunnel_clear(); -+ -+#ifdef HAVE_UDP_ENCAP_CONVERT -+ case IPSEC_UDP_ENCAP_CONVERT: -+ { -+ unsigned int *socknum =(unsigned int *)&ifr->ifr_data; -+ struct socket *sock; -+ int err, fput_needed; -+ -+ /* that's a static function in socket.c -+ * sock = sockfd_lookup_light(*socknum, &err, &fput_needed); */ -+ sock = sockfd_lookup(*socknum, &err); -+ if (!sock) -+ goto encap_out; -+ -+ /* check that it's a UDP socket */ -+ udp_sk(sk)->encap_type = UDP_ENCAP_ESPINUDP_NON_IKE; -+ udp_sk(sk)->encap_rcv = klips26_udp_encap_rcv; -+ -+ KLIPS_PRINT(debug_tunnel -+ , "UDP socket: %u set to NON-IKE encap mode\n" -+ , socknum); -+ -+ err = 0; -+ -+ encap_output: -+ fput_light(sock->file, fput_needed); -+ encap_out: -+ return err; -+#endif -+ -+ default: -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_tunnel_ioctl: " -+ "unknown command %d.\n", -+ cmd); -+ return -EOPNOTSUPP; -+ } -+} -+ -+struct net_device *ipsec_get_device(int inst) -+{ -+ struct net_device *ipsec_dev; -+ -+ ipsec_dev = NULL; -+ -+ if(inst < IPSEC_NUM_IF) { -+ ipsec_dev = ipsecdevices[inst]; -+ } -+ -+ return ipsec_dev; -+} -+ -+int -+ipsec_device_event(struct notifier_block *unused, unsigned long event, void *ptr) -+{ -+ struct net_device *dev = ptr; -+ struct net_device *ipsec_dev; -+ struct ipsecpriv *priv; -+ int i; -+ -+ if (dev == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_device_event: " -+ "dev=NULL for event type %ld.\n", -+ event); -+ return(NOTIFY_DONE); -+ } -+ -+ /* check for loopback devices */ -+ if (dev && (dev->flags & IFF_LOOPBACK)) { -+ return(NOTIFY_DONE); -+ } -+ -+ switch (event) { -+ case NETDEV_DOWN: -+ /* look very carefully at the scope of these compiler -+ directives before changing anything... -- RGB */ -+#ifdef NET_21 -+ case NETDEV_UNREGISTER: -+ switch (event) { -+ case NETDEV_DOWN: -+#endif /* NET_21 */ -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_device_event: " -+ "NETDEV_DOWN dev=%s flags=%x\n", -+ dev->name, -+ dev->flags); -+ if(strncmp(dev->name, "ipsec", strlen("ipsec")) == 0) { -+ printk(KERN_CRIT "IPSEC EVENT: KLIPS device %s shut down.\n", -+ dev->name); -+ } -+#ifdef NET_21 -+ break; -+ case NETDEV_UNREGISTER: -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_device_event: " -+ "NETDEV_UNREGISTER dev=%s flags=%x\n", -+ dev->name, -+ dev->flags); -+ break; -+ } -+#endif /* NET_21 */ -+ -+ /* find the attached physical device and detach it. */ -+ for(i = 0; i < IPSEC_NUM_IF; i++) { -+ ipsec_dev = ipsecdevices[i]; -+ -+ if(ipsec_dev) { -+ priv = (struct ipsecpriv *)(ipsec_dev->priv); -+ if(priv) { -+ ; -+ if(((struct net_device *)(priv->dev)) == dev) { -+ /* dev_close(ipsec_dev); */ -+ /* return */ ipsec_tunnel_detach(ipsec_dev); -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_device_event: " -+ "device '%s' has been detached.\n", -+ ipsec_dev->name); -+ break; -+ } -+ } else { -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_device_event: " -+ "device '%s' has no private data space!\n", -+ ipsec_dev->name); -+ } -+ } -+ } -+ break; -+ case NETDEV_UP: -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_device_event: " -+ "NETDEV_UP dev=%s\n", -+ dev->name); -+ break; -+#ifdef NET_21 -+ case NETDEV_REBOOT: -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_device_event: " -+ "NETDEV_REBOOT dev=%s\n", -+ dev->name); -+ break; -+ case NETDEV_CHANGE: -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_device_event: " -+ "NETDEV_CHANGE dev=%s flags=%x\n", -+ dev->name, -+ dev->flags); -+ break; -+ case NETDEV_REGISTER: -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_device_event: " -+ "NETDEV_REGISTER dev=%s\n", -+ dev->name); -+ break; -+ case NETDEV_CHANGEMTU: -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_device_event: " -+ "NETDEV_CHANGEMTU dev=%s to mtu=%d\n", -+ dev->name, -+ dev->mtu); -+ break; -+ case NETDEV_CHANGEADDR: -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_device_event: " -+ "NETDEV_CHANGEADDR dev=%s\n", -+ dev->name); -+ break; -+ case NETDEV_GOING_DOWN: -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_device_event: " -+ "NETDEV_GOING_DOWN dev=%s\n", -+ dev->name); -+ break; -+ case NETDEV_CHANGENAME: -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_device_event: " -+ "NETDEV_CHANGENAME dev=%s\n", -+ dev->name); -+ break; -+#endif /* NET_21 */ -+ default: -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_device_event: " -+ "event type %ld unrecognised for dev=%s\n", -+ event, -+ dev->name); -+ break; -+ } -+ return NOTIFY_DONE; -+} -+ -+/* -+ * Called when an ipsec tunnel device is initialized. -+ * The ipsec tunnel device structure is passed to us. -+ */ -+ -+int -+ipsec_tunnel_init(struct net_device *dev) -+{ -+ int i; -+ -+ KLIPS_PRINT(debug_tunnel, -+ "klips_debug:ipsec_tunnel_init: " -+ "allocating %lu bytes initialising device: %s\n", -+ (unsigned long) sizeof(struct ipsecpriv), -+ dev->name ? dev->name : "NULL"); -+ -+ /* Add our tunnel functions to the device */ -+ dev->open = ipsec_tunnel_open; -+ dev->stop = ipsec_tunnel_close; -+ dev->hard_start_xmit = ipsec_tunnel_start_xmit; -+ dev->get_stats = ipsec_tunnel_get_stats; -+ -+ dev->priv = kmalloc(sizeof(struct ipsecpriv), GFP_KERNEL); -+ if (dev->priv == NULL) -+ return -ENOMEM; -+ memset((caddr_t)(dev->priv), 0, sizeof(struct ipsecpriv)); -+ -+ for(i = 0; i < sizeof(zeroes); i++) { -+ ((__u8*)(zeroes))[i] = 0; -+ } -+ -+#ifndef NET_21 -+ /* Initialize the tunnel device structure */ -+ for (i = 0; i < DEV_NUMBUFFS; i++) -+ skb_queue_head_init(&dev->buffs[i]); -+#endif /* !NET_21 */ -+ -+ dev->set_multicast_list = NULL; -+ dev->do_ioctl = ipsec_tunnel_ioctl; -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) -+ dev->header_ops = NULL; -+#else -+ dev->hard_header = NULL; -+ dev->rebuild_header = NULL; -+ dev->set_mac_address = NULL; -+#ifndef NET_21 -+ dev->header_cache_bind = NULL; -+#endif /* !NET_21 */ -+ dev->header_cache_update= NULL; -+#endif -+ -+#ifdef NET_21 -+/* prv->neigh_setup = NULL; */ -+ dev->neigh_setup = ipsec_tunnel_neigh_setup_dev; -+#endif /* NET_21 */ -+ dev->hard_header_len = 0; -+ dev->mtu = 0; -+ dev->addr_len = 0; -+ dev->type = ARPHRD_VOID; /* ARPHRD_TUNNEL; */ /* ARPHRD_ETHER; */ -+ dev->tx_queue_len = 10; /* Small queue */ -+ memset((caddr_t)(dev->broadcast),0xFF, ETH_ALEN); /* what if this is not attached to ethernet? */ -+ -+ /* New-style flags. */ -+ dev->flags = IFF_NOARP /* 0 */ /* Petr Novak */; -+ -+ /* We're done. Have I forgotten anything? */ -+ return 0; -+} -+ -+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ -+/* Module specific interface (but it links with the rest of IPSEC) */ -+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ -+ -+int -+ipsec_tunnel_probe(struct net_device *dev) -+{ -+ ipsec_tunnel_init(dev); -+ return 0; -+} -+ -+#ifdef alloc_netdev -+static void ipsec_tunnel_netdev_setup(struct net_device *dev) -+{ -+} -+#endif -+ -+struct net_device *ipsecdevices[IPSEC_NUM_IFMAX]; -+int ipsecdevices_max=-1; -+ -+int -+ipsec_tunnel_createnum(int ifnum) -+{ -+ char name[IFNAMSIZ]; -+ struct net_device *dev_ipsec; -+ int vifentry; -+ -+ if(ifnum > IPSEC_NUM_IFMAX) { -+ return -ENOENT; -+ } -+ -+ if(ipsecdevices[ifnum]!=NULL) { -+ return -EEXIST; -+ } -+ -+ /* no identical device */ -+ if(ifnum > ipsecdevices_max) { -+ ipsecdevices_max=ifnum; -+ } -+ vifentry = ifnum; -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_tunnel_init_devices: " -+ "creating and registering IPSEC_NUM_IF=%u device\n", -+ ifnum); -+ -+ sprintf(name, IPSEC_DEV_FORMAT, ifnum); -+#ifdef alloc_netdev -+ dev_ipsec = alloc_netdev(0, name, ipsec_tunnel_netdev_setup); -+#else -+ dev_ipsec = (struct net_device*)kmalloc(sizeof(struct net_device), GFP_KERNEL); -+#endif -+ if (dev_ipsec == NULL) { -+ printk(KERN_ERR "klips_debug:ipsec_tunnel_init_devices: " -+ "failed to allocate memory for device %s, quitting device init.\n", -+ name); -+ return -ENOMEM; -+ } -+#ifndef alloc_netdev -+ memset((caddr_t)dev_ipsec, 0, sizeof(struct net_device)); -+#ifdef NETDEV_23 -+ strncpy(dev_ipsec->name, name, sizeof(dev_ipsec->name)); -+#else /* NETDEV_23 */ -+ dev_ipsec->name = (char*)kmalloc(IFNAMSIZ, GFP_KERNEL); -+ if (dev_ipsec->name == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_tunnel_init_devices: " -+ "failed to allocate memory for device %s name, quitting device init.\n", -+ name); -+ return -ENOMEM; -+ } -+ memset((caddr_t)dev_ipsec->name, 0, IFNAMSIZ); -+ strncpy(dev_ipsec->name, name, IFNAMSIZ); -+#endif /* NETDEV_23 */ -+#ifdef PAUL_FIXME -+ dev_ipsec->next = NULL; -+#endif -+#endif /* alloc_netdev */ -+ dev_ipsec->init = &ipsec_tunnel_probe; -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_tunnel_init_devices: " -+ "registering device %s\n", -+ dev_ipsec->name); -+ -+ /* reference and hold the device reference */ -+ dev_hold(dev_ipsec); -+ ipsecdevices[vifentry]=dev_ipsec; -+ -+ if (register_netdev(dev_ipsec) != 0) { -+ KLIPS_PRINT(1 || debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_tunnel_init_devices: " -+ "registering device %s failed, quitting device init.\n", -+ dev_ipsec->name); -+ return -EIO; -+ } else { -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_tunnel_init_devices: " -+ "registering device %s succeeded, continuing...\n", -+ dev_ipsec->name); -+ } -+ return 0; -+} -+ -+ -+int -+ipsec_tunnel_init_devices(void) -+{ -+ int i; -+ int error; -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_INIT, -+ "klips_debug:ipsec_tunnel_init_devices: " -+ "creating and registering IPSEC_NUM_IF=%u devices, allocating %lu per device, IFNAMSIZ=%u.\n", -+ IPSEC_NUM_IF, -+ (unsigned long) (sizeof(struct net_device) + IFNAMSIZ), -+ IFNAMSIZ); -+ -+ for(i = 0; i < IPSEC_NUM_IF; i++) { -+ error = ipsec_tunnel_createnum(i); -+ -+ if(error) break; -+ } -+ return 0; -+} -+ -+int -+ipsec_tunnel_deletenum(int vifnum) -+{ -+ struct net_device *dev_ipsec; -+ -+ if(vifnum > IPSEC_NUM_IFMAX) { -+ return -ENOENT; -+ } -+ -+ dev_ipsec = ipsecdevices[vifnum]; -+ if(dev_ipsec == NULL) { -+ return -ENOENT; -+ } -+ -+ /* release reference */ -+ ipsecdevices[vifnum]=NULL; -+ ipsec_dev_put(dev_ipsec); -+ -+ KLIPS_PRINT(debug_tunnel, "Unregistering %s (refcnt=%d)\n", -+ dev_ipsec->name, -+ atomic_read(&dev_ipsec->refcnt)); -+ unregister_netdev(dev_ipsec); -+ KLIPS_PRINT(debug_tunnel, "Unregisted %s\n", dev_ipsec->name); -+#ifdef alloc_netdev -+ free_netdev(dev_ipsec); -+#else -+#ifndef NETDEV_23 -+ kfree(dev_ipsec->name); -+ dev_ipsec->name=NULL; -+#endif /* !NETDEV_23 */ -+ kfree(dev_ipsec->priv); -+#endif /* alloc_netdev */ -+ dev_ipsec->priv=NULL; -+ -+ return 0; -+} -+ -+ -+struct net_device * -+ipsec_tunnel_get_device(int vifnum) -+{ -+ struct net_device *nd; -+ -+ if(vifnum < ipsecdevices_max) { -+ nd = ipsecdevices[vifnum]; -+ -+ if(nd) dev_hold(nd); -+ return nd; -+ } else { -+ return NULL; -+ } -+} -+ -+/* void */ -+int -+ipsec_tunnel_cleanup_devices(void) -+{ -+ int error = 0; -+ int i; -+ struct net_device *dev_ipsec; -+ -+ for(i = 0; i < IPSEC_NUM_IF; i++) { -+ dev_ipsec = ipsecdevices[i]; -+ if(dev_ipsec == NULL) { -+ continue; -+ } -+ -+ /* release reference */ -+ ipsecdevices[i]=NULL; -+ ipsec_dev_put(dev_ipsec); -+ -+ KLIPS_PRINT(debug_tunnel, "Unregistering %s (refcnt=%d)\n", -+ dev_ipsec->name, -+ atomic_read(&dev_ipsec->refcnt)); -+ unregister_netdev(dev_ipsec); -+ KLIPS_PRINT(debug_tunnel, "Unregisted %s\n", dev_ipsec->name); -+#ifdef alloc_netdev -+ free_netdev(dev_ipsec); -+#else -+#ifndef NETDEV_23 -+ kfree(dev_ipsec->name); -+ dev_ipsec->name=NULL; -+#endif /* !NETDEV_23 */ -+ kfree(dev_ipsec->priv); -+#endif /* alloc_netdev */ -+ dev_ipsec->priv=NULL; -+ } -+ return error; -+} -+ -+// ------------------------------------------------------------------------ -+// this handles creating and managing state for xmit path -+ -+static spinlock_t ixs_cache_lock = SPIN_LOCK_UNLOCKED; -+#ifdef HAVE_KMEM_CACHE_MACRO -+static struct kmem_cache *ixs_cache_allocator = NULL; -+#else -+static kmem_cache_t *ixs_cache_allocator = NULL; -+#endif -+static unsigned ixs_cache_allocated_count = 0; -+ -+#if !defined(MODULE_PARM) && defined(module_param) -+/* -+ * As of 2.6.17 MODULE_PARM no longer exists, use module_param instead. -+ */ -+#define MODULE_PARM(a,b) module_param(a,int,0644) -+#endif -+ -+int ipsec_ixs_cache_allocated_count_max = 1000; -+MODULE_PARM(ipsec_ixs_cache_allocated_count_max, "i"); -+MODULE_PARM_DESC(ipsec_ixs_cache_allocated_count_max, -+ "Maximum outstanding transmit packets"); -+ -+int -+ipsec_xmit_state_cache_init (void) -+{ -+ if (ixs_cache_allocator) -+ return -EBUSY; -+ -+ spin_lock_init(&ixs_cache_lock); -+#ifdef HAVE_KMEM_CACHE_MACRO -+ /* ixs_cache_allocator = KMEM_CACHE(ipsec_ixs,0); */ -+ ixs_cache_allocator = kmem_cache_create ("ipsec_ixs", -+ sizeof (struct ipsec_xmit_state), 0, -+ 0, NULL); -+#else -+ ixs_cache_allocator = kmem_cache_create ("ipsec_ixs", -+ sizeof (struct ipsec_xmit_state), 0, -+ 0, NULL, NULL); -+#endif -+ if (! ixs_cache_allocator) -+ return -ENOMEM; -+ -+ return 0; -+} -+ -+void -+ipsec_xmit_state_cache_cleanup (void) -+{ -+ if (unlikely (ixs_cache_allocated_count)) -+ printk ("ipsec: deleting ipsec_ixs kmem_cache while in use\n"); -+ -+ if (ixs_cache_allocator) { -+ kmem_cache_destroy (ixs_cache_allocator); -+ ixs_cache_allocator = NULL; -+ } -+ ixs_cache_allocated_count = 0; -+} -+ -+struct ipsec_xmit_state * -+ipsec_xmit_state_new (void) -+{ -+ struct ipsec_xmit_state *ixs; -+ -+ spin_lock_bh (&ixs_cache_lock); -+ -+ if (ixs_cache_allocated_count >= ipsec_ixs_cache_allocated_count_max) { -+ spin_unlock_bh (&ixs_cache_lock); -+ KLIPS_PRINT(debug_tunnel, -+ "klips_debug:ipsec_xmit_state_new: " -+ "exceeded maximum outstanding TX packet cnt %d\n", -+ ixs_cache_allocated_count); -+ return NULL; -+ } -+ -+ ixs = kmem_cache_alloc (ixs_cache_allocator, GFP_ATOMIC); -+ -+ if (likely (ixs != NULL)) -+ ixs_cache_allocated_count++; -+ -+ spin_unlock_bh (&ixs_cache_lock); -+ -+ if (unlikely (NULL == ixs)) -+ goto bail; -+ -+ // initialize the object -+#if 1 /* optimised to only clear the required bits */ -+ memset((caddr_t)ixs, 0, sizeof(*ixs)); -+#else -+ ixs->pass = 0; -+ ixs->state = 0; -+ ixs->next_state = 0; -+ ixs->ipsp = NULL; -+ ixs->sa_len = 0; -+ ixs->stats = NULL; -+ ixs->ips.ips_ident_s.data = NULL; -+ ixs->ips.ips_ident_d.data = NULL; -+ ixs->outgoing_said.proto = 0; -+#ifdef CONFIG_IPSEC_NAT_TRAVERSAL -+ ixs->natt_type = 0, ixs->natt_head = 0; -+ ixs->natt_sport = 0, ixs->natt_dport = 0; -+#endif -+ ixs->tot_headroom = 0; -+ ixs->tot_tailroom = 0; -+ ixs->eroute = NULL; -+ ixs->hard_header_stripped = 0; -+ ixs->hard_header_len = 0; -+ ixs->cur_mtu = 0; /* FIXME: can we do something better ? */ -+ -+ ixs->oskb = NULL; -+ ixs->saved_header = NULL; /* saved copy of the hard header */ -+ ixs->route = NULL; -+#endif /* memset */ -+ -+bail: -+ return ixs; -+} -+ -+void -+ipsec_xmit_state_delete (struct ipsec_xmit_state *ixs) -+{ -+ if (unlikely (! ixs)) -+ return; -+ -+ spin_lock_bh (&ixs_cache_lock); -+ -+ ixs_cache_allocated_count--; -+ kmem_cache_free (ixs_cache_allocator, ixs); -+ -+ spin_unlock_bh (&ixs_cache_lock); -+} -+ -+/* -+ * Local Variables: -+ * c-style: linux -+ * End: -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/ipsec_xform.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,69 @@ -+/* -+ * Common routines for IPSEC transformations. -+ * Copyright (C) 1996, 1997 John Ioannidis. -+ * Copyright (C) 1998, 1999, 2000, 2001 Richard Guy Briggs. -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * RCSID $Id: ipsec_xform.c,v 1.65 2005/04/29 05:10:22 mcr Exp $ -+ */ -+ -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif -+#include -+#include /* printk() */ -+ -+#include "freeswan/ipsec_param.h" -+ -+#ifdef MALLOC_SLAB -+# include /* kmalloc() */ -+#else /* MALLOC_SLAB */ -+# include /* kmalloc() */ -+#endif /* MALLOC_SLAB */ -+#include /* error codes */ -+#include /* size_t */ -+#include /* mark_bh */ -+ -+#include /* struct device, and other headers */ -+#include /* eth_type_trans */ -+#include /* struct iphdr */ -+#include -+#include /* get_random_bytes() */ -+#include -+#ifdef SPINLOCK -+# ifdef SPINLOCK_23 -+# include /* *lock* */ -+# else /* SPINLOCK_23 */ -+# include /* *lock* */ -+# endif /* SPINLOCK_23 */ -+#endif /* SPINLOCK */ -+ -+#include -+ -+#include "freeswan/radij.h" -+#include "freeswan/ipsec_encap.h" -+#include "freeswan/ipsec_radij.h" -+#include "freeswan/ipsec_xform.h" -+#include "freeswan/ipsec_ipe4.h" -+#include "freeswan/ipsec_ah.h" -+#include "freeswan/ipsec_esp.h" -+ -+#include -+#include -+ -+/* -+ * -+ * Local Variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/ipsec_xmit.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,2273 @@ -+/* -+ * IPSEC Transmit code. -+ * Copyright (C) 1996, 1997 John Ioannidis. -+ * Copyright (C) 1998-2003 Richard Guy Briggs. -+ * Copyright (C) 2004-2005 Michael Richardson -+ * -+ * OCF/receive state machine written by -+ * David McCullough -+ * Copyright (C) 2004-2005 Intel Corporation. All Rights Reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ */ -+ -+#define __NO_VERSION__ -+#include -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif /* for CONFIG_IP_FORWARD */ -+#include -+#include /* printk() */ -+ -+#include "openswan/ipsec_param.h" -+ -+ -+#ifdef MALLOC_SLAB -+# include /* kmalloc() */ -+#else /* MALLOC_SLAB */ -+# include /* kmalloc() */ -+#endif /* MALLOC_SLAB */ -+#include /* error codes */ -+#include /* size_t */ -+#include /* mark_bh */ -+ -+#include /* struct device, struct net_device_stats, dev_queue_xmit() and other headers */ -+#include /* eth_type_trans */ -+#include /* struct iphdr */ -+ -+#include -+#include -+#include -+ -+#include -+#include -+#include -+#ifdef NET_21 -+# define MSS_HACK_ /* experimental */ -+# include -+# include -+# define proto_priv cb -+#endif /* NET_21 */ -+ -+#include /* icmp_send() */ -+#include -+#ifdef NETDEV_23 -+# include -+#endif /* NETDEV_23 */ -+ -+#include -+#ifdef MSS_HACK -+# include /* TCP options */ -+#endif /* MSS_HACK */ -+ -+#include "openswan/ipsec_kern24.h" -+#include "openswan/radij.h" -+#include "openswan/ipsec_life.h" -+#include "openswan/ipsec_xform.h" -+#include "openswan/ipsec_eroute.h" -+#include "openswan/ipsec_encap.h" -+#include "openswan/ipsec_radij.h" -+#include "openswan/ipsec_xmit.h" -+#include "openswan/ipsec_sa.h" -+#include "openswan/ipsec_tunnel.h" -+#include "openswan/ipsec_ipe4.h" -+#include "openswan/ipsec_ah.h" -+#include "openswan/ipsec_esp.h" -+ -+#ifdef CONFIG_KLIPS_IPCOMP -+#include "openswan/ipcomp.h" -+#endif /* CONFIG_KLIPS_IPCOMP */ -+ -+#include -+#include -+ -+#include "openswan/ipsec_proto.h" -+#include "openswan/ipsec_alg.h" -+#include "ipsec_ocf.h" -+ -+ -+/* -+ * Stupid kernel API differences in APIs. Not only do some -+ * kernels not have ip_select_ident, but some have differing APIs, -+ * and SuSE has one with one parameter, but no way of checking to -+ * see what is really what. -+ */ -+ -+#ifdef SUSE_LINUX_2_4_19_IS_STUPID -+#define KLIPS_IP_SELECT_IDENT(iph, skb) ip_select_ident(iph) -+#else -+ -+/* simplest case, nothing */ -+#if !defined(IP_SELECT_IDENT) -+#define KLIPS_IP_SELECT_IDENT(iph, skb) do { iph->id = htons(ip_id_count++); } while(0) -+#endif -+ -+/* kernels > 2.3.37-ish */ -+#if defined(IP_SELECT_IDENT) && !defined(IP_SELECT_IDENT_NEW) -+#define KLIPS_IP_SELECT_IDENT(iph, skb) ip_select_ident(iph, skb->dst) -+#endif -+ -+/* kernels > 2.4.2 */ -+#if defined(IP_SELECT_IDENT) && defined(IP_SELECT_IDENT_NEW) -+#define KLIPS_IP_SELECT_IDENT(iph, skb) ip_select_ident(iph, skb->dst, NULL) -+#endif -+ -+#endif /* SUSE_LINUX_2_4_19_IS_STUPID */ -+ -+ -+ -+#if defined(CONFIG_KLIPS_AH) -+#if defined(CONFIG_KLIPS_AUTH_HMAC_MD5) || defined(CONFIG_KLIPS_AUTH_HMAC_SHA1) -+static __u32 zeroes[64]; -+#endif -+#endif -+ -+int ipsec_xmit_trap_count = 0; -+int ipsec_xmit_trap_sendcount = 0; -+ -+#ifdef CONFIG_KLIPS_DEBUG -+#define dmp(_x,_y,_z) if(debug_xmit && sysctl_ipsec_debug_verbose) ipsec_dmp_block(_x,_y,_z) -+#else /* CONFIG_KLIPS_DEBUG */ -+#define dmp(_x, _y, _z) -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+ -+#if !defined(SKB_COPY_EXPAND) || defined(KLIPS_UNIT_TESTS) -+/* -+ * This is mostly skbuff.c:skb_copy(). -+ */ -+struct sk_buff * -+skb_copy_expand(const struct sk_buff *skb, int headroom, -+ int tailroom, int priority) -+{ -+ struct sk_buff *n; -+ unsigned long offset; -+ -+ /* -+ * Do sanity checking -+ */ -+ if((headroom < 0) || (tailroom < 0) || ((headroom+tailroom) < 0)) { -+ printk(KERN_WARNING -+ "klips_error:skb_copy_expand: " -+ "Illegal negative head,tailroom %d,%d\n", -+ headroom, -+ tailroom); -+ return NULL; -+ } -+ /* -+ * Allocate the copy buffer -+ */ -+ -+#ifndef NET_21 -+ IS_SKB(skb); -+#endif /* !NET_21 */ -+ -+ -+ n=alloc_skb(skb->end - skb->head + headroom + tailroom, priority); -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_CROUT, -+ "klips_debug:skb_copy_expand: " -+ "allocating %d bytes, head=0p%p data=0p%p tail=0p%p end=0p%p end-head=%d tail-data=%d\n", -+ skb->end - skb->head + headroom + tailroom, -+ skb->head, -+ skb->data, -+ skb->tail, -+ skb->end, -+ skb->end - skb->head, -+ skb->tail - skb->data); -+ -+ if(n==NULL) -+ return NULL; -+ -+ /* -+ * Shift between the two data areas in bytes -+ */ -+ -+ /* Set the data pointer */ -+ skb_reserve(n,skb->data-skb->head+headroom); -+ /* Set the tail pointer and length */ -+ if(skb_tailroom(n) < skb->len) { -+ printk(KERN_WARNING "klips_error:skb_copy_expand: " -+ "tried to skb_put %ld, %d available. This should never happen, please report.\n", -+ (unsigned long int)skb->len, -+ skb_tailroom(n)); -+ ipsec_kfree_skb(n); -+ return NULL; -+ } -+ skb_put(n,skb->len); -+ -+ offset=n->head + headroom - skb->head; -+ -+ /* Copy the bytes */ -+ memcpy(n->head + headroom, skb->head,skb->end-skb->head); -+#ifdef NET_21 -+ n->csum=skb->csum; -+ n->priority=skb->priority; -+ n->dst=dst_clone(skb->dst); -+ if(skb->nh.raw) -+ n->nh.raw=skb->nh.raw+offset; -+#ifndef NETDEV_23 -+ n->is_clone=0; -+#endif /* NETDEV_23 */ -+ atomic_set(&n->users, 1); -+ n->destructor = NULL; -+#ifdef HAVE_SOCK_SECURITY -+ n->security=skb->security; -+#endif -+#else /* NET_21 */ -+ n->link3=NULL; -+ n->when=skb->when; -+ if(skb->ip_hdr) -+ n->ip_hdr=(struct iphdr *)(((char *)skb->ip_hdr)+offset); -+ n->saddr=skb->saddr; -+ n->daddr=skb->daddr; -+ n->raddr=skb->raddr; -+ n->seq=skb->seq; -+ n->end_seq=skb->end_seq; -+ n->ack_seq=skb->ack_seq; -+ n->acked=skb->acked; -+ n->free=1; -+ n->arp=skb->arp; -+ n->tries=0; -+ n->lock=0; -+ n->users=0; -+#endif /* NET_21 */ -+ n->protocol=skb->protocol; -+ n->list=NULL; -+ n->sk=NULL; -+ n->dev=skb->dev; -+ if(skb->h.raw) -+ n->h.raw=skb->h.raw+offset; -+ if(skb->mac.raw) -+ n->mac.raw=skb->mac.raw+offset; -+ memcpy(n->proto_priv, skb->proto_priv, sizeof(skb->proto_priv)); -+#ifndef NETDEV_23 -+ n->used=skb->used; -+#endif /* !NETDEV_23 */ -+ n->pkt_type=skb->pkt_type; -+ n->stamp=skb->stamp; -+ -+#ifndef NET_21 -+ IS_SKB(n); -+#endif /* !NET_21 */ -+ return n; -+} -+#endif /* !SKB_COPY_EXPAND */ -+ -+#ifdef CONFIG_KLIPS_DEBUG -+void -+ipsec_print_ip(struct iphdr *ip) -+{ -+ char buf[ADDRTOA_BUF]; -+ -+ printk(KERN_INFO "klips_debug: IP:"); -+ printk(" ihl:%d", ip->ihl << 2); -+ printk(" ver:%d", ip->version); -+ printk(" tos:%d", ip->tos); -+ printk(" tlen:%d", ntohs(ip->tot_len)); -+ printk(" id:%d", ntohs(ip->id)); -+ printk(" %s%s%sfrag_off:%d", -+ ip->frag_off & __constant_htons(IP_CE) ? "CE " : "", -+ ip->frag_off & __constant_htons(IP_DF) ? "DF " : "", -+ ip->frag_off & __constant_htons(IP_MF) ? "MF " : "", -+ (ntohs(ip->frag_off) & IP_OFFSET) << 3); -+ printk(" ttl:%d", ip->ttl); -+ printk(" proto:%d", ip->protocol); -+ if(ip->protocol == IPPROTO_UDP) -+ printk(" (UDP)"); -+ if(ip->protocol == IPPROTO_TCP) -+ printk(" (TCP)"); -+ if(ip->protocol == IPPROTO_ICMP) -+ printk(" (ICMP)"); -+ if(ip->protocol == IPPROTO_ESP) -+ printk(" (ESP)"); -+ if(ip->protocol == IPPROTO_AH) -+ printk(" (AH)"); -+ if(ip->protocol == IPPROTO_COMP) -+ printk(" (COMP)"); -+ printk(" chk:%d", ntohs(ip->check)); -+ addrtoa(*((struct in_addr*)(&ip->saddr)), 0, buf, sizeof(buf)); -+ printk(" saddr:%s", buf); -+ if(ip->protocol == IPPROTO_UDP) -+ printk(":%d", -+ ntohs(((struct udphdr*)((caddr_t)ip + (ip->ihl << 2)))->source)); -+ if(ip->protocol == IPPROTO_TCP) -+ printk(":%d", -+ ntohs(((struct tcphdr*)((caddr_t)ip + (ip->ihl << 2)))->source)); -+ addrtoa(*((struct in_addr*)(&ip->daddr)), 0, buf, sizeof(buf)); -+ printk(" daddr:%s", buf); -+ if(ip->protocol == IPPROTO_UDP) -+ printk(":%d", -+ ntohs(((struct udphdr*)((caddr_t)ip + (ip->ihl << 2)))->dest)); -+ if(ip->protocol == IPPROTO_TCP) -+ printk(":%d", -+ ntohs(((struct tcphdr*)((caddr_t)ip + (ip->ihl << 2)))->dest)); -+ if(ip->protocol == IPPROTO_ICMP) -+ printk(" type:code=%d:%d", -+ ((struct icmphdr*)((caddr_t)ip + (ip->ihl << 2)))->type, -+ ((struct icmphdr*)((caddr_t)ip + (ip->ihl << 2)))->code); -+ printk("\n"); -+ -+ if(sysctl_ipsec_debug_verbose) { -+ __u8 *c; -+ int len = ntohs(ip->tot_len) - ip->ihl*4; -+ -+ c = ((__u8*)ip) + ip->ihl*4; -+ ipsec_dmp_block("ip_print", c, len); -+ } -+} -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+#ifdef MSS_HACK -+/* -+ * Issues: -+ * 1) Fragments arriving in the tunnel should probably be rejected. -+ * 2) How does this affect syncookies, mss_cache, dst cache ? -+ * 3) Path MTU discovery handling needs to be reviewed. For example, -+ * if we receive an ICMP 'packet too big' message from an intermediate -+ * router specifying it's next hop MTU, our stack may process this and -+ * adjust the MSS without taking our AH/ESP overheads into account. -+ */ -+ -+ -+/* -+ * Recaclulate checksum using differences between changed datum, -+ * borrowed from netfilter. -+ */ -+DEBUG_NO_STATIC u_int16_t -+ipsec_fast_csum(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck) -+{ -+ u_int32_t diffs[] = { oldvalinv, newval }; -+ return csum_fold(csum_partial((char *)diffs, sizeof(diffs), -+ oldcheck^0xFFFF)); -+} -+ -+/* -+ * Determine effective MSS. -+ * -+ * Note that we assume that there is always an MSS option for our own -+ * SYN segments, which is mentioned in tcp_syn_build_options(), kernel 2.2.x. -+ * This could change, and we should probably parse TCP options instead. -+ * -+ */ -+DEBUG_NO_STATIC u_int8_t -+ipsec_adjust_mss(struct sk_buff *skb, struct tcphdr *tcph, u_int16_t mtu) -+{ -+ u_int16_t oldmss, newmss; -+ u_int32_t *mssp; -+ struct sock *sk = skb->sk; -+ -+ newmss = tcp_sync_mss(sk, mtu); -+ printk(KERN_INFO "klips: setting mss to %u\n", newmss); -+ mssp = (u_int32_t *)tcph + sizeof(struct tcphdr) / sizeof(u_int32_t); -+ oldmss = ntohl(*mssp) & 0x0000FFFF; -+ *mssp = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | newmss); -+ tcph->check = ipsec_fast_csum(htons(~oldmss), -+ htons(newmss), tcph->check); -+ return 1; -+} -+#endif /* MSS_HACK */ -+ -+#ifdef CONFIG_KLIPS_DEBUG -+DEBUG_NO_STATIC char * -+ipsec_xmit_err(int err) -+{ -+ static char tmp[32]; -+ switch ((int) err) { -+ case IPSEC_XMIT_STOLEN: return("IPSEC_XMIT_STOLEN"); -+ case IPSEC_XMIT_PASS: return("IPSEC_XMIT_PASS"); -+ case IPSEC_XMIT_OK: return("IPSEC_XMIT_OK"); -+ case IPSEC_XMIT_ERRMEMALLOC: return("IPSEC_XMIT_ERRMEMALLOC"); -+ case IPSEC_XMIT_ESP_BADALG: return("IPSEC_XMIT_ESP_BADALG"); -+ case IPSEC_XMIT_BADPROTO: return("IPSEC_XMIT_BADPROTO"); -+ case IPSEC_XMIT_ESP_PUSHPULLERR:return("IPSEC_XMIT_ESP_PUSHPULLERR"); -+ case IPSEC_XMIT_BADLEN: return("IPSEC_XMIT_BADLEN"); -+ case IPSEC_XMIT_AH_BADALG: return("IPSEC_XMIT_AH_BADALG"); -+ case IPSEC_XMIT_SAIDNOTFOUND: return("IPSEC_XMIT_SAIDNOTFOUND"); -+ case IPSEC_XMIT_SAIDNOTLIVE: return("IPSEC_XMIT_SAIDNOTLIVE"); -+ case IPSEC_XMIT_REPLAYROLLED: return("IPSEC_XMIT_REPLAYROLLED"); -+ case IPSEC_XMIT_LIFETIMEFAILED: return("IPSEC_XMIT_LIFETIMEFAILED"); -+ case IPSEC_XMIT_CANNOTFRAG: return("IPSEC_XMIT_CANNOTFRAG"); -+ case IPSEC_XMIT_MSSERR: return("IPSEC_XMIT_MSSERR"); -+ case IPSEC_XMIT_ERRSKBALLOC: return("IPSEC_XMIT_ERRSKBALLOC"); -+ case IPSEC_XMIT_ENCAPFAIL: return("IPSEC_XMIT_ENCAPFAIL"); -+ case IPSEC_XMIT_NODEV: return("IPSEC_XMIT_NODEV"); -+ case IPSEC_XMIT_NOPRIVDEV: return("IPSEC_XMIT_NOPRIVDEV"); -+ case IPSEC_XMIT_NOPHYSDEV: return("IPSEC_XMIT_NOPHYSDEV"); -+ case IPSEC_XMIT_NOSKB: return("IPSEC_XMIT_NOSKB"); -+ case IPSEC_XMIT_NOIPV6: return("IPSEC_XMIT_NOIPV6"); -+ case IPSEC_XMIT_NOIPOPTIONS: return("IPSEC_XMIT_NOIPOPTIONS"); -+ case IPSEC_XMIT_TTLEXPIRED: return("IPSEC_XMIT_TTLEXPIRED"); -+ case IPSEC_XMIT_BADHHLEN: return("IPSEC_XMIT_BADHHLEN"); -+ case IPSEC_XMIT_PUSHPULLERR: return("IPSEC_XMIT_PUSHPULLERR"); -+ case IPSEC_XMIT_ROUTEERR: return("IPSEC_XMIT_ROUTEERR"); -+ case IPSEC_XMIT_RECURSDETECT: return("IPSEC_XMIT_RECURSDETECT"); -+ case IPSEC_XMIT_IPSENDFAILURE: return("IPSEC_XMIT_IPSENDFAILURE"); -+ case IPSEC_XMIT_ESPUDP: return("IPSEC_XMIT_ESPUDP"); -+ case IPSEC_XMIT_ESPUDP_BADTYPE: return("IPSEC_XMIT_ESPUDP_BADTYPE"); -+ case IPSEC_XMIT_PENDING: return("IPSEC_XMIT_PENDING"); -+ } -+ snprintf(tmp, sizeof(tmp), "%d", err); -+ return tmp; -+} -+#endif -+ -+/* -+ * Sanity checks -+ */ -+enum ipsec_xmit_value -+ipsec_xmit_sanity_check_dev(struct ipsec_xmit_state *ixs) -+{ -+ -+ if (ixs->dev == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_error:ipsec_xmit_sanity_check_dev: " -+ "No device associated with skb!\n" ); -+ return IPSEC_XMIT_NODEV; -+ } -+ -+ ixs->prv = ixs->dev->priv; -+ if (ixs->prv == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_error:ipsec_xmit_sanity_check_dev: " -+ "Device has no private structure!\n" ); -+ return IPSEC_XMIT_NOPRIVDEV; -+ } -+ -+ ixs->physdev = ixs->prv->dev; -+ if (ixs->physdev == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_error:ipsec_xmit_sanity_check_dev: " -+ "Device is not attached to physical device!\n" ); -+ return IPSEC_XMIT_NOPHYSDEV; -+ } -+ -+ ixs->physmtu = ixs->physdev->mtu; -+ ixs->cur_mtu = ixs->physdev->mtu; -+ ixs->stats = (struct net_device_stats *) &(ixs->prv->mystats); -+ -+ return IPSEC_XMIT_OK; -+} -+ -+enum ipsec_xmit_value -+ipsec_xmit_sanity_check_skb(struct ipsec_xmit_state *ixs) -+{ -+ /* -+ * Return if there is nothing to do. (Does this ever happen?) XXX -+ */ -+ if (ixs->skb == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_error:ipsec_xmit_sanity_check_skb: " -+ "Nothing to do!\n" ); -+ return IPSEC_XMIT_NOSKB; -+ } -+ -+ /* if skb was cloned (most likely due to a packet sniffer such as -+ tcpdump being momentarily attached to the interface), make -+ a copy of our own to modify */ -+ if(skb_cloned(ixs->skb)) { -+ if -+#ifdef SKB_COW_NEW -+ (skb_cow(ixs->skb, skb_headroom(ixs->skb)) != 0) -+#else /* SKB_COW_NEW */ -+ ((ixs->skb = skb_cow(ixs->skb, skb_headroom(ixs->skb))) == NULL) -+#endif /* SKB_COW_NEW */ -+ { -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_error:ipsec_xmit_sanity_check_skb: " -+ "skb_cow failed to allocate buffer, dropping.\n" ); -+ ixs->stats->tx_dropped++; -+ return IPSEC_XMIT_ERRSKBALLOC; -+ } -+ } -+ -+ ixs->iph = ip_hdr(ixs->skb); -+ -+ /* sanity check for IP version as we can't handle IPv6 right now */ -+ if (ixs->iph->version != 4) { -+ KLIPS_PRINT(debug_tunnel, -+ "klips_debug:ipsec_xmit_sanity_check_skb: " -+ "found IP Version %d but cannot process other IP versions than v4.\n", -+ ixs->iph->version); /* XXX */ -+ ixs->stats->tx_dropped++; -+ return IPSEC_XMIT_NOIPV6; -+ } -+ -+#if IPSEC_DISALLOW_IPOPTIONS -+ if ((ixs->iph->ihl << 2) != sizeof (struct iphdr)) { -+ KLIPS_PRINT(debug_tunnel, -+ "klips_debug:ipsec_xmit_sanity_check_skb: " -+ "cannot process IP header options yet. May be mal-formed packet.\n"); /* XXX */ -+ ixs->stats->tx_dropped++; -+ return IPSEC_XMIT_NOIPOPTIONS; -+ } -+#endif /* IPSEC_DISALLOW_IPOPTIONS */ -+ -+#ifndef NET_21 -+ if (ixs->iph->ttl <= 0) { -+ /* Tell the sender its packet died... */ -+ ICMP_SEND(ixs->skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0, ixs->physdev); -+ -+ KLIPS_PRINT(debug_tunnel, "klips_debug:ipsec_xmit_sanity_check_skb: " -+ "TTL=0, too many hops!\n"); -+ ixs->stats->tx_dropped++; -+ return IPSEC_XMIT_TTLEXPIRED; -+ } -+#endif /* !NET_21 */ -+ -+ return IPSEC_XMIT_OK; -+} -+ -+ -+enum ipsec_xmit_value -+ipsec_xmit_encap_init(struct ipsec_xmit_state *ixs) -+{ -+ ixs->blocksize = 8; -+ ixs->headroom = 0; -+ ixs->tailroom = 0; -+ ixs->authlen = 0; -+ -+#ifdef CONFIG_KLIPS_ALG -+ ixs->ixt_e = NULL; -+ ixs->ixt_a = NULL; -+#endif /* CONFIG_KLIPS_ALG */ -+ -+ ixs->iphlen = ixs->iph->ihl << 2; -+ ixs->pyldsz = ntohs(ixs->iph->tot_len) - ixs->iphlen; -+ ixs->sa_len = KLIPS_SATOT(debug_tunnel, &ixs->ipsp->ips_said, 0, ixs->sa_txt, SATOT_BUF); -+ KLIPS_PRINT(debug_tunnel & DB_TN_OXFS, -+ "klips_debug:ipsec_xmit_encap_once: " -+ "calling output for <%s%s%s>, SA:%s\n", -+ IPS_XFORM_NAME(ixs->ipsp), -+ ixs->sa_len ? ixs->sa_txt : " (error)"); -+ switch(ixs->ipsp->ips_said.proto) { -+#ifdef CONFIG_KLIPS_AH -+ case IPPROTO_AH: -+ ixs->headroom += sizeof(struct ahhdr); -+ break; -+#endif /* CONFIG_KLIPS_AH */ -+#ifdef CONFIG_KLIPS_ESP -+ case IPPROTO_ESP: -+#ifdef CONFIG_KLIPS_OCF -+ /* -+ * this needs cleaning up for sure - DM -+ */ -+ if (ixs->ipsp->ocf_in_use) { -+ switch (ixs->ipsp->ips_encalg) { -+ case ESP_DES: -+ case ESP_3DES: -+ ixs->blocksize = 8; -+ ixs->headroom += ESP_HEADER_LEN + 8 /* ivsize */; -+ break; -+ case ESP_AES: -+ ixs->blocksize = 16; -+ ixs->headroom += ESP_HEADER_LEN + 16 /* ivsize */; -+ break; -+ default: -+ ixs->stats->tx_errors++; -+ return IPSEC_XMIT_ESP_BADALG; -+ } -+ } else -+#endif -+#ifdef CONFIG_KLIPS_ALG -+ if ((ixs->ixt_e=ixs->ipsp->ips_alg_enc)) { -+ ixs->blocksize = ixs->ixt_e->ixt_common.ixt_blocksize; -+ ixs->headroom += ESP_HEADER_LEN + ixs->ixt_e->ixt_common.ixt_support.ias_ivlen/8; -+ } else -+#endif /* CONFIG_KLIPS_ALG */ -+ { -+ ixs->stats->tx_errors++; -+ return IPSEC_XMIT_ESP_BADALG; -+ } -+#ifdef CONFIG_KLIPS_OCF -+ if (ixs->ipsp->ocf_in_use) { -+ switch (ixs->ipsp->ips_authalg) { -+ case AH_MD5: -+ case AH_SHA: -+ ixs->authlen = AHHMAC_HASHLEN; -+ break; -+ case AH_NONE: -+ break; -+ } -+ } else -+#endif /* CONFIG_KLIPS_OCF */ -+#ifdef CONFIG_KLIPS_ALG -+ -+ ixs->ixt_a=ixs->ipsp->ips_alg_auth; -+ if (ixs->ixt_a) { -+ ixs->tailroom += AHHMAC_HASHLEN; -+ ixs->authlen = AHHMAC_HASHLEN; -+ } else -+#endif /* CONFIG_KLIPS_ALG */ -+ switch(ixs->ipsp->ips_authalg) { -+#ifdef CONFIG_KLIPS_AUTH_HMAC_MD5 -+ case AH_MD5: -+ ixs->authlen = AHHMAC_HASHLEN; -+ break; -+#endif /* CONFIG_KLIPS_AUTH_HMAC_MD5 */ -+#ifdef CONFIG_KLIPS_AUTH_HMAC_SHA1 -+ case AH_SHA: -+ ixs->authlen = AHHMAC_HASHLEN; -+ break; -+#endif /* CONFIG_KLIPS_AUTH_HMAC_SHA1 */ -+ case AH_NONE: -+ break; -+ default: -+ ixs->stats->tx_errors++; -+ return IPSEC_XMIT_ESP_BADALG; -+ } -+ ixs->tailroom += ixs->blocksize != 1 ? -+ ((ixs->blocksize - ((ixs->pyldsz + 2) % ixs->blocksize)) % ixs->blocksize) + 2 : -+ ((4 - ((ixs->pyldsz + 2) % 4)) % 4) + 2; -+ ixs->tailroom += ixs->authlen; -+ break; -+#endif /* !CONFIG_KLIPS_ESP */ -+#ifdef CONFIG_KLIPS_IPIP -+ case IPPROTO_IPIP: -+ ixs->headroom += sizeof(struct iphdr); -+ ixs->iphlen = sizeof(struct iphdr); -+ break; -+#endif /* !CONFIG_KLIPS_IPIP */ -+#ifdef CONFIG_KLIPS_IPCOMP -+ case IPPROTO_COMP: -+ break; -+#endif /* CONFIG_KLIPS_IPCOMP */ -+ default: -+ ixs->stats->tx_errors++; -+ return IPSEC_XMIT_BADPROTO; -+ } -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_CROUT, -+ "klips_debug:ipsec_xmit_encap_once: " -+ "pushing %d bytes, putting %d, proto %d.\n", -+ ixs->headroom, ixs->tailroom, ixs->ipsp->ips_said.proto); -+ if(skb_headroom(ixs->skb) < ixs->headroom) { -+ printk(KERN_WARNING -+ "klips_error:ipsec_xmit_encap_once: " -+ "tried to skb_push headroom=%d, %d available. This should never happen, please report.\n", -+ ixs->headroom, skb_headroom(ixs->skb)); -+ ixs->stats->tx_errors++; -+ return IPSEC_XMIT_ESP_PUSHPULLERR; -+ } -+ -+ ixs->dat = skb_push(ixs->skb, ixs->headroom); -+ ixs->ilen = ixs->skb->len - ixs->tailroom; -+ if(skb_tailroom(ixs->skb) < ixs->tailroom) { -+ printk(KERN_WARNING -+ "klips_error:ipsec_xmit_encap_once: " -+ "tried to skb_put %d, %d available. This should never happen, please report.\n", -+ ixs->tailroom, skb_tailroom(ixs->skb)); -+ ixs->stats->tx_errors++; -+ return IPSEC_XMIT_ESP_PUSHPULLERR; -+ } -+ skb_put(ixs->skb, ixs->tailroom); -+ KLIPS_PRINT(debug_tunnel & DB_TN_CROUT, -+ "klips_debug:ipsec_xmit_encap_once: " -+ "head,tailroom: %d,%d before xform.\n", -+ skb_headroom(ixs->skb), skb_tailroom(ixs->skb)); -+ ixs->len = ixs->skb->len; -+ if(ixs->len > 0xfff0) { -+ printk(KERN_WARNING "klips_error:ipsec_xmit_encap_once: " -+ "tot_len (%d) > 65520. This should never happen, please report.\n", -+ ixs->len); -+ ixs->stats->tx_errors++; -+ return IPSEC_XMIT_BADLEN; -+ } -+ memmove((void *)ixs->dat, (void *)(ixs->dat + ixs->headroom), ixs->iphlen); -+ ixs->iph = (struct iphdr *)ixs->dat; -+ ixs->iph->tot_len = htons(ixs->skb->len); -+ -+ return IPSEC_XMIT_OK; -+} -+ -+ -+/* -+ * work out which state to proceed to next -+ */ -+ -+enum ipsec_xmit_value -+ipsec_xmit_encap_select(struct ipsec_xmit_state *ixs) -+{ -+ switch (ixs->ipsp->ips_said.proto) { -+#ifdef CONFIG_KLIPS_ESP -+ case IPPROTO_ESP: -+ ixs->next_state = IPSEC_XSM_ESP; -+ break; -+#endif -+#ifdef CONFIG_KLIPS_AH -+ case IPPROTO_AH: -+ ixs->next_state = IPSEC_XSM_AH; -+ break; -+#endif -+#ifdef CONFIG_KLIPS_IPIP -+ case IPPROTO_IPIP: -+ ixs->next_state = IPSEC_XSM_IPIP; -+ break; -+#endif -+#ifdef CONFIG_KLIPS_IPCOMP -+ case IPPROTO_COMP: -+ ixs->next_state = IPSEC_XSM_IPCOMP; -+ break; -+#endif -+ default: -+ ixs->stats->tx_errors++; -+ return IPSEC_XMIT_BADPROTO; -+ } -+ return IPSEC_XMIT_OK; -+} -+ -+ -+#ifdef CONFIG_KLIPS_ESP -+ -+enum ipsec_xmit_value -+ipsec_xmit_esp(struct ipsec_xmit_state *ixs) -+{ -+ int i; -+ unsigned char *pad; -+ int padlen = 0; -+ -+ ixs->espp = (struct esphdr *)(ixs->dat + ixs->iphlen); -+#ifdef NET_21 -+ skb_set_transport_header(ixs->skb, ipsec_skb_offset(ixs->skb, ixs->espp)); -+#endif /* NET_21 */ -+ ixs->espp->esp_spi = ixs->ipsp->ips_said.spi; -+ ixs->espp->esp_rpl = htonl(++(ixs->ipsp->ips_replaywin_lastseq)); -+ -+ ixs->idat = ixs->dat + ixs->iphlen + ixs->headroom; -+ ixs->ilen = ixs->len - (ixs->iphlen + ixs->headroom + ixs->authlen); -+ -+ /* Self-describing padding */ -+ pad = &ixs->dat[ixs->len - ixs->tailroom]; -+ padlen = ixs->tailroom - 2 - ixs->authlen; -+ for (i = 0; i < padlen; i++) { -+ pad[i] = i + 1; -+ } -+ ixs->dat[ixs->len - ixs->authlen - 2] = padlen; -+ -+ ixs->dat[ixs->len - ixs->authlen - 1] = ixs->iph->protocol; -+ ixs->iph->protocol = IPPROTO_ESP; -+ -+#ifdef CONFIG_KLIPS_OCF -+ if (ixs->ipsp->ocf_in_use) -+ return(ipsec_ocf_xmit(ixs)); -+#endif -+ -+#ifdef CONFIG_KLIPS_ALG -+ if (!ixs->ixt_e) { -+ ixs->stats->tx_errors++; -+ return IPSEC_XMIT_ESP_BADALG; -+ } -+ -+#ifdef CONFIG_KLIPS_DEBUG -+ if(debug_tunnel & DB_TN_ENCAP) { -+ dmp("pre-encrypt", ixs->dat, ixs->len); -+ } -+#endif -+ -+ /* -+ * Do all operations here: -+ * copy IV->ESP, encrypt, update ips IV -+ * -+ */ -+ { -+ int ret; -+ memcpy(ixs->espp->esp_iv, -+ ixs->ipsp->ips_iv, -+ ixs->ipsp->ips_iv_size); -+ ret=ipsec_alg_esp_encrypt(ixs->ipsp, -+ ixs->idat, ixs->ilen, ixs->espp->esp_iv, -+ IPSEC_ALG_ENCRYPT); -+ -+ prng_bytes(&ipsec_prng, -+ (char *)ixs->ipsp->ips_iv, -+ ixs->ipsp->ips_iv_size); -+ } -+ return IPSEC_XMIT_OK; -+#else -+ return IPSEC_XMIT_ESP_BADALG; -+#endif /* CONFIG_KLIPS_ALG */ -+} -+ -+ -+enum ipsec_xmit_value -+ipsec_xmit_esp_ah(struct ipsec_xmit_state *ixs) -+{ -+#if defined(CONFIG_KLIPS_AUTH_HMAC_MD5) || defined(CONFIG_KLIPS_AUTH_HMAC_SHA1) -+ __u8 hash[AH_AMAX]; -+#endif -+#if defined(CONFIG_KLIPS_AUTH_HMAC_MD5) || defined(CONFIG_KLIPS_AUTH_HMAC_SHA1) -+ union { -+#ifdef CONFIG_KLIPS_AUTH_HMAC_MD5 -+ MD5_CTX md5; -+#endif /* CONFIG_KLIPS_AUTH_HMAC_MD5 */ -+#ifdef CONFIG_KLIPS_AUTH_HMAC_SHA1 -+ SHA1_CTX sha1; -+#endif /* CONFIG_KLIPS_AUTH_HMAC_SHA1 */ -+ } tctx; -+#endif /* defined(CONFIG_KLIPS_AUTH_HMAC_MD5) || defined(CONFIG_KLIPS_AUTH_HMAC_SHA1) */ -+ -+#ifdef CONFIG_KLIPS_OCF -+ if (ixs->ipsp->ocf_in_use) { -+ /* we should never be here using OCF */ -+ ixs->stats->tx_errors++; -+ return IPSEC_XMIT_AH_BADALG; -+ } else -+#endif -+#ifdef CONFIG_KLIPS_ALG -+ if (ixs->ixt_a) { -+ ipsec_alg_sa_esp_hash(ixs->ipsp, -+ (caddr_t)ixs->espp, ixs->len - ixs->iphlen - ixs->authlen, -+ &(ixs->dat[ixs->len - ixs->authlen]), ixs->authlen); -+ -+ } else -+#endif /* CONFIG_KLIPS_ALG */ -+ switch(ixs->ipsp->ips_authalg) { -+#ifdef CONFIG_KLIPS_AUTH_HMAC_MD5 -+ case AH_MD5: -+ dmp("espp", (char*)ixs->espp, ixs->len - ixs->iphlen - ixs->authlen); -+ tctx.md5 = ((struct md5_ctx*)(ixs->ipsp->ips_key_a))->ictx; -+ dmp("ictx", (char*)&tctx.md5, sizeof(tctx.md5)); -+ osMD5Update(&tctx.md5, (caddr_t)ixs->espp, ixs->len - ixs->iphlen - ixs->authlen); -+ dmp("ictx+dat", (char*)&tctx.md5, sizeof(tctx.md5)); -+ osMD5Final(hash, &tctx.md5); -+ dmp("ictx hash", (char*)&hash, sizeof(hash)); -+ tctx.md5 = ((struct md5_ctx*)(ixs->ipsp->ips_key_a))->octx; -+ dmp("octx", (char*)&tctx.md5, sizeof(tctx.md5)); -+ osMD5Update(&tctx.md5, hash, AHMD596_ALEN); -+ dmp("octx+hash", (char*)&tctx.md5, sizeof(tctx.md5)); -+ osMD5Final(hash, &tctx.md5); -+ dmp("octx hash", (char*)&hash, sizeof(hash)); -+ memcpy(&(ixs->dat[ixs->len - ixs->authlen]), hash, ixs->authlen); -+ -+ /* paranoid */ -+ memset((caddr_t)&tctx.md5, 0, sizeof(tctx.md5)); -+ memset((caddr_t)hash, 0, sizeof(*hash)); -+ break; -+#endif /* CONFIG_KLIPS_AUTH_HMAC_MD5 */ -+#ifdef CONFIG_KLIPS_AUTH_HMAC_SHA1 -+ case AH_SHA: -+ tctx.sha1 = ((struct sha1_ctx*)(ixs->ipsp->ips_key_a))->ictx; -+ SHA1Update(&tctx.sha1, (caddr_t)ixs->espp, ixs->len - ixs->iphlen - ixs->authlen); -+ SHA1Final(hash, &tctx.sha1); -+ tctx.sha1 = ((struct sha1_ctx*)(ixs->ipsp->ips_key_a))->octx; -+ SHA1Update(&tctx.sha1, hash, AHSHA196_ALEN); -+ SHA1Final(hash, &tctx.sha1); -+ memcpy(&(ixs->dat[ixs->len - ixs->authlen]), hash, ixs->authlen); -+ -+ /* paranoid */ -+ memset((caddr_t)&tctx.sha1, 0, sizeof(tctx.sha1)); -+ memset((caddr_t)hash, 0, sizeof(*hash)); -+ break; -+#endif /* CONFIG_KLIPS_AUTH_HMAC_SHA1 */ -+ case AH_NONE: -+ break; -+ default: -+ ixs->stats->tx_errors++; -+ return IPSEC_XMIT_AH_BADALG; -+ } -+ return IPSEC_XMIT_OK; -+} -+ -+#endif /* CONFIG_KLIPS_ESP */ -+ -+ -+ -+#ifdef CONFIG_KLIPS_AH -+ -+enum ipsec_xmit_value -+ipsec_xmit_ah(struct ipsec_xmit_state *ixs) -+{ -+ struct iphdr ipo; -+ struct ahhdr *ahp; -+#if defined(CONFIG_KLIPS_AUTH_HMAC_MD5) || defined(CONFIG_KLIPS_AUTH_HMAC_SHA1) -+ __u8 hash[AH_AMAX]; -+#endif -+#if defined(CONFIG_KLIPS_AUTH_HMAC_MD5) || defined(CONFIG_KLIPS_AUTH_HMAC_SHA1) -+ union { -+#ifdef CONFIG_KLIPS_AUTH_HMAC_MD5 -+ MD5_CTX md5; -+#endif /* CONFIG_KLIPS_AUTH_HMAC_MD5 */ -+#ifdef CONFIG_KLIPS_AUTH_HMAC_SHA1 -+ SHA1_CTX sha1; -+#endif /* CONFIG_KLIPS_AUTH_HMAC_SHA1 */ -+ } tctx; -+#endif /* defined(CONFIG_KLIPS_AUTH_HMAC_MD5) || defined(CONFIG_KLIPS_AUTH_HMAC_SHA1) */ -+ -+ ahp = (struct ahhdr *)(ixs->dat + ixs->iphlen); -+#ifdef NET_21 -+ skb_set_transport_header(ixs->skb, ipsec_skb_offset(ixs->skb, ahp)); -+#endif /* NET_21 */ -+ ahp->ah_spi = ixs->ipsp->ips_said.spi; -+ ahp->ah_rpl = htonl(++(ixs->ipsp->ips_replaywin_lastseq)); -+ ahp->ah_rv = 0; -+ ahp->ah_nh = ixs->iph->protocol; -+ ahp->ah_hl = (ixs->headroom >> 2) - sizeof(__u64)/sizeof(__u32); -+ ixs->iph->protocol = IPPROTO_AH; -+ dmp("ahp", (char*)ahp, sizeof(*ahp)); -+ -+#ifdef CONFIG_KLIPS_OCF -+ if (ixs->ipsp->ocf_in_use) -+ return(ipsec_ocf_xmit(ixs)); -+#endif -+ -+ ipo = *ixs->iph; -+ ipo.tos = 0; -+ ipo.frag_off = 0; -+ ipo.ttl = 0; -+ ipo.check = 0; -+ dmp("ipo", (char*)&ipo, sizeof(ipo)); -+ -+ switch(ixs->ipsp->ips_authalg) { -+#ifdef CONFIG_KLIPS_AUTH_HMAC_MD5 -+ case AH_MD5: -+ tctx.md5 = ((struct md5_ctx*)(ixs->ipsp->ips_key_a))->ictx; -+ dmp("ictx", (char*)&tctx.md5, sizeof(tctx.md5)); -+ osMD5Update(&tctx.md5, (unsigned char *)&ipo, sizeof (struct iphdr)); -+ dmp("ictx+ipo", (char*)&tctx.md5, sizeof(tctx.md5)); -+ osMD5Update(&tctx.md5, (unsigned char *)ahp, ixs->headroom - sizeof(ahp->ah_data)); -+ dmp("ictx+ahp", (char*)&tctx.md5, sizeof(tctx.md5)); -+ osMD5Update(&tctx.md5, (unsigned char *)zeroes, AHHMAC_HASHLEN); -+ dmp("ictx+zeroes", (char*)&tctx.md5, sizeof(tctx.md5)); -+ osMD5Update(&tctx.md5, ixs->dat + ixs->iphlen + ixs->headroom, ixs->len - ixs->iphlen - ixs->headroom); -+ dmp("ictx+dat", (char*)&tctx.md5, sizeof(tctx.md5)); -+ osMD5Final(hash, &tctx.md5); -+ dmp("ictx hash", (char*)&hash, sizeof(hash)); -+ tctx.md5 = ((struct md5_ctx*)(ixs->ipsp->ips_key_a))->octx; -+ dmp("octx", (char*)&tctx.md5, sizeof(tctx.md5)); -+ osMD5Update(&tctx.md5, hash, AHMD596_ALEN); -+ dmp("octx+hash", (char*)&tctx.md5, sizeof(tctx.md5)); -+ osMD5Final(hash, &tctx.md5); -+ dmp("octx hash", (char*)&hash, sizeof(hash)); -+ -+ memcpy(ahp->ah_data, hash, AHHMAC_HASHLEN); -+ -+ /* paranoid */ -+ memset((caddr_t)&tctx.md5, 0, sizeof(tctx.md5)); -+ memset((caddr_t)hash, 0, sizeof(*hash)); -+ break; -+#endif /* CONFIG_KLIPS_AUTH_HMAC_MD5 */ -+#ifdef CONFIG_KLIPS_AUTH_HMAC_SHA1 -+ case AH_SHA: -+ tctx.sha1 = ((struct sha1_ctx*)(ixs->ipsp->ips_key_a))->ictx; -+ SHA1Update(&tctx.sha1, (unsigned char *)&ipo, sizeof (struct iphdr)); -+ SHA1Update(&tctx.sha1, (unsigned char *)ahp, ixs->headroom - sizeof(ahp->ah_data)); -+ SHA1Update(&tctx.sha1, (unsigned char *)zeroes, AHHMAC_HASHLEN); -+ SHA1Update(&tctx.sha1, ixs->dat + ixs->iphlen + ixs->headroom, ixs->len - ixs->iphlen - ixs->headroom); -+ SHA1Final(hash, &tctx.sha1); -+ tctx.sha1 = ((struct sha1_ctx*)(ixs->ipsp->ips_key_a))->octx; -+ SHA1Update(&tctx.sha1, hash, AHSHA196_ALEN); -+ SHA1Final(hash, &tctx.sha1); -+ -+ memcpy(ahp->ah_data, hash, AHHMAC_HASHLEN); -+ -+ /* paranoid */ -+ memset((caddr_t)&tctx.sha1, 0, sizeof(tctx.sha1)); -+ memset((caddr_t)hash, 0, sizeof(*hash)); -+ break; -+#endif /* CONFIG_KLIPS_AUTH_HMAC_SHA1 */ -+ default: -+ ixs->stats->tx_errors++; -+ return IPSEC_XMIT_AH_BADALG; -+ } -+ return IPSEC_XMIT_OK; -+} -+ -+#endif /* CONFIG_KLIPS_AH */ -+ -+ -+#ifdef CONFIG_KLIPS_IPIP -+ -+enum ipsec_xmit_value -+ipsec_xmit_ipip(struct ipsec_xmit_state *ixs) -+{ -+ ixs->iph->version = 4; -+ switch(sysctl_ipsec_tos) { -+ case 0: -+#ifdef NET_21 -+ ixs->iph->tos = ip_hdr(ixs->skb)->tos; -+#else /* NET_21 */ -+ ixs->iph->tos = ixs->skb->ip_hdr->tos; -+#endif /* NET_21 */ -+ break; -+ case 1: -+ ixs->iph->tos = 0; -+ break; -+ default: -+ break; -+ } -+ ixs->iph->ttl = SYSCTL_IPSEC_DEFAULT_TTL; -+ ixs->iph->frag_off = 0; -+ ixs->iph->saddr = ((struct sockaddr_in*)(ixs->ipsp->ips_addr_s))->sin_addr.s_addr; -+ ixs->iph->daddr = ((struct sockaddr_in*)(ixs->ipsp->ips_addr_d))->sin_addr.s_addr; -+ ixs->iph->protocol = IPPROTO_IPIP; -+ ixs->iph->ihl = sizeof(struct iphdr) >> 2; -+ -+ KLIPS_IP_SELECT_IDENT(ixs->iph, ixs->skb); -+ -+ ixs->newdst = (__u32)ixs->iph->daddr; -+ ixs->newsrc = (__u32)ixs->iph->saddr; -+ -+#ifdef NET_21 -+ skb_set_transport_header(ixs->skb, ipsec_skb_offset(ixs->skb, ip_hdr(ixs->skb))); -+#endif /* NET_21 */ -+ return IPSEC_XMIT_OK; -+} -+ -+#endif /* CONFIG_KLIPS_IPIP */ -+ -+ -+#ifdef CONFIG_KLIPS_IPCOMP -+ -+enum ipsec_xmit_value -+ipsec_xmit_ipcomp(struct ipsec_xmit_state *ixs) -+{ -+#ifdef CONFIG_KLIPS_DEBUG -+ unsigned int old_tot_len; -+#endif -+ int flags = 0; -+ -+#ifdef CONFIG_KLIPS_DEBUG -+ old_tot_len = ntohs(ixs->iph->tot_len); -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+ ixs->ipsp->ips_comp_ratio_dbytes += ntohs(ixs->iph->tot_len); -+ ixs->skb = skb_compress(ixs->skb, ixs->ipsp, &flags); -+ -+#ifdef NET_21 -+ ixs->iph = ip_hdr(ixs->skb); -+#else /* NET_21 */ -+ ixs->iph = ixs->skb->ip_hdr; -+#endif /* NET_21 */ -+ -+ ixs->ipsp->ips_comp_ratio_cbytes += ntohs(ixs->iph->tot_len); -+ -+#ifdef CONFIG_KLIPS_DEBUG -+ if (debug_tunnel & DB_TN_CROUT) -+ { -+ if (old_tot_len > ntohs(ixs->iph->tot_len)) -+ KLIPS_PRINT(debug_tunnel & DB_TN_CROUT, -+ "klips_debug:ipsec_xmit_encap_once: " -+ "packet shrunk from %d to %d bytes after compression, cpi=%04x (should be from spi=%08x, spi&0xffff=%04x.\n", -+ old_tot_len, ntohs(ixs->iph->tot_len), -+ ntohs(((struct ipcomphdr*)(((char*)ixs->iph) + ((ixs->iph->ihl) << 2)))->ipcomp_cpi), -+ ntohl(ixs->ipsp->ips_said.spi), -+ (__u16)(ntohl(ixs->ipsp->ips_said.spi) & 0x0000ffff)); -+ else -+ KLIPS_PRINT(debug_tunnel & DB_TN_CROUT, -+ "klips_debug:ipsec_xmit_encap_once: " -+ "packet did not compress (flags = %d).\n", -+ flags); -+ } -+#endif /* CONFIG_KLIPS_DEBUG */ -+ return IPSEC_XMIT_OK; -+} -+ -+#endif /* CONFIG_KLIPS_IPCOMP */ -+ -+ -+ -+/* -+ * upon entry to this function, ixs->skb should be setup -+ * as follows: -+ * -+ * data = beginning of IP packet <- differs from ipsec_rcv(). -+ * nh.raw = beginning of IP packet. -+ * h.raw = data after the IP packet. -+ * -+ */ -+enum ipsec_xmit_value -+ipsec_xmit_cont(struct ipsec_xmit_state *ixs) -+{ -+#ifdef NET_21 -+ skb_set_network_header(ixs->skb, ipsec_skb_offset(ixs->skb, ixs->skb->data)); -+#else /* NET_21 */ -+ ixs->skb->ip_hdr = ixs->skb->h.iph = (struct iphdr *) ixs->skb->data; -+#endif /* NET_21 */ -+ ixs->iph->check = 0; -+ ixs->iph->check = ip_fast_csum((unsigned char *)ixs->iph, ixs->iph->ihl); -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_xmit_encap_once: " -+ "after <%s%s%s>, SA:%s:\n", -+ IPS_XFORM_NAME(ixs->ipsp), -+ ixs->sa_len ? ixs->sa_txt : " (error)"); -+ KLIPS_IP_PRINT(debug_tunnel & DB_TN_XMIT, ixs->iph); -+ -+ ixs->ipsp->ips_life.ipl_bytes.ipl_count += ixs->len; -+ ixs->ipsp->ips_life.ipl_bytes.ipl_last = ixs->len; -+ -+ if(!ixs->ipsp->ips_life.ipl_usetime.ipl_count) { -+ ixs->ipsp->ips_life.ipl_usetime.ipl_count = jiffies / HZ; -+ } -+ ixs->ipsp->ips_life.ipl_usetime.ipl_last = jiffies / HZ; -+ ixs->ipsp->ips_life.ipl_packets.ipl_count++; -+ -+ ixs->ipsp = ixs->ipsp->ips_next; -+ -+ /* -+ * start again if we have more work to do -+ */ -+ if (ixs->ipsp) -+ ixs->next_state = IPSEC_XSM_ENCAP_INIT; -+ -+ return IPSEC_XMIT_OK; -+} -+ -+ -+/* -+ * If the IP packet (iph) is a carrying TCP/UDP, then set the encaps -+ * source and destination ports to those from the TCP/UDP header. -+ */ -+void ipsec_extract_ports(struct iphdr * iph, struct sockaddr_encap * er) -+{ -+ struct udphdr *udp; -+ -+ switch (iph->protocol) { -+ case IPPROTO_UDP: -+ case IPPROTO_TCP: -+ /* -+ * The ports are at the same offsets in a TCP and UDP -+ * header so hack it ... -+ */ -+ udp = (struct udphdr*)(((char*)iph)+(iph->ihl<<2)); -+ er->sen_sport = udp->source; -+ er->sen_dport = udp->dest; -+ break; -+ default: -+ er->sen_sport = 0; -+ er->sen_dport = 0; -+ break; -+ } -+} -+ -+/* -+ * A TRAP eroute is installed and we want to replace it with a HOLD -+ * eroute. -+ */ -+static int create_hold_eroute(struct eroute *origtrap, -+ struct sk_buff * skb, struct iphdr * iph, -+ uint32_t eroute_pid) -+{ -+ struct eroute hold_eroute; -+ ip_said hold_said; -+ struct sk_buff *first, *last; -+ int error; -+ -+ first = last = NULL; -+ memset((caddr_t)&hold_eroute, 0, sizeof(hold_eroute)); -+ memset((caddr_t)&hold_said, 0, sizeof(hold_said)); -+ -+ hold_said.proto = IPPROTO_INT; -+ hold_said.spi = htonl(SPI_HOLD); -+ hold_said.dst.u.v4.sin_addr.s_addr = INADDR_ANY; -+ -+ hold_eroute.er_eaddr.sen_len = sizeof(struct sockaddr_encap); -+ hold_eroute.er_emask.sen_len = sizeof(struct sockaddr_encap); -+ hold_eroute.er_eaddr.sen_family = AF_ENCAP; -+ hold_eroute.er_emask.sen_family = AF_ENCAP; -+ hold_eroute.er_eaddr.sen_type = SENT_IP4; -+ hold_eroute.er_emask.sen_type = 255; -+ -+ hold_eroute.er_eaddr.sen_ip_src.s_addr = iph->saddr; -+ hold_eroute.er_eaddr.sen_ip_dst.s_addr = iph->daddr; -+ hold_eroute.er_emask.sen_ip_src.s_addr = INADDR_BROADCAST; -+ hold_eroute.er_emask.sen_ip_dst.s_addr = INADDR_BROADCAST; -+ hold_eroute.er_emask.sen_sport = 0; -+ hold_eroute.er_emask.sen_dport = 0; -+ hold_eroute.er_pid = eroute_pid; -+ hold_eroute.er_count = 0; -+ hold_eroute.er_lasttime = jiffies/HZ; -+ -+ /* -+ * if it wasn't captured by a wildcard, then don't record it as -+ * a wildcard. -+ */ -+ if(origtrap->er_eaddr.sen_proto != 0) { -+ hold_eroute.er_eaddr.sen_proto = iph->protocol; -+ -+ if((iph->protocol == IPPROTO_TCP || -+ iph->protocol == IPPROTO_UDP) && -+ (origtrap->er_eaddr.sen_sport != 0 || -+ origtrap->er_eaddr.sen_dport != 0)) { -+ -+ if(origtrap->er_eaddr.sen_sport != 0) -+ hold_eroute.er_emask.sen_sport = ~0; -+ -+ if(origtrap->er_eaddr.sen_dport != 0) -+ hold_eroute.er_emask.sen_dport = ~0; -+ -+ ipsec_extract_ports(iph, &hold_eroute.er_eaddr); -+ } -+ } -+ -+#ifdef CONFIG_KLIPS_DEBUG -+ if (debug_pfkey) { -+ char buf1[64], buf2[64]; -+ subnettoa(hold_eroute.er_eaddr.sen_ip_src, -+ hold_eroute.er_emask.sen_ip_src, 0, buf1, sizeof(buf1)); -+ subnettoa(hold_eroute.er_eaddr.sen_ip_dst, -+ hold_eroute.er_emask.sen_ip_dst, 0, buf2, sizeof(buf2)); -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:ipsec_tunnel_start_xmit: " -+ "calling breakeroute and makeroute for %s:%d->%s:%d %d HOLD eroute.\n", -+ buf1, ntohs(hold_eroute.er_eaddr.sen_sport), -+ buf2, ntohs(hold_eroute.er_eaddr.sen_dport), -+ hold_eroute.er_eaddr.sen_proto); -+ } -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+ if (ipsec_breakroute(&(hold_eroute.er_eaddr), &(hold_eroute.er_emask), -+ &first, &last)) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:ipsec_tunnel_start_xmit: " -+ "HOLD breakeroute found nothing.\n"); -+ } else { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:ipsec_tunnel_start_xmit: " -+ "HOLD breakroute deleted %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u %u\n", -+ NIPQUAD(hold_eroute.er_eaddr.sen_ip_src), -+ ntohs(hold_eroute.er_eaddr.sen_sport), -+ NIPQUAD(hold_eroute.er_eaddr.sen_ip_dst), -+ ntohs(hold_eroute.er_eaddr.sen_dport), -+ hold_eroute.er_eaddr.sen_proto); -+ } -+ if (first != NULL) -+ kfree_skb(first); -+ if (last != NULL) -+ kfree_skb(last); -+ -+ error = ipsec_makeroute(&(hold_eroute.er_eaddr), -+ &(hold_eroute.er_emask), -+ hold_said, eroute_pid, skb, NULL, NULL); -+ if (error) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:ipsec_tunnel_start_xmit: " -+ "HOLD makeroute returned %d, failed.\n", error); -+ } else { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:ipsec_tunnel_start_xmit: " -+ "HOLD makeroute call successful.\n"); -+ } -+ return (error == 0); -+} -+ -+/* -+ * upon entry to this function, ixs->skb should be setup -+ * as follows: -+ * -+ * data = beginning of IP packet <- differs from ipsec_rcv(). -+ * nh.raw = beginning of IP packet. -+ * h.raw = data after the IP packet. -+ * -+ */ -+enum ipsec_xmit_value -+ipsec_xmit_init1(struct ipsec_xmit_state *ixs) -+{ -+ ixs->newdst = ixs->orgdst = ixs->iph->daddr; -+ ixs->newsrc = ixs->orgsrc = ixs->iph->saddr; -+ ixs->orgedst = ixs->outgoing_said.dst.u.v4.sin_addr.s_addr; -+ ixs->iphlen = ixs->iph->ihl << 2; -+ ixs->pyldsz = ntohs(ixs->iph->tot_len) - ixs->iphlen; -+ ixs->max_headroom = ixs->max_tailroom = 0; -+ -+ if (ixs->outgoing_said.proto == IPPROTO_INT) { -+ switch (ntohl(ixs->outgoing_said.spi)) { -+ case SPI_DROP: -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_xmit_encap_bundle: " -+ "shunt SA of DROP or no eroute: dropping.\n"); -+ ixs->stats->tx_dropped++; -+ break; -+ -+ case SPI_REJECT: -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_xmit_encap_bundle: " -+ "shunt SA of REJECT: notifying and dropping.\n"); -+ ICMP_SEND(ixs->skb, -+ ICMP_DEST_UNREACH, -+ ICMP_PKT_FILTERED, -+ 0, -+ ixs->physdev); -+ ixs->stats->tx_dropped++; -+ break; -+ -+ case SPI_PASS: -+#ifdef NET_21 -+ ixs->pass = 1; -+#endif /* NET_21 */ -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_xmit_encap_bundle: " -+ "PASS: calling dev_queue_xmit\n"); -+ return IPSEC_XMIT_PASS; -+ -+ case SPI_HOLD: -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_xmit_encap_bundle: " -+ "shunt SA of HOLD: this does not make sense here, dropping.\n"); -+ ixs->stats->tx_dropped++; -+ break; -+ -+ case SPI_TRAP: -+ case SPI_TRAPSUBNET: -+ { -+ struct sockaddr_in src, dst; -+#ifdef CONFIG_KLIPS_DEBUG -+ char bufsrc[ADDRTOA_BUF], bufdst[ADDRTOA_BUF]; -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+ /* Signal all listening KMds with a PF_KEY ACQUIRE */ -+ -+ memset(&src, 0, sizeof(src)); -+ memset(&dst, 0, sizeof(dst)); -+ src.sin_family = AF_INET; -+ dst.sin_family = AF_INET; -+ src.sin_addr.s_addr = ixs->iph->saddr; -+ dst.sin_addr.s_addr = ixs->iph->daddr; -+ -+ ixs->ips.ips_transport_protocol = 0; -+ src.sin_port = 0; -+ dst.sin_port = 0; -+ -+ if(ixs->eroute->er_eaddr.sen_proto != 0) { -+ ixs->ips.ips_transport_protocol = ixs->iph->protocol; -+ -+ if(ixs->eroute->er_eaddr.sen_sport != 0) { -+ src.sin_port = -+ (ixs->iph->protocol == IPPROTO_UDP -+ ? ((struct udphdr*) (((caddr_t)ixs->iph) + (ixs->iph->ihl << 2)))->source -+ : (ixs->iph->protocol == IPPROTO_TCP -+ ? ((struct tcphdr*)((caddr_t)ixs->iph + (ixs->iph->ihl << 2)))->source -+ : 0)); -+ } -+ if(ixs->eroute->er_eaddr.sen_dport != 0) { -+ dst.sin_port = -+ (ixs->iph->protocol == IPPROTO_UDP -+ ? ((struct udphdr*) (((caddr_t)ixs->iph) + (ixs->iph->ihl << 2)))->dest -+ : (ixs->iph->protocol == IPPROTO_TCP -+ ? ((struct tcphdr*)((caddr_t)ixs->iph + (ixs->iph->ihl << 2)))->dest -+ : 0)); -+ } -+ } -+ -+ ixs->ips.ips_addr_s = (struct sockaddr*)(&src); -+ ixs->ips.ips_addr_d = (struct sockaddr*)(&dst); -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_xmit_encap_bundle: " -+ "SADB_ACQUIRE sent with src=%s:%d, dst=%s:%d, proto=%d.\n", -+ addrtoa(((struct sockaddr_in*)(ixs->ips.ips_addr_s))->sin_addr, 0, bufsrc, sizeof(bufsrc)) <= ADDRTOA_BUF ? bufsrc : "BAD_ADDR", -+ ntohs(((struct sockaddr_in*)(ixs->ips.ips_addr_s))->sin_port), -+ addrtoa(((struct sockaddr_in*)(ixs->ips.ips_addr_d))->sin_addr, 0, bufdst, sizeof(bufdst)) <= ADDRTOA_BUF ? bufdst : "BAD_ADDR", -+ ntohs(((struct sockaddr_in*)(ixs->ips.ips_addr_d))->sin_port), -+ ixs->ips.ips_said.proto); -+ -+ /* increment count of total traps needed */ -+ ipsec_xmit_trap_count++; -+ -+ if (pfkey_acquire(&ixs->ips) == 0) { -+ -+ /* note that we succeeded */ -+ ipsec_xmit_trap_sendcount++; -+ -+ if (ixs->outgoing_said.spi==htonl(SPI_TRAPSUBNET)) { -+ /* -+ * The spinlock is to prevent any other -+ * process from accessing or deleting -+ * the eroute while we are using and -+ * updating it. -+ */ -+ spin_lock_bh(&eroute_lock); -+ ixs->eroute = ipsec_findroute(&ixs->matcher); -+ if(ixs->eroute) { -+ ixs->eroute->er_said.spi = htonl(SPI_HOLD); -+ ixs->eroute->er_first = ixs->skb; -+ ixs->skb = NULL; -+ } -+ spin_unlock_bh(&eroute_lock); -+ } else if (create_hold_eroute(ixs->eroute, -+ ixs->skb, -+ ixs->iph, -+ ixs->eroute_pid)) { -+ ixs->skb = NULL; -+ } -+ /* whether or not the above succeeded, we continue */ -+ -+ } -+ ixs->stats->tx_dropped++; -+ } -+ default: -+ /* XXX what do we do with an unknown shunt spi? */ -+ break; -+ } /* switch (ntohl(ixs->outgoing_said.spi)) */ -+ return IPSEC_XMIT_STOLEN; -+ } /* if (ixs->outgoing_said.proto == IPPROTO_INT) */ -+ -+ ixs->ipsp = ipsec_sa_getbyid(&ixs->outgoing_said); -+ ixs->sa_len = KLIPS_SATOT(debug_tunnel, &ixs->outgoing_said, 0, ixs->sa_txt, sizeof(ixs->sa_txt)); -+ -+ if (ixs->ipsp == NULL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_xmit_encap_bundle: " -+ "no ipsec_sa for SA%s: outgoing packet with no SA, dropped.\n", -+ ixs->sa_len ? ixs->sa_txt : " (error)"); -+ if(ixs->stats) { -+ ixs->stats->tx_dropped++; -+ } -+ return IPSEC_XMIT_SAIDNOTFOUND; -+ } -+ -+ return IPSEC_XMIT_OK; -+} -+ -+enum ipsec_xmit_value -+ipsec_xmit_init2(struct ipsec_xmit_state *ixs) -+{ -+ enum ipsec_xmit_value bundle_stat = IPSEC_XMIT_OK; -+ struct ipsec_sa *saved_ipsp; -+#ifdef CONFIG_KLIPS_ALG -+ ixs->blocksize = 8; -+ ixs->ixt_e = NULL; -+ ixs->ixt_a = NULL; -+#endif /* CONFIG_KLIPS_ALG */ -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_xmit_encap_bundle_2: " -+ "found ipsec_sa -- SA:<%s%s%s> %s\n", -+ IPS_XFORM_NAME(ixs->ipsp), -+ ixs->sa_len ? ixs->sa_txt : " (error)"); -+ -+ /* -+ * How much headroom do we need to be able to apply -+ * all the grouped transforms? -+ */ -+ saved_ipsp = ixs->ipsp; /* save the head of the ipsec_sa chain */ -+ while (ixs->ipsp) { -+ if (debug_tunnel & DB_TN_XMIT) { -+ ixs->sa_len = KLIPS_SATOT(debug_tunnel, &ixs->ipsp->ips_said, 0, ixs->sa_txt, sizeof(ixs->sa_txt)); -+ if(ixs->sa_len == 0) { -+ strcpy(ixs->sa_txt, "(error)"); -+ } -+ } else { -+ *ixs->sa_txt = 0; -+ ixs->sa_len = 0; -+ } -+ -+ /* If it is in larval state, drop the packet, we cannot process yet. */ -+ if(ixs->ipsp->ips_state == K_SADB_SASTATE_LARVAL) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_xmit_encap_bundle_2: " -+ "ipsec_sa in larval state for SA:<%s%s%s> %s, cannot be used yet, dropping packet.\n", -+ IPS_XFORM_NAME(ixs->ipsp), -+ ixs->sa_len ? ixs->sa_txt : " (error)"); -+ if(ixs->stats) { -+ ixs->stats->tx_errors++; -+ } -+ bundle_stat = IPSEC_XMIT_SAIDNOTLIVE; -+ goto cleanup; -+ } -+ -+ if(ixs->ipsp->ips_state == K_SADB_SASTATE_DEAD) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_xmit_encap_bundle_2: " -+ "ipsec_sa in dead state for SA:<%s%s%s> %s, can no longer be used, dropping packet.\n", -+ IPS_XFORM_NAME(ixs->ipsp), -+ ixs->sa_len ? ixs->sa_txt : " (error)"); -+ ixs->stats->tx_errors++; -+ bundle_stat = IPSEC_XMIT_SAIDNOTLIVE; -+ goto cleanup; -+ } -+ -+ /* If the replay window counter == -1, expire SA, it will roll */ -+ if(ixs->ipsp->ips_replaywin && ixs->ipsp->ips_replaywin_lastseq == -1) { -+ pfkey_expire(ixs->ipsp, 1); -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_xmit_encap_bundle_2: " -+ "replay window counter rolled for SA:<%s%s%s> %s, packet dropped, expiring SA.\n", -+ IPS_XFORM_NAME(ixs->ipsp), -+ ixs->sa_len ? ixs->sa_txt : " (error)"); -+ ipsec_sa_rm(ixs->ipsp); -+ ixs->stats->tx_errors++; -+ bundle_stat = IPSEC_XMIT_REPLAYROLLED; -+ goto cleanup; -+ } -+ -+ /* -+ * if this is the first time we are using this SA, mark start time, -+ * and offset hard/soft counters by "now" for later checking. -+ */ -+#if 0 -+ if(ixs->ipsp->ips_life.ipl_usetime.count == 0) { -+ ixs->ipsp->ips_life.ipl_usetime.count = jiffies; -+ ixs->ipsp->ips_life.ipl_usetime.hard += jiffies; -+ ixs->ipsp->ips_life.ipl_usetime.soft += jiffies; -+ } -+#endif -+ -+ -+ if(ipsec_lifetime_check(&ixs->ipsp->ips_life.ipl_bytes, "bytes", ixs->sa_txt, -+ ipsec_life_countbased, ipsec_outgoing, ixs->ipsp) == ipsec_life_harddied || -+ ipsec_lifetime_check(&ixs->ipsp->ips_life.ipl_addtime, "addtime",ixs->sa_txt, -+ ipsec_life_timebased, ipsec_outgoing, ixs->ipsp) == ipsec_life_harddied || -+ ipsec_lifetime_check(&ixs->ipsp->ips_life.ipl_usetime, "usetime",ixs->sa_txt, -+ ipsec_life_timebased, ipsec_outgoing, ixs->ipsp) == ipsec_life_harddied || -+ ipsec_lifetime_check(&ixs->ipsp->ips_life.ipl_packets, "packets",ixs->sa_txt, -+ ipsec_life_countbased, ipsec_outgoing, ixs->ipsp) == ipsec_life_harddied) { -+ -+ ipsec_sa_rm(ixs->ipsp); -+ ixs->stats->tx_errors++; -+ bundle_stat = IPSEC_XMIT_LIFETIMEFAILED; -+ goto cleanup; -+ } -+ -+ -+ ixs->headroom = ixs->tailroom = 0; -+ KLIPS_PRINT(debug_tunnel & DB_TN_CROUT, -+ "klips_debug:ipsec_xmit_encap_bundle_2: " -+ "calling room for <%s%s%s>, SA:%s\n", -+ IPS_XFORM_NAME(ixs->ipsp), -+ ixs->sa_len ? ixs->sa_txt : " (error)"); -+ switch(ixs->ipsp->ips_said.proto) { -+#ifdef CONFIG_KLIPS_AH -+ case IPPROTO_AH: -+ ixs->headroom += sizeof(struct ahhdr); -+ break; -+#endif /* CONFIG_KLIPS_AH */ -+ -+#ifdef CONFIG_KLIPS_ESP -+ case IPPROTO_ESP: -+#ifdef CONFIG_KLIPS_OCF -+ /* -+ * this needs cleaning up for sure - DM -+ */ -+ if (ixs->ipsp->ocf_in_use) { -+ switch (ixs->ipsp->ips_encalg) { -+ case ESP_DES: -+ case ESP_3DES: -+ ixs->blocksize = 8; -+ ixs->headroom += ESP_HEADER_LEN + 8 /* ivsize */; -+ break; -+ case ESP_AES: -+ ixs->blocksize = 16; -+ ixs->headroom += ESP_HEADER_LEN + 16 /* ivsize */; -+ break; -+ default: -+ ixs->stats->tx_errors++; -+ bundle_stat = IPSEC_XMIT_ESP_BADALG; -+ goto cleanup; -+ } -+ } else -+#endif /* CONFIG_KLIPS_OCF */ -+#ifdef CONFIG_KLIPS_ALG -+ ixs->ixt_e=ixs->ipsp->ips_alg_enc; -+ if (ixs->ixt_e) { -+ ixs->blocksize = ixs->ixt_e->ixt_common.ixt_blocksize; -+ ixs->headroom += ESP_HEADER_LEN + ixs->ixt_e->ixt_common.ixt_support.ias_ivlen/8; -+ } else -+#endif /* CONFIG_KLIPS_ALG */ -+ { -+ ixs->stats->tx_errors++; -+ bundle_stat = IPSEC_XMIT_ESP_BADALG; -+ goto cleanup; -+ } -+#ifdef CONFIG_KLIPS_OCF -+ if (ixs->ipsp->ocf_in_use) { -+ switch (ixs->ipsp->ips_authalg) { -+ case AH_MD5: -+ case AH_SHA: -+ ixs->tailroom += AHHMAC_HASHLEN; -+ break; -+ case AH_NONE: -+ break; -+ } -+ } else -+#endif /* CONFIG_KLIPS_OCF */ -+#ifdef CONFIG_KLIPS_ALG -+ if ((ixs->ixt_a=ixs->ipsp->ips_alg_auth)) { -+ ixs->tailroom += AHHMAC_HASHLEN; -+ } else -+#endif /* CONFIG_KLIPS_ALG */ -+ switch(ixs->ipsp->ips_authalg) { -+#ifdef CONFIG_KLIPS_AUTH_HMAC_MD5 -+ case AH_MD5: -+ ixs->tailroom += AHHMAC_HASHLEN; -+ break; -+#endif /* CONFIG_KLIPS_AUTH_HMAC_MD5 */ -+#ifdef CONFIG_KLIPS_AUTH_HMAC_SHA1 -+ case AH_SHA: -+ ixs->tailroom += AHHMAC_HASHLEN; -+ break; -+#endif /* CONFIG_KLIPS_AUTH_HMAC_SHA1 */ -+ case AH_NONE: -+ break; -+ default: -+ ixs->stats->tx_errors++; -+ bundle_stat = IPSEC_XMIT_AH_BADALG; -+ goto cleanup; -+ } -+ ixs->tailroom += ixs->blocksize != 1 ? -+ ((ixs->blocksize - ((ixs->pyldsz + 2) % ixs->blocksize)) % ixs->blocksize) + 2 : -+ ((4 - ((ixs->pyldsz + 2) % 4)) % 4) + 2; -+#ifdef CONFIG_IPSEC_NAT_TRAVERSAL -+ if ((ixs->ipsp->ips_natt_type) && (!ixs->natt_type)) { -+ ixs->natt_type = ixs->ipsp->ips_natt_type; -+ ixs->natt_sport = ixs->ipsp->ips_natt_sport; -+ ixs->natt_dport = ixs->ipsp->ips_natt_dport; -+ switch (ixs->natt_type) { -+ case ESPINUDP_WITH_NON_IKE: -+ ixs->natt_head = sizeof(struct udphdr)+(2*sizeof(__u32)); -+ break; -+ -+ case ESPINUDP_WITH_NON_ESP: -+ ixs->natt_head = sizeof(struct udphdr); -+ break; -+ -+ default: -+ KLIPS_PRINT(debug_tunnel & DB_TN_CROUT -+ , "klips_xmit: invalid nat-t type %d" -+ , ixs->natt_type); -+ bundle_stat = IPSEC_XMIT_ESPUDP_BADTYPE; -+ goto cleanup; -+ -+ break; -+ } -+ ixs->tailroom += ixs->natt_head; -+ } -+#endif -+ break; -+#endif /* CONFIG_KLIPS_ESP */ -+#ifdef CONFIG_KLIPS_IPIP -+ case IPPROTO_IPIP: -+ ixs->headroom += sizeof(struct iphdr); -+ break; -+#endif /* !CONFIG_KLIPS_IPIP */ -+ case IPPROTO_COMP: -+#ifdef CONFIG_KLIPS_IPCOMP -+ /* -+ We can't predict how much the packet will -+ shrink without doing the actual compression. -+ We could do it here, if we were the first -+ encapsulation in the chain. That might save -+ us a skb_copy_expand, since we might fit -+ into the existing skb then. However, this -+ would be a bit unclean (and this hack has -+ bit us once), so we better not do it. After -+ all, the skb_copy_expand is cheap in -+ comparison to the actual compression. -+ At least we know the packet will not grow. -+ */ -+ break; -+#endif /* CONFIG_KLIPS_IPCOMP */ -+ default: -+ ixs->stats->tx_errors++; -+ bundle_stat = IPSEC_XMIT_BADPROTO; -+ goto cleanup; -+ } -+ ixs->ipsp = ixs->ipsp->ips_next; -+ KLIPS_PRINT(debug_tunnel & DB_TN_CROUT, -+ "klips_debug:ipsec_xmit_encap_bundle_2: " -+ "Required head,tailroom: %d,%d\n", -+ ixs->headroom, ixs->tailroom); -+ ixs->max_headroom += ixs->headroom; -+ ixs->max_tailroom += ixs->tailroom; -+ ixs->pyldsz += (ixs->headroom + ixs->tailroom); -+ } -+ ixs->ipsp = saved_ipsp; /* restore the head of the ipsec_sa chain */ -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_CROUT, -+ "klips_debug:ipsec_xmit_encap_bundle_2: " -+ "existing head,tailroom: %d,%d before applying xforms with head,tailroom: %d,%d .\n", -+ skb_headroom(ixs->skb), skb_tailroom(ixs->skb), -+ ixs->max_headroom, ixs->max_tailroom); -+ -+ ixs->tot_headroom += ixs->max_headroom; -+ ixs->tot_tailroom += ixs->max_tailroom; -+ -+ ixs->mtudiff = ixs->cur_mtu + ixs->tot_headroom + ixs->tot_tailroom - ixs->physmtu; -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_CROUT, -+ "klips_debug:ipsec_xmit_encap_bundle_2: " -+ "mtu:%d physmtu:%d tothr:%d tottr:%d mtudiff:%d ippkttotlen:%d\n", -+ ixs->cur_mtu, ixs->physmtu, -+ ixs->tot_headroom, ixs->tot_tailroom, ixs->mtudiff, ntohs(ixs->iph->tot_len)); -+ if(ixs->cur_mtu == 0 || ixs->mtudiff > 0) { -+ int newmtu = ixs->physmtu - (ixs->tot_headroom + ((ixs->tot_tailroom + 2) & ~7) + 5); -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_CROUT, -+ "klips_info:ipsec_xmit_encap_bundle_2: " -+ "dev %s mtu of %d decreased by %d to %d\n", -+ ixs->dev ? ixs->dev->name : "ifX", -+ ixs->cur_mtu, -+ ixs->cur_mtu - newmtu, -+ newmtu); -+ ixs->cur_mtu = newmtu; -+ -+ /* this would seem to adjust the MTU of the route as well */ -+#if 0 -+ ixs->skb->dst->pmtu = ixs->prv->mtu; /* RGB */ -+#endif /* 0 */ -+ } -+ -+ /* -+ If the sender is doing PMTU discovery, and the -+ packet doesn't fit within ixs->prv->mtu, notify him -+ (unless it was an ICMP packet, or it was not the -+ zero-offset packet) and send it anyways. -+ -+ Note: buggy firewall configuration may prevent the -+ ICMP packet from getting back. -+ */ -+ if(sysctl_ipsec_icmp -+ && ixs->cur_mtu < ntohs(ixs->iph->tot_len) -+ && (ixs->iph->frag_off & __constant_htons(IP_DF)) ) { -+ int notify = ixs->iph->protocol != IPPROTO_ICMP -+ && (ixs->iph->frag_off & __constant_htons(IP_OFFSET)) == 0; -+ -+#ifdef IPSEC_obey_DF -+ KLIPS_PRINT(debug_tunnel & DB_TN_CROUT, -+ "klips_debug:ipsec_xmit_encap_bundle_2: " -+ "fragmentation needed and DF set; %sdropping packet\n", -+ notify ? "sending ICMP and " : ""); -+ if (notify) -+ ICMP_SEND(ixs->skb, -+ ICMP_DEST_UNREACH, -+ ICMP_FRAG_NEEDED, -+ ixs->cur_mtu, -+ ixs->physdev); -+ ixs->stats->tx_errors++; -+ bundle_stat = IPSEC_XMIT_CANNOTFRAG; -+ goto cleanup; -+#else /* IPSEC_obey_DF */ -+ KLIPS_PRINT(debug_tunnel & DB_TN_CROUT, -+ "klips_debug:ipsec_xmit_encap_bundle_2: " -+ "fragmentation needed and DF set; %spassing packet\n", -+ notify ? "sending ICMP and " : ""); -+ if (notify) -+ ICMP_SEND(ixs->skb, -+ ICMP_DEST_UNREACH, -+ ICMP_FRAG_NEEDED, -+ ixs->cur_mtu, -+ ixs->physdev); -+#endif /* IPSEC_obey_DF */ -+ } -+ -+#ifdef MSS_HACK -+ /* -+ * If this is a transport mode TCP packet with -+ * SYN set, determine an effective MSS based on -+ * AH/ESP overheads determined above. -+ */ -+ if (ixs->iph->protocol == IPPROTO_TCP -+ && ixs->outgoing_said.proto != IPPROTO_IPIP) { -+ struct tcphdr *tcph = ixs->skb->h.th; -+ if (tcph->syn && !tcph->ack) { -+ if(!ipsec_adjust_mss(ixs->skb, tcph, ixs->cur_mtu)) { -+ printk(KERN_WARNING -+ "klips_warning:ipsec_xmit_encap_bundle_2: " -+ "ipsec_adjust_mss() failed\n"); -+ ixs->stats->tx_errors++; -+ bundle_stat = IPSEC_XMIT_MSSERR; -+ goto cleanup; -+ } -+ } -+ } -+#endif /* MSS_HACK */ -+ -+#ifdef CONFIG_IPSEC_NAT_TRAVERSAL -+ if ((ixs->natt_type) && (ixs->outgoing_said.proto != IPPROTO_IPIP)) { -+ /** -+ * NAT-Traversal and Transport Mode: -+ * we need to force destination address to sane value -+ */ -+ -+ struct sockaddr_in *sv4=(struct sockaddr_in *)ixs->ipsp->ips_addr_d; -+ __u32 natt_d = sv4->sin_addr.s_addr; -+ struct iphdr *ipp = ixs->iph; -+ -+ /* set the destination address to what it needs to be for the -+ * NAT encapsulation. -+ */ -+ KLIPS_PRINT(debug_tunnel, -+ "xmit: setting ND=%08x\n", natt_d); -+ ipp->daddr = natt_d; -+ ipp->check = 0; -+ ipp->check = ip_fast_csum((unsigned char *)ipp, ipp->ihl); -+ } -+#endif /* CONFIG_IPSEC_NAT_TRAVERSAL */ -+ -+ if(!ixs->hard_header_stripped && ixs->hard_header_len>0) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_xmit_encap_bundle_2: " -+ "allocating %d bytes for hardheader.\n", -+ ixs->hard_header_len); -+ if((ixs->saved_header = kmalloc(ixs->hard_header_len, GFP_ATOMIC)) == NULL) { -+ printk(KERN_WARNING "klips_debug:ipsec_xmit_encap_bundle_2: " -+ "Failed, tried to allocate %d bytes for temp hard_header.\n", -+ ixs->hard_header_len); -+ ixs->stats->tx_errors++; -+ bundle_stat = IPSEC_XMIT_ERRMEMALLOC; -+ goto cleanup; -+ } -+ { -+ int i; -+ for (i = 0; i < ixs->hard_header_len; i++) { -+ ixs->saved_header[i] = ixs->skb->data[i]; -+ } -+ } -+ if(ixs->skb->len < ixs->hard_header_len) { -+ printk(KERN_WARNING "klips_error:ipsec_xmit_encap_bundle_2: " -+ "tried to skb_pull hhlen=%d, %d available. This should never happen, please report.\n", -+ ixs->hard_header_len, (int)(ixs->skb->len)); -+ ixs->stats->tx_errors++; -+ bundle_stat = IPSEC_XMIT_ESP_PUSHPULLERR; -+ goto cleanup; -+ } -+ skb_pull(ixs->skb, ixs->hard_header_len); -+ ixs->hard_header_stripped = 1; -+ -+/* ixs->iph = (struct iphdr *) (ixs->skb->data); */ -+ KLIPS_PRINT(debug_tunnel & DB_TN_CROUT, -+ "klips_debug:ipsec_xmit_encap_bundle_2: " -+ "head,tailroom: %d,%d after hard_header stripped.\n", -+ skb_headroom(ixs->skb), skb_tailroom(ixs->skb)); -+ KLIPS_IP_PRINT(debug_tunnel & DB_TN_CROUT, ixs->iph); -+ } else { -+ KLIPS_PRINT(debug_tunnel & DB_TN_CROUT, -+ "klips_debug:ipsec_xmit_encap_bundle_2: " -+ "hard header already stripped.\n"); -+ } -+ -+ ixs->ll_headroom = (ixs->hard_header_len + 15) & ~15; -+ -+ if ((skb_headroom(ixs->skb) >= ixs->max_headroom + 2 * ixs->ll_headroom) && -+ (skb_tailroom(ixs->skb) >= ixs->max_tailroom) -+#ifndef NET_21 -+ && ixs->skb->free -+#endif /* !NET_21 */ -+ ) { -+ KLIPS_PRINT(debug_tunnel & DB_TN_CROUT, -+ "klips_debug:ipsec_xmit_encap_bundle_2: " -+ "data fits in existing skb\n"); -+ } else { -+ struct sk_buff* tskb; -+ -+ if(!ixs->oskb) { -+ ixs->oskb = ixs->skb; -+ } -+ -+ tskb = skb_copy_expand(ixs->skb, -+ /* The need for 2 * link layer length here remains unexplained...RGB */ -+ ixs->max_headroom + 2 * ixs->ll_headroom, -+ ixs->max_tailroom, -+ GFP_ATOMIC); -+ -+ if(tskb && ixs->skb->sk) { -+ skb_set_owner_w(tskb, ixs->skb->sk); -+ } -+ -+ if(ixs->skb != ixs->oskb) { -+ ipsec_kfree_skb(ixs->skb); -+ } -+ ixs->skb = tskb; -+ if (!ixs->skb) { -+ printk(KERN_WARNING -+ "klips_debug:ipsec_xmit_encap_bundle_2: " -+ "Failed, tried to allocate %d head and %d tailroom\n", -+ ixs->max_headroom, ixs->max_tailroom); -+ ixs->stats->tx_errors++; -+ bundle_stat = IPSEC_XMIT_ERRSKBALLOC; -+ goto cleanup; -+ } -+ KLIPS_PRINT(debug_tunnel & DB_TN_CROUT, -+ "klips_debug:ipsec_xmit_encap_bundle_2: " -+ "head,tailroom: %d,%d after allocation\n", -+ skb_headroom(ixs->skb), skb_tailroom(ixs->skb)); -+ } -+ -+#ifdef CONFIG_KLIPS_DEBUG -+ if(debug_tunnel & DB_TN_ENCAP) { -+ ipsec_print_ip(ixs->iph); -+ } -+#endif -+ -+cleanup: -+ return bundle_stat; -+} -+ -+void -+ipsec_xmit_cleanup(struct ipsec_xmit_state*ixs) -+{ -+ if(ixs->dev) { -+#if defined(HAS_NETIF_QUEUE) || defined (HAVE_NETIF_QUEUE) -+ netif_wake_queue(ixs->dev); -+#else /* defined(HAS_NETIF_QUEUE) || defined (HAVE_NETIF_QUEUE) */ -+ ixs->dev->tbusy = 0; -+#endif /* defined(HAS_NETIF_QUEUE) || defined (HAVE_NETIF_QUEUE) */ -+ } -+ -+ if(ixs->saved_header) { -+ kfree(ixs->saved_header); -+ ixs->saved_header = NULL; -+ } -+ if(ixs->skb) { -+ dev_kfree_skb(ixs->skb); -+ ixs->skb=NULL; -+ } -+ if(ixs->oskb) { -+ dev_kfree_skb(ixs->oskb); -+ ixs->oskb=NULL; -+ } -+ if (ixs->ips.ips_ident_s.data) { -+ kfree(ixs->ips.ips_ident_s.data); -+ ixs->ips.ips_ident_s.data=NULL; -+ } -+ if (ixs->ips.ips_ident_d.data) { -+ kfree(ixs->ips.ips_ident_d.data); -+ ixs->ips.ips_ident_d.data=NULL; -+ } -+} -+ -+#ifdef NETDEV_23 -+static inline int ipsec_xmit_send2(struct sk_buff *skb) -+{ -+#ifdef NETDEV_25 /* 2.6 kernels */ -+ return dst_output(skb); -+#else -+ return ip_send(skb); -+#endif -+} -+#endif /* NETDEV_23 */ -+ -+#ifdef CONFIG_IPSEC_NAT_TRAVERSAL -+enum ipsec_xmit_value ipsec_nat_encap(struct ipsec_xmit_state *ixs) -+{ -+ if (ixs->natt_type && ixs->natt_head) { -+ struct iphdr *ipp = ip_hdr(ixs->skb); -+ struct udphdr *udp; -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_tunnel_start_xmit: " -+ "encapsuling packet into UDP (NAT-Traversal) (%d %d)\n", -+ ixs->natt_type, ixs->natt_head); -+ -+ ixs->iphlen = ipp->ihl << 2; -+ ipp->tot_len = -+ htons(ntohs(ipp->tot_len) + ixs->natt_head); -+ if(skb_tailroom(ixs->skb) < ixs->natt_head) { -+ printk(KERN_WARNING "klips_error:ipsec_tunnel_start_xmit: " -+ "tried to skb_put %d, %d available. " -+ "This should never happen, please report.\n", -+ ixs->natt_head, -+ skb_tailroom(ixs->skb)); -+ ixs->stats->tx_errors++; -+ return IPSEC_XMIT_ESPUDP; -+ } -+ skb_put(ixs->skb, ixs->natt_head); -+ -+ udp = (struct udphdr *)((char *)ipp + ixs->iphlen); -+ -+ /* move ESP hdr after UDP hdr */ -+ memmove((void *)((char *)udp + ixs->natt_head), -+ (void *)(udp), -+ ntohs(ipp->tot_len) - ixs->iphlen - ixs->natt_head); -+ -+#if 0 -+ /* set IP destination address (matters in transport mode) */ -+ { -+ struct sockaddr_in *d = (struct sockaddr_in *)ixs->ipsp->ips_addr_d; -+ ipp->daddr = d->sin_addr.s_addr; -+ } -+#endif -+ -+ /* clear UDP & Non-IKE Markers (if any) */ -+ memset(udp, 0, ixs->natt_head); -+ -+ /* fill UDP with usefull informations ;-) */ -+ udp->source = htons(ixs->natt_sport); -+ udp->dest = htons(ixs->natt_dport); -+ udp->len = htons(ntohs(ipp->tot_len) - ixs->iphlen); -+ -+ /* set protocol */ -+ ipp->protocol = IPPROTO_UDP; -+ -+ /* fix IP checksum */ -+ ipp->check = 0; -+ ipp->check = ip_fast_csum((unsigned char *)ipp, ipp->ihl); -+ } -+ return IPSEC_XMIT_OK; -+} -+#endif -+ -+ -+/* avoid forward reference complain on <2.5 */ -+struct flowi; -+ -+enum ipsec_xmit_value -+ipsec_xmit_send(struct ipsec_xmit_state*ixs, struct flowi *fl) -+{ -+ int error; -+ -+#ifdef NETDEV_25 -+ fl->nl_u.ip4_u.daddr = ip_hdr(ixs->skb)->daddr; -+ fl->nl_u.ip4_u.saddr = ixs->pass ? 0 : ip_hdr(ixs->skb)->saddr; -+ fl->nl_u.ip4_u.tos = RT_TOS(ip_hdr(ixs->skb)->tos); -+ fl->proto = ip_hdr(ixs->skb)->protocol; -+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,24) -+ error = ip_route_output_key(&ixs->route, &fl); -+#else -+ error = ip_route_output_key(&init_net, &ixs->route, fl); -+#endif -+ if (error) { -+ -+#else -+ /*skb_orphan(ixs->skb);*/ -+ if((error = ip_route_output(&ixs->route, -+ ip_hdr(ixs->skb)->daddr, -+ ixs->pass ? 0 : ip_hdr(ixs->skb)->saddr, -+ RT_TOS(ip_hdr(ixs->skb)->tos), -+ /* mcr->rgb: should this be 0 instead? */ -+ ixs->physdev->iflink))) { -+#endif -+ ixs->stats->tx_errors++; -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_xmit_send: " -+ "ip_route_output failed with error code %d, rt->u.dst.dev=%s, dropped\n", -+ error, -+ ixs->route->u.dst.dev->name); -+ return IPSEC_XMIT_ROUTEERR; -+ } -+ -+ if(ixs->dev == ixs->route->u.dst.dev) { -+ ip_rt_put(ixs->route); -+ /* This is recursion, drop it. */ -+ ixs->stats->tx_errors++; -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_xmit_send: " -+ "suspect recursion, dev=rt->u.dst.dev=%s, dropped\n", -+ ixs->dev->name); -+ return IPSEC_XMIT_RECURSDETECT; -+ } -+ -+ dst_release(ixs->skb->dst); -+ ixs->skb->dst = &ixs->route->u.dst; -+ if(ixs->stats) { -+ ixs->stats->tx_bytes += ixs->skb->len; -+ } -+ -+ if(ixs->skb->len < skb_network_header(ixs->skb) - ixs->skb->data) { -+ if(ixs->stats) { -+ ixs->stats->tx_errors++; -+ } -+ printk(KERN_WARNING -+ "klips_error:ipsec_xmit_send: " -+ "tried to __skb_pull nh-data=%ld, %d available. This should never happen, please report.\n", -+ (unsigned long)(skb_network_header(ixs->skb) - ixs->skb->data), -+ ixs->skb->len); -+ return IPSEC_XMIT_PUSHPULLERR; -+ } -+ __skb_pull(ixs->skb, skb_network_header(ixs->skb) - ixs->skb->data); -+ if(!ixs->pass) { -+ ipsec_nf_reset(ixs->skb); -+ } -+ -+ KLIPS_PRINT(debug_tunnel & DB_TN_XMIT, -+ "klips_debug:ipsec_xmit_send: " -+ "...done, calling ip_send() on device:%s\n", -+ ixs->skb->dev ? ixs->skb->dev->name : "NULL"); -+ KLIPS_IP_PRINT(debug_tunnel & DB_TN_XMIT, ip_hdr(ixs->skb)); -+#ifdef NETDEV_23 /* 2.4 kernels */ -+ { -+ int err; -+ -+/* XXX huh, we include linux/netfilter_ipv4.h where NF_IP_LOCAL_OUT is defined as 3 */ -+#ifndef NF_IP_LOCAL_OUT -+#warning I dont understand why NF_IP_LOCAL_OUT is undefined when including linux/netfilter_ipv4.h -+#define NF_IP_LOCAL_OUT 3 -+#endif -+ err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, ixs->skb, NULL, -+ ixs->route->u.dst.dev, -+ ipsec_xmit_send2); -+ if(err != NET_XMIT_SUCCESS && err != NET_XMIT_CN) { -+ if(net_ratelimit()) -+ printk(KERN_ERR -+ "klips_error:ipsec_xmit_send: " -+ "ip_send() failed, err=%d\n", -+ -err); -+ if(ixs->stats) { -+ ixs->stats->tx_errors++; -+ ixs->stats->tx_aborted_errors++; -+ } -+ ixs->skb = NULL; -+ return IPSEC_XMIT_IPSENDFAILURE; -+ } -+ } -+#else /* NETDEV_23 */ /* 2.2 kernels */ -+ ip_send(ixs->skb); -+#endif /* NETDEV_23 */ -+ if(ixs->stats) { -+ ixs->stats->tx_packets++; -+ } -+ -+ ixs->skb = NULL; -+ -+ return IPSEC_XMIT_OK; -+} -+ -+#ifdef NETDEV_25 -+enum ipsec_xmit_value -+ipsec_tunnel_send(struct ipsec_xmit_state *ixs) -+{ -+ struct flowi fl; -+ memset(&fl, 0, sizeof(fl)); -+ -+ /* new route/dst cache code from James Morris */ -+ ixs->skb->dev = ixs->physdev; -+ fl.oif = ixs->physdev->iflink; -+ -+ return ipsec_xmit_send(ixs, &fl); -+} -+#else -+enum ipsec_xmit_value -+ipsec_tunnel_send(struct ipsec_xmit_state *ixs) -+{ -+ return ipsec_xmit_send(ixs, NULL); -+} -+#endif -+ -+ -+/* -+ * here is a state machine to handle encapsulation -+ * basically we keep getting re-entered until processing is -+ * complete. For the simple case we step down the states and finish. -+ * each state is ideally some logical part of the process. If a state -+ * can pend (ie., require async processing to complete), then this -+ * should be the part of last action before it returns IPSEC_RCV_PENDING -+ * -+ * Any particular action may alter the next_state in ixs to move us to -+ * a state other than the preferred "next_state", but this is the -+ * exception and is highlighted when it is done. -+ * -+ * prototypes for state action -+ */ -+ -+struct { -+ enum ipsec_xmit_value (*action)(struct ipsec_xmit_state *ixs); -+ int next_state; -+} xmit_state_table[] = { -+ [IPSEC_XSM_INIT1] = {ipsec_xmit_init1, IPSEC_XSM_INIT2 }, -+ [IPSEC_XSM_INIT2] = {ipsec_xmit_init2, IPSEC_XSM_ENCAP_INIT }, -+ [IPSEC_XSM_ENCAP_INIT] = {ipsec_xmit_encap_init, IPSEC_XSM_ENCAP_SELECT }, -+ [IPSEC_XSM_ENCAP_SELECT]= {ipsec_xmit_encap_select,IPSEC_XSM_DONE }, -+ -+#ifdef CONFIG_KLIPS_ESP -+ [IPSEC_XSM_ESP] = {ipsec_xmit_esp, IPSEC_XSM_ESP_AH }, -+ [IPSEC_XSM_ESP_AH] = {ipsec_xmit_esp_ah, IPSEC_XSM_CONT }, -+#endif -+ -+#ifdef CONFIG_KLIPS_AH -+ [IPSEC_XSM_AH] = {ipsec_xmit_ah, IPSEC_XSM_CONT }, -+#endif -+ -+#ifdef CONFIG_KLIPS_IPIP -+ [IPSEC_XSM_IPIP] = {ipsec_xmit_ipip, IPSEC_XSM_CONT }, -+#endif -+ -+#ifdef CONFIG_KLIPS_IPCOMP -+ [IPSEC_XSM_IPCOMP] = {ipsec_xmit_ipcomp, IPSEC_XSM_CONT }, -+#endif -+ -+ [IPSEC_XSM_CONT] = {ipsec_xmit_cont, IPSEC_XSM_DONE }, -+ [IPSEC_XSM_DONE] = {NULL, IPSEC_XSM_DONE}, -+}; -+ -+ -+ -+void -+ipsec_xsm(struct ipsec_xmit_state *ixs) -+{ -+ enum ipsec_xmit_value stat = IPSEC_XMIT_ENCAPFAIL; -+ -+ if (ixs == NULL) { -+ KLIPS_PRINT(debug_tunnel, "klips_debug:ipsec_xsm: ixs == NULL.\n"); -+ return; -+ } -+ -+ /* -+ * make sure nothing is removed from underneath us -+ */ -+ spin_lock_bh(&tdb_lock); -+ -+ /* -+ * if we have a valid said, then we must check it here to ensure it -+ * hasn't gone away while we were waiting for a task to complete -+ */ -+ -+ if (ixs->ipsp && ipsec_sa_getbyid(&ixs->outgoing_said) == NULL) { -+ KLIPS_PRINT(debug_tunnel, -+ "klips_debug:ipsec_xsm: " -+ "no ipsec_sa for SA:%s: outgoing packet with no SA dropped\n", -+ ixs->sa_len ? ixs->sa_txt : " (error)"); -+ if (ixs->stats) -+ ixs->stats->tx_dropped++; -+ -+ /* drop through and cleanup */ -+ stat = IPSEC_XMIT_SAIDNOTFOUND; -+ ixs->state = IPSEC_XSM_DONE; -+ } -+ -+ while (ixs->state != IPSEC_XSM_DONE) { -+ -+ ixs->next_state = xmit_state_table[ixs->state].next_state; -+ -+ stat = xmit_state_table[ixs->state].action(ixs); -+ -+ if (stat == IPSEC_XMIT_OK) { -+ /* some functions change the next state, see the state table */ -+ ixs->state = ixs->next_state; -+ } else if (stat == IPSEC_XMIT_PENDING) { -+ /* -+ * things are on hold until we return here in the next/new state -+ * we check our SA is valid when we return -+ */ -+ spin_unlock_bh(&tdb_lock); -+ return; -+ } else { -+ /* bad result, force state change to done */ -+ KLIPS_PRINT(debug_tunnel, -+ "klips_debug:ipsec_xsm: " -+ "processing completed due to %s.\n", -+ ipsec_xmit_err(stat)); -+ ixs->state = IPSEC_XSM_DONE; -+ } -+ } -+ -+ /* -+ * all done with anything needing locks -+ */ -+ spin_unlock_bh(&tdb_lock); -+ -+ /* we are done with this SA */ -+ if (ixs->ipsp) { -+ ipsec_sa_put(ixs->ipsp); -+ ixs->ipsp = NULL; -+ } -+ -+ /* -+ * let the caller continue with their processing -+ */ -+ ixs->xsm_complete(ixs, stat); -+} -+ -+ -+/* -+ * -+ * Local Variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/match586.S Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,357 @@ -+/* match.s -- Pentium-optimized version of longest_match() -+ * Written for zlib 1.1.2 -+ * Copyright (C) 1998 Brian Raiter -+ * -+ * This is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License. -+ */ -+ -+#ifndef NO_UNDERLINE -+#define match_init _ipcomp_match_init -+#define longest_match _ipcomp_longest_match -+#else -+#define match_init ipcomp_match_init -+#define longest_match ipcomp_longest_match -+#endif -+ -+#define MAX_MATCH (258) -+#define MIN_MATCH (3) -+#define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1) -+#define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7) -+ -+/* stack frame offsets */ -+ -+#define wmask 0 /* local copy of s->wmask */ -+#define window 4 /* local copy of s->window */ -+#define windowbestlen 8 /* s->window + bestlen */ -+#define chainlenscanend 12 /* high word: current chain len */ -+ /* low word: last bytes sought */ -+#define scanstart 16 /* first two bytes of string */ -+#define scanalign 20 /* dword-misalignment of string */ -+#define nicematch 24 /* a good enough match size */ -+#define bestlen 28 /* size of best match so far */ -+#define scan 32 /* ptr to string wanting match */ -+ -+#define LocalVarsSize (36) -+/* saved ebx 36 */ -+/* saved edi 40 */ -+/* saved esi 44 */ -+/* saved ebp 48 */ -+/* return address 52 */ -+#define deflatestate 56 /* the function arguments */ -+#define curmatch 60 -+ -+/* Offsets for fields in the deflate_state structure. These numbers -+ * are calculated from the definition of deflate_state, with the -+ * assumption that the compiler will dword-align the fields. (Thus, -+ * changing the definition of deflate_state could easily cause this -+ * program to crash horribly, without so much as a warning at -+ * compile time. Sigh.) -+ */ -+#define dsWSize 36 -+#define dsWMask 44 -+#define dsWindow 48 -+#define dsPrev 56 -+#define dsMatchLen 88 -+#define dsPrevMatch 92 -+#define dsStrStart 100 -+#define dsMatchStart 104 -+#define dsLookahead 108 -+#define dsPrevLen 112 -+#define dsMaxChainLen 116 -+#define dsGoodMatch 132 -+#define dsNiceMatch 136 -+ -+ -+.file "match.S" -+ -+.globl match_init, longest_match -+ -+.text -+ -+/* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */ -+ -+longest_match: -+ -+/* Save registers that the compiler may be using, and adjust %esp to */ -+/* make room for our stack frame. */ -+ -+ pushl %ebp -+ pushl %edi -+ pushl %esi -+ pushl %ebx -+ subl $LocalVarsSize, %esp -+ -+/* Retrieve the function arguments. %ecx will hold cur_match */ -+/* throughout the entire function. %edx will hold the pointer to the */ -+/* deflate_state structure during the function's setup (before */ -+/* entering the main loop). */ -+ -+ movl deflatestate(%esp), %edx -+ movl curmatch(%esp), %ecx -+ -+/* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */ -+ -+ movl dsNiceMatch(%edx), %eax -+ movl dsLookahead(%edx), %ebx -+ cmpl %eax, %ebx -+ jl LookaheadLess -+ movl %eax, %ebx -+LookaheadLess: movl %ebx, nicematch(%esp) -+ -+/* register Bytef *scan = s->window + s->strstart; */ -+ -+ movl dsWindow(%edx), %esi -+ movl %esi, window(%esp) -+ movl dsStrStart(%edx), %ebp -+ lea (%esi,%ebp), %edi -+ movl %edi, scan(%esp) -+ -+/* Determine how many bytes the scan ptr is off from being */ -+/* dword-aligned. */ -+ -+ movl %edi, %eax -+ negl %eax -+ andl $3, %eax -+ movl %eax, scanalign(%esp) -+ -+/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */ -+/* s->strstart - (IPos)MAX_DIST(s) : NIL; */ -+ -+ movl dsWSize(%edx), %eax -+ subl $MIN_LOOKAHEAD, %eax -+ subl %eax, %ebp -+ jg LimitPositive -+ xorl %ebp, %ebp -+LimitPositive: -+ -+/* unsigned chain_length = s->max_chain_length; */ -+/* if (s->prev_length >= s->good_match) { */ -+/* chain_length >>= 2; */ -+/* } */ -+ -+ movl dsPrevLen(%edx), %eax -+ movl dsGoodMatch(%edx), %ebx -+ cmpl %ebx, %eax -+ movl dsMaxChainLen(%edx), %ebx -+ jl LastMatchGood -+ shrl $2, %ebx -+LastMatchGood: -+ -+/* chainlen is decremented once beforehand so that the function can */ -+/* use the sign flag instead of the zero flag for the exit test. */ -+/* It is then shifted into the high word, to make room for the scanend */ -+/* scanend value, which it will always accompany. */ -+ -+ decl %ebx -+ shll $16, %ebx -+ -+/* int best_len = s->prev_length; */ -+ -+ movl dsPrevLen(%edx), %eax -+ movl %eax, bestlen(%esp) -+ -+/* Store the sum of s->window + best_len in %esi locally, and in %esi. */ -+ -+ addl %eax, %esi -+ movl %esi, windowbestlen(%esp) -+ -+/* register ush scan_start = *(ushf*)scan; */ -+/* register ush scan_end = *(ushf*)(scan+best_len-1); */ -+ -+ movw (%edi), %bx -+ movw %bx, scanstart(%esp) -+ movw -1(%edi,%eax), %bx -+ movl %ebx, chainlenscanend(%esp) -+ -+/* Posf *prev = s->prev; */ -+/* uInt wmask = s->w_mask; */ -+ -+ movl dsPrev(%edx), %edi -+ movl dsWMask(%edx), %edx -+ mov %edx, wmask(%esp) -+ -+/* Jump into the main loop. */ -+ -+ jmp LoopEntry -+ -+.balign 16 -+ -+/* do { -+ * match = s->window + cur_match; -+ * if (*(ushf*)(match+best_len-1) != scan_end || -+ * *(ushf*)match != scan_start) continue; -+ * [...] -+ * } while ((cur_match = prev[cur_match & wmask]) > limit -+ * && --chain_length != 0); -+ * -+ * Here is the inner loop of the function. The function will spend the -+ * majority of its time in this loop, and majority of that time will -+ * be spent in the first ten instructions. -+ * -+ * Within this loop: -+ * %ebx = chainlenscanend - i.e., ((chainlen << 16) | scanend) -+ * %ecx = curmatch -+ * %edx = curmatch & wmask -+ * %esi = windowbestlen - i.e., (window + bestlen) -+ * %edi = prev -+ * %ebp = limit -+ * -+ * Two optimization notes on the choice of instructions: -+ * -+ * The first instruction uses a 16-bit address, which costs an extra, -+ * unpairable cycle. This is cheaper than doing a 32-bit access and -+ * zeroing the high word, due to the 3-cycle misalignment penalty which -+ * would occur half the time. This also turns out to be cheaper than -+ * doing two separate 8-bit accesses, as the memory is so rarely in the -+ * L1 cache. -+ * -+ * The window buffer, however, apparently spends a lot of time in the -+ * cache, and so it is faster to retrieve the word at the end of the -+ * match string with two 8-bit loads. The instructions that test the -+ * word at the beginning of the match string, however, are executed -+ * much less frequently, and there it was cheaper to use 16-bit -+ * instructions, which avoided the necessity of saving off and -+ * subsequently reloading one of the other registers. -+ */ -+LookupLoop: -+ /* 1 U & V */ -+ movw (%edi,%edx,2), %cx /* 2 U pipe */ -+ movl wmask(%esp), %edx /* 2 V pipe */ -+ cmpl %ebp, %ecx /* 3 U pipe */ -+ jbe LeaveNow /* 3 V pipe */ -+ subl $0x00010000, %ebx /* 4 U pipe */ -+ js LeaveNow /* 4 V pipe */ -+LoopEntry: movb -1(%esi,%ecx), %al /* 5 U pipe */ -+ andl %ecx, %edx /* 5 V pipe */ -+ cmpb %bl, %al /* 6 U pipe */ -+ jnz LookupLoop /* 6 V pipe */ -+ movb (%esi,%ecx), %ah -+ cmpb %bh, %ah -+ jnz LookupLoop -+ movl window(%esp), %eax -+ movw (%eax,%ecx), %ax -+ cmpw scanstart(%esp), %ax -+ jnz LookupLoop -+ -+/* Store the current value of chainlen. */ -+ -+ movl %ebx, chainlenscanend(%esp) -+ -+/* Point %edi to the string under scrutiny, and %esi to the string we */ -+/* are hoping to match it up with. In actuality, %esi and %edi are */ -+/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */ -+/* initialized to -(MAX_MATCH_8 - scanalign). */ -+ -+ movl window(%esp), %esi -+ movl scan(%esp), %edi -+ addl %ecx, %esi -+ movl scanalign(%esp), %eax -+ movl $(-MAX_MATCH_8), %edx -+ lea MAX_MATCH_8(%edi,%eax), %edi -+ lea MAX_MATCH_8(%esi,%eax), %esi -+ -+/* Test the strings for equality, 8 bytes at a time. At the end, -+ * adjust %edx so that it is offset to the exact byte that mismatched. -+ * -+ * We already know at this point that the first three bytes of the -+ * strings match each other, and they can be safely passed over before -+ * starting the compare loop. So what this code does is skip over 0-3 -+ * bytes, as much as necessary in order to dword-align the %edi -+ * pointer. (%esi will still be misaligned three times out of four.) -+ * -+ * It should be confessed that this loop usually does not represent -+ * much of the total running time. Replacing it with a more -+ * straightforward "rep cmpsb" would not drastically degrade -+ * performance. -+ */ -+LoopCmps: -+ movl (%esi,%edx), %eax -+ movl (%edi,%edx), %ebx -+ xorl %ebx, %eax -+ jnz LeaveLoopCmps -+ movl 4(%esi,%edx), %eax -+ movl 4(%edi,%edx), %ebx -+ xorl %ebx, %eax -+ jnz LeaveLoopCmps4 -+ addl $8, %edx -+ jnz LoopCmps -+ jmp LenMaximum -+LeaveLoopCmps4: addl $4, %edx -+LeaveLoopCmps: testl $0x0000FFFF, %eax -+ jnz LenLower -+ addl $2, %edx -+ shrl $16, %eax -+LenLower: subb $1, %al -+ adcl $0, %edx -+ -+/* Calculate the length of the match. If it is longer than MAX_MATCH, */ -+/* then automatically accept it as the best possible match and leave. */ -+ -+ lea (%edi,%edx), %eax -+ movl scan(%esp), %edi -+ subl %edi, %eax -+ cmpl $MAX_MATCH, %eax -+ jge LenMaximum -+ -+/* If the length of the match is not longer than the best match we */ -+/* have so far, then forget it and return to the lookup loop. */ -+ -+ movl deflatestate(%esp), %edx -+ movl bestlen(%esp), %ebx -+ cmpl %ebx, %eax -+ jg LongerMatch -+ movl chainlenscanend(%esp), %ebx -+ movl windowbestlen(%esp), %esi -+ movl dsPrev(%edx), %edi -+ movl wmask(%esp), %edx -+ andl %ecx, %edx -+ jmp LookupLoop -+ -+/* s->match_start = cur_match; */ -+/* best_len = len; */ -+/* if (len >= nice_match) break; */ -+/* scan_end = *(ushf*)(scan+best_len-1); */ -+ -+LongerMatch: movl nicematch(%esp), %ebx -+ movl %eax, bestlen(%esp) -+ movl %ecx, dsMatchStart(%edx) -+ cmpl %ebx, %eax -+ jge LeaveNow -+ movl window(%esp), %esi -+ addl %eax, %esi -+ movl %esi, windowbestlen(%esp) -+ movl chainlenscanend(%esp), %ebx -+ movw -1(%edi,%eax), %bx -+ movl dsPrev(%edx), %edi -+ movl %ebx, chainlenscanend(%esp) -+ movl wmask(%esp), %edx -+ andl %ecx, %edx -+ jmp LookupLoop -+ -+/* Accept the current string, with the maximum possible length. */ -+ -+LenMaximum: movl deflatestate(%esp), %edx -+ movl $MAX_MATCH, bestlen(%esp) -+ movl %ecx, dsMatchStart(%edx) -+ -+/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */ -+/* return s->lookahead; */ -+ -+LeaveNow: -+ movl deflatestate(%esp), %edx -+ movl bestlen(%esp), %ebx -+ movl dsLookahead(%edx), %eax -+ cmpl %eax, %ebx -+ jg LookaheadRet -+ movl %ebx, %eax -+LookaheadRet: -+ -+/* Restore the stack and return from whence we came. */ -+ -+ addl $LocalVarsSize, %esp -+ popl %ebx -+ popl %esi -+ popl %edi -+ popl %ebp -+match_init: ret ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/match686.S Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,330 @@ -+/* match.s -- Pentium-Pro-optimized version of longest_match() -+ * Written for zlib 1.1.2 -+ * Copyright (C) 1998 Brian Raiter -+ * -+ * This is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License. -+ */ -+ -+#ifndef NO_UNDERLINE -+#define match_init _ipcomp_match_init -+#define longest_match _ipcomp_longest_match -+#else -+#define match_init ipcomp_match_init -+#define longest_match ipcomp_longest_match -+#endif -+ -+#define MAX_MATCH (258) -+#define MIN_MATCH (3) -+#define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1) -+#define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7) -+ -+/* stack frame offsets */ -+ -+#define chainlenwmask 0 /* high word: current chain len */ -+ /* low word: s->wmask */ -+#define window 4 /* local copy of s->window */ -+#define windowbestlen 8 /* s->window + bestlen */ -+#define scanstart 16 /* first two bytes of string */ -+#define scanend 12 /* last two bytes of string */ -+#define scanalign 20 /* dword-misalignment of string */ -+#define nicematch 24 /* a good enough match size */ -+#define bestlen 28 /* size of best match so far */ -+#define scan 32 /* ptr to string wanting match */ -+ -+#define LocalVarsSize (36) -+/* saved ebx 36 */ -+/* saved edi 40 */ -+/* saved esi 44 */ -+/* saved ebp 48 */ -+/* return address 52 */ -+#define deflatestate 56 /* the function arguments */ -+#define curmatch 60 -+ -+/* Offsets for fields in the deflate_state structure. These numbers -+ * are calculated from the definition of deflate_state, with the -+ * assumption that the compiler will dword-align the fields. (Thus, -+ * changing the definition of deflate_state could easily cause this -+ * program to crash horribly, without so much as a warning at -+ * compile time. Sigh.) -+ */ -+#define dsWSize 36 -+#define dsWMask 44 -+#define dsWindow 48 -+#define dsPrev 56 -+#define dsMatchLen 88 -+#define dsPrevMatch 92 -+#define dsStrStart 100 -+#define dsMatchStart 104 -+#define dsLookahead 108 -+#define dsPrevLen 112 -+#define dsMaxChainLen 116 -+#define dsGoodMatch 132 -+#define dsNiceMatch 136 -+ -+ -+.file "match.S" -+ -+.globl match_init, longest_match -+ -+.text -+ -+/* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */ -+ -+longest_match: -+ -+/* Save registers that the compiler may be using, and adjust %esp to */ -+/* make room for our stack frame. */ -+ -+ pushl %ebp -+ pushl %edi -+ pushl %esi -+ pushl %ebx -+ subl $LocalVarsSize, %esp -+ -+/* Retrieve the function arguments. %ecx will hold cur_match */ -+/* throughout the entire function. %edx will hold the pointer to the */ -+/* deflate_state structure during the function's setup (before */ -+/* entering the main loop). */ -+ -+ movl deflatestate(%esp), %edx -+ movl curmatch(%esp), %ecx -+ -+/* uInt wmask = s->w_mask; */ -+/* unsigned chain_length = s->max_chain_length; */ -+/* if (s->prev_length >= s->good_match) { */ -+/* chain_length >>= 2; */ -+/* } */ -+ -+ movl dsPrevLen(%edx), %eax -+ movl dsGoodMatch(%edx), %ebx -+ cmpl %ebx, %eax -+ movl dsWMask(%edx), %eax -+ movl dsMaxChainLen(%edx), %ebx -+ jl LastMatchGood -+ shrl $2, %ebx -+LastMatchGood: -+ -+/* chainlen is decremented once beforehand so that the function can */ -+/* use the sign flag instead of the zero flag for the exit test. */ -+/* It is then shifted into the high word, to make room for the wmask */ -+/* value, which it will always accompany. */ -+ -+ decl %ebx -+ shll $16, %ebx -+ orl %eax, %ebx -+ movl %ebx, chainlenwmask(%esp) -+ -+/* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */ -+ -+ movl dsNiceMatch(%edx), %eax -+ movl dsLookahead(%edx), %ebx -+ cmpl %eax, %ebx -+ jl LookaheadLess -+ movl %eax, %ebx -+LookaheadLess: movl %ebx, nicematch(%esp) -+ -+/* register Bytef *scan = s->window + s->strstart; */ -+ -+ movl dsWindow(%edx), %esi -+ movl %esi, window(%esp) -+ movl dsStrStart(%edx), %ebp -+ lea (%esi,%ebp), %edi -+ movl %edi, scan(%esp) -+ -+/* Determine how many bytes the scan ptr is off from being */ -+/* dword-aligned. */ -+ -+ movl %edi, %eax -+ negl %eax -+ andl $3, %eax -+ movl %eax, scanalign(%esp) -+ -+/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */ -+/* s->strstart - (IPos)MAX_DIST(s) : NIL; */ -+ -+ movl dsWSize(%edx), %eax -+ subl $MIN_LOOKAHEAD, %eax -+ subl %eax, %ebp -+ jg LimitPositive -+ xorl %ebp, %ebp -+LimitPositive: -+ -+/* int best_len = s->prev_length; */ -+ -+ movl dsPrevLen(%edx), %eax -+ movl %eax, bestlen(%esp) -+ -+/* Store the sum of s->window + best_len in %esi locally, and in %esi. */ -+ -+ addl %eax, %esi -+ movl %esi, windowbestlen(%esp) -+ -+/* register ush scan_start = *(ushf*)scan; */ -+/* register ush scan_end = *(ushf*)(scan+best_len-1); */ -+/* Posf *prev = s->prev; */ -+ -+ movzwl (%edi), %ebx -+ movl %ebx, scanstart(%esp) -+ movzwl -1(%edi,%eax), %ebx -+ movl %ebx, scanend(%esp) -+ movl dsPrev(%edx), %edi -+ -+/* Jump into the main loop. */ -+ -+ movl chainlenwmask(%esp), %edx -+ jmp LoopEntry -+ -+.balign 16 -+ -+/* do { -+ * match = s->window + cur_match; -+ * if (*(ushf*)(match+best_len-1) != scan_end || -+ * *(ushf*)match != scan_start) continue; -+ * [...] -+ * } while ((cur_match = prev[cur_match & wmask]) > limit -+ * && --chain_length != 0); -+ * -+ * Here is the inner loop of the function. The function will spend the -+ * majority of its time in this loop, and majority of that time will -+ * be spent in the first ten instructions. -+ * -+ * Within this loop: -+ * %ebx = scanend -+ * %ecx = curmatch -+ * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) -+ * %esi = windowbestlen - i.e., (window + bestlen) -+ * %edi = prev -+ * %ebp = limit -+ */ -+LookupLoop: -+ andl %edx, %ecx -+ movzwl (%edi,%ecx,2), %ecx -+ cmpl %ebp, %ecx -+ jbe LeaveNow -+ subl $0x00010000, %edx -+ js LeaveNow -+LoopEntry: movzwl -1(%esi,%ecx), %eax -+ cmpl %ebx, %eax -+ jnz LookupLoop -+ movl window(%esp), %eax -+ movzwl (%eax,%ecx), %eax -+ cmpl scanstart(%esp), %eax -+ jnz LookupLoop -+ -+/* Store the current value of chainlen. */ -+ -+ movl %edx, chainlenwmask(%esp) -+ -+/* Point %edi to the string under scrutiny, and %esi to the string we */ -+/* are hoping to match it up with. In actuality, %esi and %edi are */ -+/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */ -+/* initialized to -(MAX_MATCH_8 - scanalign). */ -+ -+ movl window(%esp), %esi -+ movl scan(%esp), %edi -+ addl %ecx, %esi -+ movl scanalign(%esp), %eax -+ movl $(-MAX_MATCH_8), %edx -+ lea MAX_MATCH_8(%edi,%eax), %edi -+ lea MAX_MATCH_8(%esi,%eax), %esi -+ -+/* Test the strings for equality, 8 bytes at a time. At the end, -+ * adjust %edx so that it is offset to the exact byte that mismatched. -+ * -+ * We already know at this point that the first three bytes of the -+ * strings match each other, and they can be safely passed over before -+ * starting the compare loop. So what this code does is skip over 0-3 -+ * bytes, as much as necessary in order to dword-align the %edi -+ * pointer. (%esi will still be misaligned three times out of four.) -+ * -+ * It should be confessed that this loop usually does not represent -+ * much of the total running time. Replacing it with a more -+ * straightforward "rep cmpsb" would not drastically degrade -+ * performance. -+ */ -+LoopCmps: -+ movl (%esi,%edx), %eax -+ xorl (%edi,%edx), %eax -+ jnz LeaveLoopCmps -+ movl 4(%esi,%edx), %eax -+ xorl 4(%edi,%edx), %eax -+ jnz LeaveLoopCmps4 -+ addl $8, %edx -+ jnz LoopCmps -+ jmp LenMaximum -+LeaveLoopCmps4: addl $4, %edx -+LeaveLoopCmps: testl $0x0000FFFF, %eax -+ jnz LenLower -+ addl $2, %edx -+ shrl $16, %eax -+LenLower: subb $1, %al -+ adcl $0, %edx -+ -+/* Calculate the length of the match. If it is longer than MAX_MATCH, */ -+/* then automatically accept it as the best possible match and leave. */ -+ -+ lea (%edi,%edx), %eax -+ movl scan(%esp), %edi -+ subl %edi, %eax -+ cmpl $MAX_MATCH, %eax -+ jge LenMaximum -+ -+/* If the length of the match is not longer than the best match we */ -+/* have so far, then forget it and return to the lookup loop. */ -+ -+ movl deflatestate(%esp), %edx -+ movl bestlen(%esp), %ebx -+ cmpl %ebx, %eax -+ jg LongerMatch -+ movl windowbestlen(%esp), %esi -+ movl dsPrev(%edx), %edi -+ movl scanend(%esp), %ebx -+ movl chainlenwmask(%esp), %edx -+ jmp LookupLoop -+ -+/* s->match_start = cur_match; */ -+/* best_len = len; */ -+/* if (len >= nice_match) break; */ -+/* scan_end = *(ushf*)(scan+best_len-1); */ -+ -+LongerMatch: movl nicematch(%esp), %ebx -+ movl %eax, bestlen(%esp) -+ movl %ecx, dsMatchStart(%edx) -+ cmpl %ebx, %eax -+ jge LeaveNow -+ movl window(%esp), %esi -+ addl %eax, %esi -+ movl %esi, windowbestlen(%esp) -+ movzwl -1(%edi,%eax), %ebx -+ movl dsPrev(%edx), %edi -+ movl %ebx, scanend(%esp) -+ movl chainlenwmask(%esp), %edx -+ jmp LookupLoop -+ -+/* Accept the current string, with the maximum possible length. */ -+ -+LenMaximum: movl deflatestate(%esp), %edx -+ movl $MAX_MATCH, bestlen(%esp) -+ movl %ecx, dsMatchStart(%edx) -+ -+/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */ -+/* return s->lookahead; */ -+ -+LeaveNow: -+ movl deflatestate(%esp), %edx -+ movl bestlen(%esp), %ebx -+ movl dsLookahead(%edx), %eax -+ cmpl %eax, %ebx -+ jg LookaheadRet -+ movl %ebx, %eax -+LookaheadRet: -+ -+/* Restore the stack and return from whence we came. */ -+ -+ addl $LocalVarsSize, %esp -+ popl %ebx -+ popl %esi -+ popl %edi -+ popl %ebp -+match_init: ret ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/pfkey_v2.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,1587 @@ -+/* -+ * @(#) RFC2367 PF_KEYv2 Key management API domain socket I/F -+ * Copyright (C) 1999, 2000, 2001 Richard Guy Briggs. -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ */ -+ -+/* -+ * Template from /usr/src/linux-2.0.36/net/unix/af_unix.c. -+ * Hints from /usr/src/linux-2.0.36/net/ipv4/udp.c. -+ */ -+ -+#define __NO_VERSION__ -+#include -+#include -+#ifndef AUTOCONF_INCLUDED -+# include -+#endif -+#include -+ -+#include "openswan/ipsec_param.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include /* struct socket */ -+#include -+#include -+#ifdef MALLOC_SLAB -+# include /* kmalloc() */ -+#else /* MALLOC_SLAB */ -+# include /* kmalloc() */ -+#endif /* MALLOC_SLAB */ -+#ifdef CONFIG_X86 -+# include -+#endif -+#include -+#include -+#include /* struct sock */ -+#include -+/* #include */ -+#include -+#ifdef CONFIG_PROC_FS -+# include -+#endif /* CONFIG_PROC_FS */ -+#ifdef HAVE_SEQ_FILE -+# include -+#endif -+ -+#include -+ -+#include -+ -+#include "openswan/radij.h" -+#include "openswan/ipsec_encap.h" -+#include "openswan/ipsec_sa.h" -+ -+#include -+#include -+ -+#include "openswan/ipsec_proto.h" -+#include "openswan/ipsec_kern24.h" -+#include "openswan/ipsec_sysctl.h" -+ -+#define SENDERR(_x) do { error = -(_x); goto errlab; } while (0) -+ -+#if 0 -+#ifndef SOCKOPS_WRAPPED -+#define SOCKOPS_WRAPPED(name) name -+#endif /* SOCKOPS_WRAPPED */ -+#endif -+ -+extern struct proto_ops SOCKOPS_WRAPPED(pfkey_ops); -+ -+#ifdef NET_26 -+static rwlock_t pfkey_sock_lock = RW_LOCK_UNLOCKED; -+HLIST_HEAD(pfkey_sock_list); -+static DECLARE_WAIT_QUEUE_HEAD(pfkey_sock_wait); -+static atomic_t pfkey_sock_users = ATOMIC_INIT(0); -+#else -+struct sock *pfkey_sock_list = NULL; -+#endif -+ -+struct supported_list *pfkey_supported_list[K_SADB_SATYPE_MAX+1]; -+ -+struct socket_list *pfkey_open_sockets = NULL; -+struct socket_list *pfkey_registered_sockets[K_SADB_SATYPE_MAX+1]; -+ -+int pfkey_msg_interp(struct sock *, struct sadb_msg *); -+ -+#ifdef NET_26_24_SKALLOC -+DEBUG_NO_STATIC int pfkey_create(struct net *net, struct socket *sock, int protocol); -+#else -+DEBUG_NO_STATIC int pfkey_create(struct socket *sock, int protocol); -+#endif -+DEBUG_NO_STATIC int pfkey_shutdown(struct socket *sock, int mode); -+DEBUG_NO_STATIC int pfkey_release(struct socket *sock); -+ -+#ifdef NET_26 -+DEBUG_NO_STATIC int pfkey_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t len); -+DEBUG_NO_STATIC int pfkey_recvmsg(struct kiocb *kiocb, struct socket *sock, struct msghdr *msg -+ , size_t size, int flags); -+#else -+DEBUG_NO_STATIC int pfkey_sendmsg(struct socket *sock, struct msghdr *msg, int len, struct scm_cookie *scm); -+DEBUG_NO_STATIC int pfkey_recvmsg(struct socket *sock, struct msghdr *msg, int size, int flags, struct scm_cookie *scm); -+#endif -+ -+struct net_proto_family pfkey_family_ops = { -+ .owner = THIS_MODULE, -+ .family = PF_KEY, -+ .create = pfkey_create -+}; -+ -+struct proto_ops SOCKOPS_WRAPPED(pfkey_ops) = { -+ owner: THIS_MODULE, -+ family: PF_KEY, -+ release: pfkey_release, -+ bind: sock_no_bind, -+ connect: sock_no_connect, -+ socketpair: sock_no_socketpair, -+ accept: sock_no_accept, -+ getname: sock_no_getname, -+ poll: datagram_poll, -+ ioctl: sock_no_ioctl, -+ listen: sock_no_listen, -+ shutdown: pfkey_shutdown, -+ setsockopt: sock_no_setsockopt, -+ getsockopt: sock_no_getsockopt, -+ sendmsg: pfkey_sendmsg, -+ recvmsg: pfkey_recvmsg, -+ mmap: sock_no_mmap, -+}; -+ -+#include -+SOCKOPS_WRAP(pfkey, PF_KEY); -+ -+#ifdef NET_26 -+static void pfkey_sock_list_grab(void) -+{ -+ write_lock_bh(&pfkey_sock_lock); -+ -+ if (atomic_read(&pfkey_sock_users)) { -+ DECLARE_WAITQUEUE(wait, current); -+ -+ add_wait_queue_exclusive(&pfkey_sock_wait, &wait); -+ for(;;) { -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ if (atomic_read(&pfkey_sock_users) == 0) -+ break; -+ write_unlock_bh(&pfkey_sock_lock); -+ schedule(); -+ write_lock_bh(&pfkey_sock_lock); -+ } -+ -+ __set_current_state(TASK_RUNNING); -+ remove_wait_queue(&pfkey_sock_wait, &wait); -+ } -+} -+ -+static __inline__ void pfkey_sock_list_ungrab(void) -+{ -+ write_unlock_bh(&pfkey_sock_lock); -+ wake_up(&pfkey_sock_wait); -+} -+ -+static __inline__ void pfkey_lock_sock_list(void) -+{ -+ /* read_lock() synchronizes us to pfkey_table_grab */ -+ -+ read_lock(&pfkey_sock_lock); -+ atomic_inc(&pfkey_sock_users); -+ read_unlock(&pfkey_sock_lock); -+} -+ -+static __inline__ void pfkey_unlock_sock_list(void) -+{ -+ if (atomic_dec_and_test(&pfkey_sock_users)) -+ wake_up(&pfkey_sock_wait); -+} -+#endif -+ -+int -+pfkey_list_remove_socket(struct socket *socketp, struct socket_list **sockets) -+{ -+ struct socket_list *socket_listp,*prev; -+ -+ if(!socketp) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_list_remove_socket: " -+ "NULL socketp handed in, failed.\n"); -+ return -EINVAL; -+ } -+ -+ if(!sockets) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_list_remove_socket: " -+ "NULL sockets list handed in, failed.\n"); -+ return -EINVAL; -+ } -+ -+ socket_listp = *sockets; -+ prev = NULL; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_list_remove_socket: " -+ "removing sock=0p%p\n", -+ socketp); -+ -+ while(socket_listp != NULL) { -+ if(socket_listp->socketp == socketp) { -+ if(prev != NULL) { -+ prev->next = socket_listp->next; -+ } else { -+ *sockets = socket_listp->next; -+ } -+ -+ kfree((void*)socket_listp); -+ -+ break; -+ } -+ prev = socket_listp; -+ socket_listp = socket_listp->next; -+ } -+ -+ return 0; -+} -+ -+int -+pfkey_list_insert_socket(struct socket *socketp, struct socket_list **sockets) -+{ -+ struct socket_list *socket_listp; -+ -+ if(!socketp) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_list_insert_socket: " -+ "NULL socketp handed in, failed.\n"); -+ return -EINVAL; -+ } -+ -+ if(!sockets) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_list_insert_socket: " -+ "NULL sockets list handed in, failed.\n"); -+ return -EINVAL; -+ } -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_list_insert_socket: " -+ "allocating %lu bytes for socketp=0p%p\n", -+ (unsigned long) sizeof(struct socket_list), -+ socketp); -+ -+ if((socket_listp = (struct socket_list *)kmalloc(sizeof(struct socket_list), GFP_KERNEL)) == NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_list_insert_socket: " -+ "memory allocation error.\n"); -+ return -ENOMEM; -+ } -+ -+ socket_listp->socketp = socketp; -+ socket_listp->next = *sockets; -+ *sockets = socket_listp; -+ -+ return 0; -+} -+ -+int -+pfkey_list_remove_supported(struct ipsec_alg_supported *supported, struct supported_list **supported_list) -+{ -+ struct supported_list *supported_listp = *supported_list, *prev = NULL; -+ -+ if(!supported) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_list_remove_supported: " -+ "NULL supported handed in, failed.\n"); -+ return -EINVAL; -+ } -+ -+ if(!supported_list) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_list_remove_supported: " -+ "NULL supported_list handed in, failed.\n"); -+ return -EINVAL; -+ } -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_list_remove_supported: " -+ "removing supported=0p%p\n", -+ supported); -+ -+ while(supported_listp != NULL) { -+ if(supported_listp->supportedp == supported) { -+ if(prev != NULL) { -+ prev->next = supported_listp->next; -+ } else { -+ *supported_list = supported_listp->next; -+ } -+ -+ kfree((void*)supported_listp); -+ -+ break; -+ } -+ prev = supported_listp; -+ supported_listp = supported_listp->next; -+ } -+ -+ return 0; -+} -+ -+int -+pfkey_list_insert_supported(struct ipsec_alg_supported *supported -+ , struct supported_list **supported_list) -+{ -+ struct supported_list *supported_listp; -+ -+ if(!supported) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_list_insert_supported: " -+ "NULL supported handed in, failed.\n"); -+ return -EINVAL; -+ } -+ -+ if(!supported_list) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_list_insert_supported: " -+ "NULL supported_list handed in, failed.\n"); -+ return -EINVAL; -+ } -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_list_insert_supported: " -+ "allocating %lu bytes for incoming, supported=0p%p, supported_list=0p%p\n", -+ (unsigned long) sizeof(struct supported_list), -+ supported, -+ supported_list); -+ -+ supported_listp = (struct supported_list *)kmalloc(sizeof(struct supported_list), GFP_KERNEL); -+ -+ if(supported_listp == NULL) -+ { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_list_insert_supported: " -+ "memory allocation error.\n"); -+ return -ENOMEM; -+ } -+ -+ supported_listp->supportedp = supported; -+ supported_listp->next = *supported_list; -+ *supported_list = supported_listp; -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_list_insert_supported: " -+ "outgoing, supported=0p%p, supported_list=0p%p\n", -+ supported, -+ supported_list); -+ -+ return 0; -+} -+ -+#ifdef NET_26 -+DEBUG_NO_STATIC void -+pfkey_insert_socket(struct sock *sk) -+{ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_insert_socket: " -+ "sk=0p%p\n", -+ sk); -+ pfkey_sock_list_grab(); -+ sk_add_node(sk, &pfkey_sock_list); -+ pfkey_sock_list_ungrab(); -+} -+ -+DEBUG_NO_STATIC void -+pfkey_remove_socket(struct sock *sk) -+{ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_remove_socket: 0p%p\n", sk); -+ pfkey_sock_list_grab(); -+ sk_del_node_init(sk); -+ pfkey_sock_list_ungrab(); -+ return; -+} -+#else -+ -+DEBUG_NO_STATIC void -+pfkey_insert_socket(struct sock *sk) -+{ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_insert_socket: " -+ "sk=0p%p\n", -+ sk); -+ cli(); -+ sk->next=pfkey_sock_list; -+ pfkey_sock_list=sk; -+ sti(); -+} -+DEBUG_NO_STATIC void -+pfkey_remove_socket(struct sock *sk) -+{ -+ struct sock **s; -+ -+ s = NULL; -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_remove_socket: .\n"); -+ -+ cli(); -+ s=&pfkey_sock_list; -+ -+ while(*s!=NULL) { -+ if(*s==sk) { -+ *s=sk->next; -+ sk->next=NULL; -+ sti(); -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_remove_socket: " -+ "succeeded.\n"); -+ return; -+ } -+ s=&((*s)->next); -+ } -+ sti(); -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_remove_socket: " -+ "not found.\n"); -+ return; -+} -+#endif -+ -+DEBUG_NO_STATIC void -+pfkey_destroy_socket(struct sock *sk) -+{ -+ struct sk_buff *skb; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_destroy_socket: 0p%p\n",sk); -+ pfkey_remove_socket(sk); -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_destroy_socket: " -+ "pfkey_remove_socket called, sk=0p%p\n",sk); -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_destroy_socket: " -+ "sk(0p%p)->(&0p%p)receive_queue.{next=0p%p,prev=0p%p}.\n", -+ sk, -+ &(sk->sk_receive_queue), -+ sk->sk_receive_queue.next, -+ sk->sk_receive_queue.prev); -+ -+ while(sk && ((skb=skb_dequeue(&(sk->sk_receive_queue)))!=NULL)) { -+#ifdef CONFIG_KLIPS_DEBUG -+ if(debug_pfkey && sysctl_ipsec_debug_verbose) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_destroy_socket: " -+ "skb=0p%p dequeued.\n", skb); -+ printk(KERN_INFO "klips_debug:pfkey_destroy_socket: " -+ "pfkey_skb contents:"); -+ printk(" next:0p%p", skb->next); -+ printk(" prev:0p%p", skb->prev); -+ printk(" sk:0p%p", skb->sk); -+ printk(" dev:0p%p", skb->dev); -+ if(skb->dev) { -+ if(skb->dev->name) { -+ printk(" dev->name:%s", skb->dev->name); -+ } else { -+ printk(" dev->name:NULL?"); -+ } -+ } else { -+ printk(" dev:NULL"); -+ } -+ printk(" h:0p%p", skb_transport_header(skb)); -+ printk(" nh:0p%p", skb_network_header(skb)); -+ printk(" mac:0p%p", skb_mac_header(skb)); -+ printk(" dst:0p%p", skb->dst); -+ if(sysctl_ipsec_debug_verbose) { -+ int i; -+ -+ printk(" cb"); -+ for(i=0; i<48; i++) { -+ printk(":%2x", skb->cb[i]); -+ } -+ } -+ printk(" len:%d", skb->len); -+ printk(" csum:%d", skb->csum); -+#ifndef NETDEV_23 -+ printk(" used:%d", skb->used); -+ printk(" is_clone:%d", skb->is_clone); -+#endif /* NETDEV_23 */ -+ printk(" cloned:%d", skb->cloned); -+ printk(" pkt_type:%d", skb->pkt_type); -+ printk(" ip_summed:%d", skb->ip_summed); -+ printk(" priority:%d", skb->priority); -+ printk(" protocol:%d", skb->protocol); -+#ifdef HAVE_SOCK_SECURITY -+ printk(" security:%d", skb->security); -+#endif -+ printk(" truesize:%d", skb->truesize); -+ printk(" head:0p%p", skb->head); -+ printk(" data:0p%p", skb->data); -+ printk(" tail:0p%p", skb_tail_pointer(skb)); -+ printk(" end:0p%p", skb_end_pointer(skb)); -+ if(sysctl_ipsec_debug_verbose) { -+ unsigned char* i; -+ printk(" data"); -+ for(i = skb->head; i < skb_end_pointer(skb); i++) { -+ printk(":%2x", (unsigned char)(*(i))); -+ } -+ } -+ printk(" destructor:0p%p", skb->destructor); -+ printk("\n"); -+ } -+#endif /* CONFIG_KLIPS_DEBUG */ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_destroy_socket: " -+ "skb=0p%p freed.\n", -+ skb); -+ ipsec_kfree_skb(skb); -+ } -+ -+#ifdef NET_26 -+ sock_set_flag(sk, SOCK_DEAD); -+#else -+ sk->dead = 1; -+#endif -+ sk_free(sk); -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_destroy_socket: destroyed.\n"); -+} -+ -+int -+pfkey_upmsg(struct socket *sock, struct sadb_msg *pfkey_msg) -+{ -+ struct sock *sk; -+ -+ if(sock == NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_upmsg: " -+ "NULL socket passed in.\n"); -+ return -EINVAL; -+ } -+ -+ if(pfkey_msg == NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_upmsg: " -+ "NULL pfkey_msg passed in.\n"); -+ return -EINVAL; -+ } -+ -+ sk = sock->sk; -+ return pfkey_upmsgsk(sk, pfkey_msg); -+} -+int -+pfkey_upmsgsk(struct sock *sk, struct sadb_msg *pfkey_msg) -+{ -+ int error = 0; -+ struct sk_buff * skb = NULL; -+ -+ if(sk == NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_upmsg: " -+ "NULL sock passed in.\n"); -+ return -EINVAL; -+ } -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_upmsg: " -+ "allocating %d bytes...\n", -+ (int)(pfkey_msg->sadb_msg_len * IPSEC_PFKEYv2_ALIGN)); -+ if(!(skb = alloc_skb(pfkey_msg->sadb_msg_len * IPSEC_PFKEYv2_ALIGN, GFP_ATOMIC) )) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_upmsg: " -+ "no buffers left to send up a message.\n"); -+ return -ENOBUFS; -+ } -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_upmsg: " -+ "...allocated at 0p%p.\n", -+ skb); -+ -+ skb->dev = NULL; -+ -+ if(skb_tailroom(skb) < pfkey_msg->sadb_msg_len * IPSEC_PFKEYv2_ALIGN) { -+ printk(KERN_WARNING "klips_error:pfkey_upmsg: " -+ "tried to skb_put %ld, %d available. This should never happen, please report.\n", -+ (unsigned long int)pfkey_msg->sadb_msg_len * IPSEC_PFKEYv2_ALIGN, -+ skb_tailroom(skb)); -+ ipsec_kfree_skb(skb); -+ return -ENOBUFS; -+ } -+ skb_set_transport_header(skb, ipsec_skb_offset(skb, skb_put(skb, pfkey_msg->sadb_msg_len * IPSEC_PFKEYv2_ALIGN))); -+ memcpy(skb_transport_header(skb), pfkey_msg, pfkey_msg->sadb_msg_len * IPSEC_PFKEYv2_ALIGN); -+ -+ if((error = sock_queue_rcv_skb(sk, skb)) < 0) { -+ skb->sk=NULL; -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_upmsg: " -+ "error=%d calling sock_queue_rcv_skb with skb=0p%p.\n", -+ error, -+ skb); -+ ipsec_kfree_skb(skb); -+ return error; -+ } -+ return error; -+} -+ -+#if defined(NET_26_12_SKALLOC) || defined(NET_26_24_SKALLOC) -+ -+static struct proto key_proto = { -+ .name = "KEY", -+ .owner = THIS_MODULE, -+ .obj_size = sizeof(struct sock), -+ -+}; -+#endif -+#ifdef NET_26_24_SKALLOC -+DEBUG_NO_STATIC int -+pfkey_create(struct net *net, struct socket *sock, int protocol) -+#else -+DEBUG_NO_STATIC int -+pfkey_create(struct socket *sock, int protocol) -+#endif -+{ -+ struct sock *sk; -+ -+ if(sock == NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_create: " -+ "socket NULL.\n"); -+ return -EINVAL; -+ } -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_create: " -+ "sock=0p%p type:%d state:%d flags:%ld protocol:%d\n", -+ sock, -+ sock->type, -+ (unsigned int)(sock->state), -+ sock->flags, protocol); -+ -+ if(sock->type != SOCK_RAW) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_create: " -+ "only SOCK_RAW supported.\n"); -+ return -ESOCKTNOSUPPORT; -+ } -+ -+ if(protocol != PF_KEY_V2) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_create: " -+ "protocol not PF_KEY_V2.\n"); -+ return -EPROTONOSUPPORT; -+ } -+ -+ if((current->uid != 0)) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_create: " -+ "must be root to open pfkey sockets.\n"); -+ return -EACCES; -+ } -+ -+ sock->state = SS_UNCONNECTED; -+ -+ KLIPS_INC_USE; -+ -+#ifdef NET_26 -+#ifdef NET_26_24_SKALLOC -+ sk=(struct sock *)sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto); -+#else -+#ifdef NET_26_12_SKALLOC -+ sk=(struct sock *)sk_alloc(PF_KEY, GFP_KERNEL, &key_proto, 1); -+#else -+ sk=(struct sock *)sk_alloc(PF_KEY, GFP_KERNEL, 1, NULL); -+#endif -+#endif -+#else -+ /* 2.4 interface */ -+ sk=(struct sock *)sk_alloc(PF_KEY, GFP_KERNEL, 1); -+#endif -+ -+ if(sk == NULL) -+ { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_create: " -+ "Out of memory trying to allocate.\n"); -+ KLIPS_DEC_USE; -+ return -ENOMEM; -+ } -+ -+ sock_init_data(sock, sk); -+ -+ sk->sk_destruct = NULL; -+ sk->sk_reuse = 1; -+ sock->ops = &SOCKOPS_WRAPPED(pfkey_ops); -+ -+ sk->sk_family = PF_KEY; -+/* sk->num = protocol; */ -+ sk->sk_protocol = protocol; -+ key_pid(sk) = current->pid; -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_create: " -+ "sock->fasync_list=0p%p sk->sleep=0p%p.\n", -+ sock->fasync_list, -+ sk->sk_sleep); -+ -+ pfkey_insert_socket(sk); -+ pfkey_list_insert_socket(sock, &pfkey_open_sockets); -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_create: " -+ "Socket sock=0p%p sk=0p%p initialised.\n", sock, sk); -+ return 0; -+} -+ -+DEBUG_NO_STATIC int -+#ifdef NETDEV_23 -+pfkey_release(struct socket *sock) -+#else /* NETDEV_23 */ -+pfkey_release(struct socket *sock, struct socket *peersock) -+#endif /* NETDEV_23 */ -+{ -+ struct sock *sk; -+ int i; -+ -+ if(sock==NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_release: " -+ "No socket attached.\n"); -+ return 0; /* -EINVAL; */ -+ } -+ -+ sk=sock->sk; -+ -+ /* May not have data attached */ -+ if(sk==NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_release: " -+ "No sk attached to sock=0p%p.\n", sock); -+ return 0; /* -EINVAL; */ -+ } -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_release: " -+ "sock=0p%p sk=0p%p\n", sock, sk); -+ -+ if(sock_flag(sk, SOCK_DEAD)) -+ if(sk->sk_state_change) { -+ sk->sk_state_change(sk); -+ } -+ -+ sock->sk = NULL; -+ -+ /* Try to flush out this socket. Throw out buffers at least */ -+ pfkey_destroy_socket(sk); -+ pfkey_list_remove_socket(sock, &pfkey_open_sockets); -+ for(i = K_SADB_SATYPE_UNSPEC; i <= K_SADB_SATYPE_MAX; i++) { -+ pfkey_list_remove_socket(sock, &(pfkey_registered_sockets[i])); -+ } -+ -+ KLIPS_DEC_USE; -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_release: " -+ "succeeded.\n"); -+ -+ return 0; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_shutdown(struct socket *sock, int mode) -+{ -+ struct sock *sk; -+ -+ if(sock == NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_shutdown: " -+ "NULL socket passed in.\n"); -+ return -EINVAL; -+ } -+ -+ sk=sock->sk; -+ -+ if(sk == NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_shutdown: " -+ "No sock attached to socket.\n"); -+ return -EINVAL; -+ } -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_shutdown: " -+ "mode=%x.\n", mode); -+ mode++; -+ -+ if(mode&SEND_SHUTDOWN) { -+ sk->sk_shutdown|=SEND_SHUTDOWN; -+ sk->sk_state_change(sk); -+ } -+ -+ if(mode&RCV_SHUTDOWN) { -+ sk->sk_shutdown|=RCV_SHUTDOWN; -+ sk->sk_state_change(sk); -+ } -+ return 0; -+} -+ -+/* -+ * Send PF_KEY data down. -+ */ -+ -+DEBUG_NO_STATIC int -+#ifdef NET_26 -+pfkey_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t len) -+#else -+pfkey_sendmsg(struct socket *sock, struct msghdr *msg, int len, struct scm_cookie *scm) -+#endif -+{ -+ struct sock *sk; -+ int error = 0; -+ struct sadb_msg *pfkey_msg = NULL, *pfkey_reply = NULL; -+ -+ if(sock == NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_sendmsg: " -+ "Null socket passed in.\n"); -+ SENDERR(EINVAL); -+ } -+ -+ sk = sock->sk; -+ -+ if(sk == NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_sendmsg: " -+ "Null sock passed in.\n"); -+ SENDERR(EINVAL); -+ } -+ -+ if(msg == NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_sendmsg: " -+ "Null msghdr passed in.\n"); -+ SENDERR(EINVAL); -+ } -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_sendmsg: .\n"); -+ if(sk->sk_err) { -+ error = sock_error(sk); -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_sendmsg: " -+ "sk->err is non-zero, returns %d.\n", -+ error); -+ SENDERR(-error); -+ } -+ -+ if((current->uid != 0)) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_sendmsg: " -+ "must be root to send messages to pfkey sockets.\n"); -+ SENDERR(EACCES); -+ } -+ -+ if(msg->msg_control) -+ { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_sendmsg: " -+ "can't set flags or set msg_control.\n"); -+ SENDERR(EINVAL); -+ } -+ -+ if(sk->sk_shutdown & SEND_SHUTDOWN) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_sendmsg: " -+ "shutdown.\n"); -+ send_sig(SIGPIPE, current, 0); -+ SENDERR(EPIPE); -+ } -+ -+ if(len < sizeof(struct sadb_msg)) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_sendmsg: " -+ "bogus msg len of %d, too small.\n", (int)len); -+ SENDERR(EMSGSIZE); -+ } -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_sendmsg: " -+ "allocating %d bytes for downward message.\n", -+ (int)len); -+ if((pfkey_msg = (struct sadb_msg*)kmalloc(len, GFP_KERNEL)) == NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_sendmsg: " -+ "memory allocation error.\n"); -+ SENDERR(ENOBUFS); -+ } -+ -+ memcpy_fromiovec((void *)pfkey_msg, msg->msg_iov, len); -+ -+ if(pfkey_msg->sadb_msg_version != PF_KEY_V2) { -+ KLIPS_PRINT(1 || debug_pfkey, -+ "klips_debug:pfkey_sendmsg: " -+ "not PF_KEY_V2 msg, found %d, should be %d.\n", -+ pfkey_msg->sadb_msg_version, -+ PF_KEY_V2); -+ kfree((void*)pfkey_msg); -+ return -EINVAL; -+ } -+ -+ if(len != pfkey_msg->sadb_msg_len * IPSEC_PFKEYv2_ALIGN) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_sendmsg: " -+ "bogus msg len of %d, not %d byte aligned.\n", -+ (int)len, (int)IPSEC_PFKEYv2_ALIGN); -+ SENDERR(EMSGSIZE); -+ } -+ -+ if(pfkey_msg->sadb_msg_reserved) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_sendmsg: " -+ "reserved field must be zero, set to %d.\n", -+ pfkey_msg->sadb_msg_reserved); -+ SENDERR(EINVAL); -+ } -+ -+ if((pfkey_msg->sadb_msg_type > K_SADB_MAX) || (!pfkey_msg->sadb_msg_type)){ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_sendmsg: " -+ "msg type too large or small:%d.\n", -+ pfkey_msg->sadb_msg_type); -+ SENDERR(EINVAL); -+ } -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_sendmsg: " -+ "msg sent for parsing.\n"); -+ -+ if((error = pfkey_msg_interp(sk, pfkey_msg))) { -+ struct socket_list *pfkey_socketsp; -+ -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_sendmsg: " -+ "pfkey_msg_parse returns %d.\n", -+ error); -+ -+ if((pfkey_reply = (struct sadb_msg*)kmalloc(sizeof(struct sadb_msg), GFP_KERNEL)) == NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_sendmsg: " -+ "memory allocation error.\n"); -+ SENDERR(ENOBUFS); -+ } -+ memcpy((void*)pfkey_reply, (void*)pfkey_msg, sizeof(struct sadb_msg)); -+ pfkey_reply->sadb_msg_errno = -error; -+ pfkey_reply->sadb_msg_len = sizeof(struct sadb_msg) / IPSEC_PFKEYv2_ALIGN; -+ -+ for(pfkey_socketsp = pfkey_open_sockets; -+ pfkey_socketsp; -+ pfkey_socketsp = pfkey_socketsp->next) { -+ int error_upmsg = 0; -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_sendmsg: " -+ "sending up error=%d message=0p%p to socket=0p%p.\n", -+ error, -+ pfkey_reply, -+ pfkey_socketsp->socketp); -+ if((error_upmsg = pfkey_upmsg(pfkey_socketsp->socketp, pfkey_reply))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_sendmsg: " -+ "sending up error message to socket=0p%p failed with error=%d.\n", -+ pfkey_socketsp->socketp, -+ error_upmsg); -+ /* pfkey_msg_free(&pfkey_reply); */ -+ /* SENDERR(-error); */ -+ } -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_sendmsg: " -+ "sending up error message to socket=0p%p succeeded.\n", -+ pfkey_socketsp->socketp); -+ } -+ -+ pfkey_msg_free(&pfkey_reply); -+ -+ SENDERR(-error); -+ } -+ -+ errlab: -+ if (pfkey_msg) { -+ kfree((void*)pfkey_msg); -+ } -+ -+ if(error) { -+ return error; -+ } else { -+ return len; -+ } -+} -+ -+/* -+ * Receive PF_KEY data up. -+ */ -+ -+DEBUG_NO_STATIC int -+#ifdef NET_26 -+pfkey_recvmsg(struct kiocb *kiocb -+ , struct socket *sock -+ , struct msghdr *msg -+ , size_t size -+ , int flags) -+#else -+pfkey_recvmsg(struct socket *sock -+ , struct msghdr *msg -+ , int size, int flags -+ , struct scm_cookie *scm) -+#endif -+{ -+ struct sock *sk; -+ int noblock = flags & MSG_DONTWAIT; -+ struct sk_buff *skb; -+ int error; -+ -+ if(sock == NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_recvmsg: " -+ "Null socket passed in.\n"); -+ return -EINVAL; -+ } -+ -+ sk = sock->sk; -+ -+ if(sk == NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_recvmsg: " -+ "Null sock passed in for sock=0p%p.\n", sock); -+ return -EINVAL; -+ } -+ -+ if(msg == NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_recvmsg: " -+ "Null msghdr passed in for sock=0p%p, sk=0p%p.\n", -+ sock, sk); -+ return -EINVAL; -+ } -+ -+ KLIPS_PRINT(debug_pfkey && sysctl_ipsec_debug_verbose, -+ "klips_debug:pfkey_recvmsg: sock=0p%p sk=0p%p msg=0p%p size=%d.\n", -+ sock, sk, msg, (int)size); -+ if(flags & ~MSG_PEEK) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_sendmsg: " -+ "flags (%d) other than MSG_PEEK not supported.\n", -+ flags); -+ return -EOPNOTSUPP; -+ } -+ -+ msg->msg_namelen = 0; /* sizeof(*ska); */ -+ -+ if(sk->sk_err) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_sendmsg: " -+ "sk->sk_err=%d.\n", sk->sk_err); -+ return sock_error(sk); -+ } -+ -+ if((skb = skb_recv_datagram(sk, flags, noblock, &error) ) == NULL) { -+ return error; -+ } -+ -+ if(size > skb->len) { -+ size = skb->len; -+ } -+ else if(size len) { -+ msg->msg_flags |= MSG_TRUNC; -+ } -+ -+ skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size); -+#ifdef HAVE_KERNEL_TSTAMP -+ sk->sk_stamp = skb->tstamp; -+#elif defined(HAVE_TSTAMP) -+ sk->sk_stamp.tv_sec = skb->tstamp.off_sec; -+ sk->sk_stamp.tv_usec = skb->tstamp.off_usec; -+#else -+ sk->sk_stamp=skb->stamp; -+#endif -+ -+ skb_free_datagram(sk, skb); -+ return size; -+} -+ -+#ifdef CONFIG_PROC_FS -+#ifndef PROC_FS_2325 -+DEBUG_NO_STATIC -+#endif /* PROC_FS_2325 */ -+int -+pfkey_get_info(char *buffer, char **start, off_t offset, int length -+#ifndef PROC_NO_DUMMY -+, int dummy -+#endif /* !PROC_NO_DUMMY */ -+#ifdef PROC_EOF_DATA -+, int *eof -+, void *data -+#endif -+) -+{ -+ const int max_content = length > 0? length-1 : 0; /* limit of useful snprintf output */ -+#ifdef NET_26 -+ struct hlist_node *node; -+#endif -+ off_t begin=0; -+ int len=0; -+ struct sock *sk; -+ -+#ifdef CONFIG_KLIPS_DEBUG -+ if(!sysctl_ipsec_debug_verbose) { -+#endif /* CONFIG_KLIPS_DEBUG */ -+ len += ipsec_snprintf(buffer, length, -+ " sock pid socket next prev e n p sndbf Flags Type St\n"); -+#ifdef CONFIG_KLIPS_DEBUG -+ } else { -+ len += ipsec_snprintf(buffer, length, -+ " sock pid d sleep socket next prev e r z n p sndbf stamp Flags Type St\n"); -+ } -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+ sk_for_each(sk, node, &pfkey_sock_list) { -+ -+#ifdef CONFIG_KLIPS_DEBUG -+ if(!sysctl_ipsec_debug_verbose) { -+#endif /* CONFIG_KLIPS_DEBUG */ -+ len += ipsec_snprintf(buffer+len, length-len, -+ "%8p %5d %8p %d %d %5d %08lX %8X %2X\n", -+ sk, -+ key_pid(sk), -+ sk->sk_socket, -+ sk->sk_err, -+ sk->sk_protocol, -+ sk->sk_sndbuf, -+ sk->sk_socket->flags, -+ sk->sk_socket->type, -+ sk->sk_socket->state); -+#ifdef CONFIG_KLIPS_DEBUG -+ } else { -+ struct timeval t; -+ grab_socket_timeval(t, *sk); -+ len += ipsec_snprintf(buffer+len, length-len, -+ "%8p %5d %d %8p %8p %d %d %d %d %5d %d.%06d %08lX %8X %2X\n", -+ sk, -+ key_pid(sk), -+ sock_flag(sk, SOCK_DEAD), -+ sk->sk_sleep, -+ sk->sk_socket, -+ sk->sk_err, -+ sk->sk_reuse, -+#ifdef HAVE_SOCK_ZAPPED -+ sock_flag(sk, SOCK_ZAPPED), -+#else -+ sk->sk_zapped, -+#endif -+ sk->sk_protocol, -+ sk->sk_sndbuf, -+ (unsigned int)t.tv_sec, -+ (unsigned int)t.tv_usec, -+ sk->sk_socket->flags, -+ sk->sk_socket->type, -+ sk->sk_socket->state); -+ } -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+ if (len >= max_content) { -+ /* we've done all that can fit -- stop loop */ -+ len = max_content; /* truncate crap */ -+ break; -+ } else { -+ const off_t pos = begin + len; /* file position of end of what we've generated */ -+ -+ if (pos <= offset) { -+ /* all is before first interesting character: -+ * discard, but note where we are. -+ */ -+ len = 0; -+ begin = pos; -+ } -+ } -+ } -+ -+ *start = buffer + (offset - begin); /* Start of wanted data */ -+ return len - (offset - begin); -+} -+ -+#ifndef PROC_FS_2325 -+DEBUG_NO_STATIC -+#endif /* PROC_FS_2325 */ -+int -+pfkey_supported_get_info(char *buffer, char **start, off_t offset, int length -+#ifndef PROC_NO_DUMMY -+, int dummy -+#endif /* !PROC_NO_DUMMY */ -+#ifdef PROC_EOF_DATA -+, int *eof -+, void *data -+#endif -+) -+{ -+ /* limit of useful snprintf output */ -+ const int max_content = length > 0? length-1 : 0; -+ off_t begin=0; -+ int len=0; -+ int satype; -+ struct supported_list *ps; -+ -+ len += ipsec_snprintf(buffer, length, -+ "satype exttype alg_id ivlen minbits maxbits name\n"); -+ -+ for(satype = K_SADB_SATYPE_UNSPEC; satype <= K_SADB_SATYPE_MAX; satype++) { -+ ps = pfkey_supported_list[satype]; -+ while(ps) { -+ struct ipsec_alg_supported *alg = ps->supportedp; -+ const char *n = alg->ias_name; -+ if(n == NULL) n = "unknown"; -+ -+ len += ipsec_snprintf(buffer+len, length-len, -+ " %2d %2d %2d %3d %3d %3d %20s\n", -+ satype, -+ alg->ias_exttype, -+ alg->ias_id, -+ alg->ias_ivlen, -+ alg->ias_keyminbits, -+ alg->ias_keymaxbits, -+ n); -+ -+ if (len >= max_content) { -+ /* we've done all that can fit -- stop loop */ -+ len = max_content; /* truncate crap */ -+ break; -+ } else { -+ const off_t pos = begin + len; /* file position of end of what we've generated */ -+ -+ if (pos <= offset) { -+ /* all is before first interesting character: -+ * discard, but note where we are. -+ */ -+ len = 0; -+ begin = pos; -+ } -+ } -+ -+ ps = ps->next; -+ } -+ } -+ *start = buffer + (offset - begin); /* Start of wanted data */ -+ return len - (offset - begin); -+} -+ -+#ifndef PROC_FS_2325 -+DEBUG_NO_STATIC -+#endif /* PROC_FS_2325 */ -+int -+pfkey_registered_get_info(char *buffer, char **start, off_t offset, int length -+#ifndef PROC_NO_DUMMY -+, int dummy -+#endif /* !PROC_NO_DUMMY */ -+#ifdef PROC_EOF_DATA -+, int *eof -+, void *data -+#endif -+) -+{ -+ const int max_content = length > 0? length-1 : 0; /* limit of useful snprintf output */ -+ off_t begin=0; -+ int len=0; -+ int satype; -+ struct socket_list *pfkey_sockets; -+ -+ len += ipsec_snprintf(buffer, length, -+ "satype socket pid sk\n"); -+ -+ for(satype = K_SADB_SATYPE_UNSPEC; satype <= K_SADB_SATYPE_MAX; satype++) { -+ pfkey_sockets = pfkey_registered_sockets[satype]; -+ while(pfkey_sockets) { -+ len += ipsec_snprintf(buffer+len, length-len, -+ " %2d %8p %5d %8p\n", -+ satype, -+ pfkey_sockets->socketp, -+ key_pid(pfkey_sockets->socketp->sk), -+ pfkey_sockets->socketp->sk); -+ -+ if (len >= max_content) { -+ /* we've done all that can fit -- stop loop (could stop two) */ -+ len = max_content; /* truncate crap */ -+ break; -+ } else { -+ const off_t pos = begin + len; /* file position of end of what we've generated */ -+ -+ if (pos <= offset) { -+ /* all is before first interesting character: -+ * discard, but note where we are. -+ */ -+ len = 0; -+ begin = pos; -+ } -+ } -+ -+ pfkey_sockets = pfkey_sockets->next; -+ } -+ } -+ *start = buffer + (offset - begin); /* Start of wanted data */ -+ return len - (offset - begin); -+} -+ -+#ifndef PROC_FS_2325 -+struct proc_dir_entry proc_net_pfkey = -+{ -+ 0, -+ 6, "pf_key", -+ S_IFREG | S_IRUGO, 1, 0, 0, -+ 0, &proc_net_inode_operations, -+ pfkey_get_info -+}; -+struct proc_dir_entry proc_net_pfkey_supported = -+{ -+ 0, -+ 16, "pf_key_supported", -+ S_IFREG | S_IRUGO, 1, 0, 0, -+ 0, &proc_net_inode_operations, -+ pfkey_supported_get_info -+}; -+struct proc_dir_entry proc_net_pfkey_registered = -+{ -+ 0, -+ 17, "pf_key_registered", -+ S_IFREG | S_IRUGO, 1, 0, 0, -+ 0, &proc_net_inode_operations, -+ pfkey_registered_get_info -+}; -+#endif /* !PROC_FS_2325 */ -+#endif /* CONFIG_PROC_FS */ -+ -+DEBUG_NO_STATIC int -+supported_add_all(int satype, struct ipsec_alg_supported supported[], int size) -+{ -+ int i; -+ int error = 0; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:init_pfkey: " -+ "sizeof(supported_init_)[%d]/sizeof(struct ipsec_alg_supported)[%d]=%d.\n", -+ satype, -+ size, -+ (int)sizeof(struct ipsec_alg_supported), -+ (int)(size/sizeof(struct ipsec_alg_supported))); -+ -+ for(i = 0; i < size / sizeof(struct ipsec_alg_supported); i++) { -+ -+ const char *n = supported[i].ias_name; -+ if(n == NULL) n="unknown"; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:init_pfkey: " -+ "i=%d inserting satype=%d exttype=%d id=%d ivlen=%d minbits=%d maxbits=%d name=%s.\n", -+ i, -+ satype, -+ supported[i].ias_exttype, -+ supported[i].ias_id, -+ supported[i].ias_ivlen, -+ supported[i].ias_keyminbits, -+ supported[i].ias_keymaxbits, -+ n); -+ -+ error |= pfkey_list_insert_supported(&(supported[i]), -+ &(pfkey_supported_list[satype])); -+ } -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+supported_remove_all(int satype) -+{ -+ int error = 0; -+ struct ipsec_alg_supported*supportedp; -+ -+ while(pfkey_supported_list[satype]) { -+ const char *n; -+ supportedp = pfkey_supported_list[satype]->supportedp; -+ -+ n = supportedp->ias_name; -+ if(n == NULL) n="unknown"; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:init_pfkey: " -+ "removing satype=%d exttype=%d id=%d ivlen=%d minbits=%d maxbits=%d name=%s.\n", -+ satype, -+ supportedp->ias_exttype, -+ supportedp->ias_id, -+ supportedp->ias_ivlen, -+ supportedp->ias_keyminbits, -+ supportedp->ias_keymaxbits, n); -+ -+ error |= pfkey_list_remove_supported(supportedp, -+ &(pfkey_supported_list[satype])); -+ } -+ return error; -+} -+ -+int -+pfkey_init(void) -+{ -+ int error = 0; -+ int i; -+#ifdef HAVE_PROC_DIR_ENTRY -+ struct proc_dir_entry* entry; -+#endif -+ -+ -+ static struct ipsec_alg_supported supported_init_ah[] = { -+#ifdef CONFIG_KLIPS_AUTH_HMAC_MD5 -+ {K_SADB_EXT_SUPPORTED_AUTH, K_SADB_AALG_MD5HMAC, 0, 128, 128}, -+#endif /* CONFIG_KLIPS_AUTH_HMAC_MD5 */ -+#ifdef CONFIG_KLIPS_AUTH_HMAC_SHA1 -+ {K_SADB_EXT_SUPPORTED_AUTH, K_SADB_AALG_SHA1HMAC, 0, 160, 160} -+#endif /* CONFIG_KLIPS_AUTH_HMAC_SHA1 */ -+ }; -+ static struct ipsec_alg_supported supported_init_esp[] = { -+#ifdef CONFIG_KLIPS_AUTH_HMAC_MD5 -+ {K_SADB_EXT_SUPPORTED_AUTH, K_SADB_AALG_MD5HMAC, 0, 128, 128}, -+#endif /* CONFIG_KLIPS_AUTH_HMAC_MD5 */ -+#ifdef CONFIG_KLIPS_AUTH_HMAC_SHA1 -+ {K_SADB_EXT_SUPPORTED_AUTH, K_SADB_AALG_SHA1HMAC, 0, 160, 160}, -+#endif /* CONFIG_KLIPS_AUTH_HMAC_SHA1 */ -+#ifdef CONFIG_KLIPS_ENC_3DES -+ {K_SADB_EXT_SUPPORTED_ENCRYPT, K_SADB_EALG_3DESCBC, 64, 168, 168}, -+#endif /* CONFIG_KLIPS_ENC_3DES */ -+ }; -+ static struct ipsec_alg_supported supported_init_ipip[] = { -+ {K_SADB_EXT_SUPPORTED_ENCRYPT, K_SADB_X_TALG_IPv4_in_IPv4, 0, 32, 32} -+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -+ , {K_SADB_EXT_SUPPORTED_ENCRYPT, K_SADB_X_TALG_IPv6_in_IPv4, 0, 128, 32} -+ , {K_SADB_EXT_SUPPORTED_ENCRYPT, K_SADB_X_TALG_IPv4_in_IPv6, 0, 32, 128} -+ , {K_SADB_EXT_SUPPORTED_ENCRYPT, K_SADB_X_TALG_IPv6_in_IPv6, 0, 128, 128} -+#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ -+ }; -+#ifdef CONFIG_KLIPS_IPCOMP -+ static struct ipsec_alg_supported supported_init_ipcomp[] = { -+ {K_SADB_EXT_SUPPORTED_ENCRYPT, SADB_X_CALG_DEFLATE, 0, 1, 1} -+ }; -+#endif /* CONFIG_KLIPS_IPCOMP */ -+ -+#if 0 -+ printk(KERN_INFO -+ "klips_info:pfkey_init: " -+ "FreeS/WAN: initialising PF_KEYv2 domain sockets.\n"); -+#endif -+ -+ for(i = K_SADB_SATYPE_UNSPEC; i <= K_SADB_SATYPE_MAX; i++) { -+ pfkey_registered_sockets[i] = NULL; -+ pfkey_supported_list[i] = NULL; -+ } -+ -+ error |= supported_add_all(K_SADB_SATYPE_AH, supported_init_ah, sizeof(supported_init_ah)); -+ error |= supported_add_all(K_SADB_SATYPE_ESP, supported_init_esp, sizeof(supported_init_esp)); -+#ifdef CONFIG_KLIPS_IPCOMP -+ error |= supported_add_all(K_SADB_X_SATYPE_COMP, supported_init_ipcomp, sizeof(supported_init_ipcomp)); -+#endif /* CONFIG_KLIPS_IPCOMP */ -+ error |= supported_add_all(K_SADB_X_SATYPE_IPIP, supported_init_ipip, sizeof(supported_init_ipip)); -+ -+ error |= sock_register(&pfkey_family_ops); -+ -+#ifdef CONFIG_PROC_FS -+# ifndef PROC_FS_2325 -+# ifdef PROC_FS_21 -+ error |= proc_register(proc_net, &proc_net_pfkey); -+ error |= proc_register(proc_net, &proc_net_pfkey_supported); -+ error |= proc_register(proc_net, &proc_net_pfkey_registered); -+# else /* PROC_FS_21 */ -+ error |= proc_register_dynamic(&proc_net, &proc_net_pfkey); -+ error |= proc_register_dynamic(&proc_net, &proc_net_pfkey_supported); -+ error |= proc_register_dynamic(&proc_net, &proc_net_pfkey_registered); -+# endif /* PROC_FS_21 */ -+# else /* !PROC_FS_2325 */ -+# if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) -+ proc_net_create ("pf_key", 0, pfkey_get_info); -+ proc_net_create ("pf_key_supported", 0, pfkey_supported_get_info); -+ proc_net_create ("pf_key_registered", 0, pfkey_registered_get_info); -+# else -+ entry = create_proc_entry ("pf_key", 0, init_net.proc_net); -+ entry->read_proc = pfkey_get_info; -+ entry = create_proc_entry ("pf_key_supported", 0, init_net.proc_net); -+ entry->read_proc = pfkey_supported_get_info; -+ entry = create_proc_entry ("pf_key_registered", 0, init_net.proc_net); -+ entry->read_proc = pfkey_registered_get_info; -+# endif -+# endif /* !PROC_FS_2325 */ -+#endif /* CONFIG_PROC_FS */ -+ -+ return error; -+} -+ -+int -+pfkey_cleanup(void) -+{ -+ int error = 0; -+ -+ printk(KERN_INFO "klips_info:pfkey_cleanup: " -+ "shutting down PF_KEY domain sockets.\n"); -+#ifdef VOID_SOCK_UNREGISTER -+ sock_unregister(PF_KEY); -+#else -+ error |= sock_unregister(PF_KEY); -+#endif -+ -+ error |= supported_remove_all(K_SADB_SATYPE_AH); -+ error |= supported_remove_all(K_SADB_SATYPE_ESP); -+#ifdef CONFIG_KLIPS_IPCOMP -+ error |= supported_remove_all(K_SADB_X_SATYPE_COMP); -+#endif /* CONFIG_KLIPS_IPCOMP */ -+ error |= supported_remove_all(K_SADB_X_SATYPE_IPIP); -+ -+#ifdef CONFIG_PROC_FS -+# ifndef PROC_FS_2325 -+ if (proc_net_unregister(proc_net_pfkey.low_ino) != 0) -+ printk("klips_debug:pfkey_cleanup: " -+ "cannot unregister /proc/net/pf_key\n"); -+ if (proc_net_unregister(proc_net_pfkey_supported.low_ino) != 0) -+ printk("klips_debug:pfkey_cleanup: " -+ "cannot unregister /proc/net/pf_key_supported\n"); -+ if (proc_net_unregister(proc_net_pfkey_registered.low_ino) != 0) -+ printk("klips_debug:pfkey_cleanup: " -+ "cannot unregister /proc/net/pf_key_registered\n"); -+# else /* !PROC_FS_2325 */ -+# if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) -+ proc_net_remove ("pf_key"); -+ proc_net_remove ("pf_key_supported"); -+ proc_net_remove ("pf_key_registered"); -+# else -+ proc_net_remove (&init_net, "pf_key"); -+ proc_net_remove (&init_net, "pf_key_supported"); -+ proc_net_remove (&init_net, "pf_key_registered"); -+# endif -+ -+# endif /* !PROC_FS_2325 */ -+#endif /* CONFIG_PROC_FS */ -+ -+ /* other module unloading cleanup happens here */ -+ return error; -+} -+ -+#ifdef MODULE -+#if 0 -+int -+init_module(void) -+{ -+ pfkey_init(); -+ return 0; -+} -+ -+void -+cleanup_module(void) -+{ -+ pfkey_cleanup(); -+} -+#endif /* 0 */ -+#else /* MODULE */ -+struct net_protocol; -+void pfkey_proto_init(struct net_protocol *pro) -+{ -+ pfkey_init(); -+} -+#endif /* MODULE */ -+ -+/* -+ * -+ * Local Variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/pfkey_v2_build.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,1452 @@ -+/* -+ * RFC2367 PF_KEYv2 Key management API message parser -+ * Copyright (C) 1999, 2000, 2001 Richard Guy Briggs. -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * RCSID $Id: pfkey_v2_build.c,v 1.53 2005/11/09 00:30:37 mcr Exp $ -+ */ -+ -+/* -+ * Template from klips/net/ipsec/ipsec/ipsec_parser.c. -+ */ -+ -+char pfkey_v2_build_c_version[] = "$Id: pfkey_v2_build.c,v 1.53 2005/11/09 00:30:37 mcr Exp $"; -+ -+/* -+ * Some ugly stuff to allow consistent debugging code for use in the -+ * kernel and in user space -+*/ -+ -+#if defined(__KERNEL__) && defined(linux) -+ -+# include /* for printk */ -+ -+# include "openswan/ipsec_kversion.h" /* for malloc switch */ -+# ifdef MALLOC_SLAB -+# include /* kmalloc() */ -+# else /* MALLOC_SLAB */ -+# include /* kmalloc() */ -+# endif /* MALLOC_SLAB */ -+# include /* error codes */ -+# include /* size_t */ -+# include /* mark_bh */ -+ -+# include /* struct device, and other headers */ -+# include /* eth_type_trans */ -+# include /* struct iphdr */ -+# if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -+# include /* struct ipv6hdr */ -+# endif /* if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ -+ -+# define MALLOC(size) kmalloc(size, GFP_ATOMIC) -+# define FREE(obj) kfree(obj) -+# include -+#else /* __KERNEL__ */ -+ -+# include -+# include -+# include -+# include -+# include -+# include /* memset */ -+ -+# include -+ -+#endif /* __KERNEL__ */ -+ -+#include -+#include -+ -+#ifdef __KERNEL__ -+#include "openswan/radij.h" /* rd_nodes */ -+#include "openswan/ipsec_encap.h" /* sockaddr_encap */ -+#endif /* __KERNEL__ */ -+ -+ -+#include "openswan/ipsec_sa.h" /* IPSEC_SAREF_NULL, IPSEC_SA_REF_TABLE_IDX_WIDTH */ -+#include "openswan/pfkey_debug.h" -+ -+ -+#define SENDERR(_x) do { error = -(_x); goto errlab; } while (0) -+ -+void -+pfkey_extensions_init(struct sadb_ext *extensions[K_SADB_EXT_MAX + 1]) -+{ -+ int i; -+ -+ for (i = 0; i != K_SADB_EXT_MAX + 1; i++) { -+ extensions[i] = NULL; -+ } -+} -+ -+void -+pfkey_extensions_free(struct sadb_ext *extensions[K_SADB_EXT_MAX + 1]) -+{ -+ int i; -+ -+ if(!extensions) { -+ return; -+ } -+ -+ if(extensions[0]) { -+ memset(extensions[0], 0, sizeof(struct sadb_msg)); -+ FREE(extensions[0]); -+ extensions[0] = NULL; -+ } -+ -+ for (i = 1; i != K_SADB_EXT_MAX + 1; i++) { -+ if(extensions[i]) { -+ memset(extensions[i], 0, extensions[i]->sadb_ext_len * IPSEC_PFKEYv2_ALIGN); -+ FREE(extensions[i]); -+ extensions[i] = NULL; -+ } -+ } -+} -+ -+void -+pfkey_msg_free(struct sadb_msg **pfkey_msg) -+{ -+ if(*pfkey_msg) { -+ memset(*pfkey_msg, 0, (*pfkey_msg)->sadb_msg_len * IPSEC_PFKEYv2_ALIGN); -+ FREE(*pfkey_msg); -+ *pfkey_msg = NULL; -+ } -+} -+ -+/* Default extension builders taken from the KLIPS code */ -+ -+int -+pfkey_msg_hdr_build(struct sadb_ext** pfkey_ext, -+ uint8_t msg_type, -+ uint8_t satype, -+ uint8_t msg_errno, -+ uint32_t seq, -+ uint32_t pid) -+{ -+ int error = 0; -+ struct sadb_msg *pfkey_msg = (struct sadb_msg *)*pfkey_ext; -+ -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_msg_hdr_build:\n"); -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_msg_hdr_build: " -+ "on_entry &pfkey_ext=0p%p pfkey_ext=0p%p *pfkey_ext=0p%p.\n", -+ &pfkey_ext, -+ pfkey_ext, -+ *pfkey_ext); -+ /* sanity checks... */ -+ if(pfkey_msg) { -+ ERROR("pfkey_msg_hdr_build: " -+ "why is pfkey_msg already pointing to something?\n"); -+ SENDERR(EINVAL); -+ } -+ -+ if(!msg_type) { -+ ERROR("pfkey_msg_hdr_build: " -+ "msg type not set, must be non-zero..\n"); -+ SENDERR(EINVAL); -+ } -+ -+ if(msg_type > K_SADB_MAX) { -+ ERROR("pfkey_msg_hdr_build: " -+ "msg type too large:%d.\n", -+ msg_type); -+ SENDERR(EINVAL); -+ } -+ -+ if(satype > K_SADB_SATYPE_MAX) { -+ ERROR("pfkey_msg_hdr_build: " -+ "satype %d > max %d\n", -+ satype, SADB_SATYPE_MAX); -+ SENDERR(EINVAL); -+ } -+ -+ pfkey_msg = (struct sadb_msg*)MALLOC(sizeof(struct sadb_msg)); -+ *pfkey_ext = (struct sadb_ext*)pfkey_msg; -+ -+ if(pfkey_msg == NULL) { -+ ERROR("pfkey_msg_hdr_build: " -+ "memory allocation failed\n"); -+ SENDERR(ENOMEM); -+ } -+ memset(pfkey_msg, 0, sizeof(struct sadb_msg)); -+ -+ pfkey_msg->sadb_msg_len = sizeof(struct sadb_msg) / IPSEC_PFKEYv2_ALIGN; -+ -+ pfkey_msg->sadb_msg_type = msg_type; -+ pfkey_msg->sadb_msg_satype = satype; -+ -+ pfkey_msg->sadb_msg_version = PF_KEY_V2; -+ pfkey_msg->sadb_msg_errno = msg_errno; -+ pfkey_msg->sadb_msg_reserved = 0; -+ pfkey_msg->sadb_msg_seq = seq; -+ pfkey_msg->sadb_msg_pid = pid; -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_msg_hdr_build: " -+ "on_exit &pfkey_ext=0p%p pfkey_ext=0p%p *pfkey_ext=0p%p.\n", -+ &pfkey_ext, -+ pfkey_ext, -+ *pfkey_ext); -+errlab: -+ return error; -+} -+ -+ -+int -+pfkey_sa_builds(struct sadb_ext **pfkey_ext, -+ struct sadb_builds sab) -+{ -+ int error = 0; -+ struct k_sadb_sa *pfkey_sa = (struct k_sadb_sa *)*pfkey_ext; -+ -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_sa_build: " -+ "spi=%08x replay=%d sa_state=%d auth=%d encrypt=%d flags=%d\n", -+ ntohl(sab.sa_base.sadb_sa_spi), /* in network order */ -+ sab.sa_base.sadb_sa_replay, -+ sab.sa_base.sadb_sa_state, -+ sab.sa_base.sadb_sa_auth, -+ sab.sa_base.sadb_sa_encrypt, -+ sab.sa_base.sadb_sa_flags); -+ /* sanity checks... */ -+ if(pfkey_sa) { -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_sa_build: " -+ "why is pfkey_sa already pointing to something?\n"); -+ SENDERR(EINVAL); -+ } -+ -+ if(sab.sa_base.sadb_sa_exttype != SADB_EXT_SA && -+ sab.sa_base.sadb_sa_exttype != K_SADB_X_EXT_SA2) { -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_sa_build: " -+ "invalid exttype=%d.\n", -+ sab.sa_base.sadb_sa_exttype); -+ SENDERR(EINVAL); -+ } -+ -+ if(sab.sa_base.sadb_sa_replay > 64) { -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_sa_build: " -+ "replay window size: %d -- must be 0 <= size <= 64\n", -+ sab.sa_base.sadb_sa_replay); -+ SENDERR(EINVAL); -+ } -+ -+ if(sab.sa_base.sadb_sa_auth > SADB_AALG_MAX) { -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_sa_build: " -+ "auth=%d > SADB_AALG_MAX=%d.\n", -+ sab.sa_base.sadb_sa_auth, -+ SADB_AALG_MAX); -+ SENDERR(EINVAL); -+ } -+ -+#if K_SADB_EALG_MAX < 255 -+ if(sab.sa_base.sadb_sa_encrypt > K_SADB_EALG_MAX) { -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_sa_build: " -+ "encrypt=%d > K_SADB_EALG_MAX=%d.\n", -+ sab.sa_base.sadb_sa_encrypt, -+ K_SADB_EALG_MAX); -+ SENDERR(EINVAL); -+ } -+#endif -+ -+ if(sab.sa_base.sadb_sa_state > K_SADB_SASTATE_MAX) { -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_sa_build: " -+ "sa_state=%d exceeds MAX=%d.\n", -+ sab.sa_base.sadb_sa_state, -+ K_SADB_SASTATE_MAX); -+ SENDERR(EINVAL); -+ } -+ -+ if(sab.sa_base.sadb_sa_state == K_SADB_SASTATE_DEAD) { -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_sa_build: " -+ "sa_state=%d is DEAD=%d is not allowed.\n", -+ sab.sa_base.sadb_sa_state, -+ K_SADB_SASTATE_DEAD); -+ SENDERR(EINVAL); -+ } -+ -+ if((IPSEC_SAREF_NULL != sab.sa_base.sadb_x_sa_ref) && (sab.sa_base.sadb_x_sa_ref >= (1 << IPSEC_SA_REF_TABLE_IDX_WIDTH))) { -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_sa_build: " -+ "SAref=%d must be (SAref == IPSEC_SAREF_NULL(%d) || SAref < IPSEC_SA_REF_TABLE_NUM_ENTRIES(%d)).\n", -+ sab.sa_base.sadb_x_sa_ref, -+ IPSEC_SAREF_NULL, -+ IPSEC_SA_REF_TABLE_NUM_ENTRIES); -+ SENDERR(EINVAL); -+ } -+ -+ pfkey_sa = (struct k_sadb_sa*)MALLOC(sizeof(struct k_sadb_sa)); -+ *pfkey_ext = (struct sadb_ext*)pfkey_sa; -+ -+ if(pfkey_sa == NULL) { -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_sa_build: " -+ "memory allocation failed\n"); -+ SENDERR(ENOMEM); -+ } -+ memset(pfkey_sa, 0, sizeof(struct k_sadb_sa)); -+ -+ *pfkey_sa = sab.sa_base; -+ pfkey_sa->sadb_sa_len = sizeof(*pfkey_sa) / IPSEC_PFKEYv2_ALIGN; -+ -+errlab: -+ return error; -+} -+ -+int -+pfkey_sa_build(struct sadb_ext ** pfkey_ext, -+ uint16_t exttype, -+ uint32_t spi, -+ uint8_t replay_window, -+ uint8_t sa_state, -+ uint8_t auth, -+ uint8_t encrypt, -+ uint32_t flags) -+{ -+ struct sadb_builds sab; -+ -+ memset(&sab, 0, sizeof(sab)); -+ sab.sa_base.sadb_sa_exttype = exttype; -+ sab.sa_base.sadb_sa_spi = spi; -+ sab.sa_base.sadb_sa_replay = replay_window; -+ sab.sa_base.sadb_sa_state = sa_state; -+ sab.sa_base.sadb_sa_auth = auth; -+ sab.sa_base.sadb_sa_encrypt = encrypt; -+ sab.sa_base.sadb_sa_flags = flags; -+ sab.sa_base.sadb_x_sa_ref = IPSEC_SAREF_NULL; -+ -+ return pfkey_sa_builds(pfkey_ext, sab); -+} -+ -+int -+pfkey_lifetime_build(struct sadb_ext ** pfkey_ext, -+ uint16_t exttype, -+ uint32_t allocations, -+ uint64_t bytes, -+ uint64_t addtime, -+ uint64_t usetime, -+ uint32_t packets) -+{ -+ int error = 0; -+ struct sadb_lifetime *pfkey_lifetime = (struct sadb_lifetime *)*pfkey_ext; -+ -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_lifetime_build:\n"); -+ /* sanity checks... */ -+ if(pfkey_lifetime) { -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_lifetime_build: " -+ "why is pfkey_lifetime already pointing to something?\n"); -+ SENDERR(EINVAL); -+ } -+ -+ if(exttype != SADB_EXT_LIFETIME_CURRENT && -+ exttype != SADB_EXT_LIFETIME_HARD && -+ exttype != SADB_EXT_LIFETIME_SOFT) { -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_lifetime_build: " -+ "invalid exttype=%d.\n", -+ exttype); -+ SENDERR(EINVAL); -+ } -+ -+ pfkey_lifetime = (struct sadb_lifetime*)MALLOC(sizeof(struct sadb_lifetime)); -+ *pfkey_ext = (struct sadb_ext*) pfkey_lifetime; -+ -+ if(pfkey_lifetime == NULL) { -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_lifetime_build: " -+ "memory allocation failed\n"); -+ SENDERR(ENOMEM); -+ } -+ memset(pfkey_lifetime, 0, sizeof(struct sadb_lifetime)); -+ -+ pfkey_lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime) / IPSEC_PFKEYv2_ALIGN; -+ pfkey_lifetime->sadb_lifetime_exttype = exttype; -+ pfkey_lifetime->sadb_lifetime_allocations = allocations; -+ pfkey_lifetime->sadb_lifetime_bytes = bytes; -+ pfkey_lifetime->sadb_lifetime_addtime = addtime; -+ pfkey_lifetime->sadb_lifetime_usetime = usetime; -+#ifdef NOT_YET -+ /* XXX it is defined in struct sadb_lifetime, but not found?? */ -+ pfkey_lifetime->sadb_x_lifetime_packets = packets; -+#endif -+ -+errlab: -+ return error; -+} -+ -+int -+pfkey_address_build(struct sadb_ext** pfkey_ext, -+ uint16_t exttype, -+ uint8_t proto, -+ uint8_t prefixlen, -+ struct sockaddr* address) -+{ -+ int error = 0; -+ int saddr_len = 0; -+ char ipaddr_txt[ADDRTOT_BUF + 6/*extra for port number*/]; -+ struct sadb_address *pfkey_address = (struct sadb_address *)*pfkey_ext; -+ -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_address_build: " -+ "exttype=%d proto=%d prefixlen=%d\n", -+ exttype, -+ proto, -+ prefixlen); -+ /* sanity checks... */ -+ if(pfkey_address) { -+ ERROR("pfkey_address_build: " -+ "why is pfkey_address already pointing to something?\n"); -+ SENDERR(EINVAL); -+ } -+ -+ if (!address) { -+ ERROR("pfkey_address_build: " "address is NULL\n"); -+ SENDERR(EINVAL); -+ } -+ -+ switch(exttype) { -+ case SADB_EXT_ADDRESS_SRC: -+ case SADB_EXT_ADDRESS_DST: -+ case SADB_EXT_ADDRESS_PROXY: -+ case K_SADB_X_EXT_ADDRESS_DST2: -+ case K_SADB_X_EXT_ADDRESS_SRC_FLOW: -+ case K_SADB_X_EXT_ADDRESS_DST_FLOW: -+ case K_SADB_X_EXT_ADDRESS_SRC_MASK: -+ case K_SADB_X_EXT_ADDRESS_DST_MASK: -+#ifdef NAT_TRAVERSAL -+ case K_SADB_X_EXT_NAT_T_OA: -+#endif -+ break; -+ default: -+ ERROR("pfkey_address_build: " -+ "unrecognised ext_type=%d.\n", -+ exttype); -+ SENDERR(EINVAL); -+ } -+ -+ switch(address->sa_family) { -+ case AF_INET: -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_address_build: " -+ "found address family AF_INET.\n"); -+ saddr_len = sizeof(struct sockaddr_in); -+ sprintf(ipaddr_txt, "%d.%d.%d.%d:%d" -+ , (((struct sockaddr_in*)address)->sin_addr.s_addr >> 0) & 0xFF -+ , (((struct sockaddr_in*)address)->sin_addr.s_addr >> 8) & 0xFF -+ , (((struct sockaddr_in*)address)->sin_addr.s_addr >> 16) & 0xFF -+ , (((struct sockaddr_in*)address)->sin_addr.s_addr >> 24) & 0xFF -+ , ntohs(((struct sockaddr_in*)address)->sin_port)); -+ break; -+ case AF_INET6: -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_address_build: " -+ "found address family AF_INET6.\n"); -+ saddr_len = sizeof(struct sockaddr_in6); -+ sprintf(ipaddr_txt, "%x:%x:%x:%x:%x:%x:%x:%x-%x" -+ , ntohs(((struct sockaddr_in6*)address)->sin6_addr.s6_addr16[0]) -+ , ntohs(((struct sockaddr_in6*)address)->sin6_addr.s6_addr16[1]) -+ , ntohs(((struct sockaddr_in6*)address)->sin6_addr.s6_addr16[2]) -+ , ntohs(((struct sockaddr_in6*)address)->sin6_addr.s6_addr16[3]) -+ , ntohs(((struct sockaddr_in6*)address)->sin6_addr.s6_addr16[4]) -+ , ntohs(((struct sockaddr_in6*)address)->sin6_addr.s6_addr16[5]) -+ , ntohs(((struct sockaddr_in6*)address)->sin6_addr.s6_addr16[6]) -+ , ntohs(((struct sockaddr_in6*)address)->sin6_addr.s6_addr16[7]) -+ , ntohs(((struct sockaddr_in6*)address)->sin6_port)); -+ break; -+ default: -+ ERROR("pfkey_address_build: " -+ "address->sa_family=%d not supported.\n", -+ address->sa_family); -+ SENDERR(EPFNOSUPPORT); -+ } -+ -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_address_build: " -+ "found address=%s.\n", -+ ipaddr_txt); -+ if(prefixlen != 0) { -+ ERROR("pfkey_address_build: " -+ "address prefixes not supported yet.\n"); -+ SENDERR(EAFNOSUPPORT); /* not supported yet */ -+ } -+ -+ /* allocate some memory for the extension */ -+ pfkey_address = (struct sadb_address*) -+ MALLOC(ALIGN_N(sizeof(struct sadb_address) + saddr_len, IPSEC_PFKEYv2_ALIGN)); -+ *pfkey_ext = (struct sadb_ext*)pfkey_address; -+ -+ if(pfkey_address == NULL ) { -+ ERROR("pfkey_lifetime_build: " -+ "memory allocation failed\n"); -+ SENDERR(ENOMEM); -+ } -+ memset(pfkey_address, -+ 0, -+ ALIGN_N(sizeof(struct sadb_address) + saddr_len, -+ IPSEC_PFKEYv2_ALIGN)); -+ -+ pfkey_address->sadb_address_len = DIVUP(sizeof(struct sadb_address) + saddr_len, -+ IPSEC_PFKEYv2_ALIGN); -+ -+ pfkey_address->sadb_address_exttype = exttype; -+ pfkey_address->sadb_address_proto = proto; -+ pfkey_address->sadb_address_prefixlen = prefixlen; -+ pfkey_address->sadb_address_reserved = 0; -+ -+ memcpy((char*)pfkey_address + sizeof(struct sadb_address), -+ address, -+ saddr_len); -+ -+#if 0 -+ for(i = 0; i < sizeof(struct sockaddr_in) - offsetof(struct sockaddr_in, sin_zero); i++) { -+ pfkey_address_s_ska.sin_zero[i] = 0; -+ } -+#endif -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_address_build: " -+ "successful created len: %d.\n", pfkey_address->sadb_address_len); -+ -+ errlab: -+ return error; -+} -+ -+int -+pfkey_key_build(struct sadb_ext** pfkey_ext, -+ uint16_t exttype, -+ uint16_t key_bits, -+ unsigned char * key) -+{ -+ int error = 0; -+ struct sadb_key *pfkey_key = (struct sadb_key *)*pfkey_ext; -+ -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_key_build:\n"); -+ /* sanity checks... */ -+ if(pfkey_key) { -+ ERROR("pfkey_key_build: " -+ "why is pfkey_key already pointing to something?\n"); -+ SENDERR(EINVAL); -+ } -+ -+ if(!key_bits) { -+ ERROR("pfkey_key_build: " -+ "key_bits is zero, it must be non-zero.\n"); -+ SENDERR(EINVAL); -+ } -+ -+ if( !((exttype == SADB_EXT_KEY_AUTH) || (exttype == SADB_EXT_KEY_ENCRYPT))) { -+ ERROR("pfkey_key_build: " -+ "unsupported extension type=%d.\n", -+ exttype); -+ SENDERR(EINVAL); -+ } -+ -+ pfkey_key = (struct sadb_key*) -+ MALLOC(sizeof(struct sadb_key) + -+ DIVUP(key_bits, 64) * IPSEC_PFKEYv2_ALIGN); -+ -+ *pfkey_ext = (struct sadb_ext*)pfkey_key; -+ -+ if(pfkey_key == NULL) { -+ ERROR("pfkey_key_build: " -+ "memory allocation failed\n"); -+ SENDERR(ENOMEM); -+ } -+ memset(pfkey_key, -+ 0, -+ sizeof(struct sadb_key) + -+ DIVUP(key_bits, 64) * IPSEC_PFKEYv2_ALIGN); -+ -+ pfkey_key->sadb_key_len = DIVUP(sizeof(struct sadb_key) * IPSEC_PFKEYv2_ALIGN + key_bits, -+ 64); -+ pfkey_key->sadb_key_exttype = exttype; -+ pfkey_key->sadb_key_bits = key_bits; -+ pfkey_key->sadb_key_reserved = 0; -+ memcpy((char*)pfkey_key + sizeof(struct sadb_key), -+ key, -+ DIVUP(key_bits, 8)); -+ -+errlab: -+ return error; -+} -+ -+int -+pfkey_ident_build(struct sadb_ext** pfkey_ext, -+ uint16_t exttype, -+ uint16_t ident_type, -+ uint64_t ident_id, -+ uint8_t ident_len, -+ char* ident_string) -+{ -+ int error = 0; -+ struct sadb_ident *pfkey_ident = (struct sadb_ident *)*pfkey_ext; -+ int data_len = ident_len * IPSEC_PFKEYv2_ALIGN - sizeof(struct sadb_ident); -+ -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_ident_build:\n"); -+ /* sanity checks... */ -+ if(pfkey_ident) { -+ ERROR("pfkey_ident_build: " -+ "why is pfkey_ident already pointing to something?\n"); -+ SENDERR(EINVAL); -+ } -+ -+ if( ! ((exttype == SADB_EXT_IDENTITY_SRC) || -+ (exttype == SADB_EXT_IDENTITY_DST))) { -+ ERROR("pfkey_ident_build: " -+ "unsupported extension type=%d.\n", -+ exttype); -+ SENDERR(EINVAL); -+ } -+ -+ if((ident_type == SADB_IDENTTYPE_RESERVED)) { -+ ERROR("pfkey_ident_build: " -+ "ident_type must be non-zero.\n"); -+ SENDERR(EINVAL); -+ } -+ -+ if(ident_type > SADB_IDENTTYPE_MAX) { -+ ERROR("pfkey_ident_build: " -+ "identtype=%d out of range.\n", -+ ident_type); -+ SENDERR(EINVAL); -+ } -+ -+ if(((ident_type == SADB_IDENTTYPE_PREFIX) || -+ (ident_type == SADB_IDENTTYPE_FQDN)) && -+ !ident_string) { -+ ERROR("pfkey_ident_build: " -+ "string required to allocate size of extension.\n"); -+ SENDERR(EINVAL); -+ } -+ -+#if 0 -+ if((ident_type == SADB_IDENTTYPE_USERFQDN) ) { -+ } -+#endif -+ -+ pfkey_ident = (struct sadb_ident*) -+ MALLOC(ident_len * IPSEC_PFKEYv2_ALIGN); -+ -+ *pfkey_ext = (struct sadb_ext*)pfkey_ident; -+ -+ if(pfkey_ident == NULL) { -+ ERROR("pfkey_ident_build: " -+ "memory allocation failed\n"); -+ SENDERR(ENOMEM); -+ } -+ memset(pfkey_ident, 0, ident_len * IPSEC_PFKEYv2_ALIGN); -+ -+ pfkey_ident->sadb_ident_len = ident_len; -+ pfkey_ident->sadb_ident_exttype = exttype; -+ pfkey_ident->sadb_ident_type = ident_type; -+ pfkey_ident->sadb_ident_reserved = 0; -+ pfkey_ident->sadb_ident_id = ident_id; -+ memcpy((char*)pfkey_ident + sizeof(struct sadb_ident), -+ ident_string, -+ data_len); -+ -+errlab: -+ return error; -+} -+ -+int -+pfkey_sens_build(struct sadb_ext** pfkey_ext, -+ uint32_t dpd, -+ uint8_t sens_level, -+ uint8_t sens_len, -+ uint64_t* sens_bitmap, -+ uint8_t integ_level, -+ uint8_t integ_len, -+ uint64_t* integ_bitmap) -+{ -+ int error = 0; -+ struct sadb_sens *pfkey_sens = (struct sadb_sens *)*pfkey_ext; -+ int i; -+ uint64_t* bitmap; -+ -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_sens_build:\n"); -+ /* sanity checks... */ -+ if(pfkey_sens) { -+ ERROR("pfkey_sens_build: " -+ "why is pfkey_sens already pointing to something?\n"); -+ SENDERR(EINVAL); -+ } -+ -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_sens_build: " -+ "Sorry, I can't build exttype=%d yet.\n", -+ (*pfkey_ext)->sadb_ext_type); -+ SENDERR(EINVAL); /* don't process these yet */ -+ -+ pfkey_sens = (struct sadb_sens*) -+ MALLOC(sizeof(struct sadb_sens) + -+ (sens_len + integ_len) * sizeof(uint64_t)); -+ -+ *pfkey_ext = (struct sadb_ext*)pfkey_sens; -+ -+ if(pfkey_sens == NULL) { -+ ERROR("pfkey_sens_build: " -+ "memory allocation failed\n"); -+ SENDERR(ENOMEM); -+ } -+ memset(pfkey_sens, -+ 0, -+ sizeof(struct sadb_sens) + -+ (sens_len + integ_len) * sizeof(uint64_t)); -+ -+ pfkey_sens->sadb_sens_len = (sizeof(struct sadb_sens) + -+ (sens_len + integ_len) * sizeof(uint64_t)) / IPSEC_PFKEYv2_ALIGN; -+ pfkey_sens->sadb_sens_exttype = SADB_EXT_SENSITIVITY; -+ pfkey_sens->sadb_sens_dpd = dpd; -+ pfkey_sens->sadb_sens_sens_level = sens_level; -+ pfkey_sens->sadb_sens_sens_len = sens_len; -+ pfkey_sens->sadb_sens_integ_level = integ_level; -+ pfkey_sens->sadb_sens_integ_len = integ_len; -+ pfkey_sens->sadb_sens_reserved = 0; -+ -+ bitmap = (uint64_t*)((char*)pfkey_ext + sizeof(struct sadb_sens)); -+ for(i = 0; i < sens_len; i++) { -+ *bitmap = sens_bitmap[i]; -+ bitmap++; -+ } -+ for(i = 0; i < integ_len; i++) { -+ *bitmap = integ_bitmap[i]; -+ bitmap++; -+ } -+ -+errlab: -+ return error; -+} -+ -+int -+pfkey_prop_build(struct sadb_ext** pfkey_ext, -+ uint8_t replay, -+ unsigned int comb_num, -+ struct sadb_comb* comb) -+{ -+ int error = 0; -+ int i; -+ struct sadb_prop *pfkey_prop = (struct sadb_prop *)*pfkey_ext; -+ struct sadb_comb *combp; -+ -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_prop_build:\n"); -+ /* sanity checks... */ -+ if(pfkey_prop) { -+ ERROR("pfkey_prop_build: " -+ "why is pfkey_prop already pointing to something?\n"); -+ SENDERR(EINVAL); -+ } -+ -+ pfkey_prop = (struct sadb_prop*) -+ MALLOC(sizeof(struct sadb_prop) + -+ comb_num * sizeof(struct sadb_comb)); -+ -+ *pfkey_ext = (struct sadb_ext*)pfkey_prop; -+ -+ if(pfkey_prop == NULL) { -+ ERROR("pfkey_prop_build: " -+ "memory allocation failed\n"); -+ SENDERR(ENOMEM); -+ } -+ memset(pfkey_prop, -+ 0, -+ sizeof(struct sadb_prop) + -+ comb_num * sizeof(struct sadb_comb)); -+ -+ pfkey_prop->sadb_prop_len = (sizeof(struct sadb_prop) + -+ comb_num * sizeof(struct sadb_comb)) / IPSEC_PFKEYv2_ALIGN; -+ -+ pfkey_prop->sadb_prop_exttype = SADB_EXT_PROPOSAL; -+ pfkey_prop->sadb_prop_replay = replay; -+ -+ for(i=0; i<3; i++) { -+ pfkey_prop->sadb_prop_reserved[i] = 0; -+ } -+ -+ combp = (struct sadb_comb*)((char*)*pfkey_ext + sizeof(struct sadb_prop)); -+ for(i = 0; i < comb_num; i++) { -+ memcpy (combp, &(comb[i]), sizeof(struct sadb_comb)); -+ combp++; -+ } -+ -+#if 0 -+ uint8_t sadb_comb_auth; -+ uint8_t sadb_comb_encrypt; -+ uint16_t sadb_comb_flags; -+ uint16_t sadb_comb_auth_minbits; -+ uint16_t sadb_comb_auth_maxbits; -+ uint16_t sadb_comb_encrypt_minbits; -+ uint16_t sadb_comb_encrypt_maxbits; -+ uint32_t sadb_comb_reserved; -+ uint32_t sadb_comb_soft_allocations; -+ uint32_t sadb_comb_hard_allocations; -+ uint64_t sadb_comb_soft_bytes; -+ uint64_t sadb_comb_hard_bytes; -+ uint64_t sadb_comb_soft_addtime; -+ uint64_t sadb_comb_hard_addtime; -+ uint64_t sadb_comb_soft_usetime; -+ uint64_t sadb_comb_hard_usetime; -+ uint32_t sadb_comb_soft_packets; -+ uint32_t sadb_comb_hard_packets; -+#endif -+errlab: -+ return error; -+} -+ -+int -+pfkey_supported_build(struct sadb_ext** pfkey_ext, -+ uint16_t exttype, -+ unsigned int alg_num, -+ struct sadb_alg* alg) -+{ -+ int error = 0; -+ unsigned int i; -+ struct sadb_supported *pfkey_supported = (struct sadb_supported *)*pfkey_ext; -+ struct sadb_alg *pfkey_alg; -+ -+ /* sanity checks... */ -+ if(pfkey_supported) { -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_supported_build: " -+ "why is pfkey_supported already pointing to something?\n"); -+ SENDERR(EINVAL); -+ } -+ -+ if( !((exttype == SADB_EXT_SUPPORTED_AUTH) || (exttype == SADB_EXT_SUPPORTED_ENCRYPT))) { -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_supported_build: " -+ "unsupported extension type=%d.\n", -+ exttype); -+ SENDERR(EINVAL); -+ } -+ -+ pfkey_supported = (struct sadb_supported*) -+ MALLOC(sizeof(struct sadb_supported) + -+ alg_num * -+ sizeof(struct sadb_alg)); -+ -+ *pfkey_ext = (struct sadb_ext*)pfkey_supported; -+ -+ if(pfkey_supported == NULL) { -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_supported_build: " -+ "memory allocation failed\n"); -+ SENDERR(ENOMEM); -+ } -+ memset(pfkey_supported, -+ 0, -+ sizeof(struct sadb_supported) + -+ alg_num * -+ sizeof(struct sadb_alg)); -+ -+ pfkey_supported->sadb_supported_len = (sizeof(struct sadb_supported) + -+ alg_num * -+ sizeof(struct sadb_alg)) / -+ IPSEC_PFKEYv2_ALIGN; -+ pfkey_supported->sadb_supported_exttype = exttype; -+ pfkey_supported->sadb_supported_reserved = 0; -+ -+ pfkey_alg = (struct sadb_alg*)((char*)pfkey_supported + sizeof(struct sadb_supported)); -+ for(i = 0; i < alg_num; i++) { -+ memcpy (pfkey_alg, &(alg[i]), sizeof(struct sadb_alg)); -+ pfkey_alg->sadb_alg_reserved = 0; -+ pfkey_alg++; -+ } -+ -+#if 0 -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_supported_build: " -+ "Sorry, I can't build exttype=%d yet.\n", -+ (*pfkey_ext)->sadb_ext_type); -+ SENDERR(EINVAL); /* don't process these yet */ -+ -+ uint8_t sadb_alg_id; -+ uint8_t sadb_alg_ivlen; -+ uint16_t sadb_alg_minbits; -+ uint16_t sadb_alg_maxbits; -+ uint16_t sadb_alg_reserved; -+#endif -+errlab: -+ return error; -+} -+ -+int -+pfkey_spirange_build(struct sadb_ext** pfkey_ext, -+ uint16_t exttype, -+ uint32_t min, /* in network order */ -+ uint32_t max) /* in network order */ -+{ -+ int error = 0; -+ struct sadb_spirange *pfkey_spirange = (struct sadb_spirange *)*pfkey_ext; -+ -+ /* sanity checks... */ -+ if(pfkey_spirange) { -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_spirange_build: " -+ "why is pfkey_spirange already pointing to something?\n"); -+ SENDERR(EINVAL); -+ } -+ -+ if(ntohl(max) < ntohl(min)) { -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_spirange_build: " -+ "minspi=%08x must be < maxspi=%08x.\n", -+ ntohl(min), -+ ntohl(max)); -+ SENDERR(EINVAL); -+ } -+ -+ if(ntohl(min) <= 255) { -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_spirange_build: " -+ "minspi=%08x must be > 255.\n", -+ ntohl(min)); -+ SENDERR(EEXIST); -+ } -+ -+ pfkey_spirange = (struct sadb_spirange*) -+ MALLOC(sizeof(struct sadb_spirange)); -+ -+ *pfkey_ext = (struct sadb_ext*)pfkey_spirange; -+ -+ if(pfkey_spirange == NULL) { -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_spirange_build: " -+ "memory allocation failed\n"); -+ SENDERR(ENOMEM); -+ } -+ memset(pfkey_spirange, -+ 0, -+ sizeof(struct sadb_spirange)); -+ -+ pfkey_spirange->sadb_spirange_len = sizeof(struct sadb_spirange) / IPSEC_PFKEYv2_ALIGN; -+ -+ pfkey_spirange->sadb_spirange_exttype = SADB_EXT_SPIRANGE; -+ pfkey_spirange->sadb_spirange_min = min; -+ pfkey_spirange->sadb_spirange_max = max; -+ pfkey_spirange->sadb_spirange_reserved = 0; -+ errlab: -+ return error; -+} -+ -+int -+pfkey_x_kmprivate_build(struct sadb_ext** pfkey_ext) -+{ -+ int error = 0; -+ struct sadb_x_kmprivate *pfkey_x_kmprivate = (struct sadb_x_kmprivate *)*pfkey_ext; -+ -+ /* sanity checks... */ -+ if(pfkey_x_kmprivate) { -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_x_kmprivate_build: " -+ "why is pfkey_x_kmprivate already pointing to something?\n"); -+ SENDERR(EINVAL); -+ } -+ -+ pfkey_x_kmprivate->sadb_x_kmprivate_reserved = 0; -+ -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_x_kmprivate_build: " -+ "Sorry, I can't build exttype=%d yet.\n", -+ (*pfkey_ext)->sadb_ext_type); -+ SENDERR(EINVAL); /* don't process these yet */ -+ -+ pfkey_x_kmprivate = (struct sadb_x_kmprivate*) -+ MALLOC(sizeof(struct sadb_x_kmprivate)); -+ -+ *pfkey_ext = (struct sadb_ext*)pfkey_x_kmprivate; -+ -+ if(pfkey_x_kmprivate == NULL) { -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_x_kmprivate_build: " -+ "memory allocation failed\n"); -+ SENDERR(ENOMEM); -+ } -+ memset(pfkey_x_kmprivate, -+ 0, -+ sizeof(struct sadb_x_kmprivate)); -+ -+ pfkey_x_kmprivate->sadb_x_kmprivate_len = -+ sizeof(struct sadb_x_kmprivate) / IPSEC_PFKEYv2_ALIGN; -+ -+ pfkey_x_kmprivate->sadb_x_kmprivate_exttype = K_SADB_X_EXT_KMPRIVATE; -+ pfkey_x_kmprivate->sadb_x_kmprivate_reserved = 0; -+errlab: -+ return error; -+} -+ -+int -+pfkey_x_satype_build(struct sadb_ext** pfkey_ext, -+ uint8_t satype) -+{ -+ int error = 0; -+ int i; -+ struct sadb_x_satype *pfkey_x_satype = (struct sadb_x_satype *)*pfkey_ext; -+ -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_x_satype_build:\n"); -+ /* sanity checks... */ -+ if(pfkey_x_satype) { -+ ERROR("pfkey_x_satype_build: " -+ "why is pfkey_x_satype already pointing to something?\n"); -+ SENDERR(EINVAL); -+ } -+ -+ if(!satype) { -+ ERROR("pfkey_x_satype_build: " -+ "SA type not set, must be non-zero.\n"); -+ SENDERR(EINVAL); -+ } -+ -+ if(satype > K_SADB_SATYPE_MAX) { -+ ERROR("pfkey_x_satype_build: " -+ "satype %d > max %d\n", -+ satype, K_SADB_SATYPE_MAX); -+ SENDERR(EINVAL); -+ } -+ -+ pfkey_x_satype = (struct sadb_x_satype*) -+ MALLOC(sizeof(struct sadb_x_satype)); -+ -+ *pfkey_ext = (struct sadb_ext*)pfkey_x_satype; -+ if(pfkey_x_satype == NULL) { -+ ERROR("pfkey_x_satype_build: " -+ "memory allocation failed\n"); -+ SENDERR(ENOMEM); -+ } -+ memset(pfkey_x_satype, -+ 0, -+ sizeof(struct sadb_x_satype)); -+ -+ pfkey_x_satype->sadb_x_satype_len = sizeof(struct sadb_x_satype) / IPSEC_PFKEYv2_ALIGN; -+ -+ pfkey_x_satype->sadb_x_satype_exttype = K_SADB_X_EXT_SATYPE2; -+ pfkey_x_satype->sadb_x_satype_satype = satype; -+ for(i=0; i<3; i++) { -+ pfkey_x_satype->sadb_x_satype_reserved[i] = 0; -+ } -+ -+errlab: -+ return error; -+} -+ -+int -+pfkey_x_debug_build(struct sadb_ext** pfkey_ext, -+ uint32_t tunnel, -+ uint32_t netlink, -+ uint32_t xform, -+ uint32_t eroute, -+ uint32_t spi, -+ uint32_t radij, -+ uint32_t esp, -+ uint32_t ah, -+ uint32_t rcv, -+ uint32_t pfkey, -+ uint32_t ipcomp, -+ uint32_t verbose) -+{ -+ int error = 0; -+ int i; -+ struct sadb_x_debug *pfkey_x_debug = (struct sadb_x_debug *)*pfkey_ext; -+ -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_x_debug_build:\n"); -+ /* sanity checks... */ -+ if(pfkey_x_debug) { -+ ERROR("pfkey_x_debug_build: " -+ "why is pfkey_x_debug already pointing to something?\n"); -+ SENDERR(EINVAL); -+ } -+ -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_x_debug_build: " -+ "tunnel=%x netlink=%x xform=%x eroute=%x spi=%x radij=%x esp=%x ah=%x rcv=%x pfkey=%x ipcomp=%x verbose=%x?\n", -+ tunnel, netlink, xform, eroute, spi, radij, esp, ah, rcv, pfkey, ipcomp, verbose); -+ -+ pfkey_x_debug = (struct sadb_x_debug*) -+ MALLOC(sizeof(struct sadb_x_debug)); -+ -+ *pfkey_ext = (struct sadb_ext*)pfkey_x_debug; -+ -+ if(pfkey_x_debug == NULL) { -+ ERROR("pfkey_x_debug_build: " -+ "memory allocation failed\n"); -+ SENDERR(ENOMEM); -+ } -+#if 0 -+ memset(pfkey_x_debug, -+ 0, -+ sizeof(struct sadb_x_debug)); -+#endif -+ -+ pfkey_x_debug->sadb_x_debug_len = sizeof(struct sadb_x_debug) / IPSEC_PFKEYv2_ALIGN; -+ pfkey_x_debug->sadb_x_debug_exttype = K_SADB_X_EXT_DEBUG; -+ -+ pfkey_x_debug->sadb_x_debug_tunnel = tunnel; -+ pfkey_x_debug->sadb_x_debug_netlink = netlink; -+ pfkey_x_debug->sadb_x_debug_xform = xform; -+ pfkey_x_debug->sadb_x_debug_eroute = eroute; -+ pfkey_x_debug->sadb_x_debug_spi = spi; -+ pfkey_x_debug->sadb_x_debug_radij = radij; -+ pfkey_x_debug->sadb_x_debug_esp = esp; -+ pfkey_x_debug->sadb_x_debug_ah = ah; -+ pfkey_x_debug->sadb_x_debug_rcv = rcv; -+ pfkey_x_debug->sadb_x_debug_pfkey = pfkey; -+ pfkey_x_debug->sadb_x_debug_ipcomp = ipcomp; -+ pfkey_x_debug->sadb_x_debug_verbose = verbose; -+ -+ for(i=0; i<4; i++) { -+ pfkey_x_debug->sadb_x_debug_reserved[i] = 0; -+ } -+ -+errlab: -+ return error; -+} -+ -+int -+pfkey_x_nat_t_type_build(struct sadb_ext** pfkey_ext, -+ uint8_t type) -+{ -+ int error = 0; -+ int i; -+ struct sadb_x_nat_t_type *pfkey_x_nat_t_type = (struct sadb_x_nat_t_type *)*pfkey_ext; -+ -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_x_nat_t_type_build:\n"); -+ /* sanity checks... */ -+ if(pfkey_x_nat_t_type) { -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_x_nat_t_type_build: " -+ "why is pfkey_x_nat_t_type already pointing to something?\n"); -+ SENDERR(EINVAL); -+ } -+ -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_x_nat_t_type_build: " -+ "type=%d\n", type); -+ -+ pfkey_x_nat_t_type = (struct sadb_x_nat_t_type*) -+ MALLOC(sizeof(struct sadb_x_nat_t_type)); -+ -+ *pfkey_ext = (struct sadb_ext*)pfkey_x_nat_t_type; -+ -+ if(pfkey_x_nat_t_type == NULL) { -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_x_nat_t_type_build: " -+ "memory allocation failed\n"); -+ SENDERR(ENOMEM); -+ } -+ -+ pfkey_x_nat_t_type->sadb_x_nat_t_type_len = sizeof(struct sadb_x_nat_t_type) / IPSEC_PFKEYv2_ALIGN; -+ pfkey_x_nat_t_type->sadb_x_nat_t_type_exttype = K_SADB_X_EXT_NAT_T_TYPE; -+ pfkey_x_nat_t_type->sadb_x_nat_t_type_type = type; -+ for(i=0; i<3; i++) { -+ pfkey_x_nat_t_type->sadb_x_nat_t_type_reserved[i] = 0; -+ } -+ -+errlab: -+ return error; -+} -+ -+int -+pfkey_x_nat_t_port_build(struct sadb_ext** pfkey_ext, -+ uint16_t exttype, -+ uint16_t port) -+{ -+ int error = 0; -+ struct sadb_x_nat_t_port *pfkey_x_nat_t_port = (struct sadb_x_nat_t_port *)*pfkey_ext; -+ -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_x_nat_t_port_build:\n"); -+ /* sanity checks... */ -+ if(pfkey_x_nat_t_port) { -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_x_nat_t_port_build: " -+ "why is pfkey_x_nat_t_port already pointing to something?\n"); -+ SENDERR(EINVAL); -+ } -+ -+ switch(exttype) { -+ case K_SADB_X_EXT_NAT_T_SPORT: -+ case K_SADB_X_EXT_NAT_T_DPORT: -+ break; -+ default: -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_nat_t_port_build: " -+ "unrecognised ext_type=%d.\n", -+ exttype); -+ SENDERR(EINVAL); -+ } -+ -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_x_nat_t_port_build: " -+ "ext=%d, port=%d\n", exttype, port); -+ -+ pfkey_x_nat_t_port = (struct sadb_x_nat_t_port*) -+ MALLOC(sizeof(struct sadb_x_nat_t_port)); -+ -+ *pfkey_ext = (struct sadb_ext*)pfkey_x_nat_t_port; -+ -+ if(pfkey_x_nat_t_port == NULL) { -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_x_nat_t_port_build: " -+ "memory allocation failed\n"); -+ SENDERR(ENOMEM); -+ } -+ -+ pfkey_x_nat_t_port->sadb_x_nat_t_port_len = sizeof(struct sadb_x_nat_t_port) / IPSEC_PFKEYv2_ALIGN; -+ pfkey_x_nat_t_port->sadb_x_nat_t_port_exttype = exttype; -+ pfkey_x_nat_t_port->sadb_x_nat_t_port_port = port; -+ pfkey_x_nat_t_port->sadb_x_nat_t_port_reserved = 0; -+ -+errlab: -+ return error; -+} -+ -+int pfkey_x_protocol_build(struct sadb_ext **pfkey_ext, -+ uint8_t protocol) -+{ -+ int error = 0; -+ struct sadb_protocol * p = (struct sadb_protocol *)*pfkey_ext; -+ DEBUGGING(PF_KEY_DEBUG_BUILD,"pfkey_x_protocol_build: protocol=%u\n", protocol); -+ /* sanity checks... */ -+ if (p != 0) { -+ ERROR("pfkey_x_protocol_build: bogus protocol pointer\n"); -+ SENDERR(EINVAL); -+ } -+ if ((p = (struct sadb_protocol*)MALLOC(sizeof(*p))) == 0) { -+ ERROR("pfkey_build: memory allocation failed\n"); -+ SENDERR(ENOMEM); -+ } -+ *pfkey_ext = (struct sadb_ext *)p; -+ p->sadb_protocol_len = sizeof(*p) / IPSEC_PFKEYv2_ALIGN; -+ p->sadb_protocol_exttype = K_SADB_X_EXT_PROTOCOL; -+ p->sadb_protocol_proto = protocol; -+ p->sadb_protocol_flags = 0; -+ p->sadb_protocol_reserved2 = 0; -+ errlab: -+ return error; -+} -+ -+int pfkey_outif_build(struct sadb_ext **pfkey_ext, -+ uint16_t outif) -+{ -+ int error = 0; -+ struct sadb_x_plumbif * p = (struct sadb_x_plumbif *)*pfkey_ext; -+ -+ if ((p = (struct sadb_x_plumbif*)MALLOC(sizeof(*p))) == 0) { -+ ERROR("pfkey_build: memory allocation failed\n"); -+ SENDERR(ENOMEM); -+ } -+ *pfkey_ext = (struct sadb_ext *)p; -+ -+ p->sadb_x_outif_len = IPSEC_PFKEYv2_WORDS(sizeof(*p)); -+ p->sadb_x_outif_exttype = K_SADB_X_EXT_PLUMBIF; -+ p->sadb_x_outif_ifnum = outif; -+ -+ errlab: -+ return error; -+} -+ -+ -+int pfkey_saref_build(struct sadb_ext **pfkey_ext, -+ IPsecSAref_t in, IPsecSAref_t out) -+{ -+ int error = 0; -+ struct sadb_x_saref* s; -+ -+ /* +4 because sadb_x_saref is not a multiple of 8 bytes */ -+ -+ if ((s = (struct sadb_x_saref*)MALLOC(sizeof(*s)+4)) == 0) { -+ ERROR("pfkey_build: memory allocation failed\n"); -+ SENDERR(ENOMEM); -+ } -+ *pfkey_ext = (struct sadb_ext *)s; -+ -+ s->sadb_x_saref_len = IPSEC_PFKEYv2_WORDS(sizeof(*s)); -+ s->sadb_x_saref_exttype = K_SADB_X_EXT_SAREF; -+ s->sadb_x_saref_me = in; -+ s->sadb_x_saref_him = out; -+ -+ errlab: -+ return error; -+} -+ -+ -+#if defined(I_DONT_THINK_THIS_WILL_BE_USEFUL) && I_DONT_THINK_THIS_WILL_BE_USEFUL -+int (*ext_default_builders[K_SADB_EXT_MAX +1])(struct sadb_msg*, struct sadb_ext*) -+ = -+{ -+ NULL, /* pfkey_msg_build, */ -+ pfkey_sa_build, -+ pfkey_lifetime_build, -+ pfkey_lifetime_build, -+ pfkey_lifetime_build, -+ pfkey_address_build, -+ pfkey_address_build, -+ pfkey_address_build, -+ pfkey_key_build, -+ pfkey_key_build, -+ pfkey_ident_build, -+ pfkey_ident_build, -+ pfkey_sens_build, -+ pfkey_prop_build, -+ pfkey_supported_build, -+ pfkey_supported_build, -+ pfkey_spirange_build, -+ pfkey_x_kmprivate_build, -+ pfkey_x_satype_build, -+ pfkey_sa_build, -+ pfkey_address_build, -+ pfkey_address_build, -+ pfkey_address_build, -+ pfkey_address_build, -+ pfkey_address_build, -+ pfkey_x_ext_debug_build -+}; -+#endif -+ -+int -+pfkey_msg_build(struct sadb_msg **pfkey_msg, struct sadb_ext *extensions[], int dir) -+{ -+ int error = 0; -+ unsigned ext; -+ unsigned total_size; -+ struct sadb_ext *pfkey_ext; -+ pfkey_ext_track extensions_seen = 0; -+#ifndef __KERNEL__ -+ struct sadb_ext *extensions_check[K_SADB_EXT_MAX + 1]; -+#endif -+ -+ if(!extensions[0]) { -+ ERROR("pfkey_msg_build: " -+ "extensions[0] must be specified (struct sadb_msg).\n"); -+ SENDERR(EINVAL); -+ } -+ -+ /* figure out the total size for all the requested extensions */ -+ total_size = IPSEC_PFKEYv2_WORDS(sizeof(struct sadb_msg)); -+ for(ext = 1; ext <= K_SADB_EXT_MAX; ext++) { -+ if(extensions[ext]) { -+ total_size += (extensions[ext])->sadb_ext_len; -+ } -+ } -+ -+ /* allocate that much space */ -+ *pfkey_msg = (struct sadb_msg*)MALLOC(total_size * IPSEC_PFKEYv2_ALIGN); -+ if(*pfkey_msg == NULL) { -+ ERROR("pfkey_msg_build: " -+ "memory allocation failed\n"); -+ SENDERR(ENOMEM); -+ } -+ -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_msg_build: " -+ "pfkey_msg=0p%p allocated %lu bytes, &(extensions[0])=0p%p\n", -+ *pfkey_msg, -+ (unsigned long)(total_size * IPSEC_PFKEYv2_ALIGN), -+ &(extensions[0])); -+ -+ memcpy(*pfkey_msg, -+ extensions[0], -+ sizeof(struct sadb_msg)); -+ (*pfkey_msg)->sadb_msg_len = total_size; -+ (*pfkey_msg)->sadb_msg_reserved = 0; -+ extensions_seen = 1 ; -+ -+ /* -+ * point pfkey_ext to immediately after the space for the header, -+ * i.e. at the first extension location. -+ */ -+ pfkey_ext = (struct sadb_ext*)(((char*)(*pfkey_msg)) + sizeof(struct sadb_msg)); -+ -+ for(ext = 1; ext <= K_SADB_EXT_MAX; ext++) { -+ /* copy from extension[ext] to buffer */ -+ if(extensions[ext]) { -+ /* Is this type of extension permitted for this type of message? */ -+ if(!pfkey_permitted_extension(dir,(*pfkey_msg)->sadb_msg_type,ext)) { -+ ERROR("ext type %d not permitted for %d/%d (build)\n", -+ ext, -+ dir,(*pfkey_msg)->sadb_msg_type); -+ SENDERR(EINVAL); -+ } -+ -+ DEBUGGING(PF_KEY_DEBUG_BUILD, -+ "pfkey_msg_build: " -+ "copying %lu bytes from extensions[%u] (type=%d)\n", -+ (unsigned long)(extensions[ext]->sadb_ext_len * IPSEC_PFKEYv2_ALIGN), -+ ext, -+ extensions[ext]->sadb_ext_type); -+ -+ { -+ char *pfkey_ext_c = (char *)pfkey_ext; -+ -+ pfkey_ext_c += (extensions[ext])->sadb_ext_len * IPSEC_PFKEYv2_ALIGN; -+ -+#if 0 -+ printf("memcpy(%p,%p,%d) -> %p %p:%p\n", pfkey_ext, -+ extensions[ext], -+ (extensions[ext])->sadb_ext_len * IPSEC_PFKEYv2_ALIGN, -+ pfkey_ext_c, (*pfkey_msg), (char *)(*pfkey_msg)+(total_size*IPSEC_PFKEYv2_ALIGN)); -+#endif -+ memcpy(pfkey_ext, -+ extensions[ext], -+ (extensions[ext])->sadb_ext_len * IPSEC_PFKEYv2_ALIGN); -+ pfkey_ext = (struct sadb_ext *)pfkey_ext_c; -+ } -+ -+ /* Mark that we have seen this extension */ -+ pfkey_mark_extension(ext,&extensions_seen); -+ } -+ } -+ -+ if(pfkey_extensions_missing(dir,(*pfkey_msg)->sadb_msg_type,extensions_seen)) { -+ ERROR("required extensions missing. seen=%08llx\n", (unsigned long long)extensions_seen); -+ SENDERR(EINVAL); -+ } -+ -+#ifndef __KERNEL__ -+/* -+ * this is silly, there is no need to reparse the message that we just built. -+ * -+ */ -+ if((error = pfkey_msg_parse(*pfkey_msg,NULL,extensions_check, dir))) { -+ ERROR("pfkey_msg_build: " -+ "Trouble parsing newly built pfkey message, error=%d.\n", -+ error); -+ SENDERR(-error); -+ } -+#endif -+ -+errlab: -+ -+ return error; -+} -+ -+/* -+ * -+ * Local Variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/pfkey_v2_debug.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,144 @@ -+/* -+ * @(#) pfkey version 2 debugging messages -+ * -+ * Copyright (C) 2001 Richard Guy Briggs -+ * and Michael Richardson -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * RCSID $Id: pfkey_v2_debug.c,v 1.11 2005/04/06 17:45:16 mcr Exp $ -+ * -+ */ -+ -+#ifdef __KERNEL__ -+ -+# include /* for printk */ -+ -+# include "openswan/ipsec_kversion.h" /* for malloc switch */ -+# ifdef MALLOC_SLAB -+# include /* kmalloc() */ -+# else /* MALLOC_SLAB */ -+# include /* kmalloc() */ -+# endif /* MALLOC_SLAB */ -+# include /* error codes */ -+# include /* size_t */ -+# include /* mark_bh */ -+ -+# include /* struct device, and other headers */ -+# include /* eth_type_trans */ -+extern int debug_pfkey; -+ -+#else /* __KERNEL__ */ -+ -+#if defined(macintosh) || (defined(__MACH__) && defined(__APPLE__)) -+# include -+#else -+# include -+# include -+# include -+#endif -+ -+#endif /* __KERNEL__ */ -+ -+#include "openswan.h" -+#include "openswan/pfkeyv2.h" -+#include "openswan/pfkey.h" -+ -+/* -+ * This file provides ASCII translations of PF_KEY magic numbers. -+ * -+ */ -+ -+static char *pfkey_sadb_ext_strings[]={ -+ "reserved", /* K_SADB_EXT_RESERVED 0 */ -+ "security-association", /* K_SADB_EXT_SA 1 */ -+ "lifetime-current", /* K_SADB_EXT_LIFETIME_CURRENT 2 */ -+ "lifetime-hard", /* K_SADB_EXT_LIFETIME_HARD 3 */ -+ "lifetime-soft", /* K_SADB_EXT_LIFETIME_SOFT 4 */ -+ "source-address", /* K_SADB_EXT_ADDRESS_SRC 5 */ -+ "destination-address", /* K_SADB_EXT_ADDRESS_DST 6 */ -+ "proxy-address", /* K_SADB_EXT_ADDRESS_PROXY 7 */ -+ "authentication-key", /* K_SADB_EXT_KEY_AUTH 8 */ -+ "cipher-key", /* K_SADB_EXT_KEY_ENCRYPT 9 */ -+ "source-identity", /* K_SADB_EXT_IDENTITY_SRC 10 */ -+ "destination-identity", /* K_SADB_EXT_IDENTITY_DST 11 */ -+ "sensitivity-label", /* K_SADB_EXT_SENSITIVITY 12 */ -+ "proposal", /* K_SADB_EXT_PROPOSAL 13 */ -+ "supported-auth", /* K_SADB_EXT_SUPPORTED_AUTH 14 */ -+ "supported-cipher", /* K_SADB_EXT_SUPPORTED_ENCRYPT 15 */ -+ "spi-range", /* K_SADB_EXT_SPIRANGE 16 */ -+ "X-kmpprivate", /* K_SADB_X_EXT_KMPRIVATE 17 */ -+ "X-satype2", /* K_SADB_X_EXT_SATYPE2 18 */ -+ "X-security-association", /* K_SADB_X_EXT_SA2 19 */ -+ "X-destination-address2", /* K_SADB_X_EXT_ADDRESS_DST2 20 */ -+ "X-source-flow-address", /* K_SADB_X_EXT_ADDRESS_SRC_FLOW 21 */ -+ "X-dest-flow-address", /* K_SADB_X_EXT_ADDRESS_DST_FLOW 22 */ -+ "X-source-mask", /* K_SADB_X_EXT_ADDRESS_SRC_MASK 23 */ -+ "X-dest-mask", /* K_SADB_X_EXT_ADDRESS_DST_MASK 24 */ -+ "X-set-debug", /* K_SADB_X_EXT_DEBUG 25 */ -+ /* NAT_TRAVERSAL */ -+ "X-ext-protocol", /* K_SADB_X_EXT_PROTOCOL 26 */ -+ "X-NAT-T-type", /* K_SADB_X_EXT_NAT_T_TYPE 27 */ -+ "X-NAT-T-sport", /* K_SADB_X_EXT_NAT_T_SPORT 28 */ -+ "X-NAT-T-dport", /* K_SADB_X_EXT_NAT_T_DPORT 29 */ -+ "X-NAT-T-OA", /* K_SADB_X_EXT_NAT_T_OA 30 */ -+ "X-plumbif", /* K_SADB_X_EXT_PLUMBIF 31 */ -+ "X-saref", /* K_SADB_X_EXT_SAREF 32 */ -+}; -+ -+const char * -+pfkey_v2_sadb_ext_string(int ext) -+{ -+ if(ext <= K_SADB_EXT_MAX) { -+ return pfkey_sadb_ext_strings[ext]; -+ } else { -+ return "unknown-ext"; -+ } -+} -+ -+ -+static char *pfkey_sadb_type_strings[]={ -+ "reserved", /* K_SADB_RESERVED */ -+ "getspi", /* K_SADB_GETSPI */ -+ "update", /* K_SADB_UPDATE */ -+ "add", /* K_SADB_ADD */ -+ "delete", /* K_SADB_DELETE */ -+ "get", /* K_SADB_GET */ -+ "acquire", /* K_SADB_ACQUIRE */ -+ "register", /* K_SADB_REGISTER */ -+ "expire", /* K_SADB_EXPIRE */ -+ "flush", /* K_SADB_FLUSH */ -+ "dump", /* K_SADB_DUMP */ -+ "x-promisc", /* K_SADB_X_PROMISC */ -+ "x-pchange", /* K_SADB_X_PCHANGE */ -+ "x-groupsa", /* K_SADB_X_GRPSA */ -+ "x-addflow(eroute)", /* K_SADB_X_ADDFLOW */ -+ "x-delflow(eroute)", /* K_SADB_X_DELFLOW */ -+ "x-debug", /* K_SADB_X_DEBUG */ -+}; -+ -+const char * -+pfkey_v2_sadb_type_string(int sadb_type) -+{ -+ if(sadb_type <= K_SADB_MAX) { -+ return pfkey_sadb_type_strings[sadb_type]; -+ } else { -+ return "unknown-sadb-type"; -+ } -+} -+ -+/* -+ * -+ * Local Variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/pfkey_v2_ext_bits.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,814 @@ -+/* -+ * RFC2367 PF_KEYv2 Key management API message parser -+ * Copyright (C) 1999, 2000, 2001 Richard Guy Briggs. -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * RCSID $Id: pfkey_v2_ext_bits.c,v 1.22 2005/05/11 01:45:31 mcr Exp $ -+ */ -+ -+/* -+ * Template from klips/net/ipsec/ipsec/ipsec_parse.c. -+ */ -+ -+char pfkey_v2_ext_bits_c_version[] = "$Id: pfkey_v2_ext_bits.c,v 1.22 2005/05/11 01:45:31 mcr Exp $"; -+ -+/* -+ * Some ugly stuff to allow consistent debugging code for use in the -+ * kernel and in user space -+*/ -+ -+#if defined(__KERNEL__) && defined(linux) -+ -+# include /* for printk */ -+ -+# include "openswan/ipsec_kversion.h" /* for malloc switch */ -+# ifdef MALLOC_SLAB -+# include /* kmalloc() */ -+# else /* MALLOC_SLAB */ -+# include /* kmalloc() */ -+# endif /* MALLOC_SLAB */ -+# include /* error codes */ -+# include /* size_t */ -+# include /* mark_bh */ -+ -+# include /* struct device, and other headers */ -+# include /* eth_type_trans */ -+# include /* struct iphdr */ -+# if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -+# include -+# endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ -+ -+#else /* __KERNEL__ */ -+ -+# include -+# include -+# include -+#endif -+ -+#include -+#include -+#include -+ -+#include "openswan/pfkey_debug.h" -+ -+ -+pfkey_ext_track extensions_bitmaps[2/*in/out*/][2/*perm/req*/][K_SADB_MAX+1]={ -+ -+/* INBOUND EXTENSIONS */ -+{ -+ -+/* PERMITTED IN */ -+{ -+/* K_SADB_RESERVED */ -+0 -+, -+/* SADB_GETSPI */ -+1ULL< -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ */ -+ -+/* -+ * Template from klips/net/ipsec/ipsec/ipsec_netlink.c. -+ */ -+ -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif -+#include -+#include /* printk() */ -+ -+#include "openswan/ipsec_param.h" -+ -+#ifdef MALLOC_SLAB -+# include /* kmalloc() */ -+#else /* MALLOC_SLAB */ -+# include /* kmalloc() */ -+#endif /* MALLOC_SLAB */ -+#include /* error codes */ -+#include /* size_t */ -+#include /* mark_bh */ -+ -+#include /* struct device, and other headers */ -+#include /* eth_type_trans */ -+#include /* struct iphdr */ -+#include -+ -+#include -+ -+#include -+ -+#ifdef SPINLOCK -+# ifdef SPINLOCK_23 -+# include /* *lock* */ -+# else /* SPINLOCK_23 */ -+# include /* *lock* */ -+# endif /* SPINLOCK_23 */ -+#endif /* SPINLOCK */ -+#ifdef NET_21 -+# include -+# define IS_MYADDR RTN_LOCAL -+#endif -+ -+#include -+#ifdef NETLINK_SOCK -+# include -+#else -+# include -+#endif -+ -+#include /* get_random_bytes() */ -+ -+#include "openswan/radij.h" -+#include "openswan/ipsec_encap.h" -+#include "openswan/ipsec_sa.h" -+ -+#include "openswan/ipsec_radij.h" -+#include "openswan/ipsec_xform.h" -+#include "openswan/ipsec_ah.h" -+#include "openswan/ipsec_esp.h" -+#include "openswan/ipsec_tunnel.h" -+#include "openswan/ipsec_rcv.h" -+#include "openswan/ipcomp.h" -+ -+#include -+#include -+ -+#include "openswan/ipsec_proto.h" -+#include "openswan/ipsec_alg.h" -+ -+#ifdef CONFIG_KLIPS_OCF -+#include "ipsec_ocf.h" -+#endif -+ -+#define SENDERR(_x) do { error = -(_x); goto errlab; } while (0) -+ -+/* returns 0 on success */ -+int -+pfkey_sa_process(struct sadb_ext *pfkey_ext, struct pfkey_extracted_data* extr) -+{ -+ struct k_sadb_sa *k_pfkey_sa = (struct k_sadb_sa *)pfkey_ext; -+ struct sadb_sa *pfkey_sa = (struct sadb_sa *)pfkey_ext; -+ int error = 0; -+ struct ipsec_sa* ipsp; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_sa_process: .\n"); -+ -+ if(!extr || !extr->ips) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_sa_process: " -+ "extr or extr->ips is NULL, fatal\n"); -+ SENDERR(EINVAL); -+ } -+ -+ switch(pfkey_ext->sadb_ext_type) { -+ case K_SADB_EXT_SA: -+ ipsp = extr->ips; -+ break; -+ case K_SADB_X_EXT_SA2: -+ if(extr->ips2 == NULL) { -+ extr->ips2 = ipsec_sa_alloc(&error); /* pass error var by pointer */ -+ } -+ if(extr->ips2 == NULL) { -+ SENDERR(-error); -+ } -+ ipsp = extr->ips2; -+ break; -+ default: -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_sa_process: " -+ "invalid exttype=%d.\n", -+ pfkey_ext->sadb_ext_type); -+ SENDERR(EINVAL); -+ } -+ -+ ipsp->ips_said.spi = pfkey_sa->sadb_sa_spi; -+ ipsp->ips_replaywin = pfkey_sa->sadb_sa_replay; -+ ipsp->ips_state = pfkey_sa->sadb_sa_state; -+ ipsp->ips_flags = pfkey_sa->sadb_sa_flags; -+ ipsp->ips_replaywin_lastseq = ipsp->ips_replaywin_bitmap = 0; -+ -+ if(k_pfkey_sa->sadb_sa_len > sizeof(struct sadb_sa)/IPSEC_PFKEYv2_ALIGN) { -+ ipsp->ips_ref = k_pfkey_sa->sadb_x_sa_ref; -+ } -+ -+ switch(ipsp->ips_said.proto) { -+ case IPPROTO_AH: -+ ipsp->ips_authalg = pfkey_sa->sadb_sa_auth; -+ ipsp->ips_encalg = K_SADB_EALG_NONE; -+#ifdef CONFIG_KLIPS_OCF -+ if (ipsec_ocf_sa_init(ipsp, ipsp->ips_authalg, 0)) -+ break; -+#endif -+ break; -+ case IPPROTO_ESP: -+ ipsp->ips_authalg = pfkey_sa->sadb_sa_auth; -+ ipsp->ips_encalg = pfkey_sa->sadb_sa_encrypt; -+#ifdef CONFIG_KLIPS_OCF -+ if (ipsec_ocf_sa_init(ipsp, ipsp->ips_authalg, ipsp->ips_encalg)) -+ break; -+#endif -+#ifdef CONFIG_KLIPS_ALG -+ ipsec_alg_sa_init(ipsp); -+#endif -+ break; -+ case IPPROTO_IPIP: -+ ipsp->ips_authalg = AH_NONE; -+ ipsp->ips_encalg = ESP_NONE; -+ break; -+#ifdef CONFIG_KLIPS_IPCOMP -+ case IPPROTO_COMP: -+ ipsp->ips_authalg = AH_NONE; -+ ipsp->ips_encalg = pfkey_sa->sadb_sa_encrypt; -+ break; -+#endif /* CONFIG_KLIPS_IPCOMP */ -+ case IPPROTO_INT: -+ ipsp->ips_authalg = AH_NONE; -+ ipsp->ips_encalg = ESP_NONE; -+ break; -+ case 0: -+ break; -+ default: -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_sa_process: " -+ "unknown proto=%d.\n", -+ ipsp->ips_said.proto); -+ SENDERR(EINVAL); -+ } -+ -+errlab: -+ return error; -+} -+ -+int -+pfkey_lifetime_process(struct sadb_ext *pfkey_ext, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+ struct sadb_lifetime *pfkey_lifetime = (struct sadb_lifetime *)pfkey_ext; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_lifetime_process: .\n"); -+ -+ if(!extr || !extr->ips) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_lifetime_process: " -+ "extr or extr->ips is NULL, fatal\n"); -+ SENDERR(EINVAL); -+ } -+ -+ switch(pfkey_lifetime->sadb_lifetime_exttype) { -+ case K_SADB_EXT_LIFETIME_CURRENT: -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_lifetime_process: " -+ "lifetime_current not supported yet.\n"); -+ SENDERR(EINVAL); -+ break; -+ case K_SADB_EXT_LIFETIME_HARD: -+ ipsec_lifetime_update_hard(&extr->ips->ips_life.ipl_allocations, -+ pfkey_lifetime->sadb_lifetime_allocations); -+ -+ ipsec_lifetime_update_hard(&extr->ips->ips_life.ipl_bytes, -+ pfkey_lifetime->sadb_lifetime_bytes); -+ -+ ipsec_lifetime_update_hard(&extr->ips->ips_life.ipl_addtime, -+ pfkey_lifetime->sadb_lifetime_addtime); -+ -+ ipsec_lifetime_update_hard(&extr->ips->ips_life.ipl_usetime, -+ pfkey_lifetime->sadb_lifetime_usetime); -+ -+ break; -+ -+ case K_SADB_EXT_LIFETIME_SOFT: -+ ipsec_lifetime_update_soft(&extr->ips->ips_life.ipl_allocations, -+ pfkey_lifetime->sadb_lifetime_allocations); -+ -+ ipsec_lifetime_update_soft(&extr->ips->ips_life.ipl_bytes, -+ pfkey_lifetime->sadb_lifetime_bytes); -+ -+ ipsec_lifetime_update_soft(&extr->ips->ips_life.ipl_addtime, -+ pfkey_lifetime->sadb_lifetime_addtime); -+ -+ ipsec_lifetime_update_soft(&extr->ips->ips_life.ipl_usetime, -+ pfkey_lifetime->sadb_lifetime_usetime); -+ -+ break; -+ default: -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_lifetime_process: " -+ "invalid exttype=%d.\n", -+ pfkey_ext->sadb_ext_type); -+ SENDERR(EINVAL); -+ } -+ -+errlab: -+ return error; -+} -+ -+int -+pfkey_address_process(struct sadb_ext *pfkey_ext, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+ int saddr_len = 0; -+ char ipaddr_txt[ADDRTOA_BUF]; -+ unsigned char **sap; -+ unsigned short * portp = 0; -+ struct sadb_address *pfkey_address = (struct sadb_address *)pfkey_ext; -+ struct sockaddr* s = (struct sockaddr*)((char*)pfkey_address + sizeof(*pfkey_address)); -+ struct ipsec_sa* ipsp; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_address_process:\n"); -+ -+ if(!extr || !extr->ips) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_address_process: " -+ "extr or extr->ips is NULL, fatal\n"); -+ SENDERR(EINVAL); -+ } -+ -+ switch(s->sa_family) { -+ case AF_INET: -+ saddr_len = sizeof(struct sockaddr_in); -+ if (debug_pfkey) -+ addrtoa(((struct sockaddr_in*)s)->sin_addr, 0, ipaddr_txt, sizeof(ipaddr_txt)); -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_address_process: " -+ "found address family=%d, AF_INET, %s.\n", -+ s->sa_family, -+ ipaddr_txt); -+ break; -+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -+ case AF_INET6: -+ saddr_len = sizeof(struct sockaddr_in6); -+ break; -+#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ -+ default: -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_address_process: " -+ "s->sa_family=%d not supported.\n", -+ s->sa_family); -+ SENDERR(EPFNOSUPPORT); -+ } -+ -+ switch(pfkey_address->sadb_address_exttype) { -+ case K_SADB_EXT_ADDRESS_SRC: -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_address_process: " -+ "found src address.\n"); -+ sap = (unsigned char **)&(extr->ips->ips_addr_s); -+ extr->ips->ips_addr_s_size = saddr_len; -+ break; -+ case K_SADB_EXT_ADDRESS_DST: -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_address_process: " -+ "found dst address.\n"); -+ sap = (unsigned char **)&(extr->ips->ips_addr_d); -+ extr->ips->ips_addr_d_size = saddr_len; -+ break; -+ case K_SADB_EXT_ADDRESS_PROXY: -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_address_process: " -+ "found proxy address.\n"); -+ sap = (unsigned char **)&(extr->ips->ips_addr_p); -+ extr->ips->ips_addr_p_size = saddr_len; -+ break; -+ case K_SADB_X_EXT_ADDRESS_DST2: -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_address_process: " -+ "found 2nd dst address.\n"); -+ if(extr->ips2 == NULL) { -+ extr->ips2 = ipsec_sa_alloc(&error); /* pass error var by pointer */ -+ } -+ if(extr->ips2 == NULL) { -+ SENDERR(-error); -+ } -+ sap = (unsigned char **)&(extr->ips2->ips_addr_d); -+ extr->ips2->ips_addr_d_size = saddr_len; -+ break; -+ case K_SADB_X_EXT_ADDRESS_SRC_FLOW: -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_address_process: " -+ "found src flow address.\n"); -+ if(pfkey_alloc_eroute(&(extr->eroute)) == ENOMEM) { -+ SENDERR(ENOMEM); -+ } -+ sap = (unsigned char **)&(extr->eroute->er_eaddr.sen_ip_src); -+ portp = &(extr->eroute->er_eaddr.sen_sport); -+ break; -+ case K_SADB_X_EXT_ADDRESS_DST_FLOW: -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_address_process: " -+ "found dst flow address.\n"); -+ if(pfkey_alloc_eroute(&(extr->eroute)) == ENOMEM) { -+ SENDERR(ENOMEM); -+ } -+ sap = (unsigned char **)&(extr->eroute->er_eaddr.sen_ip_dst); -+ portp = &(extr->eroute->er_eaddr.sen_dport); -+ break; -+ case K_SADB_X_EXT_ADDRESS_SRC_MASK: -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_address_process: " -+ "found src mask address.\n"); -+ if(pfkey_alloc_eroute(&(extr->eroute)) == ENOMEM) { -+ SENDERR(ENOMEM); -+ } -+ sap = (unsigned char **)&(extr->eroute->er_emask.sen_ip_src); -+ portp = &(extr->eroute->er_emask.sen_sport); -+ break; -+ case K_SADB_X_EXT_ADDRESS_DST_MASK: -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_address_process: " -+ "found dst mask address.\n"); -+ if(pfkey_alloc_eroute(&(extr->eroute)) == ENOMEM) { -+ SENDERR(ENOMEM); -+ } -+ sap = (unsigned char **)&(extr->eroute->er_emask.sen_ip_dst); -+ portp = &(extr->eroute->er_emask.sen_dport); -+ break; -+#ifdef NAT_TRAVERSAL -+ case K_SADB_X_EXT_NAT_T_OA: -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_address_process: " -+ "found NAT-OA address.\n"); -+ sap = (unsigned char **)&(extr->ips->ips_natt_oa); -+ extr->ips->ips_natt_oa_size = saddr_len; -+ break; -+#endif -+ default: -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_address_process: " -+ "unrecognised ext_type=%d.\n", -+ pfkey_address->sadb_address_exttype); -+ SENDERR(EINVAL); -+ } -+ -+ switch(pfkey_address->sadb_address_exttype) { -+ case K_SADB_EXT_ADDRESS_SRC: -+ case K_SADB_EXT_ADDRESS_DST: -+ case K_SADB_EXT_ADDRESS_PROXY: -+ case K_SADB_X_EXT_ADDRESS_DST2: -+#ifdef NAT_TRAVERSAL -+ case K_SADB_X_EXT_NAT_T_OA: -+#endif -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_address_process: " -+ "allocating %d bytes for saddr.\n", -+ saddr_len); -+ if(!(*sap = kmalloc(saddr_len, GFP_KERNEL))) { -+ SENDERR(ENOMEM); -+ } -+ memcpy(*sap, s, saddr_len); -+ break; -+ default: -+ if(s->sa_family != AF_INET) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_address_process: " -+ "s->sa_family=%d not supported.\n", -+ s->sa_family); -+ SENDERR(EPFNOSUPPORT); -+ } -+ { -+ *(struct in_addr *)sap = ((struct sockaddr_in *)s)->sin_addr; -+ } -+ -+ if (portp != 0) -+ *portp = ((struct sockaddr_in*)s)->sin_port; -+#ifdef CONFIG_KLIPS_DEBUG -+ if(extr->eroute) { -+ char buf1[64], buf2[64]; -+ if (debug_pfkey) { -+ subnettoa(extr->eroute->er_eaddr.sen_ip_src, -+ extr->eroute->er_emask.sen_ip_src, 0, buf1, sizeof(buf1)); -+ subnettoa(extr->eroute->er_eaddr.sen_ip_dst, -+ extr->eroute->er_emask.sen_ip_dst, 0, buf2, sizeof(buf2)); -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_address_parse: " -+ "extr->eroute set to %s:%d->%s:%d\n", -+ buf1, -+ ntohs(extr->eroute->er_eaddr.sen_sport), -+ buf2, -+ ntohs(extr->eroute->er_eaddr.sen_dport)); -+ } -+ } -+#endif /* CONFIG_KLIPS_DEBUG */ -+ } -+ -+ ipsp = extr->ips; -+ switch(pfkey_address->sadb_address_exttype) { -+ case K_SADB_X_EXT_ADDRESS_DST2: -+ ipsp = extr->ips2; -+ case K_SADB_EXT_ADDRESS_DST: -+ if(s->sa_family == AF_INET) { -+ ipsp->ips_said.dst.u.v4.sin_addr.s_addr = ((struct sockaddr_in*)(ipsp->ips_addr_d))->sin_addr.s_addr; -+ ipsp->ips_said.dst.u.v4.sin_family = AF_INET; -+ if (debug_pfkey) -+ addrtoa(((struct sockaddr_in*)(ipsp->ips_addr_d))->sin_addr, -+ 0, -+ ipaddr_txt, -+ sizeof(ipaddr_txt)); -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_address_process: " -+ "ips_said.dst set to %s.\n", -+ ipaddr_txt); -+ } else { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_address_process: " -+ "uh, ips_said.dst doesn't do address family=%d yet, said will be invalid.\n", -+ s->sa_family); -+ } -+ default: -+ break; -+ } -+ -+ /* XXX check if port!=0 */ -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_address_process: successful.\n"); -+ errlab: -+ return error; -+} -+ -+int -+pfkey_key_process(struct sadb_ext *pfkey_ext, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+ struct sadb_key *pfkey_key = (struct sadb_key *)pfkey_ext; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_key_process: .\n"); -+ -+ if(!extr || !extr->ips) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_key_process: " -+ "extr or extr->ips is NULL, fatal\n"); -+ SENDERR(EINVAL); -+ } -+ -+ switch(pfkey_key->sadb_key_exttype) { -+ case K_SADB_EXT_KEY_AUTH: -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_key_process: " -+ "allocating %d bytes for authkey.\n", -+ DIVUP(pfkey_key->sadb_key_bits, 8)); -+ if(!(extr->ips->ips_key_a = kmalloc(DIVUP(pfkey_key->sadb_key_bits, 8), GFP_KERNEL))) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_key_process: " -+ "memory allocation error.\n"); -+ SENDERR(ENOMEM); -+ } -+ extr->ips->ips_key_bits_a = pfkey_key->sadb_key_bits; -+ extr->ips->ips_key_a_size = DIVUP(pfkey_key->sadb_key_bits, 8); -+ memcpy(extr->ips->ips_key_a, -+ (char*)pfkey_key + sizeof(struct sadb_key), -+ extr->ips->ips_key_a_size); -+ break; -+ case K_SADB_EXT_KEY_ENCRYPT: /* Key(s) */ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_key_process: " -+ "allocating %d bytes for enckey.\n", -+ DIVUP(pfkey_key->sadb_key_bits, 8)); -+ if(!(extr->ips->ips_key_e = kmalloc(DIVUP(pfkey_key->sadb_key_bits, 8), GFP_KERNEL))) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_key_process: " -+ "memory allocation error.\n"); -+ SENDERR(ENOMEM); -+ } -+ extr->ips->ips_key_bits_e = pfkey_key->sadb_key_bits; -+ extr->ips->ips_key_e_size = DIVUP(pfkey_key->sadb_key_bits, 8); -+ memcpy(extr->ips->ips_key_e, -+ (char*)pfkey_key + sizeof(struct sadb_key), -+ extr->ips->ips_key_e_size); -+ break; -+ default: -+ SENDERR(EINVAL); -+ } -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_key_process: " -+ "success.\n"); -+errlab: -+ return error; -+} -+ -+int -+pfkey_ident_process(struct sadb_ext *pfkey_ext, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+ struct sadb_ident *pfkey_ident = (struct sadb_ident *)pfkey_ext; -+ int data_len; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_ident_process: .\n"); -+ -+ if(!extr || !extr->ips) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_ident_process: " -+ "extr or extr->ips is NULL, fatal\n"); -+ SENDERR(EINVAL); -+ } -+ -+ switch(pfkey_ident->sadb_ident_exttype) { -+ case K_SADB_EXT_IDENTITY_SRC: -+ data_len = pfkey_ident->sadb_ident_len * IPSEC_PFKEYv2_ALIGN - sizeof(struct sadb_ident); -+ -+ extr->ips->ips_ident_s.type = pfkey_ident->sadb_ident_type; -+ extr->ips->ips_ident_s.id = pfkey_ident->sadb_ident_id; -+ extr->ips->ips_ident_s.len = pfkey_ident->sadb_ident_len; -+ if(data_len) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_ident_process: " -+ "allocating %d bytes for ident_s.\n", -+ data_len); -+ if(!(extr->ips->ips_ident_s.data -+ = kmalloc(data_len, GFP_KERNEL))) { -+ SENDERR(ENOMEM); -+ } -+ memcpy(extr->ips->ips_ident_s.data, -+ (char*)pfkey_ident + sizeof(struct sadb_ident), -+ data_len); -+ } else { -+ extr->ips->ips_ident_s.data = NULL; -+ } -+ break; -+ case K_SADB_EXT_IDENTITY_DST: /* Identity(ies) */ -+ data_len = pfkey_ident->sadb_ident_len * IPSEC_PFKEYv2_ALIGN - sizeof(struct sadb_ident); -+ -+ extr->ips->ips_ident_d.type = pfkey_ident->sadb_ident_type; -+ extr->ips->ips_ident_d.id = pfkey_ident->sadb_ident_id; -+ extr->ips->ips_ident_d.len = pfkey_ident->sadb_ident_len; -+ if(data_len) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_ident_process: " -+ "allocating %d bytes for ident_d.\n", -+ data_len); -+ if(!(extr->ips->ips_ident_d.data -+ = kmalloc(data_len, GFP_KERNEL))) { -+ SENDERR(ENOMEM); -+ } -+ memcpy(extr->ips->ips_ident_d.data, -+ (char*)pfkey_ident + sizeof(struct sadb_ident), -+ data_len); -+ } else { -+ extr->ips->ips_ident_d.data = NULL; -+ } -+ break; -+ default: -+ SENDERR(EINVAL); -+ } -+errlab: -+ return error; -+} -+ -+int -+pfkey_sens_process(struct sadb_ext *pfkey_ext, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_sens_process: " -+ "Sorry, I can't process exttype=%d yet.\n", -+ pfkey_ext->sadb_ext_type); -+ SENDERR(EINVAL); /* don't process these yet */ -+ errlab: -+ return error; -+} -+ -+int -+pfkey_prop_process(struct sadb_ext *pfkey_ext, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_prop_process: " -+ "Sorry, I can't process exttype=%d yet.\n", -+ pfkey_ext->sadb_ext_type); -+ SENDERR(EINVAL); /* don't process these yet */ -+ -+ errlab: -+ return error; -+} -+ -+int -+pfkey_supported_process(struct sadb_ext *pfkey_ext, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_supported_process: " -+ "Sorry, I can't process exttype=%d yet.\n", -+ pfkey_ext->sadb_ext_type); -+ SENDERR(EINVAL); /* don't process these yet */ -+ -+errlab: -+ return error; -+} -+ -+int -+pfkey_spirange_process(struct sadb_ext *pfkey_ext, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_spirange_process: .\n"); -+/* errlab: */ -+ return error; -+} -+ -+int -+pfkey_x_kmprivate_process(struct sadb_ext *pfkey_ext, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_kmprivate_process: " -+ "Sorry, I can't process exttype=%d yet.\n", -+ pfkey_ext->sadb_ext_type); -+ SENDERR(EINVAL); /* don't process these yet */ -+ -+errlab: -+ return error; -+} -+ -+int -+pfkey_x_satype_process(struct sadb_ext *pfkey_ext, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+ struct sadb_x_satype *pfkey_x_satype = (struct sadb_x_satype *)pfkey_ext; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "pfkey_x_satype_process: .\n"); -+ -+ if(!extr || !extr->ips) { -+ KLIPS_PRINT(debug_pfkey, -+ "pfkey_x_satype_process: " -+ "extr or extr->ips is NULL, fatal\n"); -+ SENDERR(EINVAL); -+ } -+ -+ if(extr->ips2 == NULL) { -+ extr->ips2 = ipsec_sa_alloc(&error); /* pass error var by pointer */ -+ } -+ if(extr->ips2 == NULL) { -+ SENDERR(-error); -+ } -+ if(!(extr->ips2->ips_said.proto = satype2proto(pfkey_x_satype->sadb_x_satype_satype))) { -+ KLIPS_ERROR(debug_pfkey, -+ "pfkey_x_satype_process: " -+ "proto lookup from satype=%d failed.\n", -+ pfkey_x_satype->sadb_x_satype_satype); -+ SENDERR(EINVAL); -+ } -+ KLIPS_PRINT(debug_pfkey, -+ "pfkey_x_satype_process: " -+ "protocol==%d decoded from satype==%d(%s).\n", -+ extr->ips2->ips_said.proto, -+ pfkey_x_satype->sadb_x_satype_satype, -+ satype2name(pfkey_x_satype->sadb_x_satype_satype)); -+ -+errlab: -+ return error; -+} -+ -+ -+#ifdef CONFIG_IPSEC_NAT_TRAVERSAL -+int -+pfkey_x_nat_t_type_process(struct sadb_ext *pfkey_ext, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+ struct sadb_x_nat_t_type *pfkey_x_nat_t_type = (struct sadb_x_nat_t_type *)pfkey_ext; -+ -+ if(!pfkey_x_nat_t_type) { -+ printk("klips_debug:pfkey_x_nat_t_type_process: " -+ "null pointer passed in\n"); -+ SENDERR(EINVAL); -+ } -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_nat_t_type_process: %d.\n", -+ pfkey_x_nat_t_type->sadb_x_nat_t_type_type); -+ -+ if(!extr || !extr->ips) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_nat_t_type_process: " -+ "extr or extr->ips is NULL, fatal\n"); -+ SENDERR(EINVAL); -+ } -+ -+ switch(pfkey_x_nat_t_type->sadb_x_nat_t_type_type) { -+ case ESPINUDP_WITH_NON_IKE: /* with Non-IKE (older version) */ -+ case ESPINUDP_WITH_NON_ESP: /* with Non-ESP */ -+ -+ extr->ips->ips_natt_type = pfkey_x_nat_t_type->sadb_x_nat_t_type_type; -+ break; -+ default: -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_nat_t_type_process: " -+ "unknown type %d.\n", -+ pfkey_x_nat_t_type->sadb_x_nat_t_type_type); -+ SENDERR(EINVAL); -+ break; -+ } -+ -+errlab: -+ return error; -+} -+ -+int -+pfkey_x_nat_t_port_process(struct sadb_ext *pfkey_ext, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+ struct sadb_x_nat_t_port *pfkey_x_nat_t_port = (struct sadb_x_nat_t_port *)pfkey_ext; -+ -+ if(!pfkey_x_nat_t_port) { -+ printk("klips_debug:pfkey_x_nat_t_port_process: " -+ "null pointer passed in\n"); -+ SENDERR(EINVAL); -+ } -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_nat_t_port_process: %d/%d.\n", -+ pfkey_x_nat_t_port->sadb_x_nat_t_port_exttype, -+ pfkey_x_nat_t_port->sadb_x_nat_t_port_port); -+ -+ if(!extr || !extr->ips) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_nat_t_type_process: " -+ "extr or extr->ips is NULL, fatal\n"); -+ SENDERR(EINVAL); -+ } -+ -+ switch(pfkey_x_nat_t_port->sadb_x_nat_t_port_exttype) { -+ case K_SADB_X_EXT_NAT_T_SPORT: -+ extr->ips->ips_natt_sport = pfkey_x_nat_t_port->sadb_x_nat_t_port_port; -+ break; -+ case K_SADB_X_EXT_NAT_T_DPORT: -+ extr->ips->ips_natt_dport = pfkey_x_nat_t_port->sadb_x_nat_t_port_port; -+ break; -+ default: -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_nat_t_port_process: " -+ "unknown exttype %d.\n", -+ pfkey_x_nat_t_port->sadb_x_nat_t_port_exttype); -+ SENDERR(EINVAL); -+ break; -+ } -+ -+errlab: -+ return error; -+} -+#endif -+ -+int -+pfkey_x_debug_process(struct sadb_ext *pfkey_ext, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+ struct sadb_x_debug *pfkey_x_debug = (struct sadb_x_debug *)pfkey_ext; -+ -+ if(!pfkey_x_debug) { -+ printk("klips_debug:pfkey_x_debug_process: " -+ "null pointer passed in\n"); -+ SENDERR(EINVAL); -+ } -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_debug_process: .\n"); -+ -+#ifdef CONFIG_KLIPS_DEBUG -+ if(pfkey_x_debug->sadb_x_debug_netlink >> -+ (sizeof(pfkey_x_debug->sadb_x_debug_netlink) * 8 - 1)) { -+ pfkey_x_debug->sadb_x_debug_netlink &= -+ ~(1 << (sizeof(pfkey_x_debug->sadb_x_debug_netlink) * 8 -1)); -+ debug_tunnel |= pfkey_x_debug->sadb_x_debug_tunnel; -+ debug_netlink |= pfkey_x_debug->sadb_x_debug_netlink; -+ debug_xform |= pfkey_x_debug->sadb_x_debug_xform; -+ debug_eroute |= pfkey_x_debug->sadb_x_debug_eroute; -+ debug_spi |= pfkey_x_debug->sadb_x_debug_spi; -+ debug_radij |= pfkey_x_debug->sadb_x_debug_radij; -+ debug_esp |= pfkey_x_debug->sadb_x_debug_esp; -+ debug_ah |= pfkey_x_debug->sadb_x_debug_ah; -+ debug_rcv |= pfkey_x_debug->sadb_x_debug_rcv; -+ debug_pfkey |= pfkey_x_debug->sadb_x_debug_pfkey; -+#ifdef CONFIG_KLIPS_IPCOMP -+ sysctl_ipsec_debug_ipcomp |= pfkey_x_debug->sadb_x_debug_ipcomp; -+#endif /* CONFIG_KLIPS_IPCOMP */ -+ sysctl_ipsec_debug_verbose |= pfkey_x_debug->sadb_x_debug_verbose; -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_debug_process: " -+ "set\n"); -+ } else { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_debug_process: " -+ "unset\n"); -+ debug_tunnel &= pfkey_x_debug->sadb_x_debug_tunnel; -+ debug_netlink &= pfkey_x_debug->sadb_x_debug_netlink; -+ debug_xform &= pfkey_x_debug->sadb_x_debug_xform; -+ debug_eroute &= pfkey_x_debug->sadb_x_debug_eroute; -+ debug_spi &= pfkey_x_debug->sadb_x_debug_spi; -+ debug_radij &= pfkey_x_debug->sadb_x_debug_radij; -+ debug_esp &= pfkey_x_debug->sadb_x_debug_esp; -+ debug_ah &= pfkey_x_debug->sadb_x_debug_ah; -+ debug_rcv &= pfkey_x_debug->sadb_x_debug_rcv; -+ debug_pfkey &= pfkey_x_debug->sadb_x_debug_pfkey; -+#ifdef CONFIG_KLIPS_IPCOMP -+ sysctl_ipsec_debug_ipcomp &= pfkey_x_debug->sadb_x_debug_ipcomp; -+#endif /* CONFIG_KLIPS_IPCOMP */ -+ sysctl_ipsec_debug_verbose &= pfkey_x_debug->sadb_x_debug_verbose; -+ } -+#else /* CONFIG_KLIPS_DEBUG */ -+ printk("klips_debug:pfkey_x_debug_process: " -+ "debugging not enabled\n"); -+ SENDERR(EINVAL); -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+errlab: -+ return error; -+} -+ -+/* -+ * Local variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/pfkey_v2_parse.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,1597 @@ -+/* -+ * RFC2367 PF_KEYv2 Key management API message parser -+ * Copyright (C) 1999, 2000, 2001 Richard Guy Briggs. -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * RCSID $Id: pfkey_v2_parse.c,v 1.65 2005/04/06 17:46:05 mcr Exp $ -+ */ -+ -+/* -+ * Template from klips/net/ipsec/ipsec/ipsec_parser.c. -+ */ -+ -+char pfkey_v2_parse_c_version[] = "$Id: pfkey_v2_parse.c,v 1.65 2005/04/06 17:46:05 mcr Exp $"; -+ -+/* -+ * Some ugly stuff to allow consistent debugging code for use in the -+ * kernel and in user space -+*/ -+ -+#ifdef __KERNEL__ -+ -+# include /* for printk */ -+ -+#include "openswan/ipsec_kversion.h" /* for malloc switch */ -+ -+# ifdef MALLOC_SLAB -+# include /* kmalloc() */ -+# else /* MALLOC_SLAB */ -+# include /* kmalloc() */ -+# endif /* MALLOC_SLAB */ -+# include /* error codes */ -+# include /* size_t */ -+# include /* mark_bh */ -+ -+# include /* struct device, and other headers */ -+# include /* eth_type_trans */ -+# include /* struct iphdr */ -+# if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -+# include /* struct ipv6hdr */ -+# endif /* if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ -+extern int debug_pfkey; -+ -+# include -+ -+#include "openswan/ipsec_encap.h" -+ -+#else /* __KERNEL__ */ -+ -+# include -+# include -+# include -+ -+# include -+# include "constants.h" -+ -+#endif /* __KERNEL__ */ -+ -+ -+#include -+#include -+ -+#include "openswan/ipsec_sa.h" /* IPSEC_SAREF_NULL, IPSEC_SA_REF_TABLE_IDX_WIDTH */ -+ -+/* -+ * how to handle debugging for pfkey. -+ */ -+#include -+ -+unsigned int pfkey_lib_debug = PF_KEY_DEBUG_PARSE_NONE; -+int (*pfkey_debug_func)(const char *message, ...) PRINTF_LIKE(1); -+int (*pfkey_error_func)(const char *message, ...) PRINTF_LIKE(1); -+ -+ -+#define SENDERR(_x) do { error = -(_x); goto errlab; } while (0) -+ -+struct satype_tbl { -+ uint8_t proto; -+ uint8_t satype; -+ char* name; -+} static satype_tbl[] = { -+#ifdef __KERNEL__ -+ { IPPROTO_ESP, K_SADB_SATYPE_ESP, "ESP" }, -+ { IPPROTO_AH, K_SADB_SATYPE_AH, "AH" }, -+ { IPPROTO_IPIP, K_SADB_X_SATYPE_IPIP, "IPIP" }, -+#ifdef CONFIG_KLIPS_IPCOMP -+ { IPPROTO_COMP, K_SADB_X_SATYPE_COMP, "COMP" }, -+#endif /* CONFIG_KLIPS_IPCOMP */ -+ { IPPROTO_INT, K_SADB_X_SATYPE_INT, "INT" }, -+#else /* __KERNEL__ */ -+ { SA_ESP, K_SADB_SATYPE_ESP, "ESP" }, -+ { SA_AH, K_SADB_SATYPE_AH, "AH" }, -+ { SA_IPIP, K_SADB_X_SATYPE_IPIP, "IPIP" }, -+ { SA_COMP, K_SADB_X_SATYPE_COMP, "COMP" }, -+ { SA_INT, K_SADB_X_SATYPE_INT, "INT" }, -+#endif /* __KERNEL__ */ -+ { 0, 0, "UNKNOWN" } -+}; -+ -+uint8_t -+satype2proto(uint8_t satype) -+{ -+ int i =0; -+ -+ while(satype_tbl[i].satype != satype && satype_tbl[i].satype != 0) { -+ i++; -+ } -+ return satype_tbl[i].proto; -+} -+ -+uint8_t -+proto2satype(uint8_t proto) -+{ -+ int i = 0; -+ -+ while(satype_tbl[i].proto != proto && satype_tbl[i].proto != 0) { -+ i++; -+ } -+ return satype_tbl[i].satype; -+} -+ -+char* -+satype2name(uint8_t satype) -+{ -+ int i = 0; -+ -+ while(satype_tbl[i].satype != satype && satype_tbl[i].satype != 0) { -+ i++; -+ } -+ return satype_tbl[i].name; -+} -+ -+char* -+proto2name(uint8_t proto) -+{ -+ int i = 0; -+ -+ while(satype_tbl[i].proto != proto && satype_tbl[i].proto != 0) { -+ i++; -+ } -+ return satype_tbl[i].name; -+} -+ -+/* Default extension parsers taken from the KLIPS code */ -+ -+DEBUG_NO_STATIC int -+pfkey_sa_parse(struct sadb_ext *pfkey_ext) -+{ -+ int error = 0; -+ struct k_sadb_sa *pfkey_sa = (struct k_sadb_sa *)pfkey_ext; -+ -+ /* sanity checks... */ -+ if(!pfkey_sa) { -+ ERROR("pfkey_sa_parse: " -+ "NULL pointer passed in.\n"); -+ SENDERR(EINVAL); -+ } -+ -+ -+ -+ if(pfkey_sa->sadb_sa_len !=sizeof(struct k_sadb_sa)/IPSEC_PFKEYv2_ALIGN -+ && pfkey_sa->sadb_sa_len!=sizeof(struct sadb_sa)/IPSEC_PFKEYv2_ALIGN) { -+ ERROR( -+ "pfkey_sa_parse: " -+ "length wrong pfkey_sa->sadb_sa_len=%d sizeof(struct sadb_sa)=%d.\n", -+ pfkey_sa->sadb_sa_len, -+ (int)sizeof(struct k_sadb_sa)); -+ SENDERR(EINVAL); -+ } -+ -+#if K_SADB_EALG_MAX < 255 -+ if(pfkey_sa->sadb_sa_encrypt > K_SADB_EALG_MAX) { -+ ERROR( -+ "pfkey_sa_parse: " -+ "pfkey_sa->sadb_sa_encrypt=%d > K_SADB_EALG_MAX=%d.\n", -+ pfkey_sa->sadb_sa_encrypt, -+ K_SADB_EALG_MAX); -+ SENDERR(EINVAL); -+ } -+#endif -+ -+#if K_SADB_AALG_MAX < 255 -+ if(pfkey_sa->sadb_sa_auth > K_SADB_AALG_MAX) { -+ ERROR( -+ "pfkey_sa_parse: " -+ "pfkey_sa->sadb_sa_auth=%d > K_SADB_AALG_MAX=%d.\n", -+ pfkey_sa->sadb_sa_auth, -+ K_SADB_AALG_MAX); -+ SENDERR(EINVAL); -+ } -+#endif -+ -+#if K_SADB_SASTATE_MAX < 255 -+ if(pfkey_sa->sadb_sa_state > K_SADB_SASTATE_MAX) { -+ ERROR( -+ "pfkey_sa_parse: " -+ "state=%d exceeds MAX=%d.\n", -+ pfkey_sa->sadb_sa_state, -+ K_SADB_SASTATE_MAX); -+ SENDERR(EINVAL); -+ } -+#endif -+ -+ if(pfkey_sa->sadb_sa_state == K_SADB_SASTATE_DEAD) { -+ ERROR( -+ "pfkey_sa_parse: " -+ "state=%d is DEAD=%d.\n", -+ pfkey_sa->sadb_sa_state, -+ K_SADB_SASTATE_DEAD); -+ SENDERR(EINVAL); -+ } -+ -+ if(pfkey_sa->sadb_sa_replay > 64) { -+ ERROR( -+ "pfkey_sa_parse: " -+ "replay window size: %d -- must be 0 <= size <= 64\n", -+ pfkey_sa->sadb_sa_replay); -+ SENDERR(EINVAL); -+ } -+ -+ if(! ((pfkey_sa->sadb_sa_exttype == K_SADB_EXT_SA) || -+ (pfkey_sa->sadb_sa_exttype == K_SADB_X_EXT_SA2))) -+ { -+ ERROR( -+ "pfkey_sa_parse: " -+ "unknown exttype=%d, expecting K_SADB_EXT_SA=%d or K_SADB_X_EXT_SA2=%d.\n", -+ pfkey_sa->sadb_sa_exttype, -+ K_SADB_EXT_SA, -+ K_SADB_X_EXT_SA2); -+ SENDERR(EINVAL); -+ } -+ -+ if(pfkey_sa->sadb_sa_len > sizeof(struct sadb_sa)/IPSEC_PFKEYv2_ALIGN) { -+ if(pfkey_sa->sadb_x_sa_ref == IPSEC_SAREF_NULL || -+ pfkey_sa->sadb_x_sa_ref == ~(IPSEC_SAREF_NULL)) -+ { -+ pfkey_sa->sadb_x_sa_ref = IPSEC_SAREF_NULL; -+ } -+ } -+ -+ if((IPSEC_SAREF_NULL != pfkey_sa->sadb_x_sa_ref) -+ && (pfkey_sa->sadb_x_sa_ref >= (1 << IPSEC_SA_REF_TABLE_IDX_WIDTH))) -+ { -+ ERROR( -+ "pfkey_sa_parse: " -+ "SAref=%d must be (SAref == IPSEC_SAREF_NULL(%d) || SAref < IPSEC_SA_REF_TABLE_NUM_ENTRIES(%d)).\n", -+ pfkey_sa->sadb_x_sa_ref, -+ IPSEC_SAREF_NULL, -+ IPSEC_SA_REF_TABLE_NUM_ENTRIES); -+ SENDERR(EINVAL); -+ } -+ -+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT, -+ "pfkey_sa_parse: " -+ "successfully found len=%d exttype=%d(%s) spi=%08lx replay=%d state=%d auth=%d encrypt=%d flags=%d ref=%d.\n", -+ pfkey_sa->sadb_sa_len, -+ pfkey_sa->sadb_sa_exttype, -+ pfkey_v2_sadb_ext_string(pfkey_sa->sadb_sa_exttype), -+ (long unsigned int)ntohl(pfkey_sa->sadb_sa_spi), -+ pfkey_sa->sadb_sa_replay, -+ pfkey_sa->sadb_sa_state, -+ pfkey_sa->sadb_sa_auth, -+ pfkey_sa->sadb_sa_encrypt, -+ pfkey_sa->sadb_sa_flags, -+ pfkey_sa->sadb_x_sa_ref); -+ -+ errlab: -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_lifetime_parse(struct sadb_ext *pfkey_ext) -+{ -+ int error = 0; -+ struct sadb_lifetime *pfkey_lifetime = (struct sadb_lifetime *)pfkey_ext; -+ -+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW, -+ "pfkey_lifetime_parse:enter\n"); -+ /* sanity checks... */ -+ if(!pfkey_lifetime) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_lifetime_parse: " -+ "NULL pointer passed in.\n"); -+ SENDERR(EINVAL); -+ } -+ -+ if(pfkey_lifetime->sadb_lifetime_len != -+ sizeof(struct sadb_lifetime) / IPSEC_PFKEYv2_ALIGN) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_lifetime_parse: " -+ "length wrong pfkey_lifetime->sadb_lifetime_len=%d sizeof(struct sadb_lifetime)=%d.\n", -+ pfkey_lifetime->sadb_lifetime_len, -+ (int)sizeof(struct sadb_lifetime)); -+ SENDERR(EINVAL); -+ } -+ -+ if((pfkey_lifetime->sadb_lifetime_exttype != K_SADB_EXT_LIFETIME_HARD) && -+ (pfkey_lifetime->sadb_lifetime_exttype != K_SADB_EXT_LIFETIME_SOFT) && -+ (pfkey_lifetime->sadb_lifetime_exttype != K_SADB_EXT_LIFETIME_CURRENT)) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_lifetime_parse: " -+ "unexpected ext_type=%d.\n", -+ pfkey_lifetime->sadb_lifetime_exttype); -+ SENDERR(EINVAL); -+ } -+ -+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT, -+ "pfkey_lifetime_parse: " -+ "life_type=%d(%s) alloc=%u bytes=%u add=%u use=%u.\n", -+ pfkey_lifetime->sadb_lifetime_exttype, -+ pfkey_v2_sadb_ext_string(pfkey_lifetime->sadb_lifetime_exttype), -+ pfkey_lifetime->sadb_lifetime_allocations, -+ (unsigned)pfkey_lifetime->sadb_lifetime_bytes, -+ (unsigned)pfkey_lifetime->sadb_lifetime_addtime, -+ (unsigned)pfkey_lifetime->sadb_lifetime_usetime); -+errlab: -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_address_parse(struct sadb_ext *pfkey_ext) -+{ -+ int error = 0; -+ int saddr_len = 0; -+ struct sadb_address *pfkey_address = (struct sadb_address *)pfkey_ext; -+ struct sockaddr* s = (struct sockaddr*)((char*)pfkey_address + sizeof(*pfkey_address)); -+ char ipaddr_txt[ADDRTOT_BUF]; -+ -+ /* sanity checks... */ -+ if(!pfkey_address) { -+ ERROR( -+ "pfkey_address_parse: " -+ "NULL pointer passed in.\n"); -+ SENDERR(EINVAL); -+ } -+ -+ if(pfkey_address->sadb_address_len < -+ (sizeof(struct sadb_address) + sizeof(struct sockaddr))/ -+ IPSEC_PFKEYv2_ALIGN) { -+ ERROR("pfkey_address_parse: " -+ "size wrong 1 ext_len=%d, adr_ext_len=%d, saddr_len=%d.\n", -+ pfkey_address->sadb_address_len, -+ (int)sizeof(struct sadb_address), -+ (int)sizeof(struct sockaddr)); -+ SENDERR(EINVAL); -+ } -+ -+ if(pfkey_address->sadb_address_reserved) { -+ ERROR("pfkey_address_parse: " -+ "res=%d, must be zero.\n", -+ pfkey_address->sadb_address_reserved); -+ SENDERR(EINVAL); -+ } -+ -+ switch(pfkey_address->sadb_address_exttype) { -+ case K_SADB_EXT_ADDRESS_SRC: -+ case K_SADB_EXT_ADDRESS_DST: -+ case K_SADB_EXT_ADDRESS_PROXY: -+ case K_SADB_X_EXT_ADDRESS_DST2: -+ case K_SADB_X_EXT_ADDRESS_SRC_FLOW: -+ case K_SADB_X_EXT_ADDRESS_DST_FLOW: -+ case K_SADB_X_EXT_ADDRESS_SRC_MASK: -+ case K_SADB_X_EXT_ADDRESS_DST_MASK: -+#ifdef NAT_TRAVERSAL -+ case K_SADB_X_EXT_NAT_T_OA: -+#endif -+ break; -+ default: -+ ERROR( -+ "pfkey_address_parse: " -+ "unexpected ext_type=%d.\n", -+ pfkey_address->sadb_address_exttype); -+ SENDERR(ENODEV); -+ } -+ -+ switch(s->sa_family) { -+ case AF_INET: -+ saddr_len = sizeof(struct sockaddr_in); -+ sprintf(ipaddr_txt, "%d.%d.%d.%d" -+ , (((struct sockaddr_in*)s)->sin_addr.s_addr >> 0) & 0xFF -+ , (((struct sockaddr_in*)s)->sin_addr.s_addr >> 8) & 0xFF -+ , (((struct sockaddr_in*)s)->sin_addr.s_addr >> 16) & 0xFF -+ , (((struct sockaddr_in*)s)->sin_addr.s_addr >> 24) & 0xFF); -+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT, -+ "pfkey_address_parse: " -+ "found exttype=%u(%s) family=%d(AF_INET) address=%s proto=%u port=%u.\n", -+ pfkey_address->sadb_address_exttype, -+ pfkey_v2_sadb_ext_string(pfkey_address->sadb_address_exttype), -+ s->sa_family, -+ ipaddr_txt, -+ pfkey_address->sadb_address_proto, -+ ntohs(((struct sockaddr_in*)s)->sin_port)); -+ break; -+ case AF_INET6: -+ saddr_len = sizeof(struct sockaddr_in6); -+ sprintf(ipaddr_txt, "%x:%x:%x:%x:%x:%x:%x:%x" -+ , ntohs(((struct sockaddr_in6*)s)->sin6_addr.s6_addr16[0]) -+ , ntohs(((struct sockaddr_in6*)s)->sin6_addr.s6_addr16[1]) -+ , ntohs(((struct sockaddr_in6*)s)->sin6_addr.s6_addr16[2]) -+ , ntohs(((struct sockaddr_in6*)s)->sin6_addr.s6_addr16[3]) -+ , ntohs(((struct sockaddr_in6*)s)->sin6_addr.s6_addr16[4]) -+ , ntohs(((struct sockaddr_in6*)s)->sin6_addr.s6_addr16[5]) -+ , ntohs(((struct sockaddr_in6*)s)->sin6_addr.s6_addr16[6]) -+ , ntohs(((struct sockaddr_in6*)s)->sin6_addr.s6_addr16[7])); -+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT, -+ "pfkey_address_parse: " -+ "found exttype=%u(%s) family=%d(AF_INET6) address=%s proto=%u port=%u.\n", -+ pfkey_address->sadb_address_exttype, -+ pfkey_v2_sadb_ext_string(pfkey_address->sadb_address_exttype), -+ s->sa_family, -+ ipaddr_txt, -+ pfkey_address->sadb_address_proto, -+ ((struct sockaddr_in6*)s)->sin6_port); -+ break; -+ default: -+ ERROR( -+ "pfkey_address_parse: " -+ "s->sa_family=%d not supported.\n", -+ s->sa_family); -+ SENDERR(EPFNOSUPPORT); -+ } -+ -+ if(pfkey_address->sadb_address_len != -+ DIVUP(sizeof(struct sadb_address) + saddr_len, IPSEC_PFKEYv2_ALIGN)) { -+ ERROR( -+ "pfkey_address_parse: " -+ "size wrong 2 ext_len=%d, adr_ext_len=%d, saddr_len=%d.\n", -+ pfkey_address->sadb_address_len, -+ (int)sizeof(struct sadb_address), -+ saddr_len); -+ SENDERR(EINVAL); -+ } -+ -+ if(pfkey_address->sadb_address_prefixlen != 0) { -+ ERROR( -+ "pfkey_address_parse: " -+ "address prefixes not supported yet.\n"); -+ SENDERR(EAFNOSUPPORT); /* not supported yet */ -+ } -+ -+ /* XXX check if port!=0 */ -+ -+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW, -+ "pfkey_address_parse: successful.\n"); -+ errlab: -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_key_parse(struct sadb_ext *pfkey_ext) -+{ -+ int error = 0; -+ struct sadb_key *pfkey_key = (struct sadb_key *)pfkey_ext; -+ -+ /* sanity checks... */ -+ -+ if(!pfkey_key) { -+ ERROR( -+ "pfkey_key_parse: " -+ "NULL pointer passed in.\n"); -+ SENDERR(EINVAL); -+ } -+ -+ if(pfkey_key->sadb_key_len < sizeof(struct sadb_key) / IPSEC_PFKEYv2_ALIGN) { -+ ERROR( -+ "pfkey_key_parse: " -+ "size wrong ext_len=%d, key_ext_len=%d.\n", -+ pfkey_key->sadb_key_len, -+ (int)sizeof(struct sadb_key)); -+ SENDERR(EINVAL); -+ } -+ -+ if(!pfkey_key->sadb_key_bits) { -+ ERROR( -+ "pfkey_key_parse: " -+ "key length set to zero, must be non-zero.\n"); -+ SENDERR(EINVAL); -+ } -+ -+ if(pfkey_key->sadb_key_len != -+ DIVUP(sizeof(struct sadb_key) * OCTETBITS + pfkey_key->sadb_key_bits, -+ PFKEYBITS)) { -+ ERROR( -+ "pfkey_key_parse: " -+ "key length=%d does not agree with extension length=%d.\n", -+ pfkey_key->sadb_key_bits, -+ pfkey_key->sadb_key_len); -+ SENDERR(EINVAL); -+ } -+ -+ if(pfkey_key->sadb_key_reserved) { -+ ERROR( -+ "pfkey_key_parse: " -+ "res=%d, must be zero.\n", -+ pfkey_key->sadb_key_reserved); -+ SENDERR(EINVAL); -+ } -+ -+ if(! ( (pfkey_key->sadb_key_exttype == K_SADB_EXT_KEY_AUTH) || -+ (pfkey_key->sadb_key_exttype == K_SADB_EXT_KEY_ENCRYPT))) { -+ ERROR( -+ "pfkey_key_parse: " -+ "expecting extension type AUTH or ENCRYPT, got %d.\n", -+ pfkey_key->sadb_key_exttype); -+ SENDERR(EINVAL); -+ } -+ -+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT, -+ "pfkey_key_parse: " -+ "success, found len=%d exttype=%d(%s) bits=%d reserved=%d.\n", -+ pfkey_key->sadb_key_len, -+ pfkey_key->sadb_key_exttype, -+ pfkey_v2_sadb_ext_string(pfkey_key->sadb_key_exttype), -+ pfkey_key->sadb_key_bits, -+ pfkey_key->sadb_key_reserved); -+ -+errlab: -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_ident_parse(struct sadb_ext *pfkey_ext) -+{ -+ int error = 0; -+ struct sadb_ident *pfkey_ident = (struct sadb_ident *)pfkey_ext; -+ -+ /* sanity checks... */ -+ if(pfkey_ident->sadb_ident_len < sizeof(struct sadb_ident) / IPSEC_PFKEYv2_ALIGN) { -+ ERROR( -+ "pfkey_ident_parse: " -+ "size wrong ext_len=%d, key_ext_len=%d.\n", -+ pfkey_ident->sadb_ident_len, -+ (int)sizeof(struct sadb_ident)); -+ SENDERR(EINVAL); -+ } -+ -+ if(pfkey_ident->sadb_ident_type > K_SADB_IDENTTYPE_MAX) { -+ ERROR( -+ "pfkey_ident_parse: " -+ "ident_type=%d out of range, must be less than %d.\n", -+ pfkey_ident->sadb_ident_type, -+ K_SADB_IDENTTYPE_MAX); -+ SENDERR(EINVAL); -+ } -+ -+ if(pfkey_ident->sadb_ident_reserved) { -+ ERROR( -+ "pfkey_ident_parse: " -+ "res=%d, must be zero.\n", -+ pfkey_ident->sadb_ident_reserved); -+ SENDERR(EINVAL); -+ } -+ -+ /* string terminator/padding must be zero */ -+ if(pfkey_ident->sadb_ident_len > sizeof(struct sadb_ident) / IPSEC_PFKEYv2_ALIGN) { -+ if(*((char*)pfkey_ident + pfkey_ident->sadb_ident_len * IPSEC_PFKEYv2_ALIGN - 1)) { -+ ERROR( -+ "pfkey_ident_parse: " -+ "string padding must be zero, last is 0x%02x.\n", -+ *((char*)pfkey_ident + -+ pfkey_ident->sadb_ident_len * IPSEC_PFKEYv2_ALIGN - 1)); -+ SENDERR(EINVAL); -+ } -+ } -+ -+ if( ! ((pfkey_ident->sadb_ident_exttype == K_SADB_EXT_IDENTITY_SRC) || -+ (pfkey_ident->sadb_ident_exttype == K_SADB_EXT_IDENTITY_DST))) { -+ ERROR( -+ "pfkey_key_parse: " -+ "expecting extension type IDENTITY_SRC or IDENTITY_DST, got %d.\n", -+ pfkey_ident->sadb_ident_exttype); -+ SENDERR(EINVAL); -+ } -+ -+errlab: -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_sens_parse(struct sadb_ext *pfkey_ext) -+{ -+ int error = 0; -+ struct sadb_sens *pfkey_sens = (struct sadb_sens *)pfkey_ext; -+ -+ /* sanity checks... */ -+ if(pfkey_sens->sadb_sens_len < sizeof(struct sadb_sens) / IPSEC_PFKEYv2_ALIGN) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_sens_parse: " -+ "size wrong ext_len=%d, key_ext_len=%d.\n", -+ pfkey_sens->sadb_sens_len, -+ (int)sizeof(struct sadb_sens)); -+ SENDERR(EINVAL); -+ } -+ -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_sens_parse: " -+ "Sorry, I can't parse exttype=%d yet.\n", -+ pfkey_ext->sadb_ext_type); -+#if 0 -+ SENDERR(EINVAL); /* don't process these yet */ -+#endif -+ -+errlab: -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_prop_parse(struct sadb_ext *pfkey_ext) -+{ -+ int error = 0; -+ int i, num_comb; -+ struct sadb_prop *pfkey_prop = (struct sadb_prop *)pfkey_ext; -+ struct k_sadb_comb *k_pfkey_comb = (struct k_sadb_comb *)((char*)pfkey_ext + sizeof(struct sadb_prop)); -+ -+ /* sanity checks... */ -+ if((pfkey_prop->sadb_prop_len < sizeof(struct sadb_prop) / IPSEC_PFKEYv2_ALIGN) || -+ (((pfkey_prop->sadb_prop_len * IPSEC_PFKEYv2_ALIGN) - sizeof(struct sadb_prop)) % sizeof(struct sadb_comb))) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_prop_parse: " -+ "size wrong ext_len=%d, prop_ext_len=%d comb_ext_len=%d.\n", -+ pfkey_prop->sadb_prop_len, -+ (int)sizeof(struct sadb_prop), -+ (int)sizeof(struct sadb_comb)); -+ SENDERR(EINVAL); -+ } -+ -+ if(pfkey_prop->sadb_prop_replay > 64) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_prop_parse: " -+ "replay window size: %d -- must be 0 <= size <= 64\n", -+ pfkey_prop->sadb_prop_replay); -+ SENDERR(EINVAL); -+ } -+ -+ for(i=0; i<3; i++) { -+ if(pfkey_prop->sadb_prop_reserved[i]) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_prop_parse: " -+ "res[%d]=%d, must be zero.\n", -+ i, pfkey_prop->sadb_prop_reserved[i]); -+ SENDERR(EINVAL); -+ } -+ } -+ -+ num_comb = ((pfkey_prop->sadb_prop_len * IPSEC_PFKEYv2_ALIGN) - sizeof(struct sadb_prop)) / sizeof(struct sadb_comb); -+ -+ for(i = 0; i < num_comb; i++) { -+ struct sadb_comb *pfkey_comb = (struct sadb_comb *)k_pfkey_comb; -+ if(pfkey_comb->sadb_comb_auth > K_SADB_AALG_MAX) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_prop_parse: " -+ "pfkey_comb[%d]->sadb_comb_auth=%d > K_SADB_AALG_MAX=%d.\n", -+ i, -+ pfkey_comb->sadb_comb_auth, -+ K_SADB_AALG_MAX); -+ SENDERR(EINVAL); -+ } -+ -+ if(pfkey_comb->sadb_comb_auth) { -+ if(!pfkey_comb->sadb_comb_auth_minbits) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_prop_parse: " -+ "pfkey_comb[%d]->sadb_comb_auth_minbits=0, fatal.\n", -+ i); -+ SENDERR(EINVAL); -+ } -+ if(!pfkey_comb->sadb_comb_auth_maxbits) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_prop_parse: " -+ "pfkey_comb[%d]->sadb_comb_auth_maxbits=0, fatal.\n", -+ i); -+ SENDERR(EINVAL); -+ } -+ if(pfkey_comb->sadb_comb_auth_minbits > pfkey_comb->sadb_comb_auth_maxbits) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_prop_parse: " -+ "pfkey_comb[%d]->sadb_comb_auth_minbits=%d > maxbits=%d, fatal.\n", -+ i, -+ pfkey_comb->sadb_comb_auth_minbits, -+ pfkey_comb->sadb_comb_auth_maxbits); -+ SENDERR(EINVAL); -+ } -+ } else { -+ if(pfkey_comb->sadb_comb_auth_minbits) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_prop_parse: " -+ "pfkey_comb[%d]->sadb_comb_auth_minbits=%d != 0, fatal.\n", -+ i, -+ pfkey_comb->sadb_comb_auth_minbits); -+ SENDERR(EINVAL); -+ } -+ if(pfkey_comb->sadb_comb_auth_maxbits) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_prop_parse: " -+ "pfkey_comb[%d]->sadb_comb_auth_maxbits=%d != 0, fatal.\n", -+ i, -+ pfkey_comb->sadb_comb_auth_maxbits); -+ SENDERR(EINVAL); -+ } -+ } -+ -+#if K_SADB_EALG_MAX < 255 -+ if(pfkey_comb->sadb_comb_encrypt > K_SADB_EALG_MAX) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_comb_parse: " -+ "pfkey_comb[%d]->sadb_comb_encrypt=%d > K_SADB_EALG_MAX=%d.\n", -+ i, -+ pfkey_comb->sadb_comb_encrypt, -+ K_SADB_EALG_MAX); -+ SENDERR(EINVAL); -+ } -+#endif -+ -+ if(pfkey_comb->sadb_comb_encrypt) { -+ if(!pfkey_comb->sadb_comb_encrypt_minbits) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_prop_parse: " -+ "pfkey_comb[%d]->sadb_comb_encrypt_minbits=0, fatal.\n", -+ i); -+ SENDERR(EINVAL); -+ } -+ if(!pfkey_comb->sadb_comb_encrypt_maxbits) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_prop_parse: " -+ "pfkey_comb[%d]->sadb_comb_encrypt_maxbits=0, fatal.\n", -+ i); -+ SENDERR(EINVAL); -+ } -+ if(pfkey_comb->sadb_comb_encrypt_minbits > pfkey_comb->sadb_comb_encrypt_maxbits) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_prop_parse: " -+ "pfkey_comb[%d]->sadb_comb_encrypt_minbits=%d > maxbits=%d, fatal.\n", -+ i, -+ pfkey_comb->sadb_comb_encrypt_minbits, -+ pfkey_comb->sadb_comb_encrypt_maxbits); -+ SENDERR(EINVAL); -+ } -+ } else { -+ if(pfkey_comb->sadb_comb_encrypt_minbits) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_prop_parse: " -+ "pfkey_comb[%d]->sadb_comb_encrypt_minbits=%d != 0, fatal.\n", -+ i, -+ pfkey_comb->sadb_comb_encrypt_minbits); -+ SENDERR(EINVAL); -+ } -+ if(pfkey_comb->sadb_comb_encrypt_maxbits) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_prop_parse: " -+ "pfkey_comb[%d]->sadb_comb_encrypt_maxbits=%d != 0, fatal.\n", -+ i, -+ pfkey_comb->sadb_comb_encrypt_maxbits); -+ SENDERR(EINVAL); -+ } -+ } -+ -+ /* XXX do sanity check on flags */ -+ -+ if(pfkey_comb->sadb_comb_hard_allocations && pfkey_comb->sadb_comb_soft_allocations > pfkey_comb->sadb_comb_hard_allocations) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_prop_parse: " -+ "pfkey_comb[%d]->sadb_comb_soft_allocations=%d > hard_allocations=%d, fatal.\n", -+ i, -+ pfkey_comb->sadb_comb_soft_allocations, -+ pfkey_comb->sadb_comb_hard_allocations); -+ SENDERR(EINVAL); -+ } -+ -+ if(pfkey_comb->sadb_comb_hard_bytes && pfkey_comb->sadb_comb_soft_bytes > pfkey_comb->sadb_comb_hard_bytes) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_prop_parse: " -+ "pfkey_comb[%d]->sadb_comb_soft_bytes=%Ld > hard_bytes=%Ld, fatal.\n", -+ i, -+ (unsigned long long int)pfkey_comb->sadb_comb_soft_bytes, -+ (unsigned long long int)pfkey_comb->sadb_comb_hard_bytes); -+ SENDERR(EINVAL); -+ } -+ -+ if(pfkey_comb->sadb_comb_hard_addtime && pfkey_comb->sadb_comb_soft_addtime > pfkey_comb->sadb_comb_hard_addtime) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_prop_parse: " -+ "pfkey_comb[%d]->sadb_comb_soft_addtime=%Ld > hard_addtime=%Ld, fatal.\n", -+ i, -+ (unsigned long long int)pfkey_comb->sadb_comb_soft_addtime, -+ (unsigned long long int)pfkey_comb->sadb_comb_hard_addtime); -+ SENDERR(EINVAL); -+ } -+ -+ if(pfkey_comb->sadb_comb_hard_usetime && pfkey_comb->sadb_comb_soft_usetime > pfkey_comb->sadb_comb_hard_usetime) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_prop_parse: " -+ "pfkey_comb[%d]->sadb_comb_soft_usetime=%Ld > hard_usetime=%Ld, fatal.\n", -+ i, -+ (unsigned long long int)pfkey_comb->sadb_comb_soft_usetime, -+ (unsigned long long int)pfkey_comb->sadb_comb_hard_usetime); -+ SENDERR(EINVAL); -+ } -+ -+#ifdef COMB_PACKETS -+ if(pfkey_comb->sadb_x_comb_hard_packets && pfkey_comb->sadb_x_comb_soft_packets > pfkey_comb->sadb_x_comb_hard_packets) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_prop_parse: " -+ "pfkey_comb[%d]->sadb_x_comb_soft_packets=%d > hard_packets=%d, fatal.\n", -+ i, -+ k_pfkey_comb->sadb_x_comb_soft_packets, -+ k_pfkey_comb->sadb_x_comb_hard_packets); -+ SENDERR(EINVAL); -+ } -+#endif -+ -+ pfkey_comb++; -+ } -+ -+errlab: -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_supported_parse(struct sadb_ext *pfkey_ext) -+{ -+ int error = 0; -+ unsigned int i, num_alg; -+ struct sadb_supported *pfkey_supported = (struct sadb_supported *)pfkey_ext; -+ struct sadb_alg *pfkey_alg = (struct sadb_alg*)((char*)pfkey_ext + sizeof(struct sadb_supported)); -+ -+ /* sanity checks... */ -+ if((pfkey_supported->sadb_supported_len < -+ sizeof(struct sadb_supported) / IPSEC_PFKEYv2_ALIGN) || -+ (((pfkey_supported->sadb_supported_len * IPSEC_PFKEYv2_ALIGN) - -+ sizeof(struct sadb_supported)) % sizeof(struct sadb_alg))) { -+ -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_supported_parse: " -+ "size wrong ext_len=%d, supported_ext_len=%d alg_ext_len=%d.\n", -+ pfkey_supported->sadb_supported_len, -+ (int)sizeof(struct sadb_supported), -+ (int)sizeof(struct sadb_alg)); -+ SENDERR(EINVAL); -+ } -+ -+ if(pfkey_supported->sadb_supported_reserved) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_supported_parse: " -+ "res=%d, must be zero.\n", -+ pfkey_supported->sadb_supported_reserved); -+ SENDERR(EINVAL); -+ } -+ -+ num_alg = ((pfkey_supported->sadb_supported_len * IPSEC_PFKEYv2_ALIGN) - sizeof(struct sadb_supported)) / sizeof(struct sadb_alg); -+ -+ for(i = 0; i < num_alg; i++) { -+ /* process algo description */ -+ if(pfkey_alg->sadb_alg_reserved) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_supported_parse: " -+ "alg[%d], id=%d, ivlen=%d, minbits=%d, maxbits=%d, res=%d, must be zero.\n", -+ i, -+ pfkey_alg->sadb_alg_id, -+ pfkey_alg->sadb_alg_ivlen, -+ pfkey_alg->sadb_alg_minbits, -+ pfkey_alg->sadb_alg_maxbits, -+ pfkey_alg->sadb_alg_reserved); -+ SENDERR(EINVAL); -+ } -+ -+ /* XXX can alg_id auth/enc be determined from info given? -+ Yes, but OpenBSD's method does not iteroperate with rfc2367. -+ rgb, 2000-04-06 */ -+ -+ switch(pfkey_supported->sadb_supported_exttype) { -+ case K_SADB_EXT_SUPPORTED_AUTH: -+ if(pfkey_alg->sadb_alg_id > K_SADB_AALG_MAX) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_supported_parse: " -+ "alg[%d], alg_id=%d > K_SADB_AALG_MAX=%d, fatal.\n", -+ i, -+ pfkey_alg->sadb_alg_id, -+ K_SADB_AALG_MAX); -+ SENDERR(EINVAL); -+ } -+ break; -+ case SADB_EXT_SUPPORTED_ENCRYPT: -+#if K_SADB_EALG_MAX < 255 -+ if(pfkey_alg->sadb_alg_id > K_SADB_EALG_MAX) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_supported_parse: " -+ "alg[%d], alg_id=%d > K_SADB_EALG_MAX=%d, fatal.\n", -+ i, -+ pfkey_alg->sadb_alg_id, -+ K_SADB_EALG_MAX); -+ SENDERR(EINVAL); -+ } -+#endif -+ break; -+ default: -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_supported_parse: " -+ "alg[%d], alg_id=%d > K_SADB_EALG_MAX=%d, fatal.\n", -+ i, -+ pfkey_alg->sadb_alg_id, -+ K_SADB_EALG_MAX); -+ SENDERR(EINVAL); -+ } -+ pfkey_alg++; -+ } -+ -+ errlab: -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_spirange_parse(struct sadb_ext *pfkey_ext) -+{ -+ int error = 0; -+ struct sadb_spirange *pfkey_spirange = (struct sadb_spirange *)pfkey_ext; -+ -+ /* sanity checks... */ -+ if(pfkey_spirange->sadb_spirange_len != -+ sizeof(struct sadb_spirange) / IPSEC_PFKEYv2_ALIGN) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_spirange_parse: " -+ "size wrong ext_len=%d, key_ext_len=%d.\n", -+ pfkey_spirange->sadb_spirange_len, -+ (int)sizeof(struct sadb_spirange)); -+ SENDERR(EINVAL); -+ } -+ -+ if(pfkey_spirange->sadb_spirange_reserved) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_spirange_parse: " -+ "reserved=%d must be set to zero.\n", -+ pfkey_spirange->sadb_spirange_reserved); -+ SENDERR(EINVAL); -+ } -+ -+ if(ntohl(pfkey_spirange->sadb_spirange_max) < ntohl(pfkey_spirange->sadb_spirange_min)) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_spirange_parse: " -+ "minspi=%08x must be < maxspi=%08x.\n", -+ ntohl(pfkey_spirange->sadb_spirange_min), -+ ntohl(pfkey_spirange->sadb_spirange_max)); -+ SENDERR(EINVAL); -+ } -+ -+ if(ntohl(pfkey_spirange->sadb_spirange_min) <= 255) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_spirange_parse: " -+ "minspi=%08x must be > 255.\n", -+ ntohl(pfkey_spirange->sadb_spirange_min)); -+ SENDERR(EEXIST); -+ } -+ -+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT, -+ "pfkey_spirange_parse: " -+ "ext_len=%u ext_type=%u(%s) min=%u max=%u res=%u.\n", -+ pfkey_spirange->sadb_spirange_len, -+ pfkey_spirange->sadb_spirange_exttype, -+ pfkey_v2_sadb_ext_string(pfkey_spirange->sadb_spirange_exttype), -+ pfkey_spirange->sadb_spirange_min, -+ pfkey_spirange->sadb_spirange_max, -+ pfkey_spirange->sadb_spirange_reserved); -+ errlab: -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_x_kmprivate_parse(struct sadb_ext *pfkey_ext) -+{ -+ int error = 0; -+ struct sadb_x_kmprivate *pfkey_x_kmprivate = (struct sadb_x_kmprivate *)pfkey_ext; -+ -+ /* sanity checks... */ -+ if(pfkey_x_kmprivate->sadb_x_kmprivate_len < -+ sizeof(struct sadb_x_kmprivate) / IPSEC_PFKEYv2_ALIGN) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_x_kmprivate_parse: " -+ "size wrong ext_len=%d, key_ext_len=%d.\n", -+ pfkey_x_kmprivate->sadb_x_kmprivate_len, -+ (int)sizeof(struct sadb_x_kmprivate)); -+ SENDERR(EINVAL); -+ } -+ -+ if(pfkey_x_kmprivate->sadb_x_kmprivate_reserved) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_x_kmprivate_parse: " -+ "reserved=%d must be set to zero.\n", -+ pfkey_x_kmprivate->sadb_x_kmprivate_reserved); -+ SENDERR(EINVAL); -+ } -+ -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_x_kmprivate_parse: " -+ "Sorry, I can't parse exttype=%d yet.\n", -+ pfkey_ext->sadb_ext_type); -+ SENDERR(EINVAL); /* don't process these yet */ -+ -+errlab: -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_x_satype_parse(struct sadb_ext *pfkey_ext) -+{ -+ int error = 0; -+ int i; -+ struct sadb_x_satype *pfkey_x_satype = (struct sadb_x_satype *)pfkey_ext; -+ -+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW, -+ "pfkey_x_satype_parse: enter\n"); -+ /* sanity checks... */ -+ if(pfkey_x_satype->sadb_x_satype_len != -+ sizeof(struct sadb_x_satype) / IPSEC_PFKEYv2_ALIGN) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_x_satype_parse: " -+ "size wrong ext_len=%d, key_ext_len=%d.\n", -+ pfkey_x_satype->sadb_x_satype_len, -+ (int)sizeof(struct sadb_x_satype)); -+ SENDERR(EINVAL); -+ } -+ -+ if(!pfkey_x_satype->sadb_x_satype_satype) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_x_satype_parse: " -+ "satype is zero, must be non-zero.\n"); -+ SENDERR(EINVAL); -+ } -+ -+ if(pfkey_x_satype->sadb_x_satype_satype > K_SADB_SATYPE_MAX) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_x_satype_parse: " -+ "satype %d > max %d, invalid.\n", -+ pfkey_x_satype->sadb_x_satype_satype, K_SADB_SATYPE_MAX); -+ SENDERR(EINVAL); -+ } -+ -+ if(!(satype2proto(pfkey_x_satype->sadb_x_satype_satype))) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_x_satype_parse: " -+ "proto lookup from satype=%d failed.\n", -+ pfkey_x_satype->sadb_x_satype_satype); -+ SENDERR(EINVAL); -+ } -+ -+ for(i = 0; i < 3; i++) { -+ if(pfkey_x_satype->sadb_x_satype_reserved[i]) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_x_satype_parse: " -+ "reserved[%d]=%d must be set to zero.\n", -+ i, pfkey_x_satype->sadb_x_satype_reserved[i]); -+ SENDERR(EINVAL); -+ } -+ } -+ -+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT, -+ "pfkey_x_satype_parse: " -+ "len=%u ext=%u(%s) satype=%u(%s) res=%u,%u,%u.\n", -+ pfkey_x_satype->sadb_x_satype_len, -+ pfkey_x_satype->sadb_x_satype_exttype, -+ pfkey_v2_sadb_ext_string(pfkey_x_satype->sadb_x_satype_exttype), -+ pfkey_x_satype->sadb_x_satype_satype, -+ satype2name(pfkey_x_satype->sadb_x_satype_satype), -+ pfkey_x_satype->sadb_x_satype_reserved[0], -+ pfkey_x_satype->sadb_x_satype_reserved[1], -+ pfkey_x_satype->sadb_x_satype_reserved[2]); -+errlab: -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_x_ext_debug_parse(struct sadb_ext *pfkey_ext) -+{ -+ int error = 0; -+ int i; -+ struct sadb_x_debug *pfkey_x_debug = (struct sadb_x_debug *)pfkey_ext; -+ -+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW, -+ "pfkey_x_debug_parse: enter\n"); -+ /* sanity checks... */ -+ if(pfkey_x_debug->sadb_x_debug_len != -+ sizeof(struct sadb_x_debug) / IPSEC_PFKEYv2_ALIGN) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_x_debug_parse: " -+ "size wrong ext_len=%d, key_ext_len=%d.\n", -+ pfkey_x_debug->sadb_x_debug_len, -+ (int)sizeof(struct sadb_x_debug)); -+ SENDERR(EINVAL); -+ } -+ -+ for(i = 0; i < 4; i++) { -+ if(pfkey_x_debug->sadb_x_debug_reserved[i]) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_x_debug_parse: " -+ "reserved[%d]=%d must be set to zero.\n", -+ i, pfkey_x_debug->sadb_x_debug_reserved[i]); -+ SENDERR(EINVAL); -+ } -+ } -+ -+errlab: -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_x_ext_protocol_parse(struct sadb_ext *pfkey_ext) -+{ -+ int error = 0; -+ struct sadb_protocol *p = (struct sadb_protocol *)pfkey_ext; -+ -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, "pfkey_x_protocol_parse:\n"); -+ /* sanity checks... */ -+ -+ if (p->sadb_protocol_len != sizeof(*p)/IPSEC_PFKEYv2_ALIGN) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_x_protocol_parse: size wrong ext_len=%d, key_ext_len=%d.\n", -+ p->sadb_protocol_len, (int)sizeof(*p)); -+ SENDERR(EINVAL); -+ } -+ -+ if (p->sadb_protocol_reserved2 != 0) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_protocol_parse: res=%d, must be zero.\n", -+ p->sadb_protocol_reserved2); -+ SENDERR(EINVAL); -+ } -+ -+ errlab: -+ return error; -+} -+ -+#ifdef NAT_TRAVERSAL -+DEBUG_NO_STATIC int -+pfkey_x_ext_nat_t_type_parse(struct sadb_ext *pfkey_ext) -+{ -+ return 0; -+} -+DEBUG_NO_STATIC int -+pfkey_x_ext_nat_t_port_parse(struct sadb_ext *pfkey_ext) -+{ -+ return 0; -+} -+#endif -+ -+DEBUG_NO_STATIC int -+pfkey_x_ext_outif_parse(struct sadb_ext *pfkey_ext) -+{ -+ int error = 0; -+ struct sadb_x_plumbif *p = (struct sadb_x_plumbif *)pfkey_ext; -+ -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, "pfkey_x_outif_parse:\n"); -+ /* sanity checks... */ -+ -+ if (p->sadb_x_outif_len != IPSEC_PFKEYv2_WORDS(sizeof(*p))) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_x_outif_parse: size wrong ext_len=%d, key_ext_len=%d.\n", -+ p->sadb_x_outif_len, (int)sizeof(*p)); -+ SENDERR(EINVAL); -+ } -+ -+ errlab: -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_x_ext_saref_parse(struct sadb_ext *pfkey_ext) -+{ -+ int error = 0; -+ struct sadb_x_saref *p = (struct sadb_x_saref *)pfkey_ext; -+ -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, "pfkey_x_saref_parse:\n"); -+ /* sanity checks... */ -+ -+ if (p->sadb_x_saref_len != IPSEC_PFKEYv2_WORDS(sizeof(*p))) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_x_saref_parse: size wrong ext_len=%d, key_ext_len=%d.\n", -+ p->sadb_x_saref_len, (int)sizeof(*p)); -+ SENDERR(EINVAL); -+ } -+ -+ errlab: -+ return error; -+} -+ -+ -+#define DEFINEPARSER(NAME) static struct pf_key_ext_parsers_def NAME##_def={NAME, #NAME}; -+ -+DEFINEPARSER(pfkey_sa_parse); -+DEFINEPARSER(pfkey_lifetime_parse); -+DEFINEPARSER(pfkey_address_parse); -+DEFINEPARSER(pfkey_key_parse); -+DEFINEPARSER(pfkey_ident_parse); -+DEFINEPARSER(pfkey_sens_parse); -+DEFINEPARSER(pfkey_prop_parse); -+DEFINEPARSER(pfkey_supported_parse); -+DEFINEPARSER(pfkey_spirange_parse); -+DEFINEPARSER(pfkey_x_kmprivate_parse); -+DEFINEPARSER(pfkey_x_satype_parse); -+DEFINEPARSER(pfkey_x_ext_debug_parse); -+DEFINEPARSER(pfkey_x_ext_protocol_parse); -+#ifdef NAT_TRAVERSAL -+DEFINEPARSER(pfkey_x_ext_nat_t_type_parse); -+DEFINEPARSER(pfkey_x_ext_nat_t_port_parse); -+#endif -+DEFINEPARSER(pfkey_x_ext_outif_parse); -+DEFINEPARSER(pfkey_x_ext_saref_parse); -+ -+struct pf_key_ext_parsers_def *ext_default_parsers[]= -+{ -+ NULL, /* pfkey_msg_parse, */ -+ &pfkey_sa_parse_def, -+ &pfkey_lifetime_parse_def, -+ &pfkey_lifetime_parse_def, -+ &pfkey_lifetime_parse_def, -+ &pfkey_address_parse_def, -+ &pfkey_address_parse_def, -+ &pfkey_address_parse_def, -+ &pfkey_key_parse_def, -+ &pfkey_key_parse_def, -+ &pfkey_ident_parse_def, -+ &pfkey_ident_parse_def, -+ &pfkey_sens_parse_def, -+ &pfkey_prop_parse_def, -+ &pfkey_supported_parse_def, -+ &pfkey_supported_parse_def, -+ &pfkey_spirange_parse_def, -+ &pfkey_x_kmprivate_parse_def, -+ &pfkey_x_satype_parse_def, -+ &pfkey_sa_parse_def, -+ &pfkey_address_parse_def, -+ &pfkey_address_parse_def, -+ &pfkey_address_parse_def, -+ &pfkey_address_parse_def, -+ &pfkey_address_parse_def, -+ &pfkey_x_ext_debug_parse_def, -+ &pfkey_x_ext_protocol_parse_def, -+#ifdef NAT_TRAVERSAL -+ &pfkey_x_ext_nat_t_type_parse_def, -+ &pfkey_x_ext_nat_t_port_parse_def, -+ &pfkey_x_ext_nat_t_port_parse_def, -+ &pfkey_address_parse_def, -+#else -+ NULL,NULL,NULL,NULL, -+#endif -+ &pfkey_x_ext_outif_parse_def, -+ &pfkey_x_ext_saref_parse_def, -+}; -+ -+int -+pfkey_msg_parse(struct sadb_msg *pfkey_msg, -+ struct pf_key_ext_parsers_def *ext_parsers[], -+ struct sadb_ext *extensions[], -+ int dir) -+{ -+ int error = 0; -+ int remain; -+ struct sadb_ext *pfkey_ext; -+ pfkey_ext_track extensions_seen = 0; -+ -+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT, -+ "pfkey_msg_parse: " -+ "parsing message ver=%d, type=%d(%s), errno=%d, satype=%d(%s), len=%d, res=%d, seq=%d, pid=%d.\n", -+ pfkey_msg->sadb_msg_version, -+ pfkey_msg->sadb_msg_type, -+ pfkey_v2_sadb_type_string(pfkey_msg->sadb_msg_type), -+ pfkey_msg->sadb_msg_errno, -+ pfkey_msg->sadb_msg_satype, -+ satype2name(pfkey_msg->sadb_msg_satype), -+ pfkey_msg->sadb_msg_len, -+ pfkey_msg->sadb_msg_reserved, -+ pfkey_msg->sadb_msg_seq, -+ pfkey_msg->sadb_msg_pid); -+ -+ if(ext_parsers == NULL) ext_parsers = ext_default_parsers; -+ -+ pfkey_extensions_init(extensions); -+ -+ remain = pfkey_msg->sadb_msg_len; -+ remain -= IPSEC_PFKEYv2_WORDS(sizeof(struct sadb_msg)); -+ -+ pfkey_ext = (struct sadb_ext*)((char*)pfkey_msg + -+ sizeof(struct sadb_msg)); -+ -+ extensions[0] = (struct sadb_ext *) pfkey_msg; -+ -+ -+ if(pfkey_msg->sadb_msg_version != PF_KEY_V2) { -+ ERROR("pfkey_msg_parse: " -+ "not PF_KEY_V2 msg, found %d, should be %d.\n", -+ pfkey_msg->sadb_msg_version, -+ PF_KEY_V2); -+ SENDERR(EINVAL); -+ } -+ -+ if(!pfkey_msg->sadb_msg_type) { -+ ERROR("pfkey_msg_parse: " -+ "msg type not set, must be non-zero..\n"); -+ SENDERR(EINVAL); -+ } -+ -+ if(pfkey_msg->sadb_msg_type > K_SADB_MAX) { -+ ERROR("pfkey_msg_parse: " -+ "msg type=%d > max=%d.\n", -+ pfkey_msg->sadb_msg_type, -+ K_SADB_MAX); -+ SENDERR(EINVAL); -+ } -+ -+ switch(pfkey_msg->sadb_msg_type) { -+ case K_SADB_GETSPI: -+ case K_SADB_UPDATE: -+ case K_SADB_ADD: -+ case K_SADB_DELETE: -+ case K_SADB_GET: -+ case K_SADB_X_GRPSA: -+ case K_SADB_X_ADDFLOW: -+ if(!satype2proto(pfkey_msg->sadb_msg_satype)) { -+ ERROR("pfkey_msg_parse: " -+ "satype %d conversion to proto failed for msg_type %d (%s).\n", -+ pfkey_msg->sadb_msg_satype, -+ pfkey_msg->sadb_msg_type, -+ pfkey_v2_sadb_type_string(pfkey_msg->sadb_msg_type)); -+ SENDERR(EINVAL); -+ } else { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_msg_parse: " -+ "satype %d(%s) conversion to proto gives %d for msg_type %d(%s).\n", -+ pfkey_msg->sadb_msg_satype, -+ satype2name(pfkey_msg->sadb_msg_satype), -+ satype2proto(pfkey_msg->sadb_msg_satype), -+ pfkey_msg->sadb_msg_type, -+ pfkey_v2_sadb_type_string(pfkey_msg->sadb_msg_type)); -+ } -+ case K_SADB_ACQUIRE: -+ case K_SADB_REGISTER: -+ case K_SADB_EXPIRE: -+ if(!pfkey_msg->sadb_msg_satype) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_msg_parse: " -+ "satype is zero, must be non-zero for msg_type %d(%s).\n", -+ pfkey_msg->sadb_msg_type, -+ pfkey_v2_sadb_type_string(pfkey_msg->sadb_msg_type)); -+ SENDERR(EINVAL); -+ } -+ default: -+ break; -+ } -+ -+ /* errno must not be set in downward messages */ -+ /* this is not entirely true... a response to an ACQUIRE could return an error */ -+ if((dir == EXT_BITS_IN) && (pfkey_msg->sadb_msg_type != K_SADB_ACQUIRE) && pfkey_msg->sadb_msg_errno) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_msg_parse: " -+ "errno set to %d.\n", -+ pfkey_msg->sadb_msg_errno); -+ SENDERR(EINVAL); -+ } -+ -+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW, -+ "pfkey_msg_parse: " -+ "remain=%d\n", -+ remain -+ ); -+ -+ extensions_seen = 1; -+ -+ while( (remain * IPSEC_PFKEYv2_ALIGN) >= sizeof(struct sadb_ext) ) { -+ /* Is there enough message left to support another extension header? */ -+ if(remain < pfkey_ext->sadb_ext_len) { -+ ERROR("pfkey_msg_parse: " -+ "remain %d less than ext len %d.\n", -+ remain, pfkey_ext->sadb_ext_len); -+ SENDERR(EINVAL); -+ } -+ -+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW, -+ "pfkey_msg_parse: " -+ "parsing ext type=%d(%s) remain=%d.\n", -+ pfkey_ext->sadb_ext_type, -+ pfkey_v2_sadb_ext_string(pfkey_ext->sadb_ext_type), -+ remain); -+ -+ /* Is the extension header type valid? */ -+ if((pfkey_ext->sadb_ext_type > K_SADB_EXT_MAX) || (!pfkey_ext->sadb_ext_type)) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_msg_parse: " -+ "ext type %d(%s) invalid, K_SADB_EXT_MAX=%d.\n", -+ pfkey_ext->sadb_ext_type, -+ pfkey_v2_sadb_ext_string(pfkey_ext->sadb_ext_type), -+ K_SADB_EXT_MAX); -+ SENDERR(EINVAL); -+ } -+ -+ /* Have we already seen this type of extension? */ -+ if(extensions[pfkey_ext->sadb_ext_type] != NULL) -+ { -+ ERROR("pfkey_msg_parse: " -+ "ext type %d(%s) already seen.\n", -+ pfkey_ext->sadb_ext_type, -+ pfkey_v2_sadb_ext_string(pfkey_ext->sadb_ext_type)); -+ SENDERR(EINVAL); -+ } -+ -+ /* Do I even know about this type of extension? */ -+ if(ext_parsers[pfkey_ext->sadb_ext_type]==NULL) { -+ ERROR("pfkey_msg_parse: " -+ "ext type %d(%s) unknown, ignoring.\n", -+ pfkey_ext->sadb_ext_type, -+ pfkey_v2_sadb_ext_string(pfkey_ext->sadb_ext_type)); -+ goto next_ext; -+ } -+ -+ /* Is this type of extension permitted for this type of message? */ -+ if(!pfkey_permitted_extension(dir,pfkey_msg->sadb_msg_type,pfkey_ext->sadb_ext_type)) { -+ ERROR("ext type %d(%s) not permitted (parse)\n", -+ pfkey_ext->sadb_ext_type, -+ pfkey_v2_sadb_ext_string(pfkey_ext->sadb_ext_type)); -+ SENDERR(EINVAL); -+ } -+ -+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT, -+ "pfkey_msg_parse: " -+ "remain=%d ext_type=%d(%s) ext_len=%d parsing ext 0p%p with parser %s.\n", -+ remain, -+ pfkey_ext->sadb_ext_type, -+ pfkey_v2_sadb_ext_string(pfkey_ext->sadb_ext_type), -+ pfkey_ext->sadb_ext_len, -+ pfkey_ext, -+ ext_parsers[pfkey_ext->sadb_ext_type]->parser_name); -+ -+ /* Parse the extension */ -+ if((error = -+ (*ext_parsers[pfkey_ext->sadb_ext_type]->parser)(pfkey_ext))) { -+ ERROR("pfkey_msg_parse: " -+ "extension parsing for type %d(%s) failed with error %d.\n", -+ pfkey_ext->sadb_ext_type, -+ pfkey_v2_sadb_ext_string(pfkey_ext->sadb_ext_type), -+ error); -+ SENDERR(-error); -+ } -+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW, -+ "pfkey_msg_parse: " -+ "Extension %d(%s) parsed.\n", -+ pfkey_ext->sadb_ext_type, -+ pfkey_v2_sadb_ext_string(pfkey_ext->sadb_ext_type)); -+ -+ /* Mark that we have seen this extension and remember the header location */ -+ extensions[pfkey_ext->sadb_ext_type] = pfkey_ext; -+ pfkey_mark_extension(pfkey_ext->sadb_ext_type,&extensions_seen); -+ -+ next_ext: -+ /* Calculate how much message remains */ -+ remain -= pfkey_ext->sadb_ext_len; -+ -+ if(!remain) { -+ break; -+ } -+ /* Find the next extension header */ -+ pfkey_ext = (struct sadb_ext*)((char*)pfkey_ext + -+ pfkey_ext->sadb_ext_len * IPSEC_PFKEYv2_ALIGN); -+ } -+ -+ if(remain) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_msg_parse: " -+ "unexpected remainder of %d.\n", -+ remain); -+ /* why is there still something remaining? */ -+ SENDERR(EINVAL); -+ } -+ -+ /* don't check further if it is an error return message since it -+ may not have a body */ -+ if(pfkey_msg->sadb_msg_errno) { -+ SENDERR(-error); -+ } -+ -+ if(pfkey_extensions_missing(dir,pfkey_msg->sadb_msg_type,extensions_seen)) { -+ ERROR("required extensions missing.seen=%08llx.\n",(unsigned long long)extensions_seen); -+ SENDERR(EINVAL); -+ } -+ -+ if((dir == EXT_BITS_IN) && (pfkey_msg->sadb_msg_type == K_SADB_X_DELFLOW) -+ && ((extensions_seen & K_SADB_X_EXT_ADDRESS_DELFLOW) -+ != K_SADB_X_EXT_ADDRESS_DELFLOW) -+ && (((extensions_seen & (1<sadb_sa_flags -+ & SADB_X_SAFLAGS_CLEARFLOW) -+ != SADB_X_SAFLAGS_CLEARFLOW))) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_msg_parse: " -+ "required SADB_X_DELFLOW extensions missing: either %16llx must be present or %16llx must be present with SADB_X_SAFLAGS_CLEARFLOW set.\n", -+ (unsigned long long)K_SADB_X_EXT_ADDRESS_DELFLOW -+ - (extensions_seen & K_SADB_X_EXT_ADDRESS_DELFLOW), -+ (unsigned long long)(1<sadb_msg_type) { -+ case K_SADB_ADD: -+ case K_SADB_UPDATE: -+ /* check maturity */ -+ if(((struct sadb_sa*)extensions[SADB_EXT_SA])->sadb_sa_state != -+ K_SADB_SASTATE_MATURE) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_msg_parse: " -+ "state=%d for add or update should be MATURE=%d.\n", -+ ((struct k_sadb_sa*)extensions[SADB_EXT_SA])->sadb_sa_state, -+ K_SADB_SASTATE_MATURE); -+ SENDERR(EINVAL); -+ } -+ -+ /* check AH and ESP */ -+ switch(((struct sadb_msg*)extensions[SADB_EXT_RESERVED])->sadb_msg_satype) { -+ case SADB_SATYPE_AH: -+ if(!(((struct k_sadb_sa*)extensions[SADB_EXT_SA]) && -+ ((struct k_sadb_sa*)extensions[SADB_EXT_SA])->sadb_sa_auth != -+ SADB_AALG_NONE)) { -+ ERROR("pfkey_msg_parse: " -+ "auth alg is zero, must be non-zero for AH SAs.\n"); -+ SENDERR(EINVAL); -+ } -+ if(((struct k_sadb_sa*)(extensions[SADB_EXT_SA]))->sadb_sa_encrypt != -+ SADB_EALG_NONE) { -+ ERROR("pfkey_msg_parse: " -+ "AH handed encalg=%d, must be zero.\n", -+ ((struct k_sadb_sa*)(extensions[SADB_EXT_SA]))->sadb_sa_encrypt); -+ SENDERR(EINVAL); -+ } -+ break; -+ case SADB_SATYPE_ESP: -+ if(!(((struct k_sadb_sa*)extensions[SADB_EXT_SA]) && -+ ((struct k_sadb_sa*)extensions[SADB_EXT_SA])->sadb_sa_encrypt != -+ SADB_EALG_NONE)) { -+ ERROR("pfkey_msg_parse: " -+ "encrypt alg=%d is zero, must be non-zero for ESP=%d SAs.\n", -+ ((struct k_sadb_sa*)extensions[SADB_EXT_SA])->sadb_sa_encrypt, -+ ((struct sadb_msg*)extensions[SADB_EXT_RESERVED])->sadb_msg_satype); -+ SENDERR(EINVAL); -+ } -+ if((((struct k_sadb_sa*)(extensions[SADB_EXT_SA]))->sadb_sa_encrypt == -+ SADB_EALG_NULL) && -+ (((struct k_sadb_sa*)(extensions[SADB_EXT_SA]))->sadb_sa_auth == -+ SADB_AALG_NONE) ) { -+ ERROR("pfkey_msg_parse: " -+ "ESP handed encNULL+authNONE, illegal combination.\n"); -+ SENDERR(EINVAL); -+ } -+ break; -+ case K_SADB_X_SATYPE_COMP: -+ if(!(((struct k_sadb_sa*)extensions[SADB_EXT_SA]) && -+ ((struct k_sadb_sa*)extensions[SADB_EXT_SA])->sadb_sa_encrypt != -+ SADB_EALG_NONE)) { -+ ERROR("pfkey_msg_parse: " -+ "encrypt alg=%d is zero, must be non-zero for COMP=%d SAs.\n", -+ ((struct k_sadb_sa*)extensions[SADB_EXT_SA])->sadb_sa_encrypt, -+ ((struct sadb_msg*)extensions[SADB_EXT_RESERVED])->sadb_msg_satype); -+ SENDERR(EINVAL); -+ } -+ if(((struct k_sadb_sa*)(extensions[SADB_EXT_SA]))->sadb_sa_auth != -+ SADB_AALG_NONE) { -+ ERROR("pfkey_msg_parse: " -+ "COMP handed auth=%d, must be zero.\n", -+ ((struct k_sadb_sa*)(extensions[SADB_EXT_SA]))->sadb_sa_auth); -+ SENDERR(EINVAL); -+ } -+ break; -+ default: -+ break; -+ } -+ if(ntohl(((struct k_sadb_sa*)(extensions[SADB_EXT_SA]))->sadb_sa_spi) <= 255) { -+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM, -+ "pfkey_msg_parse: " -+ "spi=%08x must be > 255.\n", -+ ntohl(((struct k_sadb_sa*)(extensions[SADB_EXT_SA]))->sadb_sa_spi)); -+ SENDERR(EINVAL); -+ } -+ default: -+ break; -+ } -+ -+errlab: -+ return error; -+} -+ -+/* -+ * Local variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/pfkey_v2_parser.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,3063 @@ -+/* -+ * @(#) RFC2367 PF_KEYv2 Key management API message parser -+ * Copyright (C) 1999, 2000, 2001 Richard Guy Briggs -+ * -+ * OCF support written by David McCullough -+ * Copyright (C) 2004-2005 Intel Corporation. All Rights Reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ */ -+ -+/* -+ * Template from klips/net/ipsec/ipsec/ipsec_netlink.c. -+ */ -+ -+ -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif -+#include -+#include /* printk() */ -+ -+#include "openswan/ipsec_param.h" -+ -+#ifdef MALLOC_SLAB -+# include /* kmalloc() */ -+#else /* MALLOC_SLAB */ -+# include /* kmalloc() */ -+#endif /* MALLOC_SLAB */ -+#include /* error codes */ -+#include /* size_t */ -+#include /* mark_bh */ -+ -+#include /* struct device, and other headers */ -+#include /* eth_type_trans */ -+#include /* struct iphdr */ -+#include -+ -+#include -+ -+#include -+ -+#ifdef SPINLOCK -+# ifdef SPINLOCK_23 -+# include /* *lock* */ -+# else /* SPINLOCK_23 */ -+# include /* *lock* */ -+# endif /* SPINLOCK_23 */ -+#endif /* SPINLOCK */ -+#ifdef NET_21 -+# include /* inet_addr_type */ -+# include -+# define IS_MYADDR RTN_LOCAL -+#endif -+ -+#include -+#ifdef NETLINK_SOCK -+# include -+#else -+# include -+#endif -+ -+#include /* get_random_bytes() */ -+ -+#include "openswan/radij.h" -+#include "openswan/ipsec_encap.h" -+#include "openswan/ipsec_sa.h" -+ -+#include "openswan/ipsec_radij.h" -+#include "openswan/ipsec_xform.h" -+#include "openswan/ipsec_ah.h" -+#include "openswan/ipsec_esp.h" -+#include "openswan/ipsec_tunnel.h" -+#include "openswan/ipsec_mast.h" -+#include "openswan/ipsec_rcv.h" -+#include "openswan/ipcomp.h" -+ -+#include -+#include -+ -+#include "openswan/ipsec_proto.h" -+#include "openswan/ipsec_alg.h" -+ -+#include "openswan/ipsec_kern24.h" -+ -+#include "ipsec_ocf.h" -+ -+#define SENDERR(_x) do { error = -(_x); goto errlab; } while (0) -+ -+struct sklist_t { -+ struct socket *sk; -+ struct sklist_t* next; -+} pfkey_sklist_head, *pfkey_sklist, *pfkey_sklist_prev; -+ -+__u32 pfkey_msg_seq = 0; -+ -+ -+#if 0 -+#define DUMP_SAID dump_said(&extr->ips->ips_said, __LINE__) -+#define DUMP_SAID2 dump_said(&extr.ips->ips_said, __LINE__) -+static void dump_said(ip_said *s, int line) -+{ -+ char msa[SATOT_BUF]; -+ size_t msa_len; -+ -+ msa_len = satot(s, 0, msa, sizeof(msa)); -+ -+ printk("line: %d msa: %s\n", line, msa); -+} -+#endif -+ -+ -+int -+pfkey_alloc_eroute(struct eroute** eroute) -+{ -+ int error = 0; -+ if(*eroute) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_alloc_eroute: " -+ "eroute struct already allocated\n"); -+ SENDERR(EEXIST); -+ } -+ -+ if((*eroute = kmalloc(sizeof(**eroute), GFP_ATOMIC) ) == NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_alloc_eroute: " -+ "memory allocation error\n"); -+ SENDERR(ENOMEM); -+ } -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_alloc_eroute: " -+ "allocating %lu bytes for an eroute at 0p%p\n", -+ (unsigned long) sizeof(**eroute), *eroute); -+ -+ memset((caddr_t)*eroute, 0, sizeof(**eroute)); -+ (*eroute)->er_eaddr.sen_len = -+ (*eroute)->er_emask.sen_len = sizeof(struct sockaddr_encap); -+ (*eroute)->er_eaddr.sen_family = -+ (*eroute)->er_emask.sen_family = AF_ENCAP; -+ (*eroute)->er_eaddr.sen_type = SENT_IP4; -+ (*eroute)->er_emask.sen_type = 255; -+ (*eroute)->er_pid = 0; -+ (*eroute)->er_count = 0; -+ (*eroute)->er_lasttime = jiffies/HZ; -+ -+ errlab: -+ return(error); -+} -+ -+DEBUG_NO_STATIC int -+pfkey_x_protocol_process(struct sadb_ext *pfkey_ext, -+ struct pfkey_extracted_data *extr) -+{ -+ int error = 0; -+ struct sadb_protocol * p = (struct sadb_protocol *)pfkey_ext; -+ -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_x_protocol_process: %p\n", extr); -+ -+ if (extr == 0) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_protocol_process:" -+ "extr is NULL, fatal\n"); -+ SENDERR(EINVAL); -+ } -+ if (extr->eroute == 0) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_protocol_process:" -+ "extr->eroute is NULL, fatal\n"); -+ SENDERR(EINVAL); -+ } -+ -+ extr->eroute->er_eaddr.sen_proto = p->sadb_protocol_proto; -+ extr->eroute->er_emask.sen_proto = p->sadb_protocol_proto ? ~0:0; -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_protocol_process: protocol = %d.\n", -+ p->sadb_protocol_proto); -+ errlab: -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_ipsec_sa_init(struct ipsec_sa *ipsp) -+{ -+ int rc; -+ KLIPS_PRINT(debug_pfkey, "Calling SA_INIT\n"); -+ rc = ipsec_sa_init(ipsp); -+ return rc; -+} -+ -+int -+pfkey_safe_build(int error, struct sadb_ext *extensions[K_SADB_MAX+1]) -+{ -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_safe_build: " -+ "error=%d\n", -+ error); -+ if (!error) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_safe_build:" -+ "success.\n"); -+ return 1; -+ } else { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_safe_build:" -+ "caught error %d\n", -+ error); -+ pfkey_extensions_free(extensions); -+ return 0; -+ } -+} -+ -+ -+DEBUG_NO_STATIC int -+pfkey_getspi_parse(struct sock *sk, struct sadb_ext **extensions, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+ ipsec_spi_t minspi = htonl(256), maxspi = htonl(-1L); -+ int found_avail = 0; -+ struct ipsec_sa *ipsq; -+ char sa[SATOT_BUF]; -+ size_t sa_len; -+ struct sadb_ext *extensions_reply[K_SADB_EXT_MAX+1]; -+ struct sadb_msg *pfkey_reply = NULL; -+ struct socket_list *pfkey_socketsp; -+ uint8_t satype = ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_satype; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_getspi_parse: .\n"); -+ -+ pfkey_extensions_init(extensions_reply); -+ -+ if(extr == NULL || extr->ips == NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_getspi_parse: " -+ "error, extr or extr->ipsec_sa pointer NULL\n"); -+ SENDERR(EINVAL); -+ } -+ -+ if(extensions[K_SADB_EXT_SPIRANGE]) { -+ minspi = ((struct sadb_spirange *)extensions[K_SADB_EXT_SPIRANGE])->sadb_spirange_min; -+ maxspi = ((struct sadb_spirange *)extensions[K_SADB_EXT_SPIRANGE])->sadb_spirange_max; -+ } -+ -+ if(maxspi == minspi) { -+ extr->ips->ips_said.spi = maxspi; -+ ipsq = ipsec_sa_getbyid(&(extr->ips->ips_said)); -+ if(ipsq != NULL) { -+ sa_len = KLIPS_SATOT(debug_pfkey, &extr->ips->ips_said, 0, sa, sizeof(sa)); -+ ipsec_sa_put(ipsq); -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_getspi_parse: " -+ "EMT_GETSPI found an old ipsec_sa for SA: %s, delete it first.\n", -+ sa_len ? sa : " (error)"); -+ SENDERR(EEXIST); -+ } else { -+ found_avail = 1; -+ } -+ } else { -+ int i = 0; -+ __u32 rand_val; -+ __u32 spi_diff; -+ while( ( i < (spi_diff = (ntohl(maxspi) - ntohl(minspi)))) && !found_avail ) { -+ prng_bytes(&ipsec_prng, (char *) &(rand_val), -+ ( (spi_diff < (2^8)) ? 1 : -+ ( (spi_diff < (2^16)) ? 2 : -+ ( (spi_diff < (2^24)) ? 3 : -+ 4 ) ) ) ); -+ extr->ips->ips_said.spi = htonl(ntohl(minspi) + -+ (rand_val % -+ (spi_diff + 1))); -+ i++; -+ ipsq = ipsec_sa_getbyid(&(extr->ips->ips_said)); -+ if(ipsq == NULL) { -+ found_avail = 1; -+ } else { -+ ipsec_sa_put(ipsq); -+ } -+ } -+ } -+ -+ sa_len = KLIPS_SATOT(debug_pfkey, &extr->ips->ips_said, 0, sa, sizeof(sa)); -+ -+ if (!found_avail) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_getspi_parse: " -+ "found an old ipsec_sa for SA: %s, delete it first.\n", -+ sa_len ? sa : " (error)"); -+ SENDERR(EEXIST); -+ } -+ -+ if(ip_chk_addr((unsigned long)extr->ips->ips_said.dst.u.v4.sin_addr.s_addr) == IS_MYADDR) { -+ extr->ips->ips_flags |= EMT_INBOUND; -+ } -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_getspi_parse: " -+ "existing ipsec_sa not found (this is good) for SA: %s, %s-bound, allocating.\n", -+ sa_len ? sa : " (error)", -+ extr->ips->ips_flags & EMT_INBOUND ? "in" : "out"); -+ -+ /* XXX extr->ips->ips_rcvif = &(enc_softc[em->em_if].enc_if);*/ -+ extr->ips->ips_rcvif = NULL; -+ extr->ips->ips_life.ipl_addtime.ipl_count = jiffies/HZ; -+ -+ extr->ips->ips_state = K_SADB_SASTATE_LARVAL; -+ -+ if(!extr->ips->ips_life.ipl_allocations.ipl_count) { -+ extr->ips->ips_life.ipl_allocations.ipl_count += 1; -+ } -+ -+ if(!(pfkey_safe_build(error = pfkey_msg_hdr_build(&extensions_reply[0], -+ K_SADB_GETSPI, -+ satype, -+ 0, -+ ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_seq, -+ ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_pid), -+ extensions_reply) -+ && pfkey_safe_build(error = pfkey_sa_build(&extensions_reply[K_SADB_EXT_SA], -+ K_SADB_EXT_SA, -+ extr->ips->ips_said.spi, -+ 0, -+ K_SADB_SASTATE_LARVAL, -+ 0, -+ 0, -+ 0), -+ extensions_reply) -+ -+ && pfkey_safe_build(error = pfkey_address_build(&extensions_reply[K_SADB_EXT_ADDRESS_SRC], -+ K_SADB_EXT_ADDRESS_SRC, -+ 0, /*extr->ips->ips_said.proto,*/ -+ 0, -+ extr->ips->ips_addr_s), -+ extensions_reply) -+ && pfkey_safe_build(error = pfkey_address_build(&extensions_reply[K_SADB_EXT_ADDRESS_DST], -+ K_SADB_EXT_ADDRESS_DST, -+ 0, /*extr->ips->ips_said.proto,*/ -+ 0, -+ extr->ips->ips_addr_d), -+ extensions_reply) )) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_getspi_parse: " -+ "failed to build the getspi reply message extensions\n"); -+ goto errlab; -+ } -+ -+ if((error = pfkey_msg_build(&pfkey_reply, extensions_reply, EXT_BITS_OUT))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_getspi_parse: " -+ "failed to build the getspi reply message\n"); -+ SENDERR(-error); -+ } -+ for(pfkey_socketsp = pfkey_open_sockets; -+ pfkey_socketsp; -+ pfkey_socketsp = pfkey_socketsp->next) { -+ if((error = pfkey_upmsg(pfkey_socketsp->socketp, pfkey_reply))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_getspi_parse: " -+ "sending up getspi reply message for satype=%d(%s) to socket=0p%p failed with error=%d.\n", -+ satype, -+ satype2name(satype), -+ pfkey_socketsp->socketp, -+ error); -+ SENDERR(-error); -+ } -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_getspi_parse: " -+ "sending up getspi reply message for satype=%d(%s) to socket=0p%p succeeded.\n", -+ satype, -+ satype2name(satype), -+ pfkey_socketsp->socketp); -+ } -+ -+ if((error = ipsec_sa_add(extr->ips))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_getspi_parse: " -+ "failed to add the larval SA=%s with error=%d.\n", -+ sa_len ? sa : " (error)", -+ error); -+ SENDERR(-error); -+ } -+ extr->ips = NULL; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_getspi_parse: " -+ "successful for SA: %s\n", -+ sa_len ? sa : " (error)"); -+ -+ errlab: -+ if (pfkey_reply) { -+ pfkey_msg_free(&pfkey_reply); -+ } -+ pfkey_extensions_free(extensions_reply); -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_update_parse(struct sock *sk, struct sadb_ext **extensions, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+ struct ipsec_sa* ipsq; -+ char sa[SATOT_BUF]; -+ size_t sa_len; -+ struct sadb_ext *extensions_reply[K_SADB_EXT_MAX+1]; -+ struct sadb_msg *pfkey_reply = NULL; -+ struct socket_list *pfkey_socketsp; -+ uint8_t satype = ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_satype; -+#ifdef CONFIG_IPSEC_NAT_TRAVERSAL -+ struct ipsec_sa *nat_t_ips_saved = NULL; -+#endif -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_update_parse: .\n"); -+ -+ pfkey_extensions_init(extensions_reply); -+ -+ if(((struct sadb_sa*)extensions[K_SADB_EXT_SA])->sadb_sa_state != K_SADB_SASTATE_MATURE) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_update_parse: " -+ "error, sa_state=%d must be MATURE=%d\n", -+ ((struct sadb_sa*)extensions[K_SADB_EXT_SA])->sadb_sa_state, -+ K_SADB_SASTATE_MATURE); -+ SENDERR(EINVAL); -+ } -+ -+ if(extr == NULL || extr->ips == NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_update_parse: " -+ "error, extr or extr->ips pointer NULL\n"); -+ SENDERR(EINVAL); -+ } -+ -+ sa_len = KLIPS_SATOT(debug_pfkey, &extr->ips->ips_said, 0, sa, sizeof(sa)); -+ -+ spin_lock_bh(&tdb_lock); -+ -+ ipsq = ipsec_sa_getbyid(&(extr->ips->ips_said)); -+ if (ipsq == NULL) { -+ spin_unlock_bh(&tdb_lock); -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_update_parse: " -+ "reserved ipsec_sa for SA: %s not found. Call K_SADB_GETSPI first or call K_SADB_ADD instead.\n", -+ sa_len ? sa : " (error)"); -+ SENDERR(ENOENT); -+ } -+ -+ if(ip_chk_addr((unsigned long)extr->ips->ips_said.dst.u.v4.sin_addr.s_addr) == IS_MYADDR) { -+ extr->ips->ips_flags |= EMT_INBOUND; -+ } -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_update_parse: " -+ "existing ipsec_sa found (this is good) for SA: %s, %s-bound, updating.\n", -+ sa_len ? sa : " (error)", -+ extr->ips->ips_flags & EMT_INBOUND ? "in" : "out"); -+ -+#ifdef CONFIG_IPSEC_NAT_TRAVERSAL -+ if (extr->ips->ips_natt_sport || extr->ips->ips_natt_dport) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_update_parse: only updating NAT-T ports " -+ "(%u:%u -> %u:%u)\n", -+ ipsq->ips_natt_sport, ipsq->ips_natt_dport, -+ extr->ips->ips_natt_sport, extr->ips->ips_natt_dport); -+ -+ if (extr->ips->ips_natt_sport) { -+ ipsq->ips_natt_sport = extr->ips->ips_natt_sport; -+ if (ipsq->ips_addr_s->sa_family == AF_INET) { -+ ((struct sockaddr_in *)(ipsq->ips_addr_s))->sin_port = htons(extr->ips->ips_natt_sport); -+ } -+ } -+ -+ if (extr->ips->ips_natt_dport) { -+ ipsq->ips_natt_dport = extr->ips->ips_natt_dport; -+ if (ipsq->ips_addr_d->sa_family == AF_INET) { -+ ((struct sockaddr_in *)(ipsq->ips_addr_d))->sin_port = htons(extr->ips->ips_natt_dport); -+ } -+ } -+ -+ nat_t_ips_saved = extr->ips; -+ extr->ips = ipsq; -+ } -+ else -+#endif -+ { -+ /* XXX extr->ips->ips_rcvif = &(enc_softc[em->em_if].enc_if);*/ -+ extr->ips->ips_rcvif = NULL; -+ if ((error = pfkey_ipsec_sa_init(extr->ips))) { -+ ipsec_sa_put(ipsq); -+ spin_unlock_bh(&tdb_lock); -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_update_parse: " -+ "not successful for SA: %s, deleting.\n", -+ sa_len ? sa : " (error)"); -+ SENDERR(-error); -+ } -+ -+ extr->ips->ips_life.ipl_addtime.ipl_count = ipsq->ips_life.ipl_addtime.ipl_count; -+ -+ /* this will call delchain-equivalent if refcount=>0 */ -+ ipsec_sa_put(ipsq); -+ } -+ -+ spin_unlock_bh(&tdb_lock); -+ -+ if(!(pfkey_safe_build(error = pfkey_msg_hdr_build(&extensions_reply[0], -+ K_SADB_UPDATE, -+ satype, -+ 0, -+ ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_seq, -+ ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_pid), -+ extensions_reply) -+ && pfkey_safe_build(error = pfkey_sa_build(&extensions_reply[K_SADB_EXT_SA], -+ K_SADB_EXT_SA, -+ extr->ips->ips_said.spi, -+ extr->ips->ips_replaywin, -+ extr->ips->ips_state, -+ extr->ips->ips_authalg, -+ extr->ips->ips_encalg, -+ extr->ips->ips_flags), -+ extensions_reply) -+ /* The 3 lifetime extentions should only be sent if non-zero. */ -+ && (extensions[K_SADB_EXT_LIFETIME_HARD] -+ ? pfkey_safe_build(error = pfkey_lifetime_build(&extensions_reply[K_SADB_EXT_LIFETIME_HARD], -+ K_SADB_EXT_LIFETIME_HARD, -+ extr->ips->ips_life.ipl_allocations.ipl_hard, -+ extr->ips->ips_life.ipl_bytes.ipl_hard, -+ extr->ips->ips_life.ipl_addtime.ipl_hard, -+ extr->ips->ips_life.ipl_usetime.ipl_hard, -+ extr->ips->ips_life.ipl_packets.ipl_hard), -+ extensions_reply) : 1) -+ && (extensions[K_SADB_EXT_LIFETIME_SOFT] -+ ? pfkey_safe_build(error = pfkey_lifetime_build(&extensions_reply[K_SADB_EXT_LIFETIME_SOFT], -+ K_SADB_EXT_LIFETIME_SOFT, -+ extr->ips->ips_life.ipl_allocations.ipl_count, -+ extr->ips->ips_life.ipl_bytes.ipl_count, -+ extr->ips->ips_life.ipl_addtime.ipl_count, -+ extr->ips->ips_life.ipl_usetime.ipl_count, -+ extr->ips->ips_life.ipl_packets.ipl_count), -+ extensions_reply) : 1) -+ && (extr->ips->ips_life.ipl_allocations.ipl_count -+ || extr->ips->ips_life.ipl_bytes.ipl_count -+ || extr->ips->ips_life.ipl_addtime.ipl_count -+ || extr->ips->ips_life.ipl_usetime.ipl_count -+ || extr->ips->ips_life.ipl_packets.ipl_count -+ -+ ? pfkey_safe_build(error = pfkey_lifetime_build(&extensions_reply[K_SADB_EXT_LIFETIME_CURRENT], -+ K_SADB_EXT_LIFETIME_CURRENT, -+ extr->ips->ips_life.ipl_allocations.ipl_count, -+ extr->ips->ips_life.ipl_bytes.ipl_count, -+ extr->ips->ips_life.ipl_addtime.ipl_count, -+ extr->ips->ips_life.ipl_usetime.ipl_count, -+ extr->ips->ips_life.ipl_packets.ipl_count), -+ extensions_reply) : 1) -+ && pfkey_safe_build(error = pfkey_address_build(&extensions_reply[K_SADB_EXT_ADDRESS_SRC], -+ K_SADB_EXT_ADDRESS_SRC, -+ 0, /*extr->ips->ips_said.proto,*/ -+ 0, -+ extr->ips->ips_addr_s), -+ extensions_reply) -+ && pfkey_safe_build(error = pfkey_address_build(&extensions_reply[K_SADB_EXT_ADDRESS_DST], -+ K_SADB_EXT_ADDRESS_DST, -+ 0, /*extr->ips->ips_said.proto,*/ -+ 0, -+ extr->ips->ips_addr_d), -+ extensions_reply) -+ && (extr->ips->ips_ident_s.data -+ ? pfkey_safe_build(error = pfkey_ident_build(&extensions_reply[K_SADB_EXT_IDENTITY_SRC], -+ K_SADB_EXT_IDENTITY_SRC, -+ extr->ips->ips_ident_s.type, -+ extr->ips->ips_ident_s.id, -+ extr->ips->ips_ident_s.len, -+ extr->ips->ips_ident_s.data), -+ extensions_reply) : 1) -+ && (extr->ips->ips_ident_d.data -+ ? pfkey_safe_build(error = pfkey_ident_build(&extensions_reply[K_SADB_EXT_IDENTITY_DST], -+ K_SADB_EXT_IDENTITY_DST, -+ extr->ips->ips_ident_d.type, -+ extr->ips->ips_ident_d.id, -+ extr->ips->ips_ident_d.len, -+ extr->ips->ips_ident_d.data), -+ extensions_reply) : 1) -+#if 0 -+ /* FIXME: This won't work yet because I have not finished -+ it. */ -+ && (extr->ips->ips_sens_ -+ ? pfkey_safe_build(error = pfkey_sens_build(&extensions_reply[K_SADB_EXT_SENSITIVITY], -+ extr->ips->ips_sens_dpd, -+ extr->ips->ips_sens_sens_level, -+ extr->ips->ips_sens_sens_len, -+ extr->ips->ips_sens_sens_bitmap, -+ extr->ips->ips_sens_integ_level, -+ extr->ips->ips_sens_integ_len, -+ extr->ips->ips_sens_integ_bitmap), -+ extensions_reply) : 1) -+#endif -+ )) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_update_parse: " -+ "failed to build the update reply message extensions\n"); -+ SENDERR(-error); -+ } -+ -+ if((error = pfkey_msg_build(&pfkey_reply, extensions_reply, EXT_BITS_OUT))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_update_parse: " -+ "failed to build the update reply message\n"); -+ SENDERR(-error); -+ } -+ for(pfkey_socketsp = pfkey_open_sockets; -+ pfkey_socketsp; -+ pfkey_socketsp = pfkey_socketsp->next) { -+ if((error = pfkey_upmsg(pfkey_socketsp->socketp, pfkey_reply))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_update_parse: " -+ "sending up update reply message for satype=%d(%s) to socket=0p%p failed with error=%d.\n", -+ satype, -+ satype2name(satype), -+ pfkey_socketsp->socketp, -+ error); -+ SENDERR(-error); -+ } -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_update_parse: " -+ "sending up update reply message for satype=%d(%s) to socket=0p%p succeeded.\n", -+ satype, -+ satype2name(satype), -+ pfkey_socketsp->socketp); -+ } -+ -+#ifdef CONFIG_IPSEC_NAT_TRAVERSAL -+ if (nat_t_ips_saved) { -+ /** -+ * As we _really_ update existing SA, we keep tdbq and need to delete -+ * parsed ips (nat_t_ips_saved, was extr->ips). -+ * -+ * goto errlab with extr->ips = nat_t_ips_saved will free it. -+ */ -+ -+ extr->ips = nat_t_ips_saved; -+ -+ error = 0; -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_update_parse (NAT-T ports): " -+ "successful for SA: %s\n", -+ sa_len ? sa : " (error)"); -+ -+ goto errlab; -+ } -+#endif -+ -+ if((error = ipsec_sa_add(extr->ips))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_update_parse: " -+ "failed to update the mature SA=%s with error=%d.\n", -+ sa_len ? sa : " (error)", -+ error); -+ SENDERR(-error); -+ } -+ extr->ips = NULL; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_update_parse: " -+ "successful for SA: %s\n", -+ sa_len ? sa : " (error)"); -+ -+ errlab: -+ if (pfkey_reply) { -+ pfkey_msg_free(&pfkey_reply); -+ } -+ pfkey_extensions_free(extensions_reply); -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_add_parse(struct sock *sk, struct sadb_ext **extensions, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+ struct ipsec_sa* ipsq; -+ char sa[SATOT_BUF]; -+ size_t sa_len; -+ struct sadb_ext *extensions_reply[K_SADB_EXT_MAX+1]; -+ struct sadb_msg *pfkey_reply = NULL; -+ struct socket_list *pfkey_socketsp; -+ uint8_t satype = ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_satype; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_add_parse: .\n"); -+ -+ pfkey_extensions_init(extensions_reply); -+ -+ if(((struct sadb_sa*)extensions[K_SADB_EXT_SA])->sadb_sa_state != K_SADB_SASTATE_MATURE) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_add_parse: " -+ "error, sa_state=%d must be MATURE=%d\n", -+ ((struct sadb_sa*)extensions[K_SADB_EXT_SA])->sadb_sa_state, -+ K_SADB_SASTATE_MATURE); -+ SENDERR(EINVAL); -+ } -+ -+ if(!extr || !extr->ips) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_add_parse: " -+ "extr or extr->ips pointer NULL\n"); -+ SENDERR(EINVAL); -+ } -+ -+ sa_len = KLIPS_SATOT(debug_pfkey, &extr->ips->ips_said, 0, sa, sizeof(sa)); -+ -+ ipsq = ipsec_sa_getbyid(&(extr->ips->ips_said)); -+ if(ipsq != NULL) { -+ ipsec_sa_put(ipsq); -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_add_parse: " -+ "found an old ipsec_sa for SA%s, delete it first.\n", -+ sa_len ? sa : " (error)"); -+ SENDERR(EEXIST); -+ } -+ -+ if(ip_chk_addr((unsigned long)extr->ips->ips_said.dst.u.v4.sin_addr.s_addr) == IS_MYADDR) { -+ extr->ips->ips_flags |= EMT_INBOUND; -+ } -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_add_parse: " -+ "existing ipsec_sa not found (this is good) for SA%s, %s-bound, allocating.\n", -+ sa_len ? sa : " (error)", -+ extr->ips->ips_flags & EMT_INBOUND ? "in" : "out"); -+ -+ /* XXX extr->ips->ips_rcvif = &(enc_softc[em->em_if].enc_if);*/ -+ extr->ips->ips_rcvif = NULL; -+ -+ if ((error = ipsec_sa_init(extr->ips))) { -+ KLIPS_ERROR(debug_pfkey, -+ "pfkey_add_parse: " -+ "not successful for SA: %s, deleting.\n", -+ sa_len ? sa : " (error)"); -+ SENDERR(-error); -+ } -+ -+ if(extr->sarefme!=IPSEC_SAREF_NULL -+ && extr->ips->ips_ref==IPSEC_SAREF_NULL) { -+ extr->ips->ips_ref=extr->sarefme; -+ } -+ -+ if(extr->sarefhim!=IPSEC_SAREF_NULL -+ && extr->ips->ips_refhim==IPSEC_SAREF_NULL) { -+ extr->ips->ips_refhim=extr->sarefhim; -+ } -+ -+ /* attach it to the SAref table */ -+ if((error = ipsec_sa_intern(extr->ips)) != 0) { -+ KLIPS_ERROR(debug_pfkey, -+ "pfkey_add_parse: " -+ "failed to intern SA as SAref#%lu\n" -+ , (unsigned long)extr->ips->ips_ref); -+ SENDERR(-error); -+ } -+ -+ extr->ips->ips_life.ipl_addtime.ipl_count = jiffies / HZ; -+ if(!extr->ips->ips_life.ipl_allocations.ipl_count) { -+ extr->ips->ips_life.ipl_allocations.ipl_count += 1; -+ } -+ -+ if(!(pfkey_safe_build(error = pfkey_msg_hdr_build(&extensions_reply[0], -+ K_SADB_ADD, -+ satype, -+ 0, -+ ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_seq, -+ ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_pid), -+ extensions_reply) -+ && pfkey_safe_build(error = pfkey_sa_build(&extensions_reply[K_SADB_EXT_SA], -+ K_SADB_EXT_SA, -+ extr->ips->ips_said.spi, -+ extr->ips->ips_replaywin, -+ extr->ips->ips_state, -+ extr->ips->ips_authalg, -+ extr->ips->ips_encalg, -+ extr->ips->ips_flags), -+ extensions_reply) -+ && pfkey_safe_build(error = pfkey_saref_build(&extensions_reply[K_SADB_X_EXT_SAREF], -+ extr->ips->ips_ref, -+ extr->ips->ips_refhim), -+ extensions_reply) -+ /* The 3 lifetime extentions should only be sent if non-zero. */ -+ && (extensions[K_SADB_EXT_LIFETIME_HARD] -+ ? pfkey_safe_build(error = pfkey_lifetime_build(&extensions_reply[K_SADB_EXT_LIFETIME_HARD], -+ K_SADB_EXT_LIFETIME_HARD, -+ extr->ips->ips_life.ipl_allocations.ipl_hard, -+ extr->ips->ips_life.ipl_bytes.ipl_hard, -+ extr->ips->ips_life.ipl_addtime.ipl_hard, -+ extr->ips->ips_life.ipl_usetime.ipl_hard, -+ extr->ips->ips_life.ipl_packets.ipl_hard), -+ extensions_reply) : 1) -+ && (extensions[K_SADB_EXT_LIFETIME_SOFT] -+ ? pfkey_safe_build(error = pfkey_lifetime_build(&extensions_reply[K_SADB_EXT_LIFETIME_SOFT], -+ K_SADB_EXT_LIFETIME_SOFT, -+ extr->ips->ips_life.ipl_allocations.ipl_soft, -+ extr->ips->ips_life.ipl_bytes.ipl_soft, -+ extr->ips->ips_life.ipl_addtime.ipl_soft, -+ extr->ips->ips_life.ipl_usetime.ipl_soft, -+ extr->ips->ips_life.ipl_packets.ipl_soft), -+ extensions_reply) : 1) -+ && pfkey_safe_build(error = pfkey_address_build(&extensions_reply[K_SADB_EXT_ADDRESS_SRC], -+ K_SADB_EXT_ADDRESS_SRC, -+ 0, /*extr->ips->ips_said.proto,*/ -+ 0, -+ extr->ips->ips_addr_s), -+ extensions_reply) -+ && pfkey_safe_build(error = pfkey_address_build(&extensions_reply[K_SADB_EXT_ADDRESS_DST], -+ K_SADB_EXT_ADDRESS_DST, -+ 0, /*extr->ips->ips_said.proto,*/ -+ 0, -+ extr->ips->ips_addr_d), -+ extensions_reply) -+ && (extr->ips->ips_ident_s.data -+ ? pfkey_safe_build(error = pfkey_ident_build(&extensions_reply[K_SADB_EXT_IDENTITY_SRC], -+ K_SADB_EXT_IDENTITY_SRC, -+ extr->ips->ips_ident_s.type, -+ extr->ips->ips_ident_s.id, -+ extr->ips->ips_ident_s.len, -+ extr->ips->ips_ident_s.data), -+ extensions_reply) : 1) -+ && (extr->ips->ips_ident_d.data -+ ? pfkey_safe_build(error = pfkey_ident_build(&extensions_reply[K_SADB_EXT_IDENTITY_DST], -+ K_SADB_EXT_IDENTITY_DST, -+ extr->ips->ips_ident_d.type, -+ extr->ips->ips_ident_d.id, -+ extr->ips->ips_ident_d.len, -+ extr->ips->ips_ident_d.data), -+ extensions_reply) : 1) -+#if 0 -+ /* FIXME: This won't work yet because I have not finished -+ it. */ -+ && (extr->ips->ips_sens_ -+ ? pfkey_safe_build(error = pfkey_sens_build(&extensions_reply[K_SADB_EXT_SENSITIVITY], -+ extr->ips->ips_sens_dpd, -+ extr->ips->ips_sens_sens_level, -+ extr->ips->ips_sens_sens_len, -+ extr->ips->ips_sens_sens_bitmap, -+ extr->ips->ips_sens_integ_level, -+ extr->ips->ips_sens_integ_len, -+ extr->ips->ips_sens_integ_bitmap), -+ extensions_reply) : 1) -+#endif -+ )) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_add_parse: " -+ "failed to build the add reply message extensions\n"); -+ SENDERR(-error); -+ } -+ -+ if((error = pfkey_msg_build(&pfkey_reply, extensions_reply, EXT_BITS_OUT))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_add_parse: " -+ "failed to build the add reply message\n"); -+ SENDERR(-error); -+ } -+ for(pfkey_socketsp = pfkey_open_sockets; -+ pfkey_socketsp; -+ pfkey_socketsp = pfkey_socketsp->next) { -+ if((error = pfkey_upmsg(pfkey_socketsp->socketp, pfkey_reply))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_add_parse: " -+ "sending up add reply message for satype=%d(%s) to socket=0p%p failed with error=%d.\n", -+ satype, -+ satype2name(satype), -+ pfkey_socketsp->socketp, -+ error); -+ SENDERR(-error); -+ } -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_add_parse: " -+ "sending up add reply message for satype=%d(%s) to socket=0p%p succeeded.\n", -+ satype, -+ satype2name(satype), -+ pfkey_socketsp->socketp); -+ } -+ -+ if(extr->outif != 0 && extr->outif != -1) { -+ extr->ips->ips_out = ipsec_mast_get_device(extr->outif); -+ extr->ips->ips_transport_direct = ipsec_mast_is_transport(extr->outif); -+ } -+ -+ if((error = ipsec_sa_add(extr->ips))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_add_parse: " -+ "failed to add the mature SA=%s with error=%d.\n", -+ sa_len ? sa : " (error)", -+ error); -+ SENDERR(-error); -+ } -+ ipsec_sa_put(extr->ips); -+ extr->ips = NULL; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_add_parse: " -+ "successful for SA: %s\n", -+ sa_len ? sa : " (error)"); -+ -+ errlab: -+ if (pfkey_reply) { -+ pfkey_msg_free(&pfkey_reply); -+ } -+ pfkey_extensions_free(extensions_reply); -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_delete_parse(struct sock *sk, struct sadb_ext **extensions, struct pfkey_extracted_data* extr) -+{ -+ struct ipsec_sa *ipsp; -+ char sa[SATOT_BUF]; -+ size_t sa_len; -+ int error = 0; -+ struct sadb_ext *extensions_reply[K_SADB_EXT_MAX+1]; -+ struct sadb_msg *pfkey_reply = NULL; -+ struct socket_list *pfkey_socketsp; -+ uint8_t satype = ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_satype; -+ IPsecSAref_t ref; -+ struct sadb_builds sab; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_delete_parse: .\n"); -+ -+ pfkey_extensions_init(extensions_reply); -+ -+ if(!extr || !extr->ips) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_delete_parse: " -+ "extr or extr->ips pointer NULL, fatal\n"); -+ SENDERR(EINVAL); -+ } -+ -+ sa_len = KLIPS_SATOT(debug_pfkey, &extr->ips->ips_said, 0, sa, sizeof(sa)); -+ -+ spin_lock_bh(&tdb_lock); -+ -+ ipsp = ipsec_sa_getbyid(&(extr->ips->ips_said)); -+ if (ipsp == NULL) { -+ spin_unlock_bh(&tdb_lock); -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_delete_parse: " -+ "ipsec_sa not found for SA:%s, could not delete.\n", -+ sa_len ? sa : " (error)"); -+ SENDERR(ESRCH); -+ } -+ -+ /* remove it from SAref tables */ -+ ref = ipsp->ips_ref; -+ ipsec_sa_untern(ipsp); -+ ipsec_sa_rm(ipsp); -+ -+ /* this will call delchain-equivalent if refcount -> 0 -+ * noting that get() above, added to ref count */ -+ ipsec_sa_put(ipsp); -+ spin_unlock_bh(&tdb_lock); -+ -+ memset(&sab, 0, sizeof(sab)); -+ sab.sa_base.sadb_sa_exttype = K_SADB_EXT_SA; -+ sab.sa_base.sadb_sa_spi = extr->ips->ips_said.spi; -+ sab.sa_base.sadb_sa_replay = 0; -+ sab.sa_base.sadb_sa_state = 0; -+ sab.sa_base.sadb_sa_auth = 0; -+ sab.sa_base.sadb_sa_encrypt = 0; -+ sab.sa_base.sadb_sa_flags = 0; -+ sab.sa_base.sadb_x_sa_ref = ref; -+ -+ if(!(pfkey_safe_build(error = pfkey_msg_hdr_build(&extensions_reply[0], -+ K_SADB_DELETE, -+ satype, -+ 0, -+ ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_seq, -+ ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_pid), -+ extensions_reply) -+ && pfkey_safe_build(error = pfkey_sa_builds(&extensions_reply[K_SADB_EXT_SA], sab), -+ extensions_reply) -+ && pfkey_safe_build(error = pfkey_address_build(&extensions_reply[K_SADB_EXT_ADDRESS_SRC], -+ K_SADB_EXT_ADDRESS_SRC, -+ 0, /*extr->ips->ips_said.proto,*/ -+ 0, -+ extr->ips->ips_addr_s), -+ extensions_reply) -+ && pfkey_safe_build(error = pfkey_address_build(&extensions_reply[K_SADB_EXT_ADDRESS_DST], -+ K_SADB_EXT_ADDRESS_DST, -+ 0, /*extr->ips->ips_said.proto,*/ -+ 0, -+ extr->ips->ips_addr_d), -+ extensions_reply) -+ )) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_delete_parse: " -+ "failed to build the delete reply message extensions\n"); -+ SENDERR(-error); -+ } -+ -+ if((error = pfkey_msg_build(&pfkey_reply, extensions_reply, EXT_BITS_OUT))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_delete_parse: " -+ "failed to build the delete reply message\n"); -+ SENDERR(-error); -+ } -+ for(pfkey_socketsp = pfkey_open_sockets; -+ pfkey_socketsp; -+ pfkey_socketsp = pfkey_socketsp->next) { -+ if((error = pfkey_upmsg(pfkey_socketsp->socketp, pfkey_reply))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_delete_parse: " -+ "sending up delete reply message for satype=%d(%s) to socket=0p%p failed with error=%d.\n", -+ satype, -+ satype2name(satype), -+ pfkey_socketsp->socketp, -+ error); -+ SENDERR(-error); -+ } -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_delete_parse: " -+ "sending up delete reply message for satype=%d(%s) to socket=0p%p succeeded.\n", -+ satype, -+ satype2name(satype), -+ pfkey_socketsp->socketp); -+ } -+ -+ errlab: -+ if (pfkey_reply) { -+ pfkey_msg_free(&pfkey_reply); -+ } -+ pfkey_extensions_free(extensions_reply); -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_get_parse(struct sock *sk, struct sadb_ext **extensions, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+ struct ipsec_sa *ipsp; -+ char sa[SATOT_BUF]; -+ size_t sa_len; -+ struct sadb_ext *extensions_reply[K_SADB_EXT_MAX+1]; -+ struct sadb_msg *pfkey_reply = NULL; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_get_parse: .\n"); -+ -+ pfkey_extensions_init(extensions_reply); -+ -+ if(!extr || !extr->ips) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_get_parse: " -+ "extr or extr->ips pointer NULL, fatal\n"); -+ SENDERR(EINVAL); -+ } -+ -+ sa_len = KLIPS_SATOT(debug_pfkey, &extr->ips->ips_said, 0, sa, sizeof(sa)); -+ -+ spin_lock_bh(&tdb_lock); -+ -+ ipsp = ipsec_sa_getbyid(&(extr->ips->ips_said)); -+ if (ipsp == NULL) { -+ spin_unlock_bh(&tdb_lock); -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_get_parse: " -+ "ipsec_sa not found for SA=%s, could not get.\n", -+ sa_len ? sa : " (error)"); -+ SENDERR(ESRCH); -+ } -+ -+ if(!(pfkey_safe_build(error = pfkey_msg_hdr_build(&extensions_reply[0], -+ K_SADB_GET, -+ ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_satype, -+ 0, -+ ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_seq, -+ ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_pid), -+ extensions_reply) -+ && pfkey_safe_build(error = pfkey_sa_build(&extensions_reply[K_SADB_EXT_SA], -+ K_SADB_EXT_SA, -+ extr->ips->ips_said.spi, -+ extr->ips->ips_replaywin, -+ extr->ips->ips_state, -+ extr->ips->ips_authalg, -+ extr->ips->ips_encalg, -+ extr->ips->ips_flags), -+ extensions_reply) -+ /* The 3 lifetime extentions should only be sent if non-zero. */ -+ && (ipsp->ips_life.ipl_allocations.ipl_count -+ || ipsp->ips_life.ipl_bytes.ipl_count -+ || ipsp->ips_life.ipl_addtime.ipl_count -+ || ipsp->ips_life.ipl_usetime.ipl_count -+ || ipsp->ips_life.ipl_packets.ipl_count -+ ? pfkey_safe_build(error = pfkey_lifetime_build(&extensions_reply[K_SADB_EXT_LIFETIME_CURRENT], -+ K_SADB_EXT_LIFETIME_CURRENT, -+ ipsp->ips_life.ipl_allocations.ipl_count, -+ ipsp->ips_life.ipl_bytes.ipl_count, -+ ipsp->ips_life.ipl_addtime.ipl_count, -+ ipsp->ips_life.ipl_usetime.ipl_count, -+ ipsp->ips_life.ipl_packets.ipl_count), -+ extensions_reply) : 1) -+ && (ipsp->ips_life.ipl_allocations.ipl_hard -+ || ipsp->ips_life.ipl_bytes.ipl_hard -+ || ipsp->ips_life.ipl_addtime.ipl_hard -+ || ipsp->ips_life.ipl_usetime.ipl_hard -+ || ipsp->ips_life.ipl_packets.ipl_hard -+ ? pfkey_safe_build(error = pfkey_lifetime_build(&extensions_reply[K_SADB_EXT_LIFETIME_HARD], -+ K_SADB_EXT_LIFETIME_HARD, -+ ipsp->ips_life.ipl_allocations.ipl_hard, -+ ipsp->ips_life.ipl_bytes.ipl_hard, -+ ipsp->ips_life.ipl_addtime.ipl_hard, -+ ipsp->ips_life.ipl_usetime.ipl_hard, -+ ipsp->ips_life.ipl_packets.ipl_hard), -+ extensions_reply) : 1) -+ && (ipsp->ips_life.ipl_allocations.ipl_soft -+ || ipsp->ips_life.ipl_bytes.ipl_soft -+ || ipsp->ips_life.ipl_addtime.ipl_soft -+ || ipsp->ips_life.ipl_usetime.ipl_soft -+ || ipsp->ips_life.ipl_packets.ipl_soft -+ ? pfkey_safe_build(error = pfkey_lifetime_build(&extensions_reply[K_SADB_EXT_LIFETIME_SOFT], -+ K_SADB_EXT_LIFETIME_SOFT, -+ ipsp->ips_life.ipl_allocations.ipl_soft, -+ ipsp->ips_life.ipl_bytes.ipl_soft, -+ ipsp->ips_life.ipl_addtime.ipl_soft, -+ ipsp->ips_life.ipl_usetime.ipl_soft, -+ ipsp->ips_life.ipl_packets.ipl_soft), -+ extensions_reply) : 1) -+ && pfkey_safe_build(error = pfkey_address_build(&extensions_reply[K_SADB_EXT_ADDRESS_SRC], -+ K_SADB_EXT_ADDRESS_SRC, -+ 0, /*extr->ips->ips_said.proto,*/ -+ 0, -+ extr->ips->ips_addr_s), -+ extensions_reply) -+ && pfkey_safe_build(error = pfkey_address_build(&extensions_reply[K_SADB_EXT_ADDRESS_DST], -+ K_SADB_EXT_ADDRESS_DST, -+ 0, /*extr->ips->ips_said.proto,*/ -+ 0, -+ extr->ips->ips_addr_d), -+ extensions_reply) -+ && (extr->ips->ips_addr_p -+ ? pfkey_safe_build(error = pfkey_address_build(&extensions_reply[K_SADB_EXT_ADDRESS_PROXY], -+ K_SADB_EXT_ADDRESS_PROXY, -+ 0, /*extr->ips->ips_said.proto,*/ -+ 0, -+ extr->ips->ips_addr_p), -+ extensions_reply) : 1) -+#if 0 -+ /* FIXME: This won't work yet because the keys are not -+ stored directly in the ipsec_sa. They are stored as -+ contexts. */ -+ && (extr->ips->ips_key_a_size -+ ? pfkey_safe_build(error = pfkey_key_build(&extensions_reply[K_SADB_EXT_KEY_AUTH], -+ K_SADB_EXT_KEY_AUTH, -+ extr->ips->ips_key_a_size * 8, -+ extr->ips->ips_key_a), -+ extensions_reply) : 1) -+ /* FIXME: This won't work yet because the keys are not -+ stored directly in the ipsec_sa. They are stored as -+ key schedules. */ -+ && (extr->ips->ips_key_e_size -+ ? pfkey_safe_build(error = pfkey_key_build(&extensions_reply[K_SADB_EXT_KEY_ENCRYPT], -+ K_SADB_EXT_KEY_ENCRYPT, -+ extr->ips->ips_key_e_size * 8, -+ extr->ips->ips_key_e), -+ extensions_reply) : 1) -+#endif -+ && (extr->ips->ips_ident_s.data -+ ? pfkey_safe_build(error = pfkey_ident_build(&extensions_reply[K_SADB_EXT_IDENTITY_SRC], -+ K_SADB_EXT_IDENTITY_SRC, -+ extr->ips->ips_ident_s.type, -+ extr->ips->ips_ident_s.id, -+ extr->ips->ips_ident_s.len, -+ extr->ips->ips_ident_s.data), -+ extensions_reply) : 1) -+ && (extr->ips->ips_ident_d.data -+ ? pfkey_safe_build(error = pfkey_ident_build(&extensions_reply[K_SADB_EXT_IDENTITY_DST], -+ K_SADB_EXT_IDENTITY_DST, -+ extr->ips->ips_ident_d.type, -+ extr->ips->ips_ident_d.id, -+ extr->ips->ips_ident_d.len, -+ extr->ips->ips_ident_d.data), -+ extensions_reply) : 1) -+#if 0 -+ /* FIXME: This won't work yet because I have not finished -+ it. */ -+ && (extr->ips->ips_sens_ -+ ? pfkey_safe_build(error = pfkey_sens_build(&extensions_reply[K_SADB_EXT_SENSITIVITY], -+ extr->ips->ips_sens_dpd, -+ extr->ips->ips_sens_sens_level, -+ extr->ips->ips_sens_sens_len, -+ extr->ips->ips_sens_sens_bitmap, -+ extr->ips->ips_sens_integ_level, -+ extr->ips->ips_sens_integ_len, -+ extr->ips->ips_sens_integ_bitmap), -+ extensions_reply) : 1) -+#endif -+ )) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_get_parse: " -+ "failed to build the get reply message extensions\n"); -+ ipsec_sa_put(ipsp); -+ spin_unlock_bh(&tdb_lock); -+ SENDERR(-error); -+ } -+ -+ ipsec_sa_put(ipsp); -+ spin_unlock_bh(&tdb_lock); -+ -+ if((error = pfkey_msg_build(&pfkey_reply, extensions_reply, EXT_BITS_OUT))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_get_parse: " -+ "failed to build the get reply message\n"); -+ SENDERR(-error); -+ } -+ -+ if((error = pfkey_upmsg(sk->sk_socket, pfkey_reply))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_get_parse: " -+ "failed to send the get reply message\n"); -+ SENDERR(-error); -+ } -+ -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_get_parse: " -+ "succeeded in sending get reply message.\n"); -+ -+ errlab: -+ if (pfkey_reply) { -+ pfkey_msg_free(&pfkey_reply); -+ } -+ pfkey_extensions_free(extensions_reply); -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_acquire_parse(struct sock *sk, struct sadb_ext **extensions, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+ struct socket_list *pfkey_socketsp; -+ uint8_t satype = ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_satype; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_acquire_parse: .\n"); -+ -+ /* XXX I don't know if we want an upper bound, since userspace may -+ want to register itself for an satype > K_SADB_SATYPE_MAX. */ -+ if((satype == 0) || (satype > K_SADB_SATYPE_MAX)) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_acquire_parse: " -+ "SATYPE=%d invalid.\n", -+ satype); -+ SENDERR(EINVAL); -+ } -+ -+ if(!(pfkey_registered_sockets[satype])) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_acquire_parse: " -+ "no sockets registered for SAtype=%d(%s).\n", -+ satype, -+ satype2name(satype)); -+ SENDERR(EPROTONOSUPPORT); -+ } -+ -+ for(pfkey_socketsp = pfkey_registered_sockets[satype]; -+ pfkey_socketsp; -+ pfkey_socketsp = pfkey_socketsp->next) { -+ if((error = pfkey_upmsg(pfkey_socketsp->socketp, -+ ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_acquire_parse: " -+ "sending up acquire reply message for satype=%d(%s) to socket=0p%p failed with error=%d.\n", -+ satype, -+ satype2name(satype), -+ pfkey_socketsp->socketp, -+ error); -+ SENDERR(-error); -+ } -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_acquire_parse: " -+ "sending up acquire reply message for satype=%d(%s) to socket=0p%p succeeded.\n", -+ satype, -+ satype2name(satype), -+ pfkey_socketsp->socketp); -+ } -+ -+ errlab: -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_register_parse(struct sock *sk, struct sadb_ext **extensions, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+ uint8_t satype = ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_satype; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_register_parse: .\n"); -+ -+ /* XXX I don't know if we want an upper bound, since userspace may -+ want to register itself for an satype > K_SADB_SATYPE_MAX. */ -+ if((satype == 0) || (satype > K_SADB_SATYPE_MAX)) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_register_parse: " -+ "SATYPE=%d invalid.\n", -+ satype); -+ SENDERR(EINVAL); -+ } -+ -+ if(!pfkey_list_insert_socket(sk->sk_socket, -+ &(pfkey_registered_sockets[satype]))) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_register_parse: " -+ "SATYPE=%02d(%s) successfully registered by KMd (pid=%d).\n", -+ satype, -+ satype2name(satype), -+ key_pid(sk)); -+ }; -+ -+ /* send up register msg with supported SATYPE algos */ -+ -+ error=pfkey_register_reply(satype, (struct sadb_msg*)extensions[K_SADB_EXT_RESERVED]); -+ errlab: -+ return error; -+} -+ -+int -+pfkey_register_reply(int satype, struct sadb_msg *sadb_msg) -+{ -+ struct sadb_ext *extensions_reply[K_SADB_EXT_MAX+1]; -+ struct sadb_msg *pfkey_reply = NULL; -+ struct socket_list *pfkey_socketsp; -+ struct supported_list *pfkey_supported_listp; -+ unsigned int alg_num_a = 0, alg_num_e = 0; -+ struct sadb_alg *alg_a = NULL, *alg_e = NULL, *alg_ap = NULL, *alg_ep = NULL; -+ int error = 0; -+ -+ pfkey_extensions_init(extensions_reply); -+ -+ if((satype == 0) || (satype > K_SADB_SATYPE_MAX)) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_register_reply: " -+ "SAtype=%d unspecified or unknown.\n", -+ satype); -+ SENDERR(EINVAL); -+ } -+ if(!(pfkey_registered_sockets[satype])) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_register_reply: " -+ "no sockets registered for SAtype=%d(%s).\n", -+ satype, -+ satype2name(satype)); -+ SENDERR(EPROTONOSUPPORT); -+ } -+ /* send up register msg with supported SATYPE algos */ -+ pfkey_supported_listp = pfkey_supported_list[satype]; -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_register_reply: " -+ "pfkey_supported_list[%d]=0p%p\n", -+ satype, -+ pfkey_supported_list[satype]); -+ while(pfkey_supported_listp) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_register_reply: " -+ "checking supported=0p%p\n", -+ pfkey_supported_listp); -+ if(pfkey_supported_listp->supportedp->ias_exttype == K_SADB_EXT_SUPPORTED_AUTH) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_register_reply: " -+ "adding auth alg.\n"); -+ alg_num_a++; -+ } -+ if(pfkey_supported_listp->supportedp->ias_exttype == K_SADB_EXT_SUPPORTED_ENCRYPT) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_register_reply: " -+ "adding encrypt alg.\n"); -+ alg_num_e++; -+ } -+ pfkey_supported_listp = pfkey_supported_listp->next; -+ } -+ -+ if(alg_num_a) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_register_reply: " -+ "allocating %lu bytes for auth algs.\n", -+ (unsigned long) (alg_num_a * sizeof(struct sadb_alg))); -+ if((alg_a = kmalloc(alg_num_a * sizeof(struct sadb_alg), GFP_ATOMIC) ) == NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_register_reply: " -+ "auth alg memory allocation error\n"); -+ SENDERR(ENOMEM); -+ } -+ alg_ap = alg_a; -+ } -+ -+ if(alg_num_e) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_register_reply: " -+ "allocating %lu bytes for enc algs.\n", -+ (unsigned long) (alg_num_e * sizeof(struct sadb_alg))); -+ if((alg_e = kmalloc(alg_num_e * sizeof(struct sadb_alg), GFP_ATOMIC) ) == NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_register_reply: " -+ "enc alg memory allocation error\n"); -+ SENDERR(ENOMEM); -+ } -+ alg_ep = alg_e; -+ } -+ -+ pfkey_supported_listp = pfkey_supported_list[satype]; -+ while(pfkey_supported_listp) { -+ if(alg_num_a) { -+ if(pfkey_supported_listp->supportedp->ias_exttype == K_SADB_EXT_SUPPORTED_AUTH) { -+ alg_ap->sadb_alg_id = pfkey_supported_listp->supportedp->ias_id; -+ alg_ap->sadb_alg_ivlen = pfkey_supported_listp->supportedp->ias_ivlen; -+ alg_ap->sadb_alg_minbits = pfkey_supported_listp->supportedp->ias_keyminbits; -+ alg_ap->sadb_alg_maxbits = pfkey_supported_listp->supportedp->ias_keymaxbits; -+ alg_ap->sadb_alg_reserved = 0; -+ KLIPS_PRINT(debug_pfkey && sysctl_ipsec_debug_verbose, -+ "klips_debug:pfkey_register_reply: " -+ "adding auth=0p%p\n", -+ alg_ap); -+ alg_ap++; -+ } -+ } -+ if(alg_num_e) { -+ if(pfkey_supported_listp->supportedp->ias_exttype == K_SADB_EXT_SUPPORTED_ENCRYPT) { -+ alg_ep->sadb_alg_id = pfkey_supported_listp->supportedp->ias_id; -+ alg_ep->sadb_alg_ivlen = pfkey_supported_listp->supportedp->ias_ivlen; -+ alg_ep->sadb_alg_minbits = pfkey_supported_listp->supportedp->ias_keyminbits; -+ alg_ep->sadb_alg_maxbits = pfkey_supported_listp->supportedp->ias_keymaxbits; -+ alg_ep->sadb_alg_reserved = 0; -+ KLIPS_PRINT(debug_pfkey && sysctl_ipsec_debug_verbose, -+ "klips_debug:pfkey_register_reply: " -+ "adding encrypt=0p%p\n", -+ alg_ep); -+ alg_ep++; -+ } -+ } -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_register_reply: " -+ "found satype=%d(%s) exttype=%d id=%d ivlen=%d minbits=%d maxbits=%d.\n", -+ satype, -+ satype2name(satype), -+ pfkey_supported_listp->supportedp->ias_exttype, -+ pfkey_supported_listp->supportedp->ias_id, -+ pfkey_supported_listp->supportedp->ias_ivlen, -+ pfkey_supported_listp->supportedp->ias_keyminbits, -+ pfkey_supported_listp->supportedp->ias_keymaxbits); -+ pfkey_supported_listp = pfkey_supported_listp->next; -+ } -+ -+ if(!(pfkey_safe_build(error = pfkey_msg_hdr_build(&extensions_reply[0], -+ K_SADB_REGISTER, -+ satype, -+ 0, -+ sadb_msg? sadb_msg->sadb_msg_seq : ++pfkey_msg_seq, -+ sadb_msg? sadb_msg->sadb_msg_pid: current->pid), -+ extensions_reply) && -+ (alg_num_a ? pfkey_safe_build(error = pfkey_supported_build(&extensions_reply[K_SADB_EXT_SUPPORTED_AUTH], -+ K_SADB_EXT_SUPPORTED_AUTH, -+ alg_num_a, -+ alg_a), -+ extensions_reply) : 1) && -+ (alg_num_e ? pfkey_safe_build(error = pfkey_supported_build(&extensions_reply[K_SADB_EXT_SUPPORTED_ENCRYPT], -+ K_SADB_EXT_SUPPORTED_ENCRYPT, -+ alg_num_e, -+ alg_e), -+ extensions_reply) : 1))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_register_reply: " -+ "failed to build the register message extensions_reply\n"); -+ SENDERR(-error); -+ } -+ -+ if((error = pfkey_msg_build(&pfkey_reply, extensions_reply, EXT_BITS_OUT))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_register_reply: " -+ "failed to build the register message\n"); -+ SENDERR(-error); -+ } -+ /* this should go to all registered sockets for that satype only */ -+ for(pfkey_socketsp = pfkey_registered_sockets[satype]; -+ pfkey_socketsp; -+ pfkey_socketsp = pfkey_socketsp->next) { -+ if((error = pfkey_upmsg(pfkey_socketsp->socketp, pfkey_reply))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_register_reply: " -+ "sending up acquire message for satype=%d(%s) to socket=0p%p failed with error=%d.\n", -+ satype, -+ satype2name(satype), -+ pfkey_socketsp->socketp, -+ error); -+ SENDERR(-error); -+ } -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_register_reply: " -+ "sending up register message for satype=%d(%s) to socket=0p%p succeeded.\n", -+ satype, -+ satype2name(satype), -+ pfkey_socketsp->socketp); -+ } -+ -+ errlab: -+ if(alg_a) { -+ kfree(alg_a); -+ } -+ if(alg_e) { -+ kfree(alg_e); -+ } -+ -+ if (pfkey_reply) { -+ pfkey_msg_free(&pfkey_reply); -+ } -+ pfkey_extensions_free(extensions_reply); -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_expire_parse(struct sock *sk, struct sadb_ext **extensions, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+ struct socket_list *pfkey_socketsp; -+#ifdef CONFIG_KLIPS_DEBUG -+ uint8_t satype = ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_satype; -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_expire_parse: .\n"); -+ -+ if(pfkey_open_sockets) { -+ for(pfkey_socketsp = pfkey_open_sockets; -+ pfkey_socketsp; -+ pfkey_socketsp = pfkey_socketsp->next) { -+ if((error = pfkey_upmsg(pfkey_socketsp->socketp, -+ ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_expire_parse: " -+ "sending up expire reply message for satype=%d(%s) to socket=0p%p failed with error=%d.\n", -+ satype, -+ satype2name(satype), -+ pfkey_socketsp->socketp, -+ error); -+ SENDERR(-error); -+ } -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_expire_parse: " -+ "sending up expire reply message for satype=%d(%s) to socket=0p%p succeeded.\n", -+ satype, -+ satype2name(satype), -+ pfkey_socketsp->socketp); -+ } -+ } -+ -+ errlab: -+ return error; -+} -+ -+ -+/* -+ * -+ * flush all SAs from the table -+ */ -+DEBUG_NO_STATIC int -+pfkey_flush_parse(struct sock *sk, struct sadb_ext **extensions, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+ struct socket_list *pfkey_socketsp; -+ uint8_t satype = ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_satype; -+ uint8_t proto = 0; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_flush_parse: " -+ "flushing type %d SAs\n", -+ satype); -+ -+ if(satype && !(proto = satype2proto(satype))) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_flush_parse: " -+ "satype %d lookup failed.\n", -+ ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_satype); -+ SENDERR(EINVAL); -+ } -+ -+ if ((error = ipsec_sadb_cleanup(proto))) { -+ SENDERR(-error); -+ } -+ -+ if(pfkey_open_sockets) { -+ for(pfkey_socketsp = pfkey_open_sockets; -+ pfkey_socketsp; -+ pfkey_socketsp = pfkey_socketsp->next) { -+ if((error = pfkey_upmsg(pfkey_socketsp->socketp, -+ ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_flush_parse: " -+ "sending up flush reply message for satype=%d(%s) (proto=%d) to socket=0p%p failed with error=%d.\n", -+ satype, -+ satype2name(satype), -+ proto, -+ pfkey_socketsp->socketp, -+ error); -+ SENDERR(-error); -+ } -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_flush_parse: " -+ "sending up flush reply message for satype=%d(%s) to socket=0p%p succeeded.\n", -+ satype, -+ satype2name(satype), -+ pfkey_socketsp->socketp); -+ } -+ } -+ -+ errlab: -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_dump_parse(struct sock *sk, struct sadb_ext **extensions, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_dump_parse: .\n"); -+ -+ SENDERR(ENOSYS); -+ errlab: -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_x_promisc_parse(struct sock *sk, struct sadb_ext **extensions, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_promisc_parse: .\n"); -+ -+ SENDERR(ENOSYS); -+ errlab: -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_x_pchange_parse(struct sock *sk, struct sadb_ext **extensions, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_pchange_parse: .\n"); -+ -+ SENDERR(ENOSYS); -+ errlab: -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_x_grpsa_parse(struct sock *sk, struct sadb_ext **extensions, struct pfkey_extracted_data* extr) -+{ -+ struct ipsec_sa *ips1p, *ips2p, *ipsp; -+ struct sadb_ext *extensions_reply[K_SADB_EXT_MAX+1]; -+ struct sadb_msg *pfkey_reply = NULL; -+ struct socket_list *pfkey_socketsp; -+ uint8_t satype = ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_satype; -+ char sa1[SATOT_BUF], sa2[SATOT_BUF]; -+ size_t sa_len1, sa_len2 = 0; -+ int error = 0; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_grpsa_parse: .\n"); -+ -+ pfkey_extensions_init(extensions_reply); -+ -+ if(extr == NULL || extr->ips == NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_grpsa_parse: " -+ "extr or extr->ips is NULL, fatal.\n"); -+ SENDERR(EINVAL); -+ } -+ -+ sa_len1 = KLIPS_SATOT(debug_pfkey, &extr->ips->ips_said, 0, sa1, sizeof(sa1)); -+ if(extr->ips2 != NULL) { -+ sa_len2 = KLIPS_SATOT(debug_pfkey, &extr->ips2->ips_said, 0, sa2, sizeof(sa2)); -+ } -+ -+ spin_lock_bh(&tdb_lock); -+ -+ ips1p = ipsec_sa_getbyid(&(extr->ips->ips_said)); -+ if(ips1p == NULL) { -+ spin_unlock_bh(&tdb_lock); -+ KLIPS_ERROR(debug_pfkey, -+ "klips_debug:pfkey_x_grpsa_parse: " -+ "reserved ipsec_sa for SA1: %s not found. Call K_SADB_ADD/UPDATE first.\n", -+ sa_len1 ? sa1 : " (error)"); -+ SENDERR(ENOENT); -+ } -+ -+ if(extr->ips2) { /* GRPSA */ -+ -+ /* group ips2p to be after ips1p */ -+ -+ ips2p = ipsec_sa_getbyid(&(extr->ips2->ips_said)); -+ if(ips2p == NULL) { -+ ipsec_sa_put(ips1p); -+ spin_unlock_bh(&tdb_lock); -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_grpsa_parse: " -+ "reserved ipsec_sa for SA2: %s not found. Call K_SADB_ADD/UPDATE first.\n", -+ sa_len2 ? sa2 : " (error)"); -+ SENDERR(ENOENT); -+ } -+ -+ /* userspace puts things in inner to outer order */ -+ if(ips2p->ips_flags & EMT_INBOUND) { -+ struct ipsec_sa *t; -+ -+ /* exchange ips and ips2 */ -+ t = ips1p; -+ ips1p = ips2p; -+ ips2p = t; -+ } -+ -+ /* Is ips1p already linked? */ -+ if(ips1p->ips_next) { -+ ipsec_sa_put(ips1p); -+ ipsec_sa_put(ips2p); -+ spin_unlock_bh(&tdb_lock); -+ KLIPS_ERROR(debug_pfkey, -+ "klips_debug:pfkey_x_grpsa_parse: " -+ "ipsec_sa for SA: %s is already linked.\n", -+ sa_len1 ? sa1 : " (error)"); -+ SENDERR(EEXIST); -+ } -+ -+ /* Is extr->ips already linked to extr->ips2? */ -+ ipsp = ips2p; -+ while(ipsp) { -+ if(ipsp == ips1p) { -+ ipsec_sa_put(ips1p); -+ ipsec_sa_put(ips2p); -+ spin_unlock_bh(&tdb_lock); -+ KLIPS_ERROR(debug_pfkey, -+ "klips_debug:pfkey_x_grpsa_parse: " -+ "ipsec_sa for SA: %s is already linked to %s.\n", -+ sa_len1 ? sa1 : " (error)", -+ sa_len2 ? sa2 : " (error)"); -+ SENDERR(EEXIST); -+ } -+ ipsp = ipsp->ips_next; -+ } -+ -+ /* link 'em */ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_grpsa_parse: " -+ "linking ipsec_sa SA: %s with %s.\n", -+ sa_len1 ? sa1 : " (error)", -+ sa_len2 ? sa2 : " (error)"); -+ ips1p->ips_next = ips2p; -+ } else { /* UNGRPSA */ -+ while(ips1p) { -+ struct ipsec_sa *ipsn; -+ -+ /* take the reference to next */ -+ ipsn = ips1p->ips_next; -+ ips1p->ips_next = NULL; -+ -+ /* drop reference to current */ -+ ipsec_sa_put(ips1p); -+ -+ ips1p = ipsn; -+ } -+ -+ /* note: we have dropped reference to ips1p, and -+ * it is now NULL -+ */ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_grpsa_parse: " -+ "unlinking ipsec_sa SA: %s.\n", -+ sa_len1 ? sa1 : " (error)"); -+ } -+ -+ spin_unlock_bh(&tdb_lock); -+ -+ /* MCR: not only is this ugly to read, and impossible -+ * to debug through, but it's also really inefficient. -+ * XXX simplify me. -+ */ -+ if(!(pfkey_safe_build(error = pfkey_msg_hdr_build(&extensions_reply[0], -+ K_SADB_X_GRPSA, -+ satype, -+ 0, -+ ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_seq, -+ ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_pid), -+ extensions_reply) -+ && pfkey_safe_build(error = pfkey_sa_build(&extensions_reply[K_SADB_EXT_SA], -+ K_SADB_EXT_SA, -+ extr->ips->ips_said.spi, -+ extr->ips->ips_replaywin, -+ extr->ips->ips_state, -+ extr->ips->ips_authalg, -+ extr->ips->ips_encalg, -+ extr->ips->ips_flags), -+ extensions_reply) -+ && pfkey_safe_build(error = pfkey_address_build(&extensions_reply[K_SADB_EXT_ADDRESS_DST], -+ K_SADB_EXT_ADDRESS_DST, -+ 0, /*extr->ips->ips_said.proto,*/ -+ 0, -+ extr->ips->ips_addr_d), -+ extensions_reply) -+ && (extr->ips2 -+ ? (pfkey_safe_build(error = pfkey_x_satype_build(&extensions_reply[K_SADB_X_EXT_SATYPE2], -+ ((struct sadb_x_satype*)extensions[K_SADB_X_EXT_SATYPE2])->sadb_x_satype_satype -+ /* proto2satype(extr->ips2->ips_said.proto) */), -+ extensions_reply) -+ && pfkey_safe_build(error = pfkey_sa_build(&extensions_reply[K_SADB_X_EXT_SA2], -+ K_SADB_X_EXT_SA2, -+ extr->ips2->ips_said.spi, -+ extr->ips2->ips_replaywin, -+ extr->ips2->ips_state, -+ extr->ips2->ips_authalg, -+ extr->ips2->ips_encalg, -+ extr->ips2->ips_flags), -+ extensions_reply) -+ && pfkey_safe_build(error = pfkey_address_build(&extensions_reply[K_SADB_X_EXT_ADDRESS_DST2], -+ K_SADB_X_EXT_ADDRESS_DST2, -+ 0, /*extr->ips->ips_said.proto,*/ -+ 0, -+ extr->ips2->ips_addr_d), -+ extensions_reply) ) : 1 ) -+ )) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_x_grpsa_parse: " -+ "failed to build the x_grpsa reply message extensions\n"); -+ SENDERR(-error); -+ } -+ -+ if((error = pfkey_msg_build(&pfkey_reply, extensions_reply, EXT_BITS_OUT))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_x_grpsa_parse: " -+ "failed to build the x_grpsa reply message\n"); -+ SENDERR(-error); -+ } -+ -+ for(pfkey_socketsp = pfkey_open_sockets; -+ pfkey_socketsp; -+ pfkey_socketsp = pfkey_socketsp->next) { -+ if((error = pfkey_upmsg(pfkey_socketsp->socketp, pfkey_reply))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_x_grpsa_parse: " -+ "sending up x_grpsa reply message for satype=%d(%s) to socket=0p%p failed with error=%d.\n", -+ satype, -+ satype2name(satype), -+ pfkey_socketsp->socketp, -+ error); -+ SENDERR(-error); -+ } -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_x_grpsa_parse: " -+ "sending up x_grpsa reply message for satype=%d(%s) to socket=0p%p succeeded.\n", -+ satype, -+ satype2name(satype), -+ pfkey_socketsp->socketp); -+ } -+ -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_x_grpsa_parse: " -+ "succeeded in sending x_grpsa reply message.\n"); -+ -+ errlab: -+ if (pfkey_reply) { -+ pfkey_msg_free(&pfkey_reply); -+ } -+ pfkey_extensions_free(extensions_reply); -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_x_addflow_parse(struct sock *sk, struct sadb_ext **extensions, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+#ifdef CONFIG_KLIPS_DEBUG -+ char buf1[64], buf2[64]; -+#endif /* CONFIG_KLIPS_DEBUG */ -+ struct sadb_ext *extensions_reply[K_SADB_EXT_MAX+1]; -+ struct sadb_msg *pfkey_reply = NULL; -+ struct socket_list *pfkey_socketsp; -+ uint8_t satype = ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_satype; -+ ip_address srcflow, dstflow, srcmask, dstmask; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_addflow_parse: .\n"); -+ -+ pfkey_extensions_init(extensions_reply); -+ -+ memset((caddr_t)&srcflow, 0, sizeof(srcflow)); -+ memset((caddr_t)&dstflow, 0, sizeof(dstflow)); -+ memset((caddr_t)&srcmask, 0, sizeof(srcmask)); -+ memset((caddr_t)&dstmask, 0, sizeof(dstmask)); -+ -+ if(!extr || !(extr->ips) || !(extr->eroute)) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_addflow_parse: " -+ "missing extr, ipsec_sa or eroute data.\n"); -+ SENDERR(EINVAL); -+ } -+ -+ srcflow.u.v4.sin_family = AF_INET; -+ dstflow.u.v4.sin_family = AF_INET; -+ srcmask.u.v4.sin_family = AF_INET; -+ dstmask.u.v4.sin_family = AF_INET; -+ srcflow.u.v4.sin_addr = extr->eroute->er_eaddr.sen_ip_src; -+ dstflow.u.v4.sin_addr = extr->eroute->er_eaddr.sen_ip_dst; -+ srcmask.u.v4.sin_addr = extr->eroute->er_emask.sen_ip_src; -+ dstmask.u.v4.sin_addr = extr->eroute->er_emask.sen_ip_dst; -+ -+#ifdef CONFIG_KLIPS_DEBUG -+ if (debug_pfkey) { -+ subnettoa(extr->eroute->er_eaddr.sen_ip_src, -+ extr->eroute->er_emask.sen_ip_src, 0, buf1, sizeof(buf1)); -+ subnettoa(extr->eroute->er_eaddr.sen_ip_dst, -+ extr->eroute->er_emask.sen_ip_dst, 0, buf2, sizeof(buf2)); -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_addflow_parse: " -+ "calling breakeroute and/or makeroute for %s->%s\n", -+ buf1, buf2); -+ } -+#endif /* CONFIG_KLIPS_DEBUG */ -+ if(extr->ips->ips_flags & SADB_X_SAFLAGS_INFLOW) { -+/* if(ip_chk_addr((unsigned long)extr->ips->ips_said.dst.u.v4.sin_addr.s_addr) == IS_MYADDR) */ -+ struct ipsec_sa *ipsp, *ipsq; -+ char sa[SATOT_BUF]; -+ size_t sa_len; -+ -+ ipsq = ipsec_sa_getbyid(&(extr->ips->ips_said)); -+ if(ipsq == NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_addflow_parse: " -+ "ipsec_sa not found, cannot set incoming policy.\n"); -+ SENDERR(ENOENT); -+ } -+ -+ ipsp = ipsq; -+ while(ipsp && ipsp->ips_said.proto != IPPROTO_IPIP) { -+ ipsp = ipsp->ips_next; -+ } -+ -+ if(ipsp == NULL) { -+ ipsec_sa_put(ipsq); -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_addflow_parse: " -+ "SA chain does not have an IPIP SA, cannot set incoming policy.\n"); -+ SENDERR(ENOENT); -+ } -+ -+ sa_len = KLIPS_SATOT(debug_pfkey, &extr->ips->ips_said, 0, sa, sizeof(sa)); -+ -+ ipsp->ips_flags |= SADB_X_SAFLAGS_INFLOW; -+ ipsp->ips_flow_s = srcflow; -+ ipsp->ips_flow_d = dstflow; -+ ipsp->ips_mask_s = srcmask; -+ ipsp->ips_mask_d = dstmask; -+ -+ ipsec_sa_put(ipsq); -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_addflow_parse: " -+ "inbound eroute, setting incoming policy information in IPIP ipsec_sa for SA: %s.\n", -+ sa_len ? sa : " (error)"); -+ } else { -+ struct sk_buff *first = NULL, *last = NULL; -+ -+ if(extr->ips->ips_flags & SADB_X_SAFLAGS_REPLACEFLOW) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_addflow_parse: " -+ "REPLACEFLOW flag set, calling breakeroute.\n"); -+ if ((error = ipsec_breakroute(&(extr->eroute->er_eaddr), -+ &(extr->eroute->er_emask), -+ &first, &last))) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_addflow_parse: " -+ "breakeroute returned %d. first=0p%p, last=0p%p\n", -+ error, -+ first, -+ last); -+ if(first != NULL) { -+ ipsec_kfree_skb(first); -+ } -+ if(last != NULL) { -+ ipsec_kfree_skb(last); -+ } -+ SENDERR(-error); -+ } -+ } -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_addflow_parse: " -+ "calling makeroute.\n"); -+ -+ if ((error = ipsec_makeroute(&(extr->eroute->er_eaddr), -+ &(extr->eroute->er_emask), -+ extr->ips->ips_said, -+ ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_pid, -+ NULL, -+ &(extr->ips->ips_ident_s), -+ &(extr->ips->ips_ident_d)))) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_addflow_parse: " -+ "makeroute returned %d.\n", error); -+ SENDERR(-error); -+ } -+ if(first != NULL) { -+ KLIPS_PRINT(debug_eroute, -+ "klips_debug:pfkey_x_addflow_parse: " -+ "first=0p%p HOLD packet re-injected.\n", -+ first); -+ dst_output(first); -+ } -+ if(last != NULL) { -+ KLIPS_PRINT(debug_eroute, -+ "klips_debug:pfkey_x_addflow_parse: " -+ "last=0p%p HOLD packet re-injected.\n", -+ last); -+ dst_output(last); -+ } -+ } -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_addflow_parse: " -+ "makeroute call successful.\n"); -+ -+ if(!(pfkey_safe_build(error = pfkey_msg_hdr_build(&extensions_reply[0], -+ K_SADB_X_ADDFLOW, -+ satype, -+ 0, -+ ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_seq, -+ ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_pid), -+ extensions_reply) -+ && pfkey_safe_build(error = pfkey_sa_build(&extensions_reply[K_SADB_EXT_SA], -+ K_SADB_EXT_SA, -+ extr->ips->ips_said.spi, -+ extr->ips->ips_replaywin, -+ extr->ips->ips_state, -+ extr->ips->ips_authalg, -+ extr->ips->ips_encalg, -+ extr->ips->ips_flags), -+ extensions_reply) -+ && (extensions[K_SADB_EXT_ADDRESS_SRC] -+ ? pfkey_safe_build(error = pfkey_address_build(&extensions_reply[K_SADB_EXT_ADDRESS_SRC], -+ K_SADB_EXT_ADDRESS_SRC, -+ 0, /*extr->ips->ips_said.proto,*/ -+ 0, -+ extr->ips->ips_addr_s), -+ extensions_reply) : 1) -+ && (extensions[K_SADB_EXT_ADDRESS_DST] -+ ? pfkey_safe_build(error = pfkey_address_build(&extensions_reply[K_SADB_EXT_ADDRESS_DST], -+ K_SADB_EXT_ADDRESS_DST, -+ 0, /*extr->ips->ips_said.proto,*/ -+ 0, -+ extr->ips->ips_addr_d), -+ extensions_reply) : 1) -+ && pfkey_safe_build(error = pfkey_address_build(&extensions_reply[K_SADB_X_EXT_ADDRESS_SRC_FLOW], -+ K_SADB_X_EXT_ADDRESS_SRC_FLOW, -+ 0, /*extr->ips->ips_said.proto,*/ -+ 0, -+ (struct sockaddr*)&srcflow), -+ extensions_reply) -+ && pfkey_safe_build(error = pfkey_address_build(&extensions_reply[K_SADB_X_EXT_ADDRESS_DST_FLOW], -+ K_SADB_X_EXT_ADDRESS_DST_FLOW, -+ 0, /*extr->ips->ips_said.proto,*/ -+ 0, -+ (struct sockaddr*)&dstflow), -+ extensions_reply) -+ && pfkey_safe_build(error = pfkey_address_build(&extensions_reply[K_SADB_X_EXT_ADDRESS_SRC_MASK], -+ K_SADB_X_EXT_ADDRESS_SRC_MASK, -+ 0, /*extr->ips->ips_said.proto,*/ -+ 0, -+ (struct sockaddr*)&srcmask), -+ extensions_reply) -+ && pfkey_safe_build(error = pfkey_address_build(&extensions_reply[K_SADB_X_EXT_ADDRESS_DST_MASK], -+ K_SADB_X_EXT_ADDRESS_DST_MASK, -+ 0, /*extr->ips->ips_said.proto,*/ -+ 0, -+ (struct sockaddr*)&dstmask), -+ extensions_reply) -+ )) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_x_addflow_parse: " -+ "failed to build the x_addflow reply message extensions\n"); -+ SENDERR(-error); -+ } -+ -+ if((error = pfkey_msg_build(&pfkey_reply, extensions_reply, EXT_BITS_OUT))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_x_addflow_parse: " -+ "failed to build the x_addflow reply message\n"); -+ SENDERR(-error); -+ } -+ -+ for(pfkey_socketsp = pfkey_open_sockets; -+ pfkey_socketsp; -+ pfkey_socketsp = pfkey_socketsp->next) { -+ if((error = pfkey_upmsg(pfkey_socketsp->socketp, pfkey_reply))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_x_addflow_parse: " -+ "sending up x_addflow reply message for satype=%d(%s) to socket=0p%p failed with error=%d.\n", -+ satype, -+ satype2name(satype), -+ pfkey_socketsp->socketp, -+ error); -+ SENDERR(-error); -+ } -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_x_addflow_parse: " -+ "sending up x_addflow reply message for satype=%d(%s) (proto=%d) to socket=0p%p succeeded.\n", -+ satype, -+ satype2name(satype), -+ extr->ips->ips_said.proto, -+ pfkey_socketsp->socketp); -+ } -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_addflow_parse: " -+ "extr->ips cleaned up and freed.\n"); -+ -+ errlab: -+ if (pfkey_reply) { -+ pfkey_msg_free(&pfkey_reply); -+ } -+ pfkey_extensions_free(extensions_reply); -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_x_delflow_parse(struct sock *sk, struct sadb_ext **extensions, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+#ifdef CONFIG_KLIPS_DEBUG -+ char buf1[64], buf2[64]; -+#endif /* CONFIG_KLIPS_DEBUG */ -+ struct sadb_ext *extensions_reply[K_SADB_EXT_MAX+1]; -+ struct sadb_msg *pfkey_reply = NULL; -+ struct socket_list *pfkey_socketsp; -+ uint8_t satype = ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_satype; -+ ip_address srcflow, dstflow, srcmask, dstmask; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_delflow_parse: .\n"); -+ -+ pfkey_extensions_init(extensions_reply); -+ -+ memset((caddr_t)&srcflow, 0, sizeof(srcflow)); -+ memset((caddr_t)&dstflow, 0, sizeof(dstflow)); -+ memset((caddr_t)&srcmask, 0, sizeof(srcmask)); -+ memset((caddr_t)&dstmask, 0, sizeof(dstmask)); -+ -+ if(!extr || !(extr->ips)) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_delflow_parse: " -+ "extr, or extr->ips is NULL, fatal\n"); -+ SENDERR(EINVAL); -+ } -+ -+ if(extr->ips->ips_flags & SADB_X_SAFLAGS_CLEARFLOW) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_delflow_parse: " -+ "CLEARFLOW flag set, calling cleareroutes.\n"); -+ if ((error = ipsec_cleareroutes())) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_delflow_parse: " -+ "cleareroutes returned %d.\n", error); -+ SENDERR(-error); -+ } -+ } else { -+ struct sk_buff *first = NULL, *last = NULL; -+ -+ if(!(extr->eroute)) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_delflow_parse: " -+ "extr->eroute is NULL, fatal.\n"); -+ SENDERR(EINVAL); -+ } -+ -+ srcflow.u.v4.sin_family = AF_INET; -+ dstflow.u.v4.sin_family = AF_INET; -+ srcmask.u.v4.sin_family = AF_INET; -+ dstmask.u.v4.sin_family = AF_INET; -+ srcflow.u.v4.sin_addr = extr->eroute->er_eaddr.sen_ip_src; -+ dstflow.u.v4.sin_addr = extr->eroute->er_eaddr.sen_ip_dst; -+ srcmask.u.v4.sin_addr = extr->eroute->er_emask.sen_ip_src; -+ dstmask.u.v4.sin_addr = extr->eroute->er_emask.sen_ip_dst; -+ -+#ifdef CONFIG_KLIPS_DEBUG -+ if (debug_pfkey) { -+ subnettoa(extr->eroute->er_eaddr.sen_ip_src, -+ extr->eroute->er_emask.sen_ip_src, 0, buf1, sizeof(buf1)); -+ subnettoa(extr->eroute->er_eaddr.sen_ip_dst, -+ extr->eroute->er_emask.sen_ip_dst, 0, buf2, sizeof(buf2)); -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_delflow_parse: " -+ "calling breakeroute for %s->%s\n", -+ buf1, buf2); -+ } -+#endif /* CONFIG_KLIPS_DEBUG */ -+ error = ipsec_breakroute(&(extr->eroute->er_eaddr), -+ &(extr->eroute->er_emask), -+ &first, &last); -+ if(error) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_delflow_parse: " -+ "breakeroute returned %d. first=0p%p, last=0p%p\n", -+ error, -+ first, -+ last); -+ } -+ if(first != NULL) { -+ ipsec_kfree_skb(first); -+ } -+ if(last != NULL) { -+ ipsec_kfree_skb(last); -+ } -+ if(error) { -+ SENDERR(-error); -+ } -+ } -+ -+ if(!(pfkey_safe_build(error = pfkey_msg_hdr_build(&extensions_reply[0], -+ K_SADB_X_DELFLOW, -+ satype, -+ 0, -+ ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_seq, -+ ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED])->sadb_msg_pid), -+ extensions_reply) -+ && pfkey_safe_build(error = pfkey_sa_build(&extensions_reply[K_SADB_EXT_SA], -+ K_SADB_EXT_SA, -+ extr->ips->ips_said.spi, -+ extr->ips->ips_replaywin, -+ extr->ips->ips_state, -+ extr->ips->ips_authalg, -+ extr->ips->ips_encalg, -+ extr->ips->ips_flags), -+ extensions_reply) -+ && pfkey_safe_build(error = pfkey_address_build(&extensions_reply[K_SADB_X_EXT_ADDRESS_SRC_FLOW], -+ K_SADB_X_EXT_ADDRESS_SRC_FLOW, -+ 0, /*extr->ips->ips_said.proto,*/ -+ 0, -+ (struct sockaddr*)&srcflow), -+ extensions_reply) -+ && pfkey_safe_build(error = pfkey_address_build(&extensions_reply[K_SADB_X_EXT_ADDRESS_DST_FLOW], -+ K_SADB_X_EXT_ADDRESS_DST_FLOW, -+ 0, /*extr->ips->ips_said.proto,*/ -+ 0, -+ (struct sockaddr*)&dstflow), -+ extensions_reply) -+ && pfkey_safe_build(error = pfkey_address_build(&extensions_reply[K_SADB_X_EXT_ADDRESS_SRC_MASK], -+ K_SADB_X_EXT_ADDRESS_SRC_MASK, -+ 0, /*extr->ips->ips_said.proto,*/ -+ 0, -+ (struct sockaddr*)&srcmask), -+ extensions_reply) -+ && pfkey_safe_build(error = pfkey_address_build(&extensions_reply[K_SADB_X_EXT_ADDRESS_DST_MASK], -+ K_SADB_X_EXT_ADDRESS_DST_MASK, -+ 0, /*extr->ips->ips_said.proto,*/ -+ 0, -+ (struct sockaddr*)&dstmask), -+ extensions_reply) -+ )) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_x_delflow_parse: " -+ "failed to build the x_delflow reply message extensions\n"); -+ SENDERR(-error); -+ } -+ -+ if((error = pfkey_msg_build(&pfkey_reply, extensions_reply, EXT_BITS_OUT))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_x_delflow_parse: " -+ "failed to build the x_delflow reply message\n"); -+ SENDERR(-error); -+ } -+ -+ for(pfkey_socketsp = pfkey_open_sockets; -+ pfkey_socketsp; -+ pfkey_socketsp = pfkey_socketsp->next) { -+ if((error = pfkey_upmsg(pfkey_socketsp->socketp, pfkey_reply))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_x_delflow_parse: " -+ "sending up x_delflow reply message for satype=%d(%s) to socket=0p%p failed with error=%d.\n", -+ satype, -+ satype2name(satype), -+ pfkey_socketsp->socketp, -+ error); -+ SENDERR(-error); -+ } -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_x_delflow_parse: " -+ "sending up x_delflow reply message for satype=%d(%s) to socket=0p%p succeeded.\n", -+ satype, -+ satype2name(satype), -+ pfkey_socketsp->socketp); -+ } -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_delflow_parse: " -+ "extr->ips cleaned up and freed.\n"); -+ -+ errlab: -+ if (pfkey_reply) { -+ pfkey_msg_free(&pfkey_reply); -+ } -+ pfkey_extensions_free(extensions_reply); -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_x_msg_debug_parse(struct sock *sk, struct sadb_ext **extensions, struct pfkey_extracted_data* extr) -+{ -+ int error = 0; -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_x_msg_debug_parse: .\n"); -+ -+/* errlab:*/ -+ return error; -+} -+ -+/* pfkey_expire expects the ipsec_sa table to be locked before being called. */ -+int -+pfkey_expire(struct ipsec_sa *ipsp, int hard) -+{ -+ struct sadb_ext *extensions[K_SADB_EXT_MAX+1]; -+ struct sadb_msg *pfkey_msg = NULL; -+ struct socket_list *pfkey_socketsp; -+ int error = 0; -+ uint8_t satype; -+ -+ pfkey_extensions_init(extensions); -+ -+ if(!(satype = proto2satype(ipsp->ips_said.proto))) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_expire: " -+ "satype lookup for protocol %d lookup failed.\n", -+ ipsp->ips_said.proto); -+ SENDERR(EINVAL); -+ } -+ -+ if(!pfkey_open_sockets) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_expire: " -+ "no sockets listening.\n"); -+ SENDERR(EPROTONOSUPPORT); -+ } -+ -+ if (!(pfkey_safe_build(error = pfkey_msg_hdr_build(&extensions[0], -+ K_SADB_EXPIRE, -+ satype, -+ 0, -+ ++pfkey_msg_seq, -+ 0), -+ extensions) -+ && pfkey_safe_build(error = pfkey_sa_build(&extensions[K_SADB_EXT_SA], -+ K_SADB_EXT_SA, -+ ipsp->ips_said.spi, -+ ipsp->ips_replaywin, -+ ipsp->ips_state, -+ ipsp->ips_authalg, -+ ipsp->ips_encalg, -+ ipsp->ips_flags), -+ extensions) -+ && pfkey_safe_build(error = pfkey_lifetime_build(&extensions[K_SADB_EXT_LIFETIME_CURRENT], -+ K_SADB_EXT_LIFETIME_CURRENT, -+ ipsp->ips_life.ipl_allocations.ipl_count, -+ ipsp->ips_life.ipl_bytes.ipl_count, -+ ipsp->ips_life.ipl_addtime.ipl_count, -+ ipsp->ips_life.ipl_usetime.ipl_count, -+ ipsp->ips_life.ipl_packets.ipl_count), -+ extensions) -+ && (hard ? -+ pfkey_safe_build(error = pfkey_lifetime_build(&extensions[K_SADB_EXT_LIFETIME_HARD], -+ K_SADB_EXT_LIFETIME_HARD, -+ ipsp->ips_life.ipl_allocations.ipl_hard, -+ ipsp->ips_life.ipl_bytes.ipl_hard, -+ ipsp->ips_life.ipl_addtime.ipl_hard, -+ ipsp->ips_life.ipl_usetime.ipl_hard, -+ ipsp->ips_life.ipl_packets.ipl_hard), -+ extensions) -+ : pfkey_safe_build(error = pfkey_lifetime_build(&extensions[K_SADB_EXT_LIFETIME_SOFT], -+ K_SADB_EXT_LIFETIME_SOFT, -+ ipsp->ips_life.ipl_allocations.ipl_soft, -+ ipsp->ips_life.ipl_bytes.ipl_soft, -+ ipsp->ips_life.ipl_addtime.ipl_soft, -+ ipsp->ips_life.ipl_usetime.ipl_soft, -+ ipsp->ips_life.ipl_packets.ipl_soft), -+ extensions)) -+ && pfkey_safe_build(error = pfkey_address_build(&extensions[K_SADB_EXT_ADDRESS_SRC], -+ K_SADB_EXT_ADDRESS_SRC, -+ 0, /* ipsp->ips_said.proto, */ -+ 0, -+ ipsp->ips_addr_s), -+ extensions) -+ && pfkey_safe_build(error = pfkey_address_build(&extensions[K_SADB_EXT_ADDRESS_DST], -+ K_SADB_EXT_ADDRESS_DST, -+ 0, /* ipsp->ips_said.proto, */ -+ 0, -+ ipsp->ips_addr_d), -+ extensions))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_expire: " -+ "failed to build the expire message extensions\n"); -+ spin_unlock_bh(&tdb_lock); -+ goto errlab; -+ } -+ -+ if ((error = pfkey_msg_build(&pfkey_msg, extensions, EXT_BITS_OUT))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_expire: " -+ "failed to build the expire message\n"); -+ SENDERR(-error); -+ } -+ -+ for(pfkey_socketsp = pfkey_open_sockets; -+ pfkey_socketsp; -+ pfkey_socketsp = pfkey_socketsp->next) { -+ if((error = pfkey_upmsg(pfkey_socketsp->socketp, pfkey_msg))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_expire: " -+ "sending up expire message for satype=%d(%s) to socket=0p%p failed with error=%d.\n", -+ satype, -+ satype2name(satype), -+ pfkey_socketsp->socketp, -+ error); -+ SENDERR(-error); -+ } -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_expire: " -+ "sending up expire message for satype=%d(%s) (proto=%d) to socket=0p%p succeeded.\n", -+ satype, -+ satype2name(satype), -+ ipsp->ips_said.proto, -+ pfkey_socketsp->socketp); -+ } -+ -+ errlab: -+ if (pfkey_msg) { -+ pfkey_msg_free(&pfkey_msg); -+ } -+ pfkey_extensions_free(extensions); -+ return error; -+} -+ -+int -+pfkey_acquire(struct ipsec_sa *ipsp) -+{ -+ struct sadb_ext *extensions[K_SADB_EXT_MAX+1]; -+ struct sadb_msg *pfkey_msg = NULL; -+ struct socket_list *pfkey_socketsp; -+ int error = 0; -+ struct sadb_comb comb[] = { -+ /* auth; encrypt; flags; */ -+ /* auth_minbits; auth_maxbits; encrypt_minbits; encrypt_maxbits; */ -+ /* reserved; soft_allocations; hard_allocations; soft_bytes; hard_bytes; */ -+ /* soft_addtime; hard_addtime; soft_usetime; hard_usetime; */ -+ /* soft_packets; hard_packets; */ -+ { K_SADB_AALG_MD5HMAC, K_SADB_EALG_3DESCBC, SADB_SAFLAGS_PFS, -+ 128, 128, 168, 168, -+ 0, 0, 0, 0, 0, -+ 57600, 86400, 57600, 86400}, -+ { K_SADB_AALG_SHA1HMAC, K_SADB_EALG_3DESCBC, SADB_SAFLAGS_PFS, -+ 160, 160, 168, 168, -+ 0, 0, 0, 0, 0, -+ 57600, 86400, 57600, 86400}, -+ }; -+ -+ /* XXX This should not be hard-coded. It should be taken from the spdb */ -+ uint8_t satype = K_SADB_SATYPE_ESP; -+ -+ pfkey_extensions_init(extensions); -+ -+ if((satype == 0) || (satype > K_SADB_SATYPE_MAX)) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_acquire: " -+ "SAtype=%d unspecified or unknown.\n", -+ satype); -+ SENDERR(EINVAL); -+ } -+ -+ if(!(pfkey_registered_sockets[satype])) { -+ KLIPS_PRINT(1|debug_pfkey, "klips_debug:pfkey_acquire: " -+ "no sockets registered for SAtype=%d(%s).\n", -+ satype, -+ satype2name(satype)); -+ SENDERR(EPROTONOSUPPORT); -+ } -+ -+ if (!(pfkey_safe_build(error = pfkey_msg_hdr_build(&extensions[0], -+ K_SADB_ACQUIRE, -+ satype, -+ 0, -+ ++pfkey_msg_seq, -+ 0), -+ extensions) -+ && pfkey_safe_build(error = pfkey_address_build(&extensions[K_SADB_EXT_ADDRESS_SRC], -+ K_SADB_EXT_ADDRESS_SRC, -+ ipsp->ips_transport_protocol, -+ 0, -+ ipsp->ips_addr_s), -+ extensions) -+ && pfkey_safe_build(error = pfkey_address_build(&extensions[K_SADB_EXT_ADDRESS_DST], -+ K_SADB_EXT_ADDRESS_DST, -+ ipsp->ips_transport_protocol, -+ 0, -+ ipsp->ips_addr_d), -+ extensions) -+#if 0 -+ && (ipsp->ips_addr_p -+ ? pfkey_safe_build(error = pfkey_address_build(&extensions[K_SADB_EXT_ADDRESS_PROXY], -+ K_SADB_EXT_ADDRESS_PROXY, -+ ipsp->ips_transport_protocol, -+ 0, -+ ipsp->ips_addr_p), -+ extensions) : 1) -+#endif -+ && (ipsp->ips_ident_s.type != SADB_IDENTTYPE_RESERVED -+ ? pfkey_safe_build(error = pfkey_ident_build(&extensions[SADB_EXT_IDENTITY_SRC], -+ K_SADB_EXT_IDENTITY_SRC, -+ ipsp->ips_ident_s.type, -+ ipsp->ips_ident_s.id, -+ ipsp->ips_ident_s.len, -+ ipsp->ips_ident_s.data), -+ extensions) : 1) -+ -+ && (ipsp->ips_ident_d.type != SADB_IDENTTYPE_RESERVED -+ ? pfkey_safe_build(error = pfkey_ident_build(&extensions[K_SADB_EXT_IDENTITY_DST], -+ K_SADB_EXT_IDENTITY_DST, -+ ipsp->ips_ident_d.type, -+ ipsp->ips_ident_d.id, -+ ipsp->ips_ident_d.len, -+ ipsp->ips_ident_d.data), -+ extensions) : 1) -+#if 0 -+ /* FIXME: This won't work yet because I have not finished -+ it. */ -+ && (ipsp->ips_sens_ -+ ? pfkey_safe_build(error = pfkey_sens_build(&extensions[K_SADB_EXT_SENSITIVITY], -+ ipsp->ips_sens_dpd, -+ ipsp->ips_sens_sens_level, -+ ipsp->ips_sens_sens_len, -+ ipsp->ips_sens_sens_bitmap, -+ ipsp->ips_sens_integ_level, -+ ipsp->ips_sens_integ_len, -+ ipsp->ips_sens_integ_bitmap), -+ extensions) : 1) -+#endif -+ && pfkey_safe_build(error = pfkey_prop_build(&extensions[K_SADB_EXT_PROPOSAL], -+ 64, /* replay */ -+ sizeof(comb)/sizeof(struct sadb_comb), -+ &(comb[0])), -+ extensions) -+ )) { -+ KLIPS_PRINT(1|debug_pfkey, "klips_debug:pfkey_acquire: " -+ "failed to build the acquire message extensions\n"); -+ SENDERR(-error); -+ } -+ -+ if ((error = pfkey_msg_build(&pfkey_msg, extensions, EXT_BITS_OUT))) { -+ KLIPS_PRINT(1|debug_pfkey, "klips_debug:pfkey_acquire: " -+ "failed to build the acquire message\n"); -+ SENDERR(-error); -+ } -+ -+#ifdef KLIPS_PFKEY_ACQUIRE_LOSSAGE -+# if KLIPS_PFKEY_ACQUIRE_LOSSAGE > 0 -+ if(sysctl_ipsec_regress_pfkey_lossage) { -+ return(0); -+ } -+# endif -+#endif -+ -+ /* this should go to all registered sockets for that satype only */ -+ for(pfkey_socketsp = pfkey_registered_sockets[satype]; -+ pfkey_socketsp; -+ pfkey_socketsp = pfkey_socketsp->next) { -+ if((error = pfkey_upmsg(pfkey_socketsp->socketp, pfkey_msg))) { -+ KLIPS_PRINT(1|debug_pfkey, "klips_debug:pfkey_acquire: " -+ "sending up acquire message for satype=%d(%s) to socket=0p%p failed with error=%d.\n", -+ satype, -+ satype2name(satype), -+ pfkey_socketsp->socketp, -+ error); -+ SENDERR(-error); -+ } -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_acquire: " -+ "sending up acquire message for satype=%d(%s) to socket=0p%p succeeded.\n", -+ satype, -+ satype2name(satype), -+ pfkey_socketsp->socketp); -+ } -+ -+ errlab: -+ if (pfkey_msg) { -+ pfkey_msg_free(&pfkey_msg); -+ } -+ pfkey_extensions_free(extensions); -+ return error; -+} -+ -+#ifdef CONFIG_IPSEC_NAT_TRAVERSAL -+int -+pfkey_nat_t_new_mapping(struct ipsec_sa *ipsp, struct sockaddr *ipaddr, -+ __u16 sport) -+{ -+ struct sadb_ext *extensions[K_SADB_EXT_MAX+1]; -+ struct sadb_msg *pfkey_msg = NULL; -+ struct socket_list *pfkey_socketsp; -+ int error = 0; -+ uint8_t satype = (ipsp->ips_said.proto==IPPROTO_ESP) ? K_SADB_SATYPE_ESP : 0; -+ -+ /* Construct K_SADB_X_NAT_T_NEW_MAPPING message */ -+ -+ pfkey_extensions_init(extensions); -+ -+ if((satype == 0) || (satype > K_SADB_SATYPE_MAX)) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_nat_t_new_mapping: " -+ "SAtype=%d unspecified or unknown.\n", -+ satype); -+ SENDERR(EINVAL); -+ } -+ -+ if(!(pfkey_registered_sockets[satype])) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_nat_t_new_mapping: " -+ "no sockets registered for SAtype=%d(%s).\n", -+ satype, -+ satype2name(satype)); -+ SENDERR(EPROTONOSUPPORT); -+ } -+ -+ if (!(pfkey_safe_build -+ (error = pfkey_msg_hdr_build(&extensions[0], K_SADB_X_NAT_T_NEW_MAPPING, -+ satype, 0, ++pfkey_msg_seq, 0), extensions) -+ /* SA */ -+ && pfkey_safe_build -+ (error = pfkey_sa_build(&extensions[K_SADB_EXT_SA], -+ K_SADB_EXT_SA, ipsp->ips_said.spi, 0, 0, 0, 0, 0), extensions) -+ /* ADDRESS_SRC = old addr */ -+ && pfkey_safe_build -+ (error = pfkey_address_build(&extensions[K_SADB_EXT_ADDRESS_SRC], -+ K_SADB_EXT_ADDRESS_SRC, ipsp->ips_said.proto, 0, ipsp->ips_addr_s), -+ extensions) -+ /* NAT_T_SPORT = old port */ -+ && pfkey_safe_build -+ (error = pfkey_x_nat_t_port_build(&extensions[K_SADB_X_EXT_NAT_T_SPORT], -+ K_SADB_X_EXT_NAT_T_SPORT, ipsp->ips_natt_sport), extensions) -+ /* ADDRESS_DST = new addr */ -+ && pfkey_safe_build -+ (error = pfkey_address_build(&extensions[K_SADB_EXT_ADDRESS_DST], -+ K_SADB_EXT_ADDRESS_DST, ipsp->ips_said.proto, 0, ipaddr), extensions) -+ /* NAT_T_DPORT = new port */ -+ && pfkey_safe_build -+ (error = pfkey_x_nat_t_port_build(&extensions[K_SADB_X_EXT_NAT_T_DPORT], -+ K_SADB_X_EXT_NAT_T_DPORT, sport), extensions) -+ )) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_nat_t_new_mapping: " -+ "failed to build the nat_t_new_mapping message extensions\n"); -+ SENDERR(-error); -+ } -+ -+ if ((error = pfkey_msg_build(&pfkey_msg, extensions, EXT_BITS_OUT))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_nat_t_new_mapping: " -+ "failed to build the nat_t_new_mapping message\n"); -+ SENDERR(-error); -+ } -+ -+ /* this should go to all registered sockets for that satype only */ -+ for(pfkey_socketsp = pfkey_registered_sockets[satype]; -+ pfkey_socketsp; -+ pfkey_socketsp = pfkey_socketsp->next) { -+ if((error = pfkey_upmsg(pfkey_socketsp->socketp, pfkey_msg))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_nat_t_new_mapping: " -+ "sending up nat_t_new_mapping message for satype=%d(%s) to socket=%p failed with error=%d.\n", -+ satype, -+ satype2name(satype), -+ pfkey_socketsp->socketp, -+ error); -+ SENDERR(-error); -+ } -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_nat_t_new_mapping: " -+ "sending up nat_t_new_mapping message for satype=%d(%s) to socket=%p succeeded.\n", -+ satype, -+ satype2name(satype), -+ pfkey_socketsp->socketp); -+ } -+ -+ errlab: -+ if (pfkey_msg) { -+ pfkey_msg_free(&pfkey_msg); -+ } -+ pfkey_extensions_free(extensions); -+ return error; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_x_nat_t_new_mapping_parse(struct sock *sk, struct sadb_ext **extensions, struct pfkey_extracted_data* extr) -+{ -+ /* K_SADB_X_NAT_T_NEW_MAPPING not used in kernel */ -+ return -EINVAL; -+} -+#endif -+ -+/******************************* -+ * EXTENSION PARSERS FOR KLIPS -+ ********************************/ -+ -+DEBUG_NO_STATIC int -+pfkey_x_outif_process(struct sadb_ext *pfkey_ext, struct pfkey_extracted_data* extr) -+{ -+ struct sadb_x_plumbif *oif; -+ -+ oif = (struct sadb_x_plumbif *)pfkey_ext; -+ -+ extr->outif = oif->sadb_x_outif_ifnum; -+ -+ return 0; -+} -+ -+DEBUG_NO_STATIC int -+pfkey_x_saref_process(struct sadb_ext *pfkey_ext, struct pfkey_extracted_data* extr) -+{ -+ struct sadb_x_saref *saf; -+ -+ saf = (struct sadb_x_saref *)pfkey_ext; -+ -+ extr->sarefme = saf->sadb_x_saref_me; -+ extr->sarefhim = saf->sadb_x_saref_him; -+ -+ return 0; -+} -+ -+DEBUG_NO_STATIC int (*ext_processors[K_SADB_EXT_MAX+1])(struct sadb_ext *pfkey_ext, struct pfkey_extracted_data* extr) = -+{ -+ NULL, /* pfkey_msg_process, */ -+ pfkey_sa_process, -+ pfkey_lifetime_process, -+ pfkey_lifetime_process, -+ pfkey_lifetime_process, -+ pfkey_address_process, -+ pfkey_address_process, -+ pfkey_address_process, -+ pfkey_key_process, -+ pfkey_key_process, -+ pfkey_ident_process, -+ pfkey_ident_process, -+ pfkey_sens_process, -+ pfkey_prop_process, -+ pfkey_supported_process, -+ pfkey_supported_process, -+ pfkey_spirange_process, -+ pfkey_x_kmprivate_process, -+ pfkey_x_satype_process, -+ pfkey_sa_process, -+ pfkey_address_process, -+ pfkey_address_process, -+ pfkey_address_process, -+ pfkey_address_process, -+ pfkey_address_process, -+ pfkey_x_debug_process, -+ pfkey_x_protocol_process, -+#ifdef CONFIG_IPSEC_NAT_TRAVERSAL -+ pfkey_x_nat_t_type_process, -+ pfkey_x_nat_t_port_process, -+ pfkey_x_nat_t_port_process, -+ pfkey_address_process, -+#else -+ NULL, NULL, NULL, NULL, -+#endif -+ pfkey_x_outif_process, -+ pfkey_x_saref_process, -+}; -+ -+ -+/******************************* -+ * MESSAGE PARSERS FOR KLIPS -+ ********************************/ -+ -+DEBUG_NO_STATIC int -+pfkey_x_simple_reply(struct sock *sk , struct sadb_ext *extensions[], int err) -+{ -+ struct sadb_msg *pfkey_reply = NULL; -+ int error = 0; -+ struct sadb_msg *m = ((struct sadb_msg*)extensions[K_SADB_EXT_RESERVED]); -+ -+ m->sadb_msg_errno = err; -+ -+ if ((error = pfkey_msg_build(&pfkey_reply, extensions, EXT_BITS_OUT))) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_expire: " -+ "failed to build the expire message\n"); -+ SENDERR(-error); -+ } -+ -+ error = pfkey_upmsgsk(sk, pfkey_reply); -+ -+ if(error) { -+ KLIPS_ERROR(debug_pfkey, "pfkey_simple reply:" -+ "sending up simple reply to pid=%d error=%d.\n", -+ m->sadb_msg_pid, err); -+ } -+ -+errlab: -+ if (pfkey_reply) { -+ pfkey_msg_free(&pfkey_reply); -+ } -+ -+ return error; -+} -+ -+/* -+ * this is a request to create a new device. Figure out which kind, and call appropriate -+ * routine in mast or tunnel code. -+ */ -+DEBUG_NO_STATIC int -+pfkey_x_plumb_parse(struct sock *sk, struct sadb_ext *extensions[], struct pfkey_extracted_data* extr) -+{ -+ unsigned int vifnum; -+ int err; -+ -+ vifnum = extr->outif; -+ if(vifnum > IPSECDEV_OFFSET) { -+ err = ipsec_tunnel_createnum(vifnum-IPSECDEV_OFFSET); -+ } else { -+ err = ipsec_mast_createnum(vifnum); -+ } -+ -+ return pfkey_x_simple_reply(sk, extensions, err); -+} -+ -+DEBUG_NO_STATIC int -+pfkey_x_unplumb_parse(struct sock *sk, struct sadb_ext *extensions[], struct pfkey_extracted_data* extr) -+{ -+ unsigned int vifnum; -+ int err; -+ -+ vifnum = extr->outif; -+ if(vifnum > IPSECDEV_OFFSET) { -+ err = ipsec_tunnel_deletenum(vifnum-IPSECDEV_OFFSET); -+ } else { -+ err = ipsec_mast_deletenum(vifnum); -+ } -+ -+ return pfkey_x_simple_reply(sk, extensions, err); -+} -+ -+ -+DEBUG_NO_STATIC int (*msg_parsers[K_SADB_MAX +1])(struct sock *sk, struct sadb_ext *extensions[], struct pfkey_extracted_data* extr) -+ = -+{ -+ NULL, /* RESERVED */ -+ pfkey_getspi_parse, -+ pfkey_update_parse, -+ pfkey_add_parse, -+ pfkey_delete_parse, -+ pfkey_get_parse, -+ pfkey_acquire_parse, -+ pfkey_register_parse, -+ pfkey_expire_parse, -+ pfkey_flush_parse, -+ pfkey_dump_parse, -+ pfkey_x_promisc_parse, -+ pfkey_x_pchange_parse, -+ pfkey_x_grpsa_parse, -+ pfkey_x_addflow_parse, -+ pfkey_x_delflow_parse, -+ pfkey_x_msg_debug_parse, -+#ifdef CONFIG_IPSEC_NAT_TRAVERSAL -+ pfkey_x_nat_t_new_mapping_parse, -+#else -+ NULL, -+#endif -+ pfkey_x_plumb_parse, -+ pfkey_x_unplumb_parse, -+}; -+ -+int -+pfkey_build_reply(struct sadb_msg *pfkey_msg, -+ struct pfkey_extracted_data *extr, -+ struct sadb_msg **pfkey_reply) -+{ -+ struct sadb_ext *extensions[K_SADB_EXT_MAX+1]; -+ int error = 0; -+ int msg_type = pfkey_msg->sadb_msg_type; -+ int seq = pfkey_msg->sadb_msg_seq; -+ -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_build_reply: " -+ "building reply with type: %d\n", -+ msg_type); -+ pfkey_extensions_init(extensions); -+ if (!extr || !extr->ips) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_build_reply: " -+ "bad ipsec_sa passed\n"); -+ return EINVAL; // TODO: should this not be negative? -+ } -+ error = pfkey_safe_build(pfkey_msg_hdr_build(&extensions[0], -+ msg_type, -+ proto2satype(extr->ips->ips_said.proto), -+ 0, -+ seq, -+ pfkey_msg->sadb_msg_pid), -+ extensions); -+ -+ if(!error -+ && pfkey_required_extension(EXT_BITS_OUT, msg_type, K_SADB_EXT_SA)) { -+ -+ error = pfkey_sa_build(&extensions[K_SADB_EXT_SA], -+ K_SADB_EXT_SA, -+ extr->ips->ips_said.spi, -+ extr->ips->ips_replaywin, -+ extr->ips->ips_state, -+ extr->ips->ips_authalg, -+ extr->ips->ips_encalg, -+ extr->ips->ips_flags); -+ pfkey_safe_build(error, extensions); -+ } -+ -+ if(!error -+ && pfkey_required_extension(EXT_BITS_OUT, msg_type, K_SADB_X_EXT_SAREF)) { -+ error = pfkey_saref_build(&extensions[K_SADB_X_EXT_SAREF], -+ extr->ips->ips_ref, -+ extr->ips->ips_refhim); -+ pfkey_safe_build(error, extensions); -+ } -+ -+ if(!error -+ && pfkey_required_extension(EXT_BITS_OUT,msg_type,K_SADB_EXT_LIFETIME_CURRENT)) { -+ error = pfkey_lifetime_build(&extensions -+ [K_SADB_EXT_LIFETIME_CURRENT], -+ K_SADB_EXT_LIFETIME_CURRENT, -+ extr->ips->ips_life.ipl_allocations.ipl_count, -+ extr->ips->ips_life.ipl_bytes.ipl_count, -+ extr->ips->ips_life.ipl_addtime.ipl_count, -+ extr->ips->ips_life.ipl_usetime.ipl_count, -+ extr->ips->ips_life.ipl_packets.ipl_count); -+ pfkey_safe_build(error, extensions); -+ } -+ -+ if(!error -+ && pfkey_required_extension(EXT_BITS_OUT,msg_type,K_SADB_EXT_ADDRESS_SRC)) { -+ error = pfkey_address_build(&extensions[K_SADB_EXT_ADDRESS_SRC], -+ K_SADB_EXT_ADDRESS_SRC, -+ extr->ips->ips_said.proto, -+ 0, -+ extr->ips->ips_addr_s); -+ pfkey_safe_build(error, extensions); -+ } -+ -+ if(!error -+ && pfkey_required_extension(EXT_BITS_OUT,msg_type,K_SADB_EXT_ADDRESS_DST)) { -+ error = pfkey_address_build(&extensions[K_SADB_EXT_ADDRESS_DST], -+ K_SADB_EXT_ADDRESS_DST, -+ extr->ips->ips_said.proto, -+ 0, -+ extr->ips->ips_addr_d); -+ pfkey_safe_build(error, extensions); -+ } -+ -+ if (error == 0) { -+ KLIPS_PRINT(debug_pfkey, "klips_debug:pfkey_build_reply: " -+ "building extensions failed\n"); -+ return EINVAL; -+ } -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_build_reply: " -+ "built extensions, proceed to build the message\n"); -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_build_reply: " -+ "extensions[1]=0p%p\n", -+ extensions[1]); -+ error = pfkey_msg_build(pfkey_reply, extensions, EXT_BITS_OUT); -+ pfkey_extensions_free(extensions); -+ -+ return error; -+} -+ -+/* -+ * interpret a pfkey message for klips usage. -+ * it used to be that we provided a reply in a seperate buffer, -+ * but now we overwrite the request buffer and return it. -+ */ -+int -+pfkey_msg_interp(struct sock *sk, struct sadb_msg *pfkey_msg) -+{ -+ int error = 0; -+ int i; -+ struct sadb_ext *extensions[K_SADB_EXT_MAX+1]; /* should be kalloc */ -+ struct pfkey_extracted_data extr; -+ -+ memset(&extr, 0, sizeof(extr)); -+ -+ pfkey_extensions_init(extensions); -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_msg_interp: " -+ "parsing message ver=%d, type=%d, errno=%d, satype=%d(%s), len=%d, res=%d, seq=%d, pid=%d.\n", -+ pfkey_msg->sadb_msg_version, -+ pfkey_msg->sadb_msg_type, -+ pfkey_msg->sadb_msg_errno, -+ pfkey_msg->sadb_msg_satype, -+ satype2name(pfkey_msg->sadb_msg_satype), -+ pfkey_msg->sadb_msg_len, -+ pfkey_msg->sadb_msg_reserved, -+ pfkey_msg->sadb_msg_seq, -+ pfkey_msg->sadb_msg_pid); -+ -+ extr.ips = ipsec_sa_alloc(&error); /* pass in error var by pointer */ -+ if(extr.ips == NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_msg_interp: " -+ "memory allocation error.\n"); -+ SENDERR(-error); -+ } -+ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_msg_interp: " -+ "allocated extr->ips=0p%p.\n", -+ extr.ips); -+ -+ if(pfkey_msg->sadb_msg_satype > K_SADB_SATYPE_MAX) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_msg_interp: " -+ "satype %d > max %d\n", -+ pfkey_msg->sadb_msg_satype, -+ K_SADB_SATYPE_MAX); -+ SENDERR(EINVAL); -+ } -+ -+ switch(pfkey_msg->sadb_msg_type) { -+ case K_SADB_GETSPI: -+ case K_SADB_UPDATE: -+ case K_SADB_ADD: -+ case K_SADB_DELETE: -+ case K_SADB_X_GRPSA: -+ case K_SADB_X_ADDFLOW: -+ if(!(extr.ips->ips_said.proto = satype2proto(pfkey_msg->sadb_msg_satype))) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_msg_interp: " -+ "satype %d lookup failed.\n", -+ pfkey_msg->sadb_msg_satype); -+ SENDERR(EINVAL); -+ } else { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_msg_interp: " -+ "satype %d lookups to proto=%d.\n", -+ pfkey_msg->sadb_msg_satype, -+ extr.ips->ips_said.proto); -+ } -+ break; -+ default: -+ break; -+ } -+ -+ /* The NULL below causes the default extension parsers to be used */ -+ /* Parse the extensions */ -+ if((error = pfkey_msg_parse(pfkey_msg, NULL, extensions, EXT_BITS_IN))) -+ { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_msg_interp: " -+ "message parsing failed with error %d.\n", -+ error); -+ SENDERR(-error); -+ } -+ -+ /* Process the extensions */ -+ for(i=1; i <= K_SADB_EXT_MAX;i++) { -+ if(extensions[i] != NULL && ext_processors[i]!=NULL) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_msg_interp: " -+ "processing ext %d 0p%p with processor 0p%p.\n", -+ i, extensions[i], ext_processors[i]); -+ if((error = ext_processors[i](extensions[i], &extr))) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_msg_interp: " -+ "extension processing for type %d failed with error %d.\n", -+ i, -+ error); -+ SENDERR(-error); -+ } -+ -+ } -+ -+ } -+ -+ /* Parse the message types */ -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_msg_interp: " -+ "parsing message type %d(%s) with msg_parser 0p%p.\n", -+ pfkey_msg->sadb_msg_type, -+ pfkey_v2_sadb_type_string(pfkey_msg->sadb_msg_type), -+ msg_parsers[pfkey_msg->sadb_msg_type]); -+ if((error = msg_parsers[pfkey_msg->sadb_msg_type](sk, extensions, &extr))) { -+ KLIPS_PRINT(debug_pfkey, -+ "klips_debug:pfkey_msg_interp: " -+ "message parsing failed with error %d.\n", -+ error); -+ SENDERR(-error); -+ } -+ -+ errlab: -+ if(extr.ips != NULL) { -+ ipsec_sa_put(extr.ips); -+ } -+ if(extr.ips2 != NULL) { -+ ipsec_sa_put(extr.ips2); -+ } -+ if (extr.eroute != NULL) { -+ kfree(extr.eroute); -+ } -+ return(error); -+} -+ -+/* -+ * -+ * Local Variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/prng.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,202 @@ -+/* -+ * crypto-class pseudorandom number generator -+ * currently uses same algorithm as RC4(TM), from Schneier 2nd ed p397 -+ * Copyright (C) 2002 Henry Spencer. -+ * -+ * This library is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU Library General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This library is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public -+ * License for more details. -+ * -+ * RCSID $Id: prng.c,v 1.8 2005/08/25 01:20:21 paul Exp $ -+ */ -+#include "openswan.h" -+ -+/* -+ - prng_init - initialize PRNG from a key -+ */ -+void -+prng_init(prng, key, keylen) -+struct prng *prng; -+const unsigned char *key; -+size_t keylen; -+{ -+ unsigned char k[256]; -+ int i, j; -+ unsigned const char *p; -+ unsigned const char *keyend = key + keylen; -+ unsigned char t; -+ -+ for (i = 0; i <= 255; i++) -+ prng->sbox[i] = i; -+ p = key; -+ for (i = 0; i <= 255; i++) { -+ k[i] = *p++; -+ if (p >= keyend) -+ p = key; -+ } -+ j = 0; -+ for (i = 0; i <= 255; i++) { -+ j = (j + prng->sbox[i] + k[i]) & 0xff; -+ t = prng->sbox[i]; -+ prng->sbox[i] = prng->sbox[j]; -+ prng->sbox[j] = t; -+ k[i] = 0; /* clear out key memory */ -+ } -+ prng->i = 0; -+ prng->j = 0; -+ prng->count = 0; -+} -+ -+/* -+ - prng_bytes - get some pseudorandom bytes from PRNG -+ */ -+void -+prng_bytes(prng, dst, dstlen) -+struct prng *prng; -+unsigned char *dst; -+size_t dstlen; -+{ -+ int i, j, t; -+ unsigned char *p = dst; -+ size_t remain = dstlen; -+# define MAXCOUNT 4000000000ul -+ -+ while (remain > 0) { -+ i = (prng->i + 1) & 0xff; -+ prng->i = i; -+ j = (prng->j + prng->sbox[i]) & 0xff; -+ prng->j = j; -+ t = prng->sbox[i]; -+ prng->sbox[i] = prng->sbox[j]; -+ prng->sbox[j] = t; -+ t = (t + prng->sbox[i]) & 0xff; -+ *p++ = prng->sbox[t]; -+ remain--; -+ } -+ if (prng->count < MAXCOUNT - dstlen) -+ prng->count += dstlen; -+ else -+ prng->count = MAXCOUNT; -+} -+ -+/* -+ - prnt_count - how many bytes have been extracted from PRNG so far? -+ */ -+unsigned long -+prng_count(prng) -+struct prng *prng; -+{ -+ return prng->count; -+} -+ -+/* -+ - prng_final - clear out PRNG to ensure nothing left in memory -+ */ -+void -+prng_final(prng) -+struct prng *prng; -+{ -+ int i; -+ -+ for (i = 0; i <= 255; i++) -+ prng->sbox[i] = 0; -+ prng->i = 0; -+ prng->j = 0; -+ prng->count = 0; /* just for good measure */ -+} -+ -+ -+ -+#ifdef PRNG_MAIN -+ -+#include -+#include -+ -+void regress(); -+ -+int -+main(argc, argv) -+int argc; -+char *argv[]; -+{ -+ struct prng pr; -+ unsigned char buf[100]; -+ unsigned char *p; -+ size_t n; -+ -+ if (argc < 2) { -+ fprintf(stderr, "Usage: %s {key|-r}\n", argv[0]); -+ exit(2); -+ } -+ -+ if (strcmp(argv[1], "-r") == 0) { -+ regress(); -+ fprintf(stderr, "regress() returned?!?\n"); -+ exit(1); -+ } -+ -+ prng_init(&pr, argv[1], strlen(argv[1])); -+ prng_bytes(&pr, buf, 32); -+ printf("0x"); -+ for (p = buf, n = 32; n > 0; p++, n--) -+ printf("%02x", *p); -+ printf("\n%lu bytes\n", prng_count(&pr)); -+ prng_final(&pr); -+ exit(0); -+} -+ -+void -+regress() -+{ -+ struct prng pr; -+ unsigned char buf[100]; -+ unsigned char *p; -+ size_t n; -+ /* somewhat non-random sample key */ -+ unsigned char key[] = "here we go gathering nuts in May"; -+ /* first thirty bytes of output from that key */ -+ unsigned char good[] = "\x3f\x02\x8e\x4a\x2a\xea\x23\x18\x92\x7c" -+ "\x09\x52\x83\x61\xaa\x26\xce\xbb\x9d\x71" -+ "\x71\xe5\x10\x22\xaf\x60\x54\x8d\x5b\x28"; -+ int nzero, none; -+ int show = 0; -+ -+ prng_init(&pr, key, strlen(key)); -+ prng_bytes(&pr, buf, sizeof(buf)); -+ for (p = buf, n = sizeof(buf); n > 0; p++, n--) { -+ if (*p == 0) -+ nzero++; -+ if (*p == 255) -+ none++; -+ } -+ if (nzero > 3 || none > 3) { -+ fprintf(stderr, "suspiciously non-random output!\n"); -+ show = 1; -+ } -+ if (memcmp(buf, good, strlen(good)) != 0) { -+ fprintf(stderr, "incorrect output!\n"); -+ show = 1; -+ } -+ if (show) { -+ fprintf(stderr, "0x"); -+ for (p = buf, n = sizeof(buf); n > 0; p++, n--) -+ fprintf(stderr, "%02x", *p); -+ fprintf(stderr, "\n"); -+ exit(1); -+ } -+ if (prng_count(&pr) != sizeof(buf)) { -+ fprintf(stderr, "got %u bytes, but count is %lu\n", -+ sizeof(buf), prng_count(&pr)); -+ exit(1); -+ } -+ prng_final(&pr); -+ exit(0); -+} -+ -+#endif /* PRNG_MAIN */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/radij.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,1013 @@ -+char radij_c_version[] = "RCSID $Id: radij.c,v 1.48 2005/04/29 05:10:22 mcr Exp $"; -+ -+/* -+ * This file is defived from ${SRC}/sys/net/radix.c of BSD 4.4lite -+ * -+ * Variable and procedure names have been modified so that they don't -+ * conflict with the original BSD code, as a small number of modifications -+ * have been introduced and we may want to reuse this code in BSD. -+ * -+ * The `j' in `radij' is pronounced as a voiceless guttural (like a Greek -+ * chi or a German ch sound (as `doch', not as in `milch'), or even a -+ * spanish j as in Juan. It is not as far back in the throat like -+ * the corresponding Hebrew sound, nor is it a soft breath like the English h. -+ * It has nothing to do with the Dutch ij sound. -+ * -+ * Here is the appropriate copyright notice: -+ */ -+ -+/* -+ * Copyright (c) 1988, 1989, 1993 -+ * The Regents of the University of California. All rights reserved. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. All advertising materials mentioning features or use of this software -+ * must display the following acknowledgement: -+ * This product includes software developed by the University of -+ * California, Berkeley and its contributors. -+ * 4. Neither the name of the University nor the names of its contributors -+ * may be used to endorse or promote products derived from this software -+ * without specific prior written permission. -+ * -+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND -+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -+ * SUCH DAMAGE. -+ * -+ * @(#)radix.c 8.2 (Berkeley) 1/4/94 -+ */ -+ -+/* -+ * Routines to build and maintain radix trees for routing lookups. -+ */ -+ -+#ifndef AUTOCONF_INCLUDED -+#include -+#endif -+#include -+#include /* printk() */ -+ -+#include "openswan/ipsec_param.h" -+ -+#ifdef MALLOC_SLAB -+# include /* kmalloc() */ -+#else /* MALLOC_SLAB */ -+# include /* kmalloc() */ -+#endif /* MALLOC_SLAB */ -+#include /* error codes */ -+#include /* size_t */ -+#include /* mark_bh */ -+ -+#include /* struct device, and other headers */ -+#include /* eth_type_trans */ -+#include /* struct iphdr */ -+#include -+#ifdef NET_21 -+# include -+#endif /* NET_21 */ -+ -+#include -+ -+#include -+ -+#include "openswan/radij.h" -+#include "openswan/ipsec_encap.h" -+#include "openswan/ipsec_radij.h" -+ -+int maj_keylen; -+struct radij_mask *rj_mkfreelist; -+struct radij_node_head *mask_rjhead; -+static int gotOddMasks; -+static char *maskedKey; -+static char *rj_zeroes, *rj_ones; -+ -+#define rj_masktop (mask_rjhead->rnh_treetop) -+#ifdef Bcmp -+# undef Bcmp -+#endif /* Bcmp */ -+#define Bcmp(a, b, l) (l == 0 ? 0 : memcmp((caddr_t)(b), (caddr_t)(a), (size_t)l)) -+/* -+ * The data structure for the keys is a radix tree with one way -+ * branching removed. The index rj_b at an internal node n represents a bit -+ * position to be tested. The tree is arranged so that all descendants -+ * of a node n have keys whose bits all agree up to position rj_b - 1. -+ * (We say the index of n is rj_b.) -+ * -+ * There is at least one descendant which has a one bit at position rj_b, -+ * and at least one with a zero there. -+ * -+ * A route is determined by a pair of key and mask. We require that the -+ * bit-wise logical and of the key and mask to be the key. -+ * We define the index of a route to associated with the mask to be -+ * the first bit number in the mask where 0 occurs (with bit number 0 -+ * representing the highest order bit). -+ * -+ * We say a mask is normal if every bit is 0, past the index of the mask. -+ * If a node n has a descendant (k, m) with index(m) == index(n) == rj_b, -+ * and m is a normal mask, then the route applies to every descendant of n. -+ * If the index(m) < rj_b, this implies the trailing last few bits of k -+ * before bit b are all 0, (and hence consequently true of every descendant -+ * of n), so the route applies to all descendants of the node as well. -+ * -+ * The present version of the code makes no use of normal routes, -+ * but similar logic shows that a non-normal mask m such that -+ * index(m) <= index(n) could potentially apply to many children of n. -+ * Thus, for each non-host route, we attach its mask to a list at an internal -+ * node as high in the tree as we can go. -+ */ -+ -+struct radij_node * -+rj_search(v_arg, head) -+ void *v_arg; -+ struct radij_node *head; -+{ -+ register struct radij_node *x; -+ register caddr_t v; -+ -+ for (x = head, v = v_arg; x->rj_b >= 0;) { -+ if (x->rj_bmask & v[x->rj_off]) -+ x = x->rj_r; -+ else -+ x = x->rj_l; -+ } -+ return (x); -+}; -+ -+struct radij_node * -+rj_search_m(v_arg, head, m_arg) -+ struct radij_node *head; -+ void *v_arg, *m_arg; -+{ -+ register struct radij_node *x; -+ register caddr_t v = v_arg, m = m_arg; -+ -+ for (x = head; x->rj_b >= 0;) { -+ if ((x->rj_bmask & m[x->rj_off]) && -+ (x->rj_bmask & v[x->rj_off])) -+ x = x->rj_r; -+ else -+ x = x->rj_l; -+ } -+ return x; -+}; -+ -+int -+rj_refines(m_arg, n_arg) -+ void *m_arg, *n_arg; -+{ -+ register caddr_t m = m_arg, n = n_arg; -+ register caddr_t lim, lim2 = lim = n + *(u_char *)n; -+ int longer = (*(u_char *)n++) - (int)(*(u_char *)m++); -+ int masks_are_equal = 1; -+ -+ if (longer > 0) -+ lim -= longer; -+ while (n < lim) { -+ if (*n & ~(*m)) -+ return 0; -+ if (*n++ != *m++) -+ masks_are_equal = 0; -+ -+ } -+ while (n < lim2) -+ if (*n++) -+ return 0; -+ if (masks_are_equal && (longer < 0)) -+ for (lim2 = m - longer; m < lim2; ) -+ if (*m++) -+ return 1; -+ return (!masks_are_equal); -+} -+ -+ -+struct radij_node * -+rj_match(v_arg, head) -+ void *v_arg; -+ struct radij_node_head *head; -+{ -+ caddr_t v = v_arg; -+ register struct radij_node *t = head->rnh_treetop, *x; -+ register caddr_t cp = v, cp2, cp3; -+ caddr_t cplim, mstart; -+ struct radij_node *saved_t, *top = t; -+ int off = t->rj_off, vlen = *(u_char *)cp, matched_off; -+ -+ /* -+ * Open code rj_search(v, top) to avoid overhead of extra -+ * subroutine call. -+ */ -+ for (; t->rj_b >= 0; ) { -+ if (t->rj_bmask & cp[t->rj_off]) -+ t = t->rj_r; -+ else -+ t = t->rj_l; -+ } -+ /* -+ * See if we match exactly as a host destination -+ */ -+ KLIPS_PRINT(debug_radij, -+ "klips_debug:rj_match: " -+ "* See if we match exactly as a host destination\n"); -+ -+ cp += off; cp2 = t->rj_key + off; cplim = v + vlen; -+ for (; cp < cplim; cp++, cp2++) -+ if (*cp != *cp2) -+ goto on1; -+ /* -+ * This extra grot is in case we are explicitly asked -+ * to look up the default. Ugh! -+ */ -+ if ((t->rj_flags & RJF_ROOT) && t->rj_dupedkey) -+ t = t->rj_dupedkey; -+ return t; -+on1: -+ matched_off = cp - v; -+ saved_t = t; -+ KLIPS_PRINT(debug_radij, -+ "klips_debug:rj_match: " -+ "** try to match a leaf, t=0p%p\n", t); -+ do { -+ if (t->rj_mask) { -+ /* -+ * Even if we don't match exactly as a hosts; -+ * we may match if the leaf we wound up at is -+ * a route to a net. -+ */ -+ cp3 = matched_off + t->rj_mask; -+ cp2 = matched_off + t->rj_key; -+ for (; cp < cplim; cp++) -+ if ((*cp2++ ^ *cp) & *cp3++) -+ break; -+ if (cp == cplim) -+ return t; -+ cp = matched_off + v; -+ } -+ } while ((t = t->rj_dupedkey)); -+ t = saved_t; -+ /* start searching up the tree */ -+ KLIPS_PRINT(debug_radij, -+ "klips_debug:rj_match: " -+ "*** start searching up the tree, t=0p%p\n", -+ t); -+ do { -+ register struct radij_mask *m; -+ -+ t = t->rj_p; -+ KLIPS_PRINT(debug_radij, -+ "klips_debug:rj_match: " -+ "**** t=0p%p\n", -+ t); -+ if ((m = t->rj_mklist)) { -+ /* -+ * After doing measurements here, it may -+ * turn out to be faster to open code -+ * rj_search_m here instead of always -+ * copying and masking. -+ */ -+ /* off = min(t->rj_off, matched_off); */ -+ off = t->rj_off; -+ if (matched_off < off) -+ off = matched_off; -+ mstart = maskedKey + off; -+ do { -+ cp2 = mstart; -+ cp3 = m->rm_mask + off; -+ KLIPS_PRINT(debug_radij, -+ "klips_debug:rj_match: " -+ "***** cp2=0p%p cp3=0p%p\n", -+ cp2, cp3); -+ for (cp = v + off; cp < cplim;) -+ *cp2++ = *cp++ & *cp3++; -+ x = rj_search(maskedKey, t); -+ while (x && x->rj_mask != m->rm_mask) -+ x = x->rj_dupedkey; -+ if (x && -+ (Bcmp(mstart, x->rj_key + off, -+ vlen - off) == 0)) -+ return x; -+ } while ((m = m->rm_mklist)); -+ } -+ } while (t != top); -+ KLIPS_PRINT(debug_radij, -+ "klips_debug:rj_match: " -+ "***** not found.\n"); -+ return 0; -+}; -+ -+#ifdef RJ_DEBUG -+int rj_nodenum; -+struct radij_node *rj_clist; -+int rj_saveinfo; -+DEBUG_NO_STATIC void traverse(struct radij_node *); -+#ifdef RJ_DEBUG2 -+int rj_debug = 1; -+#else -+int rj_debug = 0; -+#endif /* RJ_DEBUG2 */ -+#endif /* RJ_DEBUG */ -+ -+struct radij_node * -+rj_newpair(v, b, nodes) -+ void *v; -+ int b; -+ struct radij_node nodes[2]; -+{ -+ register struct radij_node *tt = nodes, *t = tt + 1; -+ t->rj_b = b; t->rj_bmask = 0x80 >> (b & 7); -+ t->rj_l = tt; t->rj_off = b >> 3; -+ tt->rj_b = -1; tt->rj_key = (caddr_t)v; tt->rj_p = t; -+ tt->rj_flags = t->rj_flags = RJF_ACTIVE; -+#ifdef RJ_DEBUG -+ tt->rj_info = rj_nodenum++; t->rj_info = rj_nodenum++; -+ tt->rj_twin = t; tt->rj_ybro = rj_clist; rj_clist = tt; -+#endif /* RJ_DEBUG */ -+ return t; -+} -+ -+struct radij_node * -+rj_insert(v_arg, head, dupentry, nodes) -+ void *v_arg; -+ struct radij_node_head *head; -+ int *dupentry; -+ struct radij_node nodes[2]; -+{ -+ caddr_t v = v_arg; -+ struct radij_node *top = head->rnh_treetop; -+ int head_off = top->rj_off, vlen = (int)*((u_char *)v); -+ register struct radij_node *t = rj_search(v_arg, top); -+ register caddr_t cp = v + head_off; -+ register int b; -+ struct radij_node *tt; -+ /* -+ *find first bit at which v and t->rj_key differ -+ */ -+ { -+ register caddr_t cp2 = t->rj_key + head_off; -+ register int cmp_res; -+ caddr_t cplim = v + vlen; -+ -+ while (cp < cplim) -+ if (*cp2++ != *cp++) -+ goto on1; -+ *dupentry = 1; -+ return t; -+on1: -+ *dupentry = 0; -+ cmp_res = (cp[-1] ^ cp2[-1]) & 0xff; -+ for (b = (cp - v) << 3; cmp_res; b--) -+ cmp_res >>= 1; -+ } -+ { -+ register struct radij_node *p, *x = top; -+ cp = v; -+ do { -+ p = x; -+ if (cp[x->rj_off] & x->rj_bmask) -+ x = x->rj_r; -+ else x = x->rj_l; -+ } while (b > (unsigned) x->rj_b); /* x->rj_b < b && x->rj_b >= 0 */ -+#ifdef RJ_DEBUG -+ if (rj_debug) -+ printk("klips_debug:rj_insert: Going In:\n"), traverse(p); -+#endif /* RJ_DEBUG */ -+ t = rj_newpair(v_arg, b, nodes); tt = t->rj_l; -+ if ((cp[p->rj_off] & p->rj_bmask) == 0) -+ p->rj_l = t; -+ else -+ p->rj_r = t; -+ x->rj_p = t; t->rj_p = p; /* frees x, p as temp vars below */ -+ if ((cp[t->rj_off] & t->rj_bmask) == 0) { -+ t->rj_r = x; -+ } else { -+ t->rj_r = tt; t->rj_l = x; -+ } -+#ifdef RJ_DEBUG -+ if (rj_debug) -+ printk("klips_debug:rj_insert: Coming out:\n"), traverse(p); -+#endif /* RJ_DEBUG */ -+ } -+ return (tt); -+} -+ -+struct radij_node * -+rj_addmask(n_arg, search, skip) -+ int search, skip; -+ void *n_arg; -+{ -+ caddr_t netmask = (caddr_t)n_arg; -+ register struct radij_node *x; -+ register caddr_t cp, cplim; -+ register int b, mlen, j; -+ int maskduplicated; -+ -+ mlen = *(u_char *)netmask; -+ if (search) { -+ x = rj_search(netmask, rj_masktop); -+ mlen = *(u_char *)netmask; -+ if (Bcmp(netmask, x->rj_key, mlen) == 0) -+ return (x); -+ } -+ R_Malloc(x, struct radij_node *, maj_keylen + 2 * sizeof (*x)); -+ if (x == 0) -+ return (0); -+ Bzero(x, maj_keylen + 2 * sizeof (*x)); -+ cp = (caddr_t)(x + 2); -+ Bcopy(netmask, cp, mlen); -+ netmask = cp; -+ x = rj_insert(netmask, mask_rjhead, &maskduplicated, x); -+ /* -+ * Calculate index of mask. -+ */ -+ cplim = netmask + mlen; -+ for (cp = netmask + skip; cp < cplim; cp++) -+ if (*(u_char *)cp != 0xff) -+ break; -+ b = (cp - netmask) << 3; -+ if (cp != cplim) { -+ if (*cp != 0) { -+ gotOddMasks = 1; -+ for (j = 0x80; j; b++, j >>= 1) -+ if ((j & *cp) == 0) -+ break; -+ } -+ } -+ x->rj_b = -1 - b; -+ return (x); -+} -+ -+#if 0 -+struct radij_node * -+#endif -+int -+rj_addroute(v_arg, n_arg, head, treenodes) -+ void *v_arg, *n_arg; -+ struct radij_node_head *head; -+ struct radij_node treenodes[2]; -+{ -+ caddr_t v = (caddr_t)v_arg, netmask = (caddr_t)n_arg; -+ register struct radij_node *t, *x=NULL, *tt; -+ struct radij_node *saved_tt, *top = head->rnh_treetop; -+ short b = 0, b_leaf; -+ int mlen, keyduplicated; -+ caddr_t cplim; -+ struct radij_mask *m, **mp; -+ -+ /* -+ * In dealing with non-contiguous masks, there may be -+ * many different routes which have the same mask. -+ * We will find it useful to have a unique pointer to -+ * the mask to speed avoiding duplicate references at -+ * nodes and possibly save time in calculating indices. -+ */ -+ if (netmask) { -+ x = rj_search(netmask, rj_masktop); -+ mlen = *(u_char *)netmask; -+ if (Bcmp(netmask, x->rj_key, mlen) != 0) { -+ x = rj_addmask(netmask, 0, top->rj_off); -+ if (x == 0) -+ return -ENOMEM; /* (0) rgb */ -+ } -+ netmask = x->rj_key; -+ b = -1 - x->rj_b; -+ } -+ /* -+ * Deal with duplicated keys: attach node to previous instance -+ */ -+ saved_tt = tt = rj_insert(v, head, &keyduplicated, treenodes); -+#ifdef RJ_DEBUG -+ printk("addkey: duplicated: %d\n", keyduplicated); -+#endif -+ if (keyduplicated) { -+ do { -+ if (tt->rj_mask == netmask) -+ return -EEXIST; /* -ENXIO; (0) rgb */ -+ t = tt; -+ if (netmask == 0 || -+ (tt->rj_mask && rj_refines(netmask, tt->rj_mask))) -+ break; -+ } while ((tt = tt->rj_dupedkey)); -+ /* -+ * If the mask is not duplicated, we wouldn't -+ * find it among possible duplicate key entries -+ * anyway, so the above test doesn't hurt. -+ * -+ * We sort the masks for a duplicated key the same way as -+ * in a masklist -- most specific to least specific. -+ * This may require the unfortunate nuisance of relocating -+ * the head of the list. -+ */ -+ if (tt && t == saved_tt) { -+ struct radij_node *xx = x; -+ /* link in at head of list */ -+ (tt = treenodes)->rj_dupedkey = t; -+ tt->rj_flags = t->rj_flags; -+ tt->rj_p = x = t->rj_p; -+ if (x->rj_l == t) x->rj_l = tt; else x->rj_r = tt; -+ saved_tt = tt; x = xx; -+ } else { -+ (tt = treenodes)->rj_dupedkey = t->rj_dupedkey; -+ t->rj_dupedkey = tt; -+ } -+#ifdef RJ_DEBUG -+ t=tt+1; tt->rj_info = rj_nodenum++; t->rj_info = rj_nodenum++; -+ tt->rj_twin = t; tt->rj_ybro = rj_clist; rj_clist = tt; -+#endif /* RJ_DEBUG */ -+ t = saved_tt; -+ tt->rj_key = (caddr_t) v; -+ tt->rj_b = -1; -+ tt->rj_flags = t->rj_flags & ~RJF_ROOT; -+ } -+ /* -+ * Put mask in tree. -+ */ -+ if (netmask) { -+ tt->rj_mask = netmask; -+ tt->rj_b = x->rj_b; -+ } -+ t = saved_tt->rj_p; -+ b_leaf = -1 - t->rj_b; -+ if (t->rj_r == saved_tt) x = t->rj_l; else x = t->rj_r; -+ /* Promote general routes from below */ -+ if (x->rj_b < 0) { -+ if (x->rj_mask && (x->rj_b >= b_leaf) && x->rj_mklist == 0) { -+ MKGet(m); -+ if (m) { -+ Bzero(m, sizeof *m); -+ m->rm_b = x->rj_b; -+ m->rm_mask = x->rj_mask; -+ x->rj_mklist = t->rj_mklist = m; -+ } -+ } -+ } else if (x->rj_mklist) { -+ /* -+ * Skip over masks whose index is > that of new node -+ */ -+ for (mp = &x->rj_mklist; (m = *mp); mp = &m->rm_mklist) -+ if (m->rm_b >= b_leaf) -+ break; -+ t->rj_mklist = m; *mp = 0; -+ } -+ /* Add new route to highest possible ancestor's list */ -+ if ((netmask == 0) || (b > t->rj_b )) { -+#ifdef RJ_DEBUG -+ printk("klips:radij.c: netmask = %p or b(%d)>t->rjb(%d)\n", netmask, b, t->rj_b); -+#endif -+ return 0; /* tt rgb */ /* can't lift at all */ -+ } -+ b_leaf = tt->rj_b; -+ do { -+ x = t; -+ t = t->rj_p; -+ } while (b <= t->rj_b && x != top); -+ /* -+ * Search through routes associated with node to -+ * insert new route according to index. -+ * For nodes of equal index, place more specific -+ * masks first. -+ */ -+ cplim = netmask + mlen; -+ for (mp = &x->rj_mklist; (m = *mp); mp = &m->rm_mklist) { -+ if (m->rm_b < b_leaf) -+ continue; -+ if (m->rm_b > b_leaf) -+ break; -+ if (m->rm_mask == netmask) { -+ m->rm_refs++; -+ tt->rj_mklist = m; -+#ifdef RJ_DEBUG -+ printk("klips:radij.c: m->rm_mask %p == netmask\n", netmask); -+#endif -+ return 0; /* tt rgb */ -+ } -+ if (rj_refines(netmask, m->rm_mask)) -+ break; -+ } -+ MKGet(m); -+ if (m == 0) { -+ printk("klips_debug:rj_addroute: " -+ "Mask for route not entered\n"); -+ return 0; /* (tt) rgb */ -+ } -+ Bzero(m, sizeof *m); -+ m->rm_b = b_leaf; -+ m->rm_mask = netmask; -+ m->rm_mklist = *mp; -+ *mp = m; -+ tt->rj_mklist = m; -+#ifdef RJ_DEBUG -+ printk("klips:radij.c: addroute done\n"); -+#endif -+ return 0; /* tt rgb */ -+} -+ -+int -+rj_delete(v_arg, netmask_arg, head, node) -+ void *v_arg, *netmask_arg; -+ struct radij_node_head *head; -+ struct radij_node **node; -+{ -+ register struct radij_node *t, *p, *x, *tt; -+ struct radij_mask *m, *saved_m, **mp; -+ struct radij_node *dupedkey, *saved_tt, *top; -+ caddr_t v, netmask; -+ int b, head_off, vlen; -+ -+ v = v_arg; -+ netmask = netmask_arg; -+ x = head->rnh_treetop; -+ tt = rj_search(v, x); -+ head_off = x->rj_off; -+ vlen = *(u_char *)v; -+ saved_tt = tt; -+ top = x; -+ if (tt == 0 || -+ Bcmp(v + head_off, tt->rj_key + head_off, vlen - head_off)) -+ return -EFAULT; /* (0) rgb */ -+ /* -+ * Delete our route from mask lists. -+ */ -+ if ((dupedkey = tt->rj_dupedkey)) { -+ if (netmask) -+ netmask = rj_search(netmask, rj_masktop)->rj_key; -+ while (tt->rj_mask != netmask) -+ if ((tt = tt->rj_dupedkey) == 0) -+ return -ENOENT; /* -ENXIO; (0) rgb */ -+ } -+ if (tt->rj_mask == 0 || (saved_m = m = tt->rj_mklist) == 0) -+ goto on1; -+ if (m->rm_mask != tt->rj_mask) { -+ printk("klips_debug:rj_delete: " -+ "inconsistent annotation\n"); -+ goto on1; -+ } -+ if (--m->rm_refs >= 0) -+ goto on1; -+ b = -1 - tt->rj_b; -+ t = saved_tt->rj_p; -+ if (b > t->rj_b) -+ goto on1; /* Wasn't lifted at all */ -+ do { -+ x = t; -+ t = t->rj_p; -+ } while (b <= t->rj_b && x != top); -+ for (mp = &x->rj_mklist; (m = *mp); mp = &m->rm_mklist) -+ if (m == saved_m) { -+ *mp = m->rm_mklist; -+ MKFree(m); -+ break; -+ } -+ if (m == 0) -+ printk("klips_debug:rj_delete: " -+ "couldn't find our annotation\n"); -+on1: -+ /* -+ * Eliminate us from tree -+ */ -+ if (tt->rj_flags & RJF_ROOT) -+ return -EFAULT; /* (0) rgb */ -+#ifdef RJ_DEBUG -+ /* Get us out of the creation list */ -+ for (t = rj_clist; t && t->rj_ybro != tt; t = t->rj_ybro) {} -+ if (t) t->rj_ybro = tt->rj_ybro; -+#endif /* RJ_DEBUG */ -+ t = tt->rj_p; -+ if (dupedkey) { -+ if (tt == saved_tt) { -+ x = dupedkey; x->rj_p = t; -+ if (t->rj_l == tt) t->rj_l = x; else t->rj_r = x; -+ } else { -+ for (x = p = saved_tt; p && p->rj_dupedkey != tt;) -+ p = p->rj_dupedkey; -+ if (p) p->rj_dupedkey = tt->rj_dupedkey; -+ else printk("klips_debug:rj_delete: " -+ "couldn't find node that we started with\n"); -+ } -+ t = tt + 1; -+ if (t->rj_flags & RJF_ACTIVE) { -+#ifndef RJ_DEBUG -+ *++x = *t; p = t->rj_p; -+#else -+ b = t->rj_info; *++x = *t; t->rj_info = b; p = t->rj_p; -+#endif /* RJ_DEBUG */ -+ if (p->rj_l == t) p->rj_l = x; else p->rj_r = x; -+ x->rj_l->rj_p = x; x->rj_r->rj_p = x; -+ } -+ goto out; -+ } -+ if (t->rj_l == tt) x = t->rj_r; else x = t->rj_l; -+ p = t->rj_p; -+ if (p->rj_r == t) p->rj_r = x; else p->rj_l = x; -+ x->rj_p = p; -+ /* -+ * Demote routes attached to us. -+ */ -+ if (t->rj_mklist) { -+ if (x->rj_b >= 0) { -+ for (mp = &x->rj_mklist; (m = *mp);) -+ mp = &m->rm_mklist; -+ *mp = t->rj_mklist; -+ } else { -+ for (m = t->rj_mklist; m;) { -+ struct radij_mask *mm = m->rm_mklist; -+ if (m == x->rj_mklist && (--(m->rm_refs) < 0)) { -+ x->rj_mklist = 0; -+ MKFree(m); -+ } else -+ printk("klips_debug:rj_delete: " -+ "Orphaned Mask 0p%p at 0p%p\n", m, x); -+ m = mm; -+ } -+ } -+ } -+ /* -+ * We may be holding an active internal node in the tree. -+ */ -+ x = tt + 1; -+ if (t != x) { -+#ifndef RJ_DEBUG -+ *t = *x; -+#else -+ b = t->rj_info; *t = *x; t->rj_info = b; -+#endif /* RJ_DEBUG */ -+ t->rj_l->rj_p = t; t->rj_r->rj_p = t; -+ p = x->rj_p; -+ if (p->rj_l == x) p->rj_l = t; else p->rj_r = t; -+ } -+out: -+ tt->rj_flags &= ~RJF_ACTIVE; -+ tt[1].rj_flags &= ~RJF_ACTIVE; -+ *node = tt; -+ return 0; /* (tt) rgb */ -+} -+ -+int -+rj_walktree(h, f, w) -+ struct radij_node_head *h; -+ register int (*f)(struct radij_node *,void *); -+ void *w; -+{ -+ int error; -+ struct radij_node *base, *next; -+ register struct radij_node *rn; -+ -+ if(!h || !f /* || !w */) { -+ return -ENODATA; -+ } -+ -+ rn = h->rnh_treetop; -+ /* -+ * This gets complicated because we may delete the node -+ * while applying the function f to it, so we need to calculate -+ * the successor node in advance. -+ */ -+ /* First time through node, go left */ -+ while (rn->rj_b >= 0) -+ rn = rn->rj_l; -+ for (;;) { -+#ifdef CONFIG_KLIPS_DEBUG -+ if(debug_radij) { -+ printk("klips_debug:rj_walktree: " -+ "for: rn=0p%p rj_b=%d rj_flags=%x", -+ rn, -+ rn->rj_b, -+ rn->rj_flags); -+ rn->rj_b >= 0 ? -+ printk(" node off=%x\n", -+ rn->rj_off) : -+ printk(" leaf key = %08x->%08x\n", -+ (u_int)ntohl(((struct sockaddr_encap *)rn->rj_key)->sen_ip_src.s_addr), -+ (u_int)ntohl(((struct sockaddr_encap *)rn->rj_key)->sen_ip_dst.s_addr)) -+ ; -+ } -+#endif /* CONFIG_KLIPS_DEBUG */ -+ base = rn; -+ /* If at right child go back up, otherwise, go right */ -+ while (rn->rj_p->rj_r == rn && (rn->rj_flags & RJF_ROOT) == 0) -+ rn = rn->rj_p; -+ /* Find the next *leaf* since next node might vanish, too */ -+ for (rn = rn->rj_p->rj_r; rn->rj_b >= 0;) -+ rn = rn->rj_l; -+ next = rn; -+#ifdef CONFIG_KLIPS_DEBUG -+ if(debug_radij) { -+ printk("klips_debug:rj_walktree: " -+ "processing leaves, rn=0p%p rj_b=%d rj_flags=%x", -+ rn, -+ rn->rj_b, -+ rn->rj_flags); -+ rn->rj_b >= 0 ? -+ printk(" node off=%x\n", -+ rn->rj_off) : -+ printk(" leaf key = %08x->%08x\n", -+ (u_int)ntohl(((struct sockaddr_encap *)rn->rj_key)->sen_ip_src.s_addr), -+ (u_int)ntohl(((struct sockaddr_encap *)rn->rj_key)->sen_ip_dst.s_addr)) -+ ; -+ } -+#endif /* CONFIG_KLIPS_DEBUG */ -+ /* Process leaves */ -+ while ((rn = base)) { -+ base = rn->rj_dupedkey; -+#ifdef CONFIG_KLIPS_DEBUG -+ if(debug_radij) { -+ printk("klips_debug:rj_walktree: " -+ "while: base=0p%p rn=0p%p rj_b=%d rj_flags=%x", -+ base, -+ rn, -+ rn->rj_b, -+ rn->rj_flags); -+ rn->rj_b >= 0 ? -+ printk(" node off=%x\n", -+ rn->rj_off) : -+ printk(" leaf key = %08x->%08x\n", -+ (u_int)ntohl(((struct sockaddr_encap *)rn->rj_key)->sen_ip_src.s_addr), -+ (u_int)ntohl(((struct sockaddr_encap *)rn->rj_key)->sen_ip_dst.s_addr)) -+ ; -+ } -+#endif /* CONFIG_KLIPS_DEBUG */ -+ if (!(rn->rj_flags & RJF_ROOT) && (error = (*f)(rn, w))) -+ return (-error); -+ } -+ rn = next; -+ if (rn->rj_flags & RJF_ROOT) -+ return (0); -+ } -+ /* NOTREACHED */ -+} -+ -+int -+rj_inithead(head, off) -+ void **head; -+ int off; -+{ -+ register struct radij_node_head *rnh; -+ register struct radij_node *t, *tt, *ttt; -+ if (*head) -+ return (1); -+ R_Malloc(rnh, struct radij_node_head *, sizeof (*rnh)); -+ if (rnh == NULL) -+ return (0); -+ Bzero(rnh, sizeof (*rnh)); -+ *head = rnh; -+ t = rj_newpair(rj_zeroes, off, rnh->rnh_nodes); -+ ttt = rnh->rnh_nodes + 2; -+ t->rj_r = ttt; -+ t->rj_p = t; -+ tt = t->rj_l; -+ tt->rj_flags = t->rj_flags = RJF_ROOT | RJF_ACTIVE; -+ tt->rj_b = -1 - off; -+ *ttt = *tt; -+ ttt->rj_key = rj_ones; -+ rnh->rnh_addaddr = rj_addroute; -+ rnh->rnh_deladdr = rj_delete; -+ rnh->rnh_matchaddr = rj_match; -+ rnh->rnh_walktree = rj_walktree; -+ rnh->rnh_treetop = t; -+ return (1); -+} -+ -+void -+rj_init() -+{ -+ char *cp, *cplim; -+ -+ if (maj_keylen == 0) { -+ printk("klips_debug:rj_init: " -+ "radij functions require maj_keylen be set\n"); -+ return; -+ } -+ R_Malloc(rj_zeroes, char *, 3 * maj_keylen); -+ if (rj_zeroes == NULL) -+ panic("rj_init"); -+ Bzero(rj_zeroes, 3 * maj_keylen); -+ rj_ones = cp = rj_zeroes + maj_keylen; -+ maskedKey = cplim = rj_ones + maj_keylen; -+ while (cp < cplim) -+ *cp++ = -1; -+ if (rj_inithead((void **)&mask_rjhead, 0) == 0) -+ panic("rj_init 2"); -+} -+ -+void -+rj_preorder(struct radij_node *rn, int l) -+{ -+ int i; -+ -+ if (rn == NULL){ -+ printk("klips_debug:rj_preorder: " -+ "NULL pointer\n"); -+ return; -+ } -+ -+ if (rn->rj_b >= 0){ -+ rj_preorder(rn->rj_l, l+1); -+ rj_preorder(rn->rj_r, l+1); -+ printk("klips_debug:"); -+ for (i=0; irj_off); -+ } else { -+ printk("klips_debug:"); -+ for (i=0; irj_flags); -+ if (rn->rj_flags & RJF_ACTIVE) { -+ printk(" @key=0p%p", -+ rn->rj_key); -+ printk(" key = %08x->%08x", -+ (u_int)ntohl(((struct sockaddr_encap *)rn->rj_key)->sen_ip_src.s_addr), -+ (u_int)ntohl(((struct sockaddr_encap *)rn->rj_key)->sen_ip_dst.s_addr)); -+ printk(" @mask=0p%p", -+ rn->rj_mask); -+ if (rn->rj_mask) -+ printk(" mask = %08x->%08x", -+ (u_int)ntohl(((struct sockaddr_encap *)rn->rj_mask)->sen_ip_src.s_addr), -+ (u_int)ntohl(((struct sockaddr_encap *)rn->rj_mask)->sen_ip_dst.s_addr)); -+ if (rn->rj_dupedkey) -+ printk(" dupedkey = 0p%p", -+ rn->rj_dupedkey); -+ } -+ printk("\n"); -+ } -+} -+ -+#ifdef RJ_DEBUG -+DEBUG_NO_STATIC void traverse(struct radij_node *p) -+{ -+ rj_preorder(p, 0); -+} -+#endif /* RJ_DEBUG */ -+ -+void -+rj_dumptrees(void) -+{ -+ rj_preorder(rnh->rnh_treetop, 0); -+} -+ -+void -+rj_free_mkfreelist(void) -+{ -+ struct radij_mask *mknp, *mknp2; -+ -+ mknp = rj_mkfreelist; -+ while(mknp) -+ { -+ mknp2 = mknp; -+ mknp = mknp->rm_mklist; -+ kfree(mknp2); -+ } -+} -+ -+int -+radijcleartree(void) -+{ -+ return rj_walktree(rnh, ipsec_rj_walker_delete, NULL); -+} -+ -+int -+radijcleanup(void) -+{ -+ int error = 0; -+ -+ error = radijcleartree(); -+ -+ rj_free_mkfreelist(); -+ -+/* rj_walktree(mask_rjhead, ipsec_rj_walker_delete, NULL); */ -+ if(mask_rjhead) { -+ kfree(mask_rjhead); -+ } -+ -+ if(rj_zeroes) { -+ kfree(rj_zeroes); -+ } -+ -+ if(rnh) { -+ kfree(rnh); -+ } -+ -+ return error; -+} -+ -+/* -+ * -+ * Local Variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/rangetoa.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,60 @@ -+/* -+ * convert binary form of address range to ASCII -+ * Copyright (C) 1998, 1999 Henry Spencer. -+ * -+ * This library is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU Library General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This library is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public -+ * License for more details. -+ * -+ * RCSID $Id: rangetoa.c,v 1.9 2004/07/10 07:48:37 mcr Exp $ -+ */ -+#include "openswan.h" -+ -+/* -+ - rangetoa - convert address range to ASCII -+ */ -+size_t /* space needed for full conversion */ -+rangetoa(addrs, format, dst, dstlen) -+struct in_addr addrs[2]; -+int format; /* character */ -+char *dst; /* need not be valid if dstlen is 0 */ -+size_t dstlen; -+{ -+ size_t len; -+ size_t rest; -+ int n; -+ char *p; -+ -+ switch (format) { -+ case 0: -+ break; -+ default: -+ return 0; -+ break; -+ } -+ -+ len = addrtoa(addrs[0], 0, dst, dstlen); -+ if (len < dstlen) -+ for (p = dst + len - 1, n = 3; len < dstlen && n > 0; -+ p++, len++, n--) -+ *p = '.'; -+ else -+ p = NULL; -+ if (len < dstlen) -+ rest = dstlen - len; -+ else { -+ if (dstlen > 0) -+ *(dst + dstlen - 1) = '\0'; -+ rest = 0; -+ } -+ -+ len += addrtoa(addrs[1], 0, p, rest); -+ -+ return len; -+} ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/satot.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,134 @@ -+/* -+ * convert from binary form of SA ID to text -+ * Copyright (C) 2000, 2001 Henry Spencer. -+ * -+ * This library is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU Library General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This library is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public -+ * License for more details. -+ * -+ * RCSID $Id: satot.c,v 1.13 2004/07/10 07:48:37 mcr Exp $ -+ */ -+#include "openswan.h" -+ -+static struct typename { -+ char type; -+ char *name; -+} typenames[] = { -+ { SA_AH, "ah" }, -+ { SA_ESP, "esp" }, -+ { SA_IPIP, "tun" }, -+ { SA_COMP, "comp" }, -+ { SA_INT, "int" }, -+ { 0, NULL } -+}; -+ -+/* -+ - satot - convert SA to text "ah507@1.2.3.4" -+ */ -+size_t /* space needed for full conversion */ -+satot(sa, format, dst, dstlen) -+const ip_said *sa; -+int format; /* character */ -+char *dst; /* need not be valid if dstlen is 0 */ -+size_t dstlen; -+{ -+ size_t len = 0; /* 0 means "not recognized yet" */ -+ int base; -+ int showversion; /* use delimiter to show IP version? */ -+ struct typename *tn; -+ char *p; -+ char *pre; -+ char buf[10+1+ULTOT_BUF+ADDRTOT_BUF]; -+ char unk[10]; -+ -+ switch (format) { -+ case 0: -+ base = 16; -+ showversion = 1; -+ break; -+ case 'f': -+ base = 17; -+ showversion = 1; -+ break; -+ case 'x': -+ base = 'x'; -+ showversion = 0; -+ break; -+ case 'd': -+ base = 10; -+ showversion = 0; -+ break; -+ default: -+ return 0; -+ break; -+ } -+ -+ memset(buf, 0, sizeof(buf)); -+ -+ pre = NULL; -+ for (tn = typenames; tn->name != NULL; tn++) -+ if (sa->proto == tn->type) { -+ pre = tn->name; -+ break; /* NOTE BREAK OUT */ -+ } -+ if (pre == NULL) { /* unknown protocol */ -+ strcpy(unk, "unk"); -+ (void) ultot((unsigned char)sa->proto, 10, unk+strlen(unk), -+ sizeof(unk)-strlen(unk)); -+ pre = unk; -+ } -+ -+ if (strcmp(pre, PASSTHROUGHTYPE) == 0 && -+ sa->spi == PASSTHROUGHSPI && -+ isunspecaddr(&sa->dst)) { -+ strcpy(buf, (addrtypeof(&sa->dst) == AF_INET) ? -+ PASSTHROUGH4NAME : -+ PASSTHROUGH6NAME); -+ len = strlen(buf); -+ } -+ -+ if (sa->proto == SA_INT) { -+ char intunk[10]; -+ switch (ntohl(sa->spi)) { -+ case SPI_PASS: p = "%pass"; break; -+ case SPI_DROP: p = "%drop"; break; -+ case SPI_REJECT: p = "%reject"; break; -+ case SPI_HOLD: p = "%hold"; break; -+ case SPI_TRAP: p = "%trap"; break; -+ case SPI_TRAPSUBNET: p = "%trapsubnet"; break; -+ default: snprintf(intunk, 10, "%%unk-%d", ntohl(sa->spi)); p = intunk; break; -+ } -+ if (p != NULL) { -+ strcpy(buf, p); -+ len = strlen(buf); -+ } -+ } -+ -+ if (len == 0) { /* general case needed */ -+ strcpy(buf, pre); -+ len = strlen(buf); -+ if (showversion) { -+ *(buf+len) = (addrtypeof(&sa->dst) == AF_INET) ? '.' : -+ ':'; -+ len++; -+ *(buf+len) = '\0'; -+ } -+ len += ultot(ntohl(sa->spi), base, buf+len, sizeof(buf)-len); -+ *(buf+len-1) = '@'; -+ len += addrtot(&sa->dst, 0, buf+len, sizeof(buf)-len); -+ *(buf+len) = '\0'; -+ } -+ -+ if (dst != NULL) { -+ if (len > dstlen) -+ *(buf+dstlen-1) = '\0'; -+ strcpy(dst, buf); -+ } -+ return len; -+} ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/subnetof.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,59 @@ -+/* -+ * minor network-address manipulation utilities -+ * Copyright (C) 1998, 1999 Henry Spencer. -+ * -+ * This library is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU Library General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This library is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public -+ * License for more details. -+ * -+ * RCSID $Id: subnetof.c,v 1.8 2004/07/10 07:48:37 mcr Exp $ -+ */ -+#include "openswan.h" -+ -+/* -+ - subnetof - given address and mask, return subnet part -+ */ -+struct in_addr -+subnetof(addr, mask) -+struct in_addr addr; -+struct in_addr mask; -+{ -+ struct in_addr result; -+ -+ result.s_addr = addr.s_addr & mask.s_addr; -+ return result; -+} -+ -+/* -+ - hostof - given address and mask, return host part -+ */ -+struct in_addr -+hostof(addr, mask) -+struct in_addr addr; -+struct in_addr mask; -+{ -+ struct in_addr result; -+ -+ result.s_addr = addr.s_addr & ~mask.s_addr; -+ return result; -+} -+ -+/* -+ - broadcastof - given (network) address and mask, return broadcast address -+ */ -+struct in_addr -+broadcastof(addr, mask) -+struct in_addr addr; -+struct in_addr mask; -+{ -+ struct in_addr result; -+ -+ result.s_addr = addr.s_addr | ~mask.s_addr; -+ return result; -+} ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/subnettoa.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,61 @@ -+/* -+ * convert binary form of subnet description to ASCII -+ * Copyright (C) 1998, 1999 Henry Spencer. -+ * -+ * This library is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU Library General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This library is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public -+ * License for more details. -+ * -+ * RCSID $Id: subnettoa.c,v 1.11 2004/07/10 07:48:37 mcr Exp $ -+ */ -+#include "openswan.h" -+ -+/* -+ - subnettoa - convert address and mask to ASCII "addr/mask" -+ * Output expresses the mask as a bit count if possible, else dotted decimal. -+ */ -+size_t /* space needed for full conversion */ -+subnettoa(addr, mask, format, dst, dstlen) -+struct in_addr addr; -+struct in_addr mask; -+int format; /* character */ -+char *dst; /* need not be valid if dstlen is 0 */ -+size_t dstlen; -+{ -+ size_t len; -+ size_t rest; -+ int n; -+ char *p; -+ -+ switch (format) { -+ case 0: -+ break; -+ default: -+ return 0; -+ break; -+ } -+ -+ len = addrtoa(addr, 0, dst, dstlen); -+ if (len < dstlen) { -+ dst[len - 1] = '/'; -+ p = dst + len; -+ rest = dstlen - len; -+ } else { -+ p = NULL; -+ rest = 0; -+ } -+ -+ n = masktobits(mask); -+ if (n >= 0) -+ len += ultoa((unsigned long)n, 10, p, rest); -+ else -+ len += addrtoa(mask, 0, p, rest); -+ -+ return len; -+} ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/sysctl_net_ipsec.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,387 @@ -+/* -+ * sysctl interface to net IPSEC subsystem. -+ * Copyright (C) 1998, 1999, 2000, 2001 Richard Guy Briggs. -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * RCSID $Id: sysctl_net_ipsec.c,v 1.17.10.2 2007/10/30 21:42:25 paul Exp $ -+ */ -+ -+/* -*- linux-c -*- -+ * -+ * Initiated April 3, 1998, Richard Guy Briggs -+ */ -+ -+#include -+#include -+#include -+ -+#include "openswan/ipsec_param.h" -+ -+#ifdef CONFIG_SYSCTL -+ -+#define NET_IPSEC 2112 /* Random number */ -+#ifdef CONFIG_KLIPS_DEBUG -+extern int debug_ah; -+extern int debug_esp; -+extern int debug_mast; -+extern int debug_tunnel; -+extern int debug_xmit; -+extern int debug_eroute; -+extern int debug_spi; -+extern int debug_radij; -+extern int debug_netlink; -+extern int debug_xform; -+extern int debug_rcv; -+extern int debug_pfkey; -+extern int sysctl_ipsec_debug_verbose; -+#ifdef CONFIG_KLIPS_IPCOMP -+extern int sysctl_ipsec_debug_ipcomp; -+#endif /* CONFIG_KLIPS_IPCOMP */ -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+extern int sysctl_ipsec_icmp; -+extern int sysctl_ipsec_inbound_policy_check; -+extern int sysctl_ipsec_tos; -+int sysctl_ipsec_regress_pfkey_lossage; -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) -+#ifdef CONFIG_KLIPS_DEBUG -+ #define NET_IPSEC_DEBUG_AH CTL_UNNUMBERED -+ #define NET_IPSEC_DEBUG_ESP CTL_UNNUMBERED -+ #define NET_IPSEC_DEBUG_TUNNEL CTL_UNNUMBERED -+ #define NET_IPSEC_DEBUG_EROUTE CTL_UNNUMBERED -+ #define NET_IPSEC_DEBUG_SPI CTL_UNNUMBERED -+ #define NET_IPSEC_DEBUG_RADIJ CTL_UNNUMBERED -+ #define NET_IPSEC_DEBUG_NETLINK CTL_UNNUMBERED -+ #define NET_IPSEC_DEBUG_XFORM CTL_UNNUMBERED -+ #define NET_IPSEC_DEBUG_RCV CTL_UNNUMBERED -+ #define NET_IPSEC_DEBUG_PFKEY CTL_UNNUMBERED -+ #define NET_IPSEC_DEBUG_VERBOSE CTL_UNNUMBERED -+ #define NET_IPSEC_DEBUG_IPCOMP CTL_UNNUMBERED -+#endif /* CONFIG_KLIPS_DEBUG */ -+ #define NET_IPSEC_ICMP CTL_UNNUMBERED -+ #define NET_IPSEC_INBOUND_POLICY_CHECK CTL_UNNUMBERED -+ #define NET_IPSEC_TOS CTL_UNNUMBERED -+ #define NET_IPSEC_REGRESS_PFKEY_LOSSAGE CTL_UNNUMBERED -+ #define NET_IPSEC_DEBUG_MAST CTL_UNNUMBERED -+ #define NET_IPSEC_DEBUG_XMIT CTL_UNNUMBERED -+#else -+enum { -+#ifdef CONFIG_KLIPS_DEBUG -+ NET_IPSEC_DEBUG_AH=1, -+ NET_IPSEC_DEBUG_ESP=2, -+ NET_IPSEC_DEBUG_TUNNEL=3, -+ NET_IPSEC_DEBUG_EROUTE=4, -+ NET_IPSEC_DEBUG_SPI=5, -+ NET_IPSEC_DEBUG_RADIJ=6, -+ NET_IPSEC_DEBUG_NETLINK=7, -+ NET_IPSEC_DEBUG_XFORM=8, -+ NET_IPSEC_DEBUG_RCV=9, -+ NET_IPSEC_DEBUG_PFKEY=10, -+ NET_IPSEC_DEBUG_VERBOSE=11, -+ NET_IPSEC_DEBUG_IPCOMP=12, -+#endif /* CONFIG_KLIPS_DEBUG */ -+ NET_IPSEC_ICMP=13, -+ NET_IPSEC_INBOUND_POLICY_CHECK=14, -+ NET_IPSEC_TOS=15, -+ NET_IPSEC_REGRESS_PFKEY_LOSSAGE=16, -+ NET_IPSEC_DEBUG_MAST=17, -+ NET_IPSEC_DEBUG_XMIT=18, -+}; -+#endif -+ -+static ctl_table ipsec_table[] = { -+ -+#ifdef CONFIG_KLIPS_DEBUG -+#ifdef CTL_TABLE_PARENT -+ { .ctl_name = NET_IPSEC_DEBUG_AH, -+ .procname = "debug_ah", -+ .data = &debug_ah, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .child = NULL, -+ .proc_handler = &proc_dointvec, -+ }, -+ -+ { .ctl_name = NET_IPSEC_DEBUG_ESP, -+ .procname = "debug_esp", -+ .data = &debug_esp, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .child = NULL, -+ .proc_handler = &proc_dointvec, -+ }, -+ -+ { .ctl_name = NET_IPSEC_DEBUG_MAST, -+ .procname = "debug_mast", -+ .data = &debug_mast, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .child = NULL, -+ .proc_handler = &proc_dointvec, -+ }, -+ -+ { .ctl_name = NET_IPSEC_DEBUG_TUNNEL, -+ .procname = "debug_tunnel", -+ .data = &debug_tunnel, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .child = NULL, -+ .proc_handler = &proc_dointvec, -+ }, -+ -+ { .ctl_name = NET_IPSEC_DEBUG_XMIT, -+ .procname = "debug_xmit", -+ .data = &debug_xmit, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .child = NULL, -+ .proc_handler = &proc_dointvec, -+ }, -+ -+ { .ctl_name = NET_IPSEC_DEBUG_EROUTE, -+ .procname = "debug_eroute", -+ .data = &debug_eroute, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .child = NULL, -+ .proc_handler = &proc_dointvec, -+ }, -+ -+ { .ctl_name = NET_IPSEC_DEBUG_SPI, -+ .procname = "debug_spi", -+ .data = &debug_spi, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .child = NULL, -+ .proc_handler = &proc_dointvec, -+ }, -+ -+ { .ctl_name = NET_IPSEC_DEBUG_RADIJ, -+ .procname = "debug_radij", -+ .data = &debug_radij, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .child = NULL, -+ .proc_handler = &proc_dointvec, -+ }, -+ -+ { .ctl_name = NET_IPSEC_DEBUG_NETLINK, -+ .procname = "debug_netlink", -+ .data = &debug_netlink, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .child = NULL, -+ .proc_handler = &proc_dointvec, -+ }, -+ -+ { .ctl_name = NET_IPSEC_DEBUG_XFORM, -+ .procname = "debug_xform", -+ .data = &debug_xform, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .child = NULL, -+ .proc_handler = &proc_dointvec, -+ }, -+ -+ { .ctl_name = NET_IPSEC_DEBUG_RCV, -+ .procname = "debug_rcv", -+ .data = &debug_rcv, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .child = NULL, -+ .proc_handler = &proc_dointvec, -+ }, -+ -+ { .ctl_name = NET_IPSEC_DEBUG_PFKEY, -+ .procname = "debug_pfkey", -+ .data = &debug_pfkey, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .child = NULL, -+ .proc_handler = &proc_dointvec, -+ }, -+ -+ { .ctl_name = NET_IPSEC_DEBUG_VERBOSE, -+ .procname = "debug_verbose", -+ .data = &sysctl_ipsec_debug_verbose, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .child = NULL, -+ .proc_handler = &proc_dointvec, -+ }, -+#else -+ { NET_IPSEC_DEBUG_AH, "debug_ah", &debug_ah, -+ sizeof(int), 0644, NULL, .proc_handler = &proc_dointvec}, -+ { NET_IPSEC_DEBUG_ESP, "debug_esp", &debug_esp, -+ sizeof(int), 0644, NULL, .proc_handler = &proc_dointvec}, -+ { NET_IPSEC_DEBUG_MAST, "debug_mast", &debug_mast, -+ sizeof(int), 0644, NULL, .proc_handler = &proc_dointvec}, -+ { NET_IPSEC_DEBUG_TUNNEL, "debug_tunnel", &debug_tunnel, -+ sizeof(int), 0644, NULL, .proc_handler = &proc_dointvec}, -+ { NET_IPSEC_DEBUG_TUNNEL, "debug_xmit", &debug_xmit, -+ sizeof(int), 0644, NULL, .proc_handler = &proc_dointvec}, -+ { NET_IPSEC_DEBUG_EROUTE, "debug_eroute", &debug_eroute, -+ sizeof(int), 0644, NULL, .proc_handler = &proc_dointvec}, -+ { NET_IPSEC_DEBUG_SPI, "debug_spi", &debug_spi, -+ sizeof(int), 0644, NULL, .proc_handler = &proc_dointvec}, -+ { NET_IPSEC_DEBUG_RADIJ, "debug_radij", &debug_radij, -+ sizeof(int), 0644, NULL, .proc_handler = &proc_dointvec}, -+ { NET_IPSEC_DEBUG_NETLINK, "debug_netlink", &debug_netlink, -+ sizeof(int), 0644, NULL, .proc_handler = &proc_dointvec}, -+ { NET_IPSEC_DEBUG_XFORM, "debug_xform", &debug_xform, -+ sizeof(int), 0644, NULL, .proc_handler = &proc_dointvec}, -+ { NET_IPSEC_DEBUG_RCV, "debug_rcv", &debug_rcv, -+ sizeof(int), 0644, NULL, .proc_handler = &proc_dointvec}, -+ { NET_IPSEC_DEBUG_PFKEY, "debug_pfkey", &debug_pfkey, -+ sizeof(int), 0644, NULL, .proc_handler = &proc_dointvec}, -+ { NET_IPSEC_DEBUG_VERBOSE, "debug_verbose",&sysctl_ipsec_debug_verbose, -+ sizeof(int), 0644, NULL, .proc_handler = &proc_dointvec}, -+#endif /* CTL_TABLE_PARENT */ -+#endif /* CONFIG_KLIPS_DEBUG */ -+ -+#ifdef CONFIG_KLIPS_IPCOMP -+#ifdef CTL_TABLE_PARENT -+ { .ctl_name = NET_IPSEC_DEBUG_IPCOMP, -+ .procname = "debug_ipcomp", -+ .data = &sysctl_ipsec_debug_ipcomp, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .child = NULL, -+ .proc_handler = &proc_dointvec, -+ }, -+#else -+ { NET_IPSEC_DEBUG_IPCOMP, "debug_ipcomp", &sysctl_ipsec_debug_ipcomp, -+ sizeof(int), 0644, NULL, .proc_handler = &proc_dointvec}, -+#endif -+#endif /* CONFIG_KLIPS_IPCOMP */ -+ -+#ifdef CONFIG_KLIPS_REGRESS -+#ifdef CTL_TABLE_PARENT -+ { -+ .ctl_name = NET_IPSEC_REGRESS_PFKEY_LOSSAGE, -+ .procname = "pfkey_lossage", -+ .data = &sysctl_ipsec_regress_pfkey_lossage, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .child = NULL, -+ .proc_handler = &proc_dointvec, -+ }, -+#else -+ { NET_IPSEC_REGRESS_PFKEY_LOSSAGE, "pfkey_lossage", -+ &sysctl_ipsec_regress_pfkey_lossage, -+ sizeof(int), 0644, NULL, .proc_handler = &proc_dointvec}, -+#endif /* CTL_TABLE_PARENT */ -+#endif /* CONFIG_KLIPS_REGRESS */ -+ -+#ifdef CTL_TABLE_PARENT -+ { .ctl_name = NET_IPSEC_ICMP, -+ .procname = "icmp", -+ .data = &sysctl_ipsec_icmp, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .child = NULL, -+ .proc_handler = &proc_dointvec, -+ }, -+ -+ { .ctl_name = NET_IPSEC_INBOUND_POLICY_CHECK, -+ .procname = "inbound_policy_check", -+ .data = &sysctl_ipsec_inbound_policy_check, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .child = NULL, -+ .proc_handler = &proc_dointvec, -+ }, -+ -+ { .ctl_name = NET_IPSEC_TOS, -+ .procname = "tos", -+ .data = &sysctl_ipsec_tos, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .child = NULL, -+ .proc_handler = &proc_dointvec, -+ }, -+ {0} -+#else -+ { NET_IPSEC_ICMP, "icmp", &sysctl_ipsec_icmp, -+ sizeof(int), 0644, NULL, .proc_handler = &proc_dointvec}, -+ { NET_IPSEC_INBOUND_POLICY_CHECK, "inbound_policy_check", &sysctl_ipsec_inbound_policy_check, -+ sizeof(int), 0644, NULL, .proc_handler = &proc_dointvec}, -+ { NET_IPSEC_TOS, "tos", &sysctl_ipsec_tos, -+ sizeof(int), 0644, NULL, .proc_handler = &proc_dointvec}, -+ {0} -+#endif -+}; -+ -+static ctl_table ipsec_net_table[] = { -+#ifdef CTL_TABLE_PARENT -+ { .ctl_name = NET_IPSEC, -+ .procname = "ipsec", -+ .data = NULL, -+ .maxlen = 0, -+ .mode = 0555, -+ .child = ipsec_table, -+ .proc_handler = NULL, -+ }, -+ { 0 } -+#else -+ { NET_IPSEC, "ipsec", NULL, 0, 0555, ipsec_table }, -+ { 0 } -+#endif -+}; -+ -+static ctl_table ipsec_root_table[] = { -+#ifdef CTL_TABLE_PARENT -+ { .ctl_name = CTL_NET, -+ .procname = "net", -+ .data = NULL, -+ .maxlen = 0, -+ .mode = 0555, -+ .child = ipsec_net_table, -+ .proc_handler = NULL, -+ }, -+ { 0 } -+#else -+ { CTL_NET, "net", NULL, 0, 0555, ipsec_net_table }, -+ { 0 } -+#endif -+}; -+ -+static struct ctl_table_header *ipsec_table_header; -+ -+int ipsec_sysctl_register(void) -+{ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,21) -+ ipsec_table_header = register_sysctl_table(ipsec_root_table); -+#else -+ ipsec_table_header = register_sysctl_table(ipsec_root_table, 0); -+#endif -+ if (!ipsec_table_header) { -+ return -ENOMEM; -+ } -+ return 0; -+} -+ -+void ipsec_sysctl_unregister(void) -+{ -+ unregister_sysctl_table(ipsec_table_header); -+} -+ -+#endif /* CONFIG_SYSCTL */ -+ -+/* -+ * -+ * Local Variables: -+ * c-file-style: "linux" -+ * End: -+ * -+ */ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/trees.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,1214 @@ -+/* trees.c -- output deflated data using Huffman coding -+ * Copyright (C) 1995-2002 Jean-loup Gailly -+ * For conditions of distribution and use, see copyright notice in zlib.h -+ */ -+ -+/* -+ * ALGORITHM -+ * -+ * The "deflation" process uses several Huffman trees. The more -+ * common source values are represented by shorter bit sequences. -+ * -+ * Each code tree is stored in a compressed form which is itself -+ * a Huffman encoding of the lengths of all the code strings (in -+ * ascending order by source values). The actual code strings are -+ * reconstructed from the lengths in the inflate process, as described -+ * in the deflate specification. -+ * -+ * REFERENCES -+ * -+ * Deutsch, L.P.,"'Deflate' Compressed Data Format Specification". -+ * Available in ftp.uu.net:/pub/archiving/zip/doc/deflate-1.1.doc -+ * -+ * Storer, James A. -+ * Data Compression: Methods and Theory, pp. 49-50. -+ * Computer Science Press, 1988. ISBN 0-7167-8156-5. -+ * -+ * Sedgewick, R. -+ * Algorithms, p290. -+ * Addison-Wesley, 1983. ISBN 0-201-06672-6. -+ */ -+ -+/* @(#) $Id: trees.c,v 1.4 2004/07/10 07:48:39 mcr Exp $ */ -+ -+/* #define GEN_TREES_H */ -+ -+#include "deflate.h" -+ -+#ifdef DEBUG -+# include -+#endif -+ -+/* =========================================================================== -+ * Constants -+ */ -+ -+#define MAX_BL_BITS 7 -+/* Bit length codes must not exceed MAX_BL_BITS bits */ -+ -+#define END_BLOCK 256 -+/* end of block literal code */ -+ -+#define REP_3_6 16 -+/* repeat previous bit length 3-6 times (2 bits of repeat count) */ -+ -+#define REPZ_3_10 17 -+/* repeat a zero length 3-10 times (3 bits of repeat count) */ -+ -+#define REPZ_11_138 18 -+/* repeat a zero length 11-138 times (7 bits of repeat count) */ -+ -+local const int extra_lbits[LENGTH_CODES] /* extra bits for each length code */ -+ = {0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0}; -+ -+local const int extra_dbits[D_CODES] /* extra bits for each distance code */ -+ = {0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13}; -+ -+local const int extra_blbits[BL_CODES]/* extra bits for each bit length code */ -+ = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,7}; -+ -+local const uch bl_order[BL_CODES] -+ = {16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15}; -+/* The lengths of the bit length codes are sent in order of decreasing -+ * probability, to avoid transmitting the lengths for unused bit length codes. -+ */ -+ -+#define Buf_size (8 * 2*sizeof(char)) -+/* Number of bits used within bi_buf. (bi_buf might be implemented on -+ * more than 16 bits on some systems.) -+ */ -+ -+/* =========================================================================== -+ * Local data. These are initialized only once. -+ */ -+ -+#define DIST_CODE_LEN 512 /* see definition of array dist_code below */ -+ -+#if defined(GEN_TREES_H) || !defined(STDC) -+/* non ANSI compilers may not accept trees.h */ -+ -+local ct_data static_ltree[L_CODES+2]; -+/* The static literal tree. Since the bit lengths are imposed, there is no -+ * need for the L_CODES extra codes used during heap construction. However -+ * The codes 286 and 287 are needed to build a canonical tree (see _tr_init -+ * below). -+ */ -+ -+local ct_data static_dtree[D_CODES]; -+/* The static distance tree. (Actually a trivial tree since all codes use -+ * 5 bits.) -+ */ -+ -+uch _dist_code[DIST_CODE_LEN]; -+/* Distance codes. The first 256 values correspond to the distances -+ * 3 .. 258, the last 256 values correspond to the top 8 bits of -+ * the 15 bit distances. -+ */ -+ -+uch _length_code[MAX_MATCH-MIN_MATCH+1]; -+/* length code for each normalized match length (0 == MIN_MATCH) */ -+ -+local int base_length[LENGTH_CODES]; -+/* First normalized length for each code (0 = MIN_MATCH) */ -+ -+local int base_dist[D_CODES]; -+/* First normalized distance for each code (0 = distance of 1) */ -+ -+#else -+# include "trees.h" -+#endif /* GEN_TREES_H */ -+ -+struct static_tree_desc_s { -+ const ct_data *static_tree; /* static tree or NULL */ -+ const intf *extra_bits; /* extra bits for each code or NULL */ -+ int extra_base; /* base index for extra_bits */ -+ int elems; /* max number of elements in the tree */ -+ int max_length; /* max bit length for the codes */ -+}; -+ -+local static_tree_desc static_l_desc = -+{static_ltree, extra_lbits, LITERALS+1, L_CODES, MAX_BITS}; -+ -+local static_tree_desc static_d_desc = -+{static_dtree, extra_dbits, 0, D_CODES, MAX_BITS}; -+ -+local static_tree_desc static_bl_desc = -+{(const ct_data *)0, extra_blbits, 0, BL_CODES, MAX_BL_BITS}; -+ -+/* =========================================================================== -+ * Local (static) routines in this file. -+ */ -+ -+local void tr_static_init OF((void)); -+local void init_block OF((deflate_state *s)); -+local void pqdownheap OF((deflate_state *s, ct_data *tree, int k)); -+local void gen_bitlen OF((deflate_state *s, tree_desc *desc)); -+local void gen_codes OF((ct_data *tree, int max_code, ushf *bl_count)); -+local void build_tree OF((deflate_state *s, tree_desc *desc)); -+local void scan_tree OF((deflate_state *s, ct_data *tree, int max_code)); -+local void send_tree OF((deflate_state *s, ct_data *tree, int max_code)); -+local int build_bl_tree OF((deflate_state *s)); -+local void send_all_trees OF((deflate_state *s, int lcodes, int dcodes, -+ int blcodes)); -+local void compress_block OF((deflate_state *s, const ct_data *ltree, -+ const ct_data *dtree)); -+local void set_data_type OF((deflate_state *s)); -+local unsigned bi_reverse OF((unsigned value, int length)); -+local void bi_windup OF((deflate_state *s)); -+local void bi_flush OF((deflate_state *s)); -+local void copy_block OF((deflate_state *s, charf *buf, unsigned len, -+ int header)); -+ -+#ifdef GEN_TREES_H -+local void gen_trees_header OF((void)); -+#endif -+ -+#ifndef DEBUG -+# define send_code(s, c, tree) send_bits(s, tree[c].Code, tree[c].Len) -+ /* Send a code of the given tree. c and tree must not have side effects */ -+ -+#else /* DEBUG */ -+# define send_code(s, c, tree) \ -+ { if (z_verbose>2) fprintf(stderr,"\ncd %3d ",(c)); \ -+ send_bits(s, tree[c].Code, tree[c].Len); } -+#endif -+ -+/* =========================================================================== -+ * Output a short LSB first on the stream. -+ * IN assertion: there is enough room in pendingBuf. -+ */ -+#define put_short(s, w) { \ -+ put_byte(s, (uch)((w) & 0xff)); \ -+ put_byte(s, (uch)((ush)(w) >> 8)); \ -+} -+ -+/* =========================================================================== -+ * Send a value on a given number of bits. -+ * IN assertion: length <= 16 and value fits in length bits. -+ */ -+#ifdef DEBUG -+local void send_bits OF((deflate_state *s, int value, int length)); -+ -+local void send_bits(s, value, length) -+ deflate_state *s; -+ int value; /* value to send */ -+ int length; /* number of bits */ -+{ -+ Tracevv((stderr," l %2d v %4x ", length, value)); -+ Assert(length > 0 && length <= 15, "invalid length"); -+ s->bits_sent += (ulg)length; -+ -+ /* If not enough room in bi_buf, use (valid) bits from bi_buf and -+ * (16 - bi_valid) bits from value, leaving (width - (16-bi_valid)) -+ * unused bits in value. -+ */ -+ if (s->bi_valid > (int)Buf_size - length) { -+ s->bi_buf |= (value << s->bi_valid); -+ put_short(s, s->bi_buf); -+ s->bi_buf = (ush)value >> (Buf_size - s->bi_valid); -+ s->bi_valid += length - Buf_size; -+ } else { -+ s->bi_buf |= value << s->bi_valid; -+ s->bi_valid += length; -+ } -+} -+#else /* !DEBUG */ -+ -+#define send_bits(s, value, length) \ -+{ int len = length;\ -+ if (s->bi_valid > (int)Buf_size - len) {\ -+ int val = value;\ -+ s->bi_buf |= (val << s->bi_valid);\ -+ put_short(s, s->bi_buf);\ -+ s->bi_buf = (ush)val >> (Buf_size - s->bi_valid);\ -+ s->bi_valid += len - Buf_size;\ -+ } else {\ -+ s->bi_buf |= (value) << s->bi_valid;\ -+ s->bi_valid += len;\ -+ }\ -+} -+#endif /* DEBUG */ -+ -+ -+#define MAX(a,b) (a >= b ? a : b) -+/* the arguments must not have side effects */ -+ -+/* =========================================================================== -+ * Initialize the various 'constant' tables. -+ */ -+local void tr_static_init() -+{ -+#if defined(GEN_TREES_H) || !defined(STDC) -+ static int static_init_done = 0; -+ int n; /* iterates over tree elements */ -+ int bits; /* bit counter */ -+ int length; /* length value */ -+ int code; /* code value */ -+ int dist; /* distance index */ -+ ush bl_count[MAX_BITS+1]; -+ /* number of codes at each bit length for an optimal tree */ -+ -+ if (static_init_done) return; -+ -+ /* For some embedded targets, global variables are not initialized: */ -+ static_l_desc.static_tree = static_ltree; -+ static_l_desc.extra_bits = extra_lbits; -+ static_d_desc.static_tree = static_dtree; -+ static_d_desc.extra_bits = extra_dbits; -+ static_bl_desc.extra_bits = extra_blbits; -+ -+ /* Initialize the mapping length (0..255) -> length code (0..28) */ -+ length = 0; -+ for (code = 0; code < LENGTH_CODES-1; code++) { -+ base_length[code] = length; -+ for (n = 0; n < (1< dist code (0..29) */ -+ dist = 0; -+ for (code = 0 ; code < 16; code++) { -+ base_dist[code] = dist; -+ for (n = 0; n < (1<>= 7; /* from now on, all distances are divided by 128 */ -+ for ( ; code < D_CODES; code++) { -+ base_dist[code] = dist << 7; -+ for (n = 0; n < (1<<(extra_dbits[code]-7)); n++) { -+ _dist_code[256 + dist++] = (uch)code; -+ } -+ } -+ Assert (dist == 256, "tr_static_init: 256+dist != 512"); -+ -+ /* Construct the codes of the static literal tree */ -+ for (bits = 0; bits <= MAX_BITS; bits++) bl_count[bits] = 0; -+ n = 0; -+ while (n <= 143) static_ltree[n++].Len = 8, bl_count[8]++; -+ while (n <= 255) static_ltree[n++].Len = 9, bl_count[9]++; -+ while (n <= 279) static_ltree[n++].Len = 7, bl_count[7]++; -+ while (n <= 287) static_ltree[n++].Len = 8, bl_count[8]++; -+ /* Codes 286 and 287 do not exist, but we must include them in the -+ * tree construction to get a canonical Huffman tree (longest code -+ * all ones) -+ */ -+ gen_codes((ct_data *)static_ltree, L_CODES+1, bl_count); -+ -+ /* The static distance tree is trivial: */ -+ for (n = 0; n < D_CODES; n++) { -+ static_dtree[n].Len = 5; -+ static_dtree[n].Code = bi_reverse((unsigned)n, 5); -+ } -+ static_init_done = 1; -+ -+# ifdef GEN_TREES_H -+ gen_trees_header(); -+# endif -+#endif /* defined(GEN_TREES_H) || !defined(STDC) */ -+} -+ -+/* =========================================================================== -+ * Genererate the file trees.h describing the static trees. -+ */ -+#ifdef GEN_TREES_H -+# ifndef DEBUG -+# include -+# endif -+ -+# define SEPARATOR(i, last, width) \ -+ ((i) == (last)? "\n};\n\n" : \ -+ ((i) % (width) == (width)-1 ? ",\n" : ", ")) -+ -+void gen_trees_header() -+{ -+ FILE *header = fopen("trees.h", "w"); -+ int i; -+ -+ Assert (header != NULL, "Can't open trees.h"); -+ fprintf(header, -+ "/* header created automatically with -DGEN_TREES_H */\n\n"); -+ -+ fprintf(header, "local const ct_data static_ltree[L_CODES+2] = {\n"); -+ for (i = 0; i < L_CODES+2; i++) { -+ fprintf(header, "{{%3u},{%3u}}%s", static_ltree[i].Code, -+ static_ltree[i].Len, SEPARATOR(i, L_CODES+1, 5)); -+ } -+ -+ fprintf(header, "local const ct_data static_dtree[D_CODES] = {\n"); -+ for (i = 0; i < D_CODES; i++) { -+ fprintf(header, "{{%2u},{%2u}}%s", static_dtree[i].Code, -+ static_dtree[i].Len, SEPARATOR(i, D_CODES-1, 5)); -+ } -+ -+ fprintf(header, "const uch _dist_code[DIST_CODE_LEN] = {\n"); -+ for (i = 0; i < DIST_CODE_LEN; i++) { -+ fprintf(header, "%2u%s", _dist_code[i], -+ SEPARATOR(i, DIST_CODE_LEN-1, 20)); -+ } -+ -+ fprintf(header, "const uch _length_code[MAX_MATCH-MIN_MATCH+1]= {\n"); -+ for (i = 0; i < MAX_MATCH-MIN_MATCH+1; i++) { -+ fprintf(header, "%2u%s", _length_code[i], -+ SEPARATOR(i, MAX_MATCH-MIN_MATCH, 20)); -+ } -+ -+ fprintf(header, "local const int base_length[LENGTH_CODES] = {\n"); -+ for (i = 0; i < LENGTH_CODES; i++) { -+ fprintf(header, "%1u%s", base_length[i], -+ SEPARATOR(i, LENGTH_CODES-1, 20)); -+ } -+ -+ fprintf(header, "local const int base_dist[D_CODES] = {\n"); -+ for (i = 0; i < D_CODES; i++) { -+ fprintf(header, "%5u%s", base_dist[i], -+ SEPARATOR(i, D_CODES-1, 10)); -+ } -+ -+ fclose(header); -+} -+#endif /* GEN_TREES_H */ -+ -+/* =========================================================================== -+ * Initialize the tree data structures for a new zlib stream. -+ */ -+void _tr_init(s) -+ deflate_state *s; -+{ -+ tr_static_init(); -+ -+ s->l_desc.dyn_tree = s->dyn_ltree; -+ s->l_desc.stat_desc = &static_l_desc; -+ -+ s->d_desc.dyn_tree = s->dyn_dtree; -+ s->d_desc.stat_desc = &static_d_desc; -+ -+ s->bl_desc.dyn_tree = s->bl_tree; -+ s->bl_desc.stat_desc = &static_bl_desc; -+ -+ s->bi_buf = 0; -+ s->bi_valid = 0; -+ s->last_eob_len = 8; /* enough lookahead for inflate */ -+#ifdef DEBUG -+ s->compressed_len = 0L; -+ s->bits_sent = 0L; -+#endif -+ -+ /* Initialize the first block of the first file: */ -+ init_block(s); -+} -+ -+/* =========================================================================== -+ * Initialize a new block. -+ */ -+local void init_block(s) -+ deflate_state *s; -+{ -+ int n; /* iterates over tree elements */ -+ -+ /* Initialize the trees. */ -+ for (n = 0; n < L_CODES; n++) s->dyn_ltree[n].Freq = 0; -+ for (n = 0; n < D_CODES; n++) s->dyn_dtree[n].Freq = 0; -+ for (n = 0; n < BL_CODES; n++) s->bl_tree[n].Freq = 0; -+ -+ s->dyn_ltree[END_BLOCK].Freq = 1; -+ s->opt_len = s->static_len = 0L; -+ s->last_lit = s->matches = 0; -+} -+ -+#define SMALLEST 1 -+/* Index within the heap array of least frequent node in the Huffman tree */ -+ -+ -+/* =========================================================================== -+ * Remove the smallest element from the heap and recreate the heap with -+ * one less element. Updates heap and heap_len. -+ */ -+#define pqremove(s, tree, top) \ -+{\ -+ top = s->heap[SMALLEST]; \ -+ s->heap[SMALLEST] = s->heap[s->heap_len--]; \ -+ pqdownheap(s, tree, SMALLEST); \ -+} -+ -+/* =========================================================================== -+ * Compares to subtrees, using the tree depth as tie breaker when -+ * the subtrees have equal frequency. This minimizes the worst case length. -+ */ -+#define smaller(tree, n, m, depth) \ -+ (tree[n].Freq < tree[m].Freq || \ -+ (tree[n].Freq == tree[m].Freq && depth[n] <= depth[m])) -+ -+/* =========================================================================== -+ * Restore the heap property by moving down the tree starting at node k, -+ * exchanging a node with the smallest of its two sons if necessary, stopping -+ * when the heap property is re-established (each father smaller than its -+ * two sons). -+ */ -+local void pqdownheap(s, tree, k) -+ deflate_state *s; -+ ct_data *tree; /* the tree to restore */ -+ int k; /* node to move down */ -+{ -+ int v = s->heap[k]; -+ int j = k << 1; /* left son of k */ -+ while (j <= s->heap_len) { -+ /* Set j to the smallest of the two sons: */ -+ if (j < s->heap_len && -+ smaller(tree, s->heap[j+1], s->heap[j], s->depth)) { -+ j++; -+ } -+ /* Exit if v is smaller than both sons */ -+ if (smaller(tree, v, s->heap[j], s->depth)) break; -+ -+ /* Exchange v with the smallest son */ -+ s->heap[k] = s->heap[j]; k = j; -+ -+ /* And continue down the tree, setting j to the left son of k */ -+ j <<= 1; -+ } -+ s->heap[k] = v; -+} -+ -+/* =========================================================================== -+ * Compute the optimal bit lengths for a tree and update the total bit length -+ * for the current block. -+ * IN assertion: the fields freq and dad are set, heap[heap_max] and -+ * above are the tree nodes sorted by increasing frequency. -+ * OUT assertions: the field len is set to the optimal bit length, the -+ * array bl_count contains the frequencies for each bit length. -+ * The length opt_len is updated; static_len is also updated if stree is -+ * not null. -+ */ -+local void gen_bitlen(s, desc) -+ deflate_state *s; -+ tree_desc *desc; /* the tree descriptor */ -+{ -+ ct_data *tree = desc->dyn_tree; -+ int max_code = desc->max_code; -+ const ct_data *stree = desc->stat_desc->static_tree; -+ const intf *extra = desc->stat_desc->extra_bits; -+ int base = desc->stat_desc->extra_base; -+ int max_length = desc->stat_desc->max_length; -+ int h; /* heap index */ -+ int n, m; /* iterate over the tree elements */ -+ int bits; /* bit length */ -+ int xbits; /* extra bits */ -+ ush f; /* frequency */ -+ int overflow = 0; /* number of elements with bit length too large */ -+ -+ for (bits = 0; bits <= MAX_BITS; bits++) s->bl_count[bits] = 0; -+ -+ /* In a first pass, compute the optimal bit lengths (which may -+ * overflow in the case of the bit length tree). -+ */ -+ tree[s->heap[s->heap_max]].Len = 0; /* root of the heap */ -+ -+ for (h = s->heap_max+1; h < HEAP_SIZE; h++) { -+ n = s->heap[h]; -+ bits = tree[tree[n].Dad].Len + 1; -+ if (bits > max_length) bits = max_length, overflow++; -+ tree[n].Len = (ush)bits; -+ /* We overwrite tree[n].Dad which is no longer needed */ -+ -+ if (n > max_code) continue; /* not a leaf node */ -+ -+ s->bl_count[bits]++; -+ xbits = 0; -+ if (n >= base) xbits = extra[n-base]; -+ f = tree[n].Freq; -+ s->opt_len += (ulg)f * (bits + xbits); -+ if (stree) s->static_len += (ulg)f * (stree[n].Len + xbits); -+ } -+ if (overflow == 0) return; -+ -+ Trace((stderr,"\nbit length overflow\n")); -+ /* This happens for example on obj2 and pic of the Calgary corpus */ -+ -+ /* Find the first bit length which could increase: */ -+ do { -+ bits = max_length-1; -+ while (s->bl_count[bits] == 0) bits--; -+ s->bl_count[bits]--; /* move one leaf down the tree */ -+ s->bl_count[bits+1] += 2; /* move one overflow item as its brother */ -+ s->bl_count[max_length]--; -+ /* The brother of the overflow item also moves one step up, -+ * but this does not affect bl_count[max_length] -+ */ -+ overflow -= 2; -+ } while (overflow > 0); -+ -+ /* Now recompute all bit lengths, scanning in increasing frequency. -+ * h is still equal to HEAP_SIZE. (It is simpler to reconstruct all -+ * lengths instead of fixing only the wrong ones. This idea is taken -+ * from 'ar' written by Haruhiko Okumura.) -+ */ -+ for (bits = max_length; bits != 0; bits--) { -+ n = s->bl_count[bits]; -+ while (n != 0) { -+ m = s->heap[--h]; -+ if (m > max_code) continue; -+ if (tree[m].Len != (unsigned) bits) { -+ Trace((stderr,"code %d bits %d->%d\n", m, tree[m].Len, bits)); -+ s->opt_len += ((long)bits - (long)tree[m].Len) -+ *(long)tree[m].Freq; -+ tree[m].Len = (ush)bits; -+ } -+ n--; -+ } -+ } -+} -+ -+/* =========================================================================== -+ * Generate the codes for a given tree and bit counts (which need not be -+ * optimal). -+ * IN assertion: the array bl_count contains the bit length statistics for -+ * the given tree and the field len is set for all tree elements. -+ * OUT assertion: the field code is set for all tree elements of non -+ * zero code length. -+ */ -+local void gen_codes (tree, max_code, bl_count) -+ ct_data *tree; /* the tree to decorate */ -+ int max_code; /* largest code with non zero frequency */ -+ ushf *bl_count; /* number of codes at each bit length */ -+{ -+ ush next_code[MAX_BITS+1]; /* next code value for each bit length */ -+ ush code = 0; /* running code value */ -+ int bits; /* bit index */ -+ int n; /* code index */ -+ -+ /* The distribution counts are first used to generate the code values -+ * without bit reversal. -+ */ -+ for (bits = 1; bits <= MAX_BITS; bits++) { -+ next_code[bits] = code = (code + bl_count[bits-1]) << 1; -+ } -+ /* Check that the bit counts in bl_count are consistent. The last code -+ * must be all ones. -+ */ -+ Assert (code + bl_count[MAX_BITS]-1 == (1<dyn_tree; -+ const ct_data *stree = desc->stat_desc->static_tree; -+ int elems = desc->stat_desc->elems; -+ int n, m; /* iterate over heap elements */ -+ int max_code = -1; /* largest code with non zero frequency */ -+ int node; /* new node being created */ -+ -+ /* Construct the initial heap, with least frequent element in -+ * heap[SMALLEST]. The sons of heap[n] are heap[2*n] and heap[2*n+1]. -+ * heap[0] is not used. -+ */ -+ s->heap_len = 0, s->heap_max = HEAP_SIZE; -+ -+ for (n = 0; n < elems; n++) { -+ if (tree[n].Freq != 0) { -+ s->heap[++(s->heap_len)] = max_code = n; -+ s->depth[n] = 0; -+ } else { -+ tree[n].Len = 0; -+ } -+ } -+ -+ /* The pkzip format requires that at least one distance code exists, -+ * and that at least one bit should be sent even if there is only one -+ * possible code. So to avoid special checks later on we force at least -+ * two codes of non zero frequency. -+ */ -+ while (s->heap_len < 2) { -+ node = s->heap[++(s->heap_len)] = (max_code < 2 ? ++max_code : 0); -+ tree[node].Freq = 1; -+ s->depth[node] = 0; -+ s->opt_len--; if (stree) s->static_len -= stree[node].Len; -+ /* node is 0 or 1 so it does not have extra bits */ -+ } -+ desc->max_code = max_code; -+ -+ /* The elements heap[heap_len/2+1 .. heap_len] are leaves of the tree, -+ * establish sub-heaps of increasing lengths: -+ */ -+ for (n = s->heap_len/2; n >= 1; n--) pqdownheap(s, tree, n); -+ -+ /* Construct the Huffman tree by repeatedly combining the least two -+ * frequent nodes. -+ */ -+ node = elems; /* next internal node of the tree */ -+ do { -+ pqremove(s, tree, n); /* n = node of least frequency */ -+ m = s->heap[SMALLEST]; /* m = node of next least frequency */ -+ -+ s->heap[--(s->heap_max)] = n; /* keep the nodes sorted by frequency */ -+ s->heap[--(s->heap_max)] = m; -+ -+ /* Create a new node father of n and m */ -+ tree[node].Freq = tree[n].Freq + tree[m].Freq; -+ s->depth[node] = (uch) (MAX(s->depth[n], s->depth[m]) + 1); -+ tree[n].Dad = tree[m].Dad = (ush)node; -+#ifdef DUMP_BL_TREE -+ if (tree == s->bl_tree) { -+ fprintf(stderr,"\nnode %d(%d), sons %d(%d) %d(%d)", -+ node, tree[node].Freq, n, tree[n].Freq, m, tree[m].Freq); -+ } -+#endif -+ /* and insert the new node in the heap */ -+ s->heap[SMALLEST] = node++; -+ pqdownheap(s, tree, SMALLEST); -+ -+ } while (s->heap_len >= 2); -+ -+ s->heap[--(s->heap_max)] = s->heap[SMALLEST]; -+ -+ /* At this point, the fields freq and dad are set. We can now -+ * generate the bit lengths. -+ */ -+ gen_bitlen(s, (tree_desc *)desc); -+ -+ /* The field len is now set, we can generate the bit codes */ -+ gen_codes ((ct_data *)tree, max_code, s->bl_count); -+} -+ -+/* =========================================================================== -+ * Scan a literal or distance tree to determine the frequencies of the codes -+ * in the bit length tree. -+ */ -+local void scan_tree (s, tree, max_code) -+ deflate_state *s; -+ ct_data *tree; /* the tree to be scanned */ -+ int max_code; /* and its largest code of non zero frequency */ -+{ -+ int n; /* iterates over all tree elements */ -+ int prevlen = -1; /* last emitted length */ -+ int curlen; /* length of current code */ -+ int nextlen = tree[0].Len; /* length of next code */ -+ int count = 0; /* repeat count of the current code */ -+ int max_count = 7; /* max repeat count */ -+ int min_count = 4; /* min repeat count */ -+ -+ if (nextlen == 0) max_count = 138, min_count = 3; -+ tree[max_code+1].Len = (ush)0xffff; /* guard */ -+ -+ for (n = 0; n <= max_code; n++) { -+ curlen = nextlen; nextlen = tree[n+1].Len; -+ if (++count < max_count && curlen == nextlen) { -+ continue; -+ } else if (count < min_count) { -+ s->bl_tree[curlen].Freq += count; -+ } else if (curlen != 0) { -+ if (curlen != prevlen) s->bl_tree[curlen].Freq++; -+ s->bl_tree[REP_3_6].Freq++; -+ } else if (count <= 10) { -+ s->bl_tree[REPZ_3_10].Freq++; -+ } else { -+ s->bl_tree[REPZ_11_138].Freq++; -+ } -+ count = 0; prevlen = curlen; -+ if (nextlen == 0) { -+ max_count = 138, min_count = 3; -+ } else if (curlen == nextlen) { -+ max_count = 6, min_count = 3; -+ } else { -+ max_count = 7, min_count = 4; -+ } -+ } -+} -+ -+/* =========================================================================== -+ * Send a literal or distance tree in compressed form, using the codes in -+ * bl_tree. -+ */ -+local void send_tree (s, tree, max_code) -+ deflate_state *s; -+ ct_data *tree; /* the tree to be scanned */ -+ int max_code; /* and its largest code of non zero frequency */ -+{ -+ int n; /* iterates over all tree elements */ -+ int prevlen = -1; /* last emitted length */ -+ int curlen; /* length of current code */ -+ int nextlen = tree[0].Len; /* length of next code */ -+ int count = 0; /* repeat count of the current code */ -+ int max_count = 7; /* max repeat count */ -+ int min_count = 4; /* min repeat count */ -+ -+ /* tree[max_code+1].Len = -1; */ /* guard already set */ -+ if (nextlen == 0) max_count = 138, min_count = 3; -+ -+ for (n = 0; n <= max_code; n++) { -+ curlen = nextlen; nextlen = tree[n+1].Len; -+ if (++count < max_count && curlen == nextlen) { -+ continue; -+ } else if (count < min_count) { -+ do { send_code(s, curlen, s->bl_tree); } while (--count != 0); -+ -+ } else if (curlen != 0) { -+ if (curlen != prevlen) { -+ send_code(s, curlen, s->bl_tree); count--; -+ } -+ Assert(count >= 3 && count <= 6, " 3_6?"); -+ send_code(s, REP_3_6, s->bl_tree); send_bits(s, count-3, 2); -+ -+ } else if (count <= 10) { -+ send_code(s, REPZ_3_10, s->bl_tree); send_bits(s, count-3, 3); -+ -+ } else { -+ send_code(s, REPZ_11_138, s->bl_tree); send_bits(s, count-11, 7); -+ } -+ count = 0; prevlen = curlen; -+ if (nextlen == 0) { -+ max_count = 138, min_count = 3; -+ } else if (curlen == nextlen) { -+ max_count = 6, min_count = 3; -+ } else { -+ max_count = 7, min_count = 4; -+ } -+ } -+} -+ -+/* =========================================================================== -+ * Construct the Huffman tree for the bit lengths and return the index in -+ * bl_order of the last bit length code to send. -+ */ -+local int build_bl_tree(s) -+ deflate_state *s; -+{ -+ int max_blindex; /* index of last bit length code of non zero freq */ -+ -+ /* Determine the bit length frequencies for literal and distance trees */ -+ scan_tree(s, (ct_data *)s->dyn_ltree, s->l_desc.max_code); -+ scan_tree(s, (ct_data *)s->dyn_dtree, s->d_desc.max_code); -+ -+ /* Build the bit length tree: */ -+ build_tree(s, (tree_desc *)(&(s->bl_desc))); -+ /* opt_len now includes the length of the tree representations, except -+ * the lengths of the bit lengths codes and the 5+5+4 bits for the counts. -+ */ -+ -+ /* Determine the number of bit length codes to send. The pkzip format -+ * requires that at least 4 bit length codes be sent. (appnote.txt says -+ * 3 but the actual value used is 4.) -+ */ -+ for (max_blindex = BL_CODES-1; max_blindex >= 3; max_blindex--) { -+ if (s->bl_tree[bl_order[max_blindex]].Len != 0) break; -+ } -+ /* Update opt_len to include the bit length tree and counts */ -+ s->opt_len += 3*(max_blindex+1) + 5+5+4; -+ Tracev((stderr, "\ndyn trees: dyn %ld, stat %ld", -+ s->opt_len, s->static_len)); -+ -+ return max_blindex; -+} -+ -+/* =========================================================================== -+ * Send the header for a block using dynamic Huffman trees: the counts, the -+ * lengths of the bit length codes, the literal tree and the distance tree. -+ * IN assertion: lcodes >= 257, dcodes >= 1, blcodes >= 4. -+ */ -+local void send_all_trees(s, lcodes, dcodes, blcodes) -+ deflate_state *s; -+ int lcodes, dcodes, blcodes; /* number of codes for each tree */ -+{ -+ int rank; /* index in bl_order */ -+ -+ Assert (lcodes >= 257 && dcodes >= 1 && blcodes >= 4, "not enough codes"); -+ Assert (lcodes <= L_CODES && dcodes <= D_CODES && blcodes <= BL_CODES, -+ "too many codes"); -+ Tracev((stderr, "\nbl counts: ")); -+ send_bits(s, lcodes-257, 5); /* not +255 as stated in appnote.txt */ -+ send_bits(s, dcodes-1, 5); -+ send_bits(s, blcodes-4, 4); /* not -3 as stated in appnote.txt */ -+ for (rank = 0; rank < blcodes; rank++) { -+ Tracev((stderr, "\nbl code %2d ", bl_order[rank])); -+ send_bits(s, s->bl_tree[bl_order[rank]].Len, 3); -+ } -+ Tracev((stderr, "\nbl tree: sent %ld", s->bits_sent)); -+ -+ send_tree(s, (ct_data *)s->dyn_ltree, lcodes-1); /* literal tree */ -+ Tracev((stderr, "\nlit tree: sent %ld", s->bits_sent)); -+ -+ send_tree(s, (ct_data *)s->dyn_dtree, dcodes-1); /* distance tree */ -+ Tracev((stderr, "\ndist tree: sent %ld", s->bits_sent)); -+} -+ -+/* =========================================================================== -+ * Send a stored block -+ */ -+void _tr_stored_block(s, buf, stored_len, eof) -+ deflate_state *s; -+ charf *buf; /* input block */ -+ ulg stored_len; /* length of input block */ -+ int eof; /* true if this is the last block for a file */ -+{ -+ send_bits(s, (STORED_BLOCK<<1)+eof, 3); /* send block type */ -+#ifdef DEBUG -+ s->compressed_len = (s->compressed_len + 3 + 7) & (ulg)~7L; -+ s->compressed_len += (stored_len + 4) << 3; -+#endif -+ copy_block(s, buf, (unsigned)stored_len, 1); /* with header */ -+} -+ -+/* =========================================================================== -+ * Send one empty static block to give enough lookahead for inflate. -+ * This takes 10 bits, of which 7 may remain in the bit buffer. -+ * The current inflate code requires 9 bits of lookahead. If the -+ * last two codes for the previous block (real code plus EOB) were coded -+ * on 5 bits or less, inflate may have only 5+3 bits of lookahead to decode -+ * the last real code. In this case we send two empty static blocks instead -+ * of one. (There are no problems if the previous block is stored or fixed.) -+ * To simplify the code, we assume the worst case of last real code encoded -+ * on one bit only. -+ */ -+void _tr_align(s) -+ deflate_state *s; -+{ -+ send_bits(s, STATIC_TREES<<1, 3); -+ send_code(s, END_BLOCK, static_ltree); -+#ifdef DEBUG -+ s->compressed_len += 10L; /* 3 for block type, 7 for EOB */ -+#endif -+ bi_flush(s); -+ /* Of the 10 bits for the empty block, we have already sent -+ * (10 - bi_valid) bits. The lookahead for the last real code (before -+ * the EOB of the previous block) was thus at least one plus the length -+ * of the EOB plus what we have just sent of the empty static block. -+ */ -+ if (1 + s->last_eob_len + 10 - s->bi_valid < 9) { -+ send_bits(s, STATIC_TREES<<1, 3); -+ send_code(s, END_BLOCK, static_ltree); -+#ifdef DEBUG -+ s->compressed_len += 10L; -+#endif -+ bi_flush(s); -+ } -+ s->last_eob_len = 7; -+} -+ -+/* =========================================================================== -+ * Determine the best encoding for the current block: dynamic trees, static -+ * trees or store, and output the encoded block to the zip file. -+ */ -+void _tr_flush_block(s, buf, stored_len, eof) -+ deflate_state *s; -+ charf *buf; /* input block, or NULL if too old */ -+ ulg stored_len; /* length of input block */ -+ int eof; /* true if this is the last block for a file */ -+{ -+ ulg opt_lenb, static_lenb; /* opt_len and static_len in bytes */ -+ int max_blindex = 0; /* index of last bit length code of non zero freq */ -+ -+ /* Build the Huffman trees unless a stored block is forced */ -+ if (s->level > 0) { -+ -+ /* Check if the file is ascii or binary */ -+ if (s->data_type == Z_UNKNOWN) set_data_type(s); -+ -+ /* Construct the literal and distance trees */ -+ build_tree(s, (tree_desc *)(&(s->l_desc))); -+ Tracev((stderr, "\nlit data: dyn %ld, stat %ld", s->opt_len, -+ s->static_len)); -+ -+ build_tree(s, (tree_desc *)(&(s->d_desc))); -+ Tracev((stderr, "\ndist data: dyn %ld, stat %ld", s->opt_len, -+ s->static_len)); -+ /* At this point, opt_len and static_len are the total bit lengths of -+ * the compressed block data, excluding the tree representations. -+ */ -+ -+ /* Build the bit length tree for the above two trees, and get the index -+ * in bl_order of the last bit length code to send. -+ */ -+ max_blindex = build_bl_tree(s); -+ -+ /* Determine the best encoding. Compute first the block length in bytes*/ -+ opt_lenb = (s->opt_len+3+7)>>3; -+ static_lenb = (s->static_len+3+7)>>3; -+ -+ Tracev((stderr, "\nopt %lu(%lu) stat %lu(%lu) stored %lu lit %u ", -+ opt_lenb, s->opt_len, static_lenb, s->static_len, stored_len, -+ s->last_lit)); -+ -+ if (static_lenb <= opt_lenb) opt_lenb = static_lenb; -+ -+ } else { -+ Assert(buf != (char*)0, "lost buf"); -+ opt_lenb = static_lenb = stored_len + 5; /* force a stored block */ -+ } -+ -+#ifdef FORCE_STORED -+ if (buf != (char*)0) { /* force stored block */ -+#else -+ if (stored_len+4 <= opt_lenb && buf != (char*)0) { -+ /* 4: two words for the lengths */ -+#endif -+ /* The test buf != NULL is only necessary if LIT_BUFSIZE > WSIZE. -+ * Otherwise we can't have processed more than WSIZE input bytes since -+ * the last block flush, because compression would have been -+ * successful. If LIT_BUFSIZE <= WSIZE, it is never too late to -+ * transform a block into a stored block. -+ */ -+ _tr_stored_block(s, buf, stored_len, eof); -+ -+#ifdef FORCE_STATIC -+ } else if (static_lenb >= 0) { /* force static trees */ -+#else -+ } else if (static_lenb == opt_lenb) { -+#endif -+ send_bits(s, (STATIC_TREES<<1)+eof, 3); -+ compress_block(s, static_ltree, static_dtree); -+#ifdef DEBUG -+ s->compressed_len += 3 + s->static_len; -+#endif -+ } else { -+ send_bits(s, (DYN_TREES<<1)+eof, 3); -+ send_all_trees(s, s->l_desc.max_code+1, s->d_desc.max_code+1, -+ max_blindex+1); -+ compress_block(s, s->dyn_ltree, s->dyn_dtree); -+#ifdef DEBUG -+ s->compressed_len += 3 + s->opt_len; -+#endif -+ } -+ Assert (s->compressed_len == s->bits_sent, "bad compressed size"); -+ /* The above check is made mod 2^32, for files larger than 512 MB -+ * and uLong implemented on 32 bits. -+ */ -+ init_block(s); -+ -+ if (eof) { -+ bi_windup(s); -+#ifdef DEBUG -+ s->compressed_len += 7; /* align on byte boundary */ -+#endif -+ } -+ Tracev((stderr,"\ncomprlen %lu(%lu) ", s->compressed_len>>3, -+ s->compressed_len-7*eof)); -+} -+ -+/* =========================================================================== -+ * Save the match info and tally the frequency counts. Return true if -+ * the current block must be flushed. -+ */ -+int _tr_tally (s, dist, lc) -+ deflate_state *s; -+ unsigned dist; /* distance of matched string */ -+ unsigned lc; /* match length-MIN_MATCH or unmatched char (if dist==0) */ -+{ -+ s->d_buf[s->last_lit] = (ush)dist; -+ s->l_buf[s->last_lit++] = (uch)lc; -+ if (dist == 0) { -+ /* lc is the unmatched char */ -+ s->dyn_ltree[lc].Freq++; -+ } else { -+ s->matches++; -+ /* Here, lc is the match length - MIN_MATCH */ -+ dist--; /* dist = match distance - 1 */ -+ Assert((ush)dist < (ush)MAX_DIST(s) && -+ (ush)lc <= (ush)(MAX_MATCH-MIN_MATCH) && -+ (ush)d_code(dist) < (ush)D_CODES, "_tr_tally: bad match"); -+ -+ s->dyn_ltree[_length_code[lc]+LITERALS+1].Freq++; -+ s->dyn_dtree[d_code(dist)].Freq++; -+ } -+ -+#ifdef TRUNCATE_BLOCK -+ /* Try to guess if it is profitable to stop the current block here */ -+ if ((s->last_lit & 0x1fff) == 0 && s->level > 2) { -+ /* Compute an upper bound for the compressed length */ -+ ulg out_length = (ulg)s->last_lit*8L; -+ ulg in_length = (ulg)((long)s->strstart - s->block_start); -+ int dcode; -+ for (dcode = 0; dcode < D_CODES; dcode++) { -+ out_length += (ulg)s->dyn_dtree[dcode].Freq * -+ (5L+extra_dbits[dcode]); -+ } -+ out_length >>= 3; -+ Tracev((stderr,"\nlast_lit %u, in %ld, out ~%ld(%ld%%) ", -+ s->last_lit, in_length, out_length, -+ 100L - out_length*100L/in_length)); -+ if (s->matches < s->last_lit/2 && out_length < in_length/2) return 1; -+ } -+#endif -+ return (s->last_lit == s->lit_bufsize-1); -+ /* We avoid equality with lit_bufsize because of wraparound at 64K -+ * on 16 bit machines and because stored blocks are restricted to -+ * 64K-1 bytes. -+ */ -+} -+ -+/* =========================================================================== -+ * Send the block data compressed using the given Huffman trees -+ */ -+local void compress_block(s, ltree, dtree) -+ deflate_state *s; -+ const ct_data *ltree; /* literal tree */ -+ const ct_data *dtree; /* distance tree */ -+{ -+ unsigned dist; /* distance of matched string */ -+ int lc; /* match length or unmatched char (if dist == 0) */ -+ unsigned lx = 0; /* running index in l_buf */ -+ unsigned code; /* the code to send */ -+ int extra; /* number of extra bits to send */ -+ -+ if (s->last_lit != 0) do { -+ dist = s->d_buf[lx]; -+ lc = s->l_buf[lx++]; -+ if (dist == 0) { -+ send_code(s, lc, ltree); /* send a literal byte */ -+ Tracecv(isgraph(lc), (stderr," '%c' ", lc)); -+ } else { -+ /* Here, lc is the match length - MIN_MATCH */ -+ code = _length_code[lc]; -+ send_code(s, code+LITERALS+1, ltree); /* send the length code */ -+ extra = extra_lbits[code]; -+ if (extra != 0) { -+ lc -= base_length[code]; -+ send_bits(s, lc, extra); /* send the extra length bits */ -+ } -+ dist--; /* dist is now the match distance - 1 */ -+ code = d_code(dist); -+ Assert (code < D_CODES, "bad d_code"); -+ -+ send_code(s, code, dtree); /* send the distance code */ -+ extra = extra_dbits[code]; -+ if (extra != 0) { -+ dist -= base_dist[code]; -+ send_bits(s, dist, extra); /* send the extra distance bits */ -+ } -+ } /* literal or match pair ? */ -+ -+ /* Check that the overlay between pending_buf and d_buf+l_buf is ok: */ -+ Assert(s->pending < s->lit_bufsize + 2*lx, "pendingBuf overflow"); -+ -+ } while (lx < s->last_lit); -+ -+ send_code(s, END_BLOCK, ltree); -+ s->last_eob_len = ltree[END_BLOCK].Len; -+} -+ -+/* =========================================================================== -+ * Set the data type to ASCII or BINARY, using a crude approximation: -+ * binary if more than 20% of the bytes are <= 6 or >= 128, ascii otherwise. -+ * IN assertion: the fields freq of dyn_ltree are set and the total of all -+ * frequencies does not exceed 64K (to fit in an int on 16 bit machines). -+ */ -+local void set_data_type(s) -+ deflate_state *s; -+{ -+ int n = 0; -+ unsigned ascii_freq = 0; -+ unsigned bin_freq = 0; -+ while (n < 7) bin_freq += s->dyn_ltree[n++].Freq; -+ while (n < 128) ascii_freq += s->dyn_ltree[n++].Freq; -+ while (n < LITERALS) bin_freq += s->dyn_ltree[n++].Freq; -+ s->data_type = (Byte)(bin_freq > (ascii_freq >> 2) ? Z_BINARY : Z_ASCII); -+} -+ -+/* =========================================================================== -+ * Reverse the first len bits of a code, using straightforward code (a faster -+ * method would use a table) -+ * IN assertion: 1 <= len <= 15 -+ */ -+local unsigned bi_reverse(code, len) -+ unsigned code; /* the value to invert */ -+ int len; /* its bit length */ -+{ -+ register unsigned res = 0; -+ do { -+ res |= code & 1; -+ code >>= 1, res <<= 1; -+ } while (--len > 0); -+ return res >> 1; -+} -+ -+/* =========================================================================== -+ * Flush the bit buffer, keeping at most 7 bits in it. -+ */ -+local void bi_flush(s) -+ deflate_state *s; -+{ -+ if (s->bi_valid == 16) { -+ put_short(s, s->bi_buf); -+ s->bi_buf = 0; -+ s->bi_valid = 0; -+ } else if (s->bi_valid >= 8) { -+ put_byte(s, (Byte)s->bi_buf); -+ s->bi_buf >>= 8; -+ s->bi_valid -= 8; -+ } -+} -+ -+/* =========================================================================== -+ * Flush the bit buffer and align the output on a byte boundary -+ */ -+local void bi_windup(s) -+ deflate_state *s; -+{ -+ if (s->bi_valid > 8) { -+ put_short(s, s->bi_buf); -+ } else if (s->bi_valid > 0) { -+ put_byte(s, (Byte)s->bi_buf); -+ } -+ s->bi_buf = 0; -+ s->bi_valid = 0; -+#ifdef DEBUG -+ s->bits_sent = (s->bits_sent+7) & ~7; -+#endif -+} -+ -+/* =========================================================================== -+ * Copy a stored block, storing first the length and its -+ * one's complement if requested. -+ */ -+local void copy_block(s, buf, len, header) -+ deflate_state *s; -+ charf *buf; /* the input data */ -+ unsigned len; /* its length */ -+ int header; /* true if block header must be written */ -+{ -+ bi_windup(s); /* align on byte boundary */ -+ s->last_eob_len = 8; /* enough lookahead for inflate */ -+ -+ if (header) { -+ put_short(s, (ush)len); -+ put_short(s, (ush)~len); -+#ifdef DEBUG -+ s->bits_sent += 2*16; -+#endif -+ } -+#ifdef DEBUG -+ s->bits_sent += (ulg)len<<3; -+#endif -+ while (len--) { -+ put_byte(s, *buf++); -+ } -+} ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/trees.h Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,128 @@ -+/* header created automatically with -DGEN_TREES_H */ -+ -+local const ct_data static_ltree[L_CODES+2] = { -+{{ 12},{ 8}}, {{140},{ 8}}, {{ 76},{ 8}}, {{204},{ 8}}, {{ 44},{ 8}}, -+{{172},{ 8}}, {{108},{ 8}}, {{236},{ 8}}, {{ 28},{ 8}}, {{156},{ 8}}, -+{{ 92},{ 8}}, {{220},{ 8}}, {{ 60},{ 8}}, {{188},{ 8}}, {{124},{ 8}}, -+{{252},{ 8}}, {{ 2},{ 8}}, {{130},{ 8}}, {{ 66},{ 8}}, {{194},{ 8}}, -+{{ 34},{ 8}}, {{162},{ 8}}, {{ 98},{ 8}}, {{226},{ 8}}, {{ 18},{ 8}}, -+{{146},{ 8}}, {{ 82},{ 8}}, {{210},{ 8}}, {{ 50},{ 8}}, {{178},{ 8}}, -+{{114},{ 8}}, {{242},{ 8}}, {{ 10},{ 8}}, {{138},{ 8}}, {{ 74},{ 8}}, -+{{202},{ 8}}, {{ 42},{ 8}}, {{170},{ 8}}, {{106},{ 8}}, {{234},{ 8}}, -+{{ 26},{ 8}}, {{154},{ 8}}, {{ 90},{ 8}}, {{218},{ 8}}, {{ 58},{ 8}}, -+{{186},{ 8}}, {{122},{ 8}}, {{250},{ 8}}, {{ 6},{ 8}}, {{134},{ 8}}, -+{{ 70},{ 8}}, {{198},{ 8}}, {{ 38},{ 8}}, {{166},{ 8}}, {{102},{ 8}}, -+{{230},{ 8}}, {{ 22},{ 8}}, {{150},{ 8}}, {{ 86},{ 8}}, {{214},{ 8}}, -+{{ 54},{ 8}}, {{182},{ 8}}, {{118},{ 8}}, {{246},{ 8}}, {{ 14},{ 8}}, -+{{142},{ 8}}, {{ 78},{ 8}}, {{206},{ 8}}, {{ 46},{ 8}}, {{174},{ 8}}, -+{{110},{ 8}}, {{238},{ 8}}, {{ 30},{ 8}}, {{158},{ 8}}, {{ 94},{ 8}}, -+{{222},{ 8}}, {{ 62},{ 8}}, {{190},{ 8}}, {{126},{ 8}}, {{254},{ 8}}, -+{{ 1},{ 8}}, {{129},{ 8}}, {{ 65},{ 8}}, {{193},{ 8}}, {{ 33},{ 8}}, -+{{161},{ 8}}, {{ 97},{ 8}}, {{225},{ 8}}, {{ 17},{ 8}}, {{145},{ 8}}, -+{{ 81},{ 8}}, {{209},{ 8}}, {{ 49},{ 8}}, {{177},{ 8}}, {{113},{ 8}}, -+{{241},{ 8}}, {{ 9},{ 8}}, {{137},{ 8}}, {{ 73},{ 8}}, {{201},{ 8}}, -+{{ 41},{ 8}}, {{169},{ 8}}, {{105},{ 8}}, {{233},{ 8}}, {{ 25},{ 8}}, -+{{153},{ 8}}, {{ 89},{ 8}}, {{217},{ 8}}, {{ 57},{ 8}}, {{185},{ 8}}, -+{{121},{ 8}}, {{249},{ 8}}, {{ 5},{ 8}}, {{133},{ 8}}, {{ 69},{ 8}}, -+{{197},{ 8}}, {{ 37},{ 8}}, {{165},{ 8}}, {{101},{ 8}}, {{229},{ 8}}, -+{{ 21},{ 8}}, {{149},{ 8}}, {{ 85},{ 8}}, {{213},{ 8}}, {{ 53},{ 8}}, -+{{181},{ 8}}, {{117},{ 8}}, {{245},{ 8}}, {{ 13},{ 8}}, {{141},{ 8}}, -+{{ 77},{ 8}}, {{205},{ 8}}, {{ 45},{ 8}}, {{173},{ 8}}, {{109},{ 8}}, -+{{237},{ 8}}, {{ 29},{ 8}}, {{157},{ 8}}, {{ 93},{ 8}}, {{221},{ 8}}, -+{{ 61},{ 8}}, {{189},{ 8}}, {{125},{ 8}}, {{253},{ 8}}, {{ 19},{ 9}}, -+{{275},{ 9}}, {{147},{ 9}}, {{403},{ 9}}, {{ 83},{ 9}}, {{339},{ 9}}, -+{{211},{ 9}}, {{467},{ 9}}, {{ 51},{ 9}}, {{307},{ 9}}, {{179},{ 9}}, -+{{435},{ 9}}, {{115},{ 9}}, {{371},{ 9}}, {{243},{ 9}}, {{499},{ 9}}, -+{{ 11},{ 9}}, {{267},{ 9}}, {{139},{ 9}}, {{395},{ 9}}, {{ 75},{ 9}}, -+{{331},{ 9}}, {{203},{ 9}}, {{459},{ 9}}, {{ 43},{ 9}}, {{299},{ 9}}, -+{{171},{ 9}}, {{427},{ 9}}, {{107},{ 9}}, {{363},{ 9}}, {{235},{ 9}}, -+{{491},{ 9}}, {{ 27},{ 9}}, {{283},{ 9}}, {{155},{ 9}}, {{411},{ 9}}, -+{{ 91},{ 9}}, {{347},{ 9}}, {{219},{ 9}}, {{475},{ 9}}, {{ 59},{ 9}}, -+{{315},{ 9}}, {{187},{ 9}}, {{443},{ 9}}, {{123},{ 9}}, {{379},{ 9}}, -+{{251},{ 9}}, {{507},{ 9}}, {{ 7},{ 9}}, {{263},{ 9}}, {{135},{ 9}}, -+{{391},{ 9}}, {{ 71},{ 9}}, {{327},{ 9}}, {{199},{ 9}}, {{455},{ 9}}, -+{{ 39},{ 9}}, {{295},{ 9}}, {{167},{ 9}}, {{423},{ 9}}, {{103},{ 9}}, -+{{359},{ 9}}, {{231},{ 9}}, {{487},{ 9}}, {{ 23},{ 9}}, {{279},{ 9}}, -+{{151},{ 9}}, {{407},{ 9}}, {{ 87},{ 9}}, {{343},{ 9}}, {{215},{ 9}}, -+{{471},{ 9}}, {{ 55},{ 9}}, {{311},{ 9}}, {{183},{ 9}}, {{439},{ 9}}, -+{{119},{ 9}}, {{375},{ 9}}, {{247},{ 9}}, {{503},{ 9}}, {{ 15},{ 9}}, -+{{271},{ 9}}, {{143},{ 9}}, {{399},{ 9}}, {{ 79},{ 9}}, {{335},{ 9}}, -+{{207},{ 9}}, {{463},{ 9}}, {{ 47},{ 9}}, {{303},{ 9}}, {{175},{ 9}}, -+{{431},{ 9}}, {{111},{ 9}}, {{367},{ 9}}, {{239},{ 9}}, {{495},{ 9}}, -+{{ 31},{ 9}}, {{287},{ 9}}, {{159},{ 9}}, {{415},{ 9}}, {{ 95},{ 9}}, -+{{351},{ 9}}, {{223},{ 9}}, {{479},{ 9}}, {{ 63},{ 9}}, {{319},{ 9}}, -+{{191},{ 9}}, {{447},{ 9}}, {{127},{ 9}}, {{383},{ 9}}, {{255},{ 9}}, -+{{511},{ 9}}, {{ 0},{ 7}}, {{ 64},{ 7}}, {{ 32},{ 7}}, {{ 96},{ 7}}, -+{{ 16},{ 7}}, {{ 80},{ 7}}, {{ 48},{ 7}}, {{112},{ 7}}, {{ 8},{ 7}}, -+{{ 72},{ 7}}, {{ 40},{ 7}}, {{104},{ 7}}, {{ 24},{ 7}}, {{ 88},{ 7}}, -+{{ 56},{ 7}}, {{120},{ 7}}, {{ 4},{ 7}}, {{ 68},{ 7}}, {{ 36},{ 7}}, -+{{100},{ 7}}, {{ 20},{ 7}}, {{ 84},{ 7}}, {{ 52},{ 7}}, {{116},{ 7}}, -+{{ 3},{ 8}}, {{131},{ 8}}, {{ 67},{ 8}}, {{195},{ 8}}, {{ 35},{ 8}}, -+{{163},{ 8}}, {{ 99},{ 8}}, {{227},{ 8}} -+}; -+ -+local const ct_data static_dtree[D_CODES] = { -+{{ 0},{ 5}}, {{16},{ 5}}, {{ 8},{ 5}}, {{24},{ 5}}, {{ 4},{ 5}}, -+{{20},{ 5}}, {{12},{ 5}}, {{28},{ 5}}, {{ 2},{ 5}}, {{18},{ 5}}, -+{{10},{ 5}}, {{26},{ 5}}, {{ 6},{ 5}}, {{22},{ 5}}, {{14},{ 5}}, -+{{30},{ 5}}, {{ 1},{ 5}}, {{17},{ 5}}, {{ 9},{ 5}}, {{25},{ 5}}, -+{{ 5},{ 5}}, {{21},{ 5}}, {{13},{ 5}}, {{29},{ 5}}, {{ 3},{ 5}}, -+{{19},{ 5}}, {{11},{ 5}}, {{27},{ 5}}, {{ 7},{ 5}}, {{23},{ 5}} -+}; -+ -+const uch _dist_code[DIST_CODE_LEN] = { -+ 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, -+ 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, -+10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, -+11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, -+12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, -+13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, -+13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, -+14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, -+14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, -+14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, -+15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, -+15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, -+15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 0, 0, 16, 17, -+18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, -+23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, -+24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, -+26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, -+26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, -+27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, -+27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, -+28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, -+28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, -+28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, -+29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, -+29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, -+29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29 -+}; -+ -+const uch _length_code[MAX_MATCH-MIN_MATCH+1]= { -+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12, 12, -+13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, -+17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, -+19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, -+21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, -+22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, -+23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, -+24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, -+25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, -+25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 26, -+26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, -+26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, -+27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28 -+}; -+ -+local const int base_length[LENGTH_CODES] = { -+0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, -+64, 80, 96, 112, 128, 160, 192, 224, 0 -+}; -+ -+local const int base_dist[D_CODES] = { -+ 0, 1, 2, 3, 4, 6, 8, 12, 16, 24, -+ 32, 48, 64, 96, 128, 192, 256, 384, 512, 768, -+ 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576 -+}; -+ ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/ultoa.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,66 @@ -+/* -+ * convert unsigned long to ASCII -+ * Copyright (C) 1998, 1999 Henry Spencer. -+ * -+ * This library is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU Library General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This library is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public -+ * License for more details. -+ * -+ * RCSID $Id: ultoa.c,v 1.10 2004/07/10 07:48:37 mcr Exp $ -+ */ -+#include "openswan.h" -+ -+/* -+ - ultoa - convert unsigned long to decimal ASCII -+ */ -+size_t /* length required for full conversion */ -+ultoa(n, base, dst, dstlen) -+unsigned long n; -+int base; -+char *dst; /* need not be valid if dstlen is 0 */ -+size_t dstlen; -+{ -+ char buf[3*sizeof(unsigned long) + 1]; -+ char *bufend = buf + sizeof(buf); -+ size_t len; -+ char *p; -+ static char hex[] = "0123456789abcdef"; -+ -+ p = bufend; -+ *--p = '\0'; -+ if (base == 10) { -+ do { -+ *--p = n%10 + '0'; -+ n /= 10; -+ } while (n != 0); -+ } else if (base == 16) { -+ do { -+ *--p = hex[n&0xf]; -+ n >>= 4; -+ } while (n != 0); -+ *--p = 'x'; -+ *--p = '0'; -+ } else if (base == 8) { -+ do { -+ *--p = (n&07) + '0'; -+ n >>= 3; -+ } while (n != 0); -+ *--p = '0'; -+ } else -+ *--p = '?'; -+ -+ len = bufend - p; -+ -+ if (dstlen > 0) { -+ if (len > dstlen) -+ *(p + dstlen - 1) = '\0'; -+ strcpy(dst, p); -+ } -+ return len; -+} ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/ultot.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,82 @@ -+/* -+ * convert unsigned long to text -+ * Copyright (C) 2000 Henry Spencer. -+ * -+ * This library is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU Library General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This library is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public -+ * License for more details. -+ * -+ * RCSID $Id: ultot.c,v 1.5 2004/07/10 07:48:37 mcr Exp $ -+ */ -+#include "openswan.h" -+ -+/* -+ - ultot - convert unsigned long to text -+ */ -+size_t /* length required for full conversion */ -+ultot(n, base, dst, dstlen) -+unsigned long n; -+int base; -+char *dst; /* need not be valid if dstlen is 0 */ -+size_t dstlen; -+{ -+ char buf[3*sizeof(unsigned long) + 1]; -+ char *bufend = buf + sizeof(buf); -+ size_t len; -+ char *p; -+ static char hex[] = "0123456789abcdef"; -+# define HEX32 (32/4) -+ -+ p = bufend; -+ *--p = '\0'; -+ switch (base) { -+ case 10: -+ case 'd': -+ do { -+ *--p = n%10 + '0'; -+ n /= 10; -+ } while (n != 0); -+ break; -+ case 16: -+ case 17: -+ case 'x': -+ do { -+ *--p = hex[n&0xf]; -+ n >>= 4; -+ } while (n != 0); -+ if (base == 17) -+ while (bufend - p < HEX32 + 1) -+ *--p = '0'; -+ if (base == 'x') { -+ *--p = 'x'; -+ *--p = '0'; -+ } -+ break; -+ case 8: -+ case 'o': -+ do { -+ *--p = (n&07) + '0'; -+ n >>= 3; -+ } while (n != 0); -+ if (base == 'o') -+ *--p = '0'; -+ break; -+ default: -+ return 0; -+ break; -+ } -+ -+ len = bufend - p; -+ if (dstlen > 0) { -+ if (len > dstlen) -+ *(p + dstlen - 1) = '\0'; -+ strcpy(dst, p); -+ } -+ return len; -+} ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/version.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,44 @@ -+/* -+ * return IPsec version information -+ * Copyright (C) 2001 Henry Spencer. -+ * -+ * This library is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU Library General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. See . -+ * -+ * This library is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public -+ * License for more details. -+ * -+ * RCSID $Id: version.in.c,v 1.2 2004/04/14 05:09:46 ken Exp $ -+ */ -+ -+#ifdef __KERNEL__ -+#include -+#endif -+ -+#include "openswan.h" -+ -+#define V "2.6.16dr2" /* substituted in by Makefile */ -+static const char openswan_number[] = V; -+static const char openswan_string[] = "Openswan " V; -+ -+/* -+ - ipsec_version_code - return IPsec version number/code, as string -+ */ -+const char * -+ipsec_version_code() -+{ -+ return openswan_number; -+} -+ -+/* -+ - ipsec_version_string - return full version string -+ */ -+const char * -+ipsec_version_string() -+{ -+ return openswan_string; -+} ---- /dev/null Tue Mar 11 13:02:56 2003 -+++ linux/net/ipsec/zutil.c Mon Feb 9 13:51:03 2004 -@@ -0,0 +1,227 @@ -+/* zutil.c -- target dependent utility functions for the compression library -+ * Copyright (C) 1995-2002 Jean-loup Gailly. -+ * For conditions of distribution and use, see copyright notice in zlib.h -+ */ -+ -+/* @(#) $Id: zutil.c,v 1.5 2004/07/10 07:48:40 mcr Exp $ */ -+ -+#include -+ -+#define MY_ZCALLOC -+ -+struct internal_state {int dummy;}; /* for buggy compilers */ -+ -+#ifndef STDC -+extern void exit OF((int)); -+#endif -+ -+const char *z_errmsg[10] = { -+"need dictionary", /* Z_NEED_DICT 2 */ -+"stream end", /* Z_STREAM_END 1 */ -+"", /* Z_OK 0 */ -+"file error", /* Z_ERRNO (-1) */ -+"stream error", /* Z_STREAM_ERROR (-2) */ -+"data error", /* Z_DATA_ERROR (-3) */ -+"insufficient memory", /* Z_MEM_ERROR (-4) */ -+"buffer error", /* Z_BUF_ERROR (-5) */ -+"incompatible version",/* Z_VERSION_ERROR (-6) */ -+""}; -+ -+ -+const char * ZEXPORT zlibVersion() -+{ -+ return ZLIB_VERSION; -+} -+ -+#ifdef DEBUG -+ -+# ifndef verbose -+# define verbose 0 -+# endif -+int z_verbose = verbose; -+ -+void z_error (m) -+ char *m; -+{ -+ fprintf(stderr, "%s\n", m); -+ exit(1); -+} -+#endif -+ -+/* exported to allow conversion of error code to string for compress() and -+ * uncompress() -+ */ -+const char * ZEXPORT zError(err) -+ int err; -+{ -+ return ERR_MSG(err); -+} -+ -+ -+#ifndef HAVE_MEMCPY -+ -+void zmemcpy(dest, source, len) -+ Bytef* dest; -+ const Bytef* source; -+ uInt len; -+{ -+ if (len == 0) return; -+ do { -+ *dest++ = *source++; /* ??? to be unrolled */ -+ } while (--len != 0); -+} -+ -+int zmemcmp(s1, s2, len) -+ const Bytef* s1; -+ const Bytef* s2; -+ uInt len; -+{ -+ uInt j; -+ -+ for (j = 0; j < len; j++) { -+ if (s1[j] != s2[j]) return 2*(s1[j] > s2[j])-1; -+ } -+ return 0; -+} -+ -+void zmemzero(dest, len) -+ Bytef* dest; -+ uInt len; -+{ -+ if (len == 0) return; -+ do { -+ *dest++ = 0; /* ??? to be unrolled */ -+ } while (--len != 0); -+} -+#endif -+ -+#ifdef __TURBOC__ -+#if (defined( __BORLANDC__) || !defined(SMALL_MEDIUM)) && !defined(__32BIT__) -+/* Small and medium model in Turbo C are for now limited to near allocation -+ * with reduced MAX_WBITS and MAX_MEM_LEVEL -+ */ -+# define MY_ZCALLOC -+ -+/* Turbo C malloc() does not allow dynamic allocation of 64K bytes -+ * and farmalloc(64K) returns a pointer with an offset of 8, so we -+ * must fix the pointer. Warning: the pointer must be put back to its -+ * original form in order to free it, use zcfree(). -+ */ -+ -+#define MAX_PTR 10 -+/* 10*64K = 640K */ -+ -+local int next_ptr = 0; -+ -+typedef struct ptr_table_s { -+ voidpf org_ptr; -+ voidpf new_ptr; -+} ptr_table; -+ -+local ptr_table table[MAX_PTR]; -+/* This table is used to remember the original form of pointers -+ * to large buffers (64K). Such pointers are normalized with a zero offset. -+ * Since MSDOS is not a preemptive multitasking OS, this table is not -+ * protected from concurrent access. This hack doesn't work anyway on -+ * a protected system like OS/2. Use Microsoft C instead. -+ */ -+ -+voidpf zcalloc (voidpf opaque, unsigned items, unsigned size) -+{ -+ voidpf buf = opaque; /* just to make some compilers happy */ -+ ulg bsize = (ulg)items*size; -+ -+ /* If we allocate less than 65520 bytes, we assume that farmalloc -+ * will return a usable pointer which doesn't have to be normalized. -+ */ -+ if (bsize < 65520L) { -+ buf = farmalloc(bsize); -+ if (*(ush*)&buf != 0) return buf; -+ } else { -+ buf = farmalloc(bsize + 16L); -+ } -+ if (buf == NULL || next_ptr >= MAX_PTR) return NULL; -+ table[next_ptr].org_ptr = buf; -+ -+ /* Normalize the pointer to seg:0 */ -+ *((ush*)&buf+1) += ((ush)((uch*)buf-0) + 15) >> 4; -+ *(ush*)&buf = 0; -+ table[next_ptr++].new_ptr = buf; -+ return buf; -+} -+ -+void zcfree (voidpf opaque, voidpf ptr) -+{ -+ int n; -+ if (*(ush*)&ptr != 0) { /* object < 64K */ -+ farfree(ptr); -+ return; -+ } -+ /* Find the original pointer */ -+ for (n = 0; n < next_ptr; n++) { -+ if (ptr != table[n].new_ptr) continue; -+ -+ farfree(table[n].org_ptr); -+ while (++n < next_ptr) { -+ table[n-1] = table[n]; -+ } -+ next_ptr--; -+ return; -+ } -+ ptr = opaque; /* just to make some compilers happy */ -+ Assert(0, "zcfree: ptr not found"); -+} -+#endif -+#endif /* __TURBOC__ */ -+ -+ -+#if defined(M_I86) && !defined(__32BIT__) -+/* Microsoft C in 16-bit mode */ -+ -+# define MY_ZCALLOC -+ -+#if (!defined(_MSC_VER) || (_MSC_VER <= 600)) -+# define _halloc halloc -+# define _hfree hfree -+#endif -+ -+voidpf zcalloc (voidpf opaque, unsigned items, unsigned size) -+{ -+ if (opaque) opaque = 0; /* to make compiler happy */ -+ return _halloc((long)items, size); -+} -+ -+void zcfree (voidpf opaque, voidpf ptr) -+{ -+ if (opaque) opaque = 0; /* to make compiler happy */ -+ _hfree(ptr); -+} -+ -+#endif /* MSC */ -+ -+ -+#ifndef MY_ZCALLOC /* Any system without a special alloc function */ -+ -+#ifndef STDC -+extern voidp calloc OF((uInt items, uInt size)); -+extern void free OF((voidpf ptr)); -+#endif -+ -+voidpf zcalloc (opaque, items, size) -+ voidpf opaque; -+ unsigned items; -+ unsigned size; -+{ -+ if (opaque) items += size - size; /* make compiler happy */ -+ return (voidpf)calloc(items, size); -+} -+ -+void zcfree (opaque, ptr) -+ voidpf opaque; -+ voidpf ptr; -+{ -+ free(ptr); -+ if (opaque) return; /* make compiler happy */ -+} -+ -+#endif /* MY_ZCALLOC */ ---- swan26/net/ipv4/af_inet.c.orig Wed Jun 16 01:18:58 2004 -+++ swan26/net/ipv4/af_inet.c Fri Aug 13 23:09:27 2004 -@@ -1169,6 +1169,18 @@ - #if defined(CONFIG_IP_MROUTE) - ip_mr_init(); - #endif -+ -+#if defined(CONFIG_KLIPS) -+ { -+ extern int ipsec_klips_init(void); -+ /* -+ * Initialise AF_INET ESP and AH protocol support including -+ * e-routing and SA tables -+ */ -+ ipsec_klips_init(); -+ } -+#endif /* CONFIG_IPSEC */ -+ - /* - * Initialise per-cpu ipv4 mibs - */ ---- /dev/null Fri May 10 13:59:54 2002 -+++ linux/net/ipsec/Makefile.ver Sun Jul 28 22:10:40 2002 -@@ -0,0 +1 @@ -+IPSECVERSION='2.6.16dr2' diff --git a/src/patches/openswan-2.6.16dr2-2.6.24-natt.patch b/src/patches/openswan-2.6.16dr2-2.6.24-natt.patch deleted file mode 100644 index e0c1fc95b9..0000000000 --- a/src/patches/openswan-2.6.16dr2-2.6.24-natt.patch +++ /dev/null @@ -1,204 +0,0 @@ -Index: linux-2.6.x/net/ipv4/Kconfig -=================================================================== -RCS file: /cvs/sw/linux-2.6.x/net/ipv4/Kconfig,v -retrieving revision 1.1.1.28 -retrieving revision 1.10 -diff -u -r1.1.1.28 -r1.10 ---- linux-2.6.x/net/ipv4/Kconfig 10 Oct 2007 00:54:30 -0000 1.1.1.28 -+++ linux-2.6.x/net/ipv4/Kconfig 10 Oct 2007 04:53:57 -0000 1.10 -@@ -367,6 +367,12 @@ - tristate - default n - -+config IPSEC_NAT_TRAVERSAL -+ bool "IPSEC NAT-Traversal (KLIPS compatible)" -+ depends on INET -+ ---help--- -+ Includes support for RFC3947/RFC3948 NAT-Traversal of ESP over UDP. -+ - config INET_XFRM_MODE_TRANSPORT - tristate "IP: IPsec transport mode" - default y -Index: linux-2.6.x/net/ipv4/udp.c -=================================================================== -RCS file: /cvs/sw/linux-2.6.x/net/ipv4/udp.c,v -retrieving revision 1.1.1.46 -diff -u -r1.1.1.46 udp.c ---- linux-2.6.x/net/ipv4/udp.c 10 Oct 2007 00:54:30 -0000 1.1.1.46 -+++ linux-2.6.x/net/ipv4/udp.c 9 Nov 2007 00:11:33 -0000 -@@ -102,6 +102,7 @@ - #include - #include - #include -+#include - #include "udp_impl.h" - - /* -@@ -920,6 +921,128 @@ - return 0; - } - -+#if defined(CONFIG_XFRM) || defined(CONFIG_IPSEC_NAT_TRAVERSAL) -+ -+static xfrm4_rcv_encap_t xfrm4_rcv_encap_func = NULL; -+ -+/* -+ * de-encapsulate and pass to the registered xfrm4_rcv_encap_func function. -+ * Most of this code stolen from net/ipv4/xfrm4_input.c -+ * which is attributed to YOSHIFUJI Hideaki @USAGI, and -+ * Derek Atkins -+ */ -+ -+static int xfrm4_udp_encap_rcv_wrapper(struct sock *sk, struct sk_buff *skb) -+{ -+ struct udp_sock *up = udp_sk(sk); -+ struct udphdr *uh; -+ struct iphdr *iph; -+ int iphlen, len; -+ int ret; -+ -+ __u8 *udpdata; -+ __be32 *udpdata32; -+ __u16 encap_type = up->encap_type; -+ -+ /* if this is not encapsulated socket, then just return now */ -+ if (!encap_type && !xfrm4_rcv_encap_func) -+ return 1; -+ -+ /* If this is a paged skb, make sure we pull up -+ * whatever data we need to look at. */ -+ len = skb->len - sizeof(struct udphdr); -+ if (!pskb_may_pull(skb, sizeof(struct udphdr) + min(len, 8))) -+ return 1; -+ -+ /* Now we can get the pointers */ -+ uh = udp_hdr(skb); -+ udpdata = (__u8 *)uh + sizeof(struct udphdr); -+ udpdata32 = (__be32 *)udpdata; -+ -+ switch (encap_type) { -+ default: -+ case UDP_ENCAP_ESPINUDP: -+ /* Check if this is a keepalive packet. If so, eat it. */ -+ if (len == 1 && udpdata[0] == 0xff) { -+ goto drop; -+ } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0) { -+ /* ESP Packet without Non-ESP header */ -+ len = sizeof(struct udphdr); -+ } else -+ /* Must be an IKE packet.. pass it through */ -+ return 1; -+ break; -+ case UDP_ENCAP_ESPINUDP_NON_IKE: -+ /* Check if this is a keepalive packet. If so, eat it. */ -+ if (len == 1 && udpdata[0] == 0xff) { -+ goto drop; -+ } else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) && -+ udpdata32[0] == 0 && udpdata32[1] == 0) { -+ -+ /* ESP Packet with Non-IKE marker */ -+ len = sizeof(struct udphdr) + 2 * sizeof(u32); -+ } else -+ /* Must be an IKE packet.. pass it through */ -+ return 1; -+ break; -+ } -+ -+ /* At this point we are sure that this is an ESPinUDP packet, -+ * so we need to remove 'len' bytes from the packet (the UDP -+ * header and optional ESP marker bytes) and then modify the -+ * protocol to ESP, and then call into the transform receiver. -+ */ -+ if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) -+ goto drop; -+ -+ /* Now we can update and verify the packet length... */ -+ iph = ip_hdr(skb); -+ iphlen = iph->ihl << 2; -+ iph->tot_len = htons(ntohs(iph->tot_len) - len); -+ if (skb->len < iphlen + len) { -+ /* packet is too small!?! */ -+ goto drop; -+ } -+ -+ /* pull the data buffer up to the ESP header and set the -+ * transport header to point to ESP. Keep UDP on the stack -+ * for later. -+ */ -+ __skb_pull(skb, len); -+ skb_reset_transport_header(skb); -+ -+ /* modify the protocol (it's ESP!) */ -+ iph->protocol = IPPROTO_ESP; -+ -+ /* process ESP */ -+ ret = (*xfrm4_rcv_encap_func)(skb, encap_type); -+ return ret; -+ -+drop: -+ kfree_skb(skb); -+ return 0; -+} -+ -+int udp4_register_esp_rcvencap(xfrm4_rcv_encap_t func, -+ xfrm4_rcv_encap_t *oldfunc) -+{ -+ if (oldfunc != NULL) -+ *oldfunc = xfrm4_rcv_encap_func; -+ xfrm4_rcv_encap_func = func; -+ return 0; -+} -+ -+int udp4_unregister_esp_rcvencap(xfrm4_rcv_encap_t func) -+{ -+ if (xfrm4_rcv_encap_func != func) -+ return -1; -+ -+ xfrm4_rcv_encap_func = NULL; -+ return 0; -+} -+ -+#endif /* CONFIG_XFRM_MODULE || CONFIG_IPSEC_NAT_TRAVERSAL */ -+ - /* returns: - * -1: error - * 0: success -@@ -1252,6 +1375,11 @@ - case 0: - case UDP_ENCAP_ESPINUDP: - case UDP_ENCAP_ESPINUDP_NON_IKE: -+#if defined(CONFIG_XFRM) || defined(CONFIG_IPSEC_NAT_TRAVERSAL) -+ if (xfrm4_rcv_encap_func) -+ up->encap_rcv = xfrm4_udp_encap_rcv_wrapper; -+ else -+#endif - up->encap_rcv = xfrm4_udp_encap_rcv; - /* FALLTHROUGH */ - case UDP_ENCAP_L2TPINUDP: -@@ -1648,3 +1776,9 @@ - EXPORT_SYMBOL(udp_proc_register); - EXPORT_SYMBOL(udp_proc_unregister); - #endif -+ -+#if defined(CONFIG_IPSEC_NAT_TRAVERSAL) -+EXPORT_SYMBOL(udp4_register_esp_rcvencap); -+EXPORT_SYMBOL(udp4_unregister_esp_rcvencap); -+#endif -+ -Index: linux-2.6.x/include/net/xfrmudp.h -=================================================================== -RCS file: linux-2.6.x/include/net/xfrmudp.h -diff -N linux-2.6.x/include/net/xfrmudp.h ---- /dev/null 1 Jan 1970 00:00:00 -0000 -+++ linux-2.6.x/include/net/xfrmudp.h 3 Nov 2005 01:55:55 -0000 1.1 -@@ -0,0 +1,10 @@ -+/* -+ * pointer to function for type that xfrm4_input wants, to permit -+ * decoupling of XFRM from udp.c -+ */ -+#define HAVE_XFRM4_UDP_REGISTER -+ -+typedef int (*xfrm4_rcv_encap_t)(struct sk_buff *skb, __u16 encap_type); -+extern int udp4_register_esp_rcvencap(xfrm4_rcv_encap_t func -+ , xfrm4_rcv_encap_t *oldfunc); -+extern int udp4_unregister_esp_rcvencap(xfrm4_rcv_encap_t func); diff --git a/src/patches/reiser4-for-2.6.20.patch b/src/patches/reiser4-for-2.6.20.patch deleted file mode 100644 index 77d3c8164b..0000000000 --- a/src/patches/reiser4-for-2.6.20.patch +++ /dev/null @@ -1,80685 +0,0 @@ -diff -urN linux-2.6.20.orig/arch/i386/lib/usercopy.c linux-2.6.20/arch/i386/lib/usercopy.c ---- linux-2.6.20.orig/arch/i386/lib/usercopy.c 2006-11-30 00:57:37.000000000 +0300 -+++ linux-2.6.20/arch/i386/lib/usercopy.c 2007-05-06 14:50:43.658963226 +0400 -@@ -812,6 +812,7 @@ - #endif - return n; - } -+EXPORT_SYMBOL(__copy_from_user_ll_nocache); - - unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from, - unsigned long n) -@@ -827,6 +828,7 @@ - #endif - return n; - } -+EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero); - - /** - * copy_to_user: - Copy a block of data into user space. -diff -urN linux-2.6.20.orig/Documentation/Changes linux-2.6.20/Documentation/Changes ---- linux-2.6.20.orig/Documentation/Changes 2007-05-06 15:04:34.226399593 +0400 -+++ linux-2.6.20/Documentation/Changes 2007-05-06 14:50:43.658963226 +0400 -@@ -36,6 +36,7 @@ - o e2fsprogs 1.29 # tune2fs - o jfsutils 1.1.3 # fsck.jfs -V - o reiserfsprogs 3.6.3 # reiserfsck -V 2>&1|grep reiserfsprogs -+o reiser4progs 1.0.0 # fsck.reiser4 -V - o xfsprogs 2.6.0 # xfs_db -V - o pcmciautils 004 # pccardctl -V - o quota-tools 3.09 # quota -V -@@ -144,6 +145,13 @@ - versions of mkreiserfs, resize_reiserfs, debugreiserfs and - reiserfsck. These utils work on both i386 and alpha platforms. - -+Reiser4progs -+------------ -+ -+The reiser4progs package contains utilities for the reiser4 file system. -+Detailed instructions are provided in the README file located at: -+. -+ - Xfsprogs - -------- - -@@ -322,6 +330,10 @@ - ------------- - o - -+Reiser4progs -+------------ -+o -+ - Xfsprogs - -------- - o -diff -urN linux-2.6.20.orig/Documentation/filesystems/reiser4.txt linux-2.6.20/Documentation/filesystems/reiser4.txt ---- linux-2.6.20.orig/Documentation/filesystems/reiser4.txt 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/Documentation/filesystems/reiser4.txt 2007-05-06 14:50:43.658963226 +0400 -@@ -0,0 +1,75 @@ -+Reiser4 filesystem -+================== -+Reiser4 is a file system based on dancing tree algorithms, and is -+described at http://www.namesys.com -+ -+ -+References -+========== -+web page http://namesys.com/v4/v4.html -+source code ftp://ftp.namesys.com/pub/reiser4-for-2.6/ -+userland tools ftp://ftp.namesys.com/pub/reiser4progs/ -+install page http://www.namesys.com/install_v4.html -+ -+Compile options -+=============== -+Enable reiser4 debug mode -+ This checks everything imaginable while reiser4 -+ runs -+ -+Mount options -+============= -+tmgr.atom_max_size=N -+ Atoms containing more than N blocks will be forced to commit. -+ N is decimal. -+ Default is nr_free_pagecache_pages() / 2 at mount time. -+ -+tmgr.atom_max_age=N -+ Atoms older than N seconds will be forced to commit. N is decimal. -+ Default is 600. -+ -+tmgr.atom_max_flushers=N -+ Limit of concurrent flushers for one atom. 0 means no limit. -+ Default is 0. -+ -+tree.cbk_cache.nr_slots=N -+ Number of slots in the cbk cache. -+ -+flush.relocate_threshold=N -+ If flush finds more than N adjacent dirty leaf-level blocks it -+ will force them to be relocated. -+ Default is 64. -+ -+flush.relocate_distance=N -+ If flush finds can find a block allocation closer than at most -+ N from the preceder it will relocate to that position. -+ Default is 64. -+ -+flush.scan_maxnodes=N -+ The maximum number of nodes to scan left on a level during -+ flush. -+ Default is 10000. -+ -+optimal_io_size=N -+ Preferred IO size. This value is used to set st_blksize of -+ struct stat. -+ Default is 65536. -+ -+bsdgroups -+ Turn on BSD-style gid assignment. -+ -+32bittimes -+ By default file in reiser4 have 64 bit timestamps. Files -+ created when filesystem is mounted with 32bittimes mount -+ option will get 32 bit timestamps. -+ -+mtflush -+ Turn off concurrent flushing. -+ -+nopseudo -+ Disable pseudo files support. See -+ http://namesys.com/v4/pseudo.html for more about pseudo files. -+ -+dont_load_bitmap -+ Don't load all bitmap blocks at mount time, it is useful for -+ machines with tiny RAM and large disks. -diff -urN linux-2.6.20.orig/fs/fs-writeback.c linux-2.6.20/fs/fs-writeback.c ---- linux-2.6.20.orig/fs/fs-writeback.c 2007-05-06 15:04:39.848155607 +0400 -+++ linux-2.6.20/fs/fs-writeback.c 2007-05-06 14:50:43.662964476 +0400 -@@ -296,8 +296,6 @@ - * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so - * that it can be located for waiting on in __writeback_single_inode(). - * -- * Called under inode_lock. -- * - * If `bdi' is non-zero then we're being asked to writeback a specific queue. - * This function assumes that the blockdev superblock's inodes are backed by - * a variety of queues, so all inodes are searched. For other superblocks, -@@ -313,11 +311,13 @@ - * on the writer throttling path, and we get decent balancing between many - * throttled threads: we don't want them all piling up on __wait_on_inode. - */ --static void --sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) -+void -+generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) - { - const unsigned long start = jiffies; /* livelock avoidance */ - -+ spin_lock(&inode_lock); -+ - if (!wbc->for_kupdate || list_empty(&sb->s_io)) - list_splice_init(&sb->s_dirty, &sb->s_io); - -@@ -397,8 +397,19 @@ - if (wbc->nr_to_write <= 0) - break; - } -+ spin_unlock(&inode_lock); - return; /* Leave any unwritten inodes on s_io */ - } -+EXPORT_SYMBOL(generic_sync_sb_inodes); -+ -+static void -+sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) -+{ -+ if (sb->s_op->sync_inodes) -+ sb->s_op->sync_inodes(sb, wbc); -+ else -+ generic_sync_sb_inodes(sb, wbc); -+} - - /* - * Start writeback of dirty pagecache data against all unlocked inodes. -@@ -439,11 +450,8 @@ - * be unmounted by the time it is released. - */ - if (down_read_trylock(&sb->s_umount)) { -- if (sb->s_root) { -- spin_lock(&inode_lock); -+ if (sb->s_root) - sync_sb_inodes(sb, wbc); -- spin_unlock(&inode_lock); -- } - up_read(&sb->s_umount); - } - spin_lock(&sb_lock); -@@ -481,9 +489,7 @@ - (inodes_stat.nr_inodes - inodes_stat.nr_unused) + - nr_dirty + nr_unstable; - wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */ -- spin_lock(&inode_lock); - sync_sb_inodes(sb, &wbc); -- spin_unlock(&inode_lock); - } - - /* -diff -urN linux-2.6.20.orig/fs/Kconfig linux-2.6.20/fs/Kconfig ---- linux-2.6.20.orig/fs/Kconfig 2007-05-06 15:04:39.668099364 +0400 -+++ linux-2.6.20/fs/Kconfig 2007-05-06 14:50:43.662964476 +0400 -@@ -272,6 +272,8 @@ - default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y - default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m - -+source "fs/reiser4/Kconfig" -+ - config REISERFS_FS - tristate "Reiserfs support" - help -diff -urN linux-2.6.20.orig/fs/Makefile linux-2.6.20/fs/Makefile ---- linux-2.6.20.orig/fs/Makefile 2007-05-06 15:04:39.668099364 +0400 -+++ linux-2.6.20/fs/Makefile 2007-05-06 14:50:43.666965726 +0400 -@@ -62,6 +62,7 @@ - - # Do not add any filesystems before this line - obj-$(CONFIG_REISERFS_FS) += reiserfs/ -+obj-$(CONFIG_REISER4_FS) += reiser4/ - obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 - obj-$(CONFIG_EXT4DEV_FS) += ext4/ # Before ext2 so root fs can be ext4dev - obj-$(CONFIG_JBD) += jbd/ -diff -urN linux-2.6.20.orig/fs/reiser4/as_ops.c linux-2.6.20/fs/reiser4/as_ops.c ---- linux-2.6.20.orig/fs/reiser4/as_ops.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/as_ops.c 2007-05-06 14:50:43.666965726 +0400 -@@ -0,0 +1,337 @@ -+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Interface to VFS. Reiser4 address_space_operations are defined here. */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "coord.h" -+#include "plugin/item/item.h" -+#include "plugin/file/file.h" -+#include "plugin/security/perm.h" -+#include "plugin/disk_format/disk_format.h" -+#include "plugin/plugin.h" -+#include "plugin/plugin_set.h" -+#include "plugin/object.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree.h" -+#include "vfs_ops.h" -+#include "inode.h" -+#include "page_cache.h" -+#include "ktxnmgrd.h" -+#include "super.h" -+#include "reiser4.h" -+#include "entd.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* address space operations */ -+ -+/** -+ * reiser4_set_page_dirty - set dirty bit, tag in page tree, dirty accounting -+ * @page: page to be dirtied -+ * -+ * Operation of struct address_space_operations. This implementation is used by -+ * unix and cryptcompress file plugins. -+ * -+ * This is called when reiser4 page gets dirtied outside of reiser4, for -+ * example, when dirty bit is moved from pte to physical page. -+ * -+ * Tags page in the mapping's page tree with special tag so that it is possible -+ * to do all the reiser4 specific work wrt dirty pages (jnode creation, -+ * capturing by an atom) later because it can not be done in the contexts where -+ * set_page_dirty is called. -+ */ -+int reiser4_set_page_dirty(struct page *page) -+{ -+ /* this page can be unformatted only */ -+ assert("vs-1734", (page->mapping && -+ page->mapping->host && -+ reiser4_get_super_fake(page->mapping->host->i_sb) != -+ page->mapping->host -+ && reiser4_get_cc_fake(page->mapping->host->i_sb) != -+ page->mapping->host -+ && reiser4_get_bitmap_fake(page->mapping->host->i_sb) != -+ page->mapping->host)); -+ -+ if (!TestSetPageDirty(page)) { -+ struct address_space *mapping = page->mapping; -+ -+ if (mapping) { -+ write_lock_irq(&mapping->tree_lock); -+ -+ /* check for race with truncate */ -+ if (page->mapping) { -+ assert("vs-1652", page->mapping == mapping); -+ if (mapping_cap_account_dirty(mapping)) -+ inc_zone_page_state(page, -+ NR_FILE_DIRTY); -+ radix_tree_tag_set(&mapping->page_tree, -+ page->index, -+ PAGECACHE_TAG_REISER4_MOVED); -+ } -+ write_unlock_irq(&mapping->tree_lock); -+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); -+ } -+ } -+ return 0; -+} -+ -+/* ->invalidatepage method for reiser4 */ -+ -+/* -+ * this is called for each truncated page from -+ * truncate_inode_pages()->truncate_{complete,partial}_page(). -+ * -+ * At the moment of call, page is under lock, and outstanding io (if any) has -+ * completed. -+ */ -+ -+/** -+ * reiser4_invalidatepage -+ * @page: page to invalidate -+ * @offset: starting offset for partial invalidation -+ * -+ */ -+void reiser4_invalidatepage(struct page *page, unsigned long offset) -+{ -+ int ret = 0; -+ reiser4_context *ctx; -+ struct inode *inode; -+ jnode *node; -+ -+ /* -+ * This is called to truncate file's page. -+ * -+ * Originally, reiser4 implemented truncate in a standard way -+ * (vmtruncate() calls ->invalidatepage() on all truncated pages -+ * first, then file system ->truncate() call-back is invoked). -+ * -+ * This lead to the problem when ->invalidatepage() was called on a -+ * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT -+ * process. That is, truncate was bypassing transactions. To avoid -+ * this, try_capture_page_to_invalidate() call was added here. -+ * -+ * After many troubles with vmtruncate() based truncate (including -+ * races with flush, tail conversion, etc.) it was re-written in the -+ * top-to-bottom style: items are killed in reiser4_cut_tree_object() -+ * and pages belonging to extent are invalidated in kill_hook_extent(). -+ * So probably now additional call to capture is not needed here. -+ */ -+ -+ assert("nikita-3137", PageLocked(page)); -+ assert("nikita-3138", !PageWriteback(page)); -+ inode = page->mapping->host; -+ -+ /* -+ * ->invalidatepage() should only be called for the unformatted -+ * jnodes. Destruction of all other types of jnodes is performed -+ * separately. But, during some corner cases (like handling errors -+ * during mount) it is simpler to let ->invalidatepage to be called on -+ * them. Check for this, and do nothing. -+ */ -+ if (reiser4_get_super_fake(inode->i_sb) == inode) -+ return; -+ if (reiser4_get_cc_fake(inode->i_sb) == inode) -+ return; -+ if (reiser4_get_bitmap_fake(inode->i_sb) == inode) -+ return; -+ assert("vs-1426", PagePrivate(page)); -+ assert("vs-1427", -+ page->mapping == jnode_get_mapping(jnode_by_page(page))); -+ assert("", jprivate(page) != NULL); -+ assert("", ergo(inode_file_plugin(inode) != -+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID), -+ offset == 0)); -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return; -+ -+ node = jprivate(page); -+ spin_lock_jnode(node); -+ if (!(node->state & ((1 << JNODE_DIRTY) | (1<< JNODE_FLUSH_QUEUED) | -+ (1 << JNODE_WRITEBACK) | (1 << JNODE_OVRWR)))) { -+ /* there is not need to capture */ -+ jref(node); -+ JF_SET(node, JNODE_HEARD_BANSHEE); -+ page_clear_jnode(page, node); -+ reiser4_uncapture_jnode(node); -+ unhash_unformatted_jnode(node); -+ jput(node); -+ reiser4_exit_context(ctx); -+ return; -+ } -+ spin_unlock_jnode(node); -+ -+ /* capture page being truncated. */ -+ ret = try_capture_page_to_invalidate(page); -+ if (ret != 0) -+ warning("nikita-3141", "Cannot capture: %i", ret); -+ -+ if (offset == 0) { -+ /* remove jnode from transaction and detach it from page. */ -+ jref(node); -+ JF_SET(node, JNODE_HEARD_BANSHEE); -+ /* page cannot be detached from jnode concurrently, because it -+ * is locked */ -+ reiser4_uncapture_page(page); -+ -+ /* this detaches page from jnode, so that jdelete will not try -+ * to lock page which is already locked */ -+ spin_lock_jnode(node); -+ page_clear_jnode(page, node); -+ spin_unlock_jnode(node); -+ unhash_unformatted_jnode(node); -+ -+ jput(node); -+ } -+ -+ reiser4_exit_context(ctx); -+} -+ -+/* help function called from reiser4_releasepage(). It returns true if jnode -+ * can be detached from its page and page released. */ -+int jnode_is_releasable(jnode * node /* node to check */ ) -+{ -+ assert("nikita-2781", node != NULL); -+ assert_spin_locked(&(node->guard)); -+ assert_spin_locked(&(node->load)); -+ -+ /* is some thread is currently using jnode page, later cannot be -+ * detached */ -+ if (atomic_read(&node->d_count) != 0) { -+ return 0; -+ } -+ -+ assert("vs-1214", !jnode_is_loaded(node)); -+ -+ /* -+ * can only release page if real block number is assigned to it. Simple -+ * check for ->atom wouldn't do, because it is possible for node to be -+ * clean, not it atom yet, and still having fake block number. For -+ * example, node just created in jinit_new(). -+ */ -+ if (reiser4_blocknr_is_fake(jnode_get_block(node))) -+ return 0; -+ -+ /* -+ * pages prepared for write can not be released anyway, so avoid -+ * detaching jnode from the page -+ */ -+ if (JF_ISSET(node, JNODE_WRITE_PREPARED)) -+ return 0; -+ -+ /* -+ * dirty jnode cannot be released. It can however be submitted to disk -+ * as part of early flushing, but only after getting flush-prepped. -+ */ -+ if (JF_ISSET(node, JNODE_DIRTY)) -+ return 0; -+ -+ /* overwrite set is only written by log writer. */ -+ if (JF_ISSET(node, JNODE_OVRWR)) -+ return 0; -+ -+ /* jnode is already under writeback */ -+ if (JF_ISSET(node, JNODE_WRITEBACK)) -+ return 0; -+ -+ /* don't flush bitmaps or journal records */ -+ if (!jnode_is_znode(node) && !jnode_is_unformatted(node)) -+ return 0; -+ -+ return 1; -+} -+ -+/* -+ * ->releasepage method for reiser4 -+ * -+ * This is called by VM scanner when it comes across clean page. What we have -+ * to do here is to check whether page can really be released (freed that is) -+ * and if so, detach jnode from it and remove page from the page cache. -+ * -+ * Check for releasability is done by releasable() function. -+ */ -+int reiser4_releasepage(struct page *page, gfp_t gfp UNUSED_ARG) -+{ -+ jnode *node; -+ -+ assert("nikita-2257", PagePrivate(page)); -+ assert("nikita-2259", PageLocked(page)); -+ assert("nikita-2892", !PageWriteback(page)); -+ assert("nikita-3019", reiser4_schedulable()); -+ -+ /* NOTE-NIKITA: this can be called in the context of reiser4 call. It -+ is not clear what to do in this case. A lot of deadlocks seems be -+ possible. */ -+ -+ node = jnode_by_page(page); -+ assert("nikita-2258", node != NULL); -+ assert("reiser4-4", page->mapping != NULL); -+ assert("reiser4-5", page->mapping->host != NULL); -+ -+ if (PageDirty(page)) -+ return 0; -+ -+ /* extra page reference is used by reiser4 to protect -+ * jnode<->page link from this ->releasepage(). */ -+ if (page_count(page) > 3) -+ return 0; -+ -+ /* releasable() needs jnode lock, because it looks at the jnode fields -+ * and we need jload_lock here to avoid races with jload(). */ -+ spin_lock_jnode(node); -+ spin_lock(&(node->load)); -+ if (jnode_is_releasable(node)) { -+ struct address_space *mapping; -+ -+ mapping = page->mapping; -+ jref(node); -+ /* there is no need to synchronize against -+ * jnode_extent_write() here, because pages seen by -+ * jnode_extent_write() are !releasable(). */ -+ page_clear_jnode(page, node); -+ spin_unlock(&(node->load)); -+ spin_unlock_jnode(node); -+ -+ /* we are under memory pressure so release jnode also. */ -+ jput(node); -+ -+ return 1; -+ } else { -+ spin_unlock(&(node->load)); -+ spin_unlock_jnode(node); -+ assert("nikita-3020", reiser4_schedulable()); -+ return 0; -+ } -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/block_alloc.c linux-2.6.20/fs/reiser4/block_alloc.c ---- linux-2.6.20.orig/fs/reiser4/block_alloc.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/block_alloc.c 2007-05-06 14:50:43.682970725 +0400 -@@ -0,0 +1,1137 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "debug.h" -+#include "dformat.h" -+#include "plugin/plugin.h" -+#include "txnmgr.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree.h" -+#include "super.h" -+ -+#include /* for __u?? */ -+#include /* for struct super_block */ -+#include -+ -+/* THE REISER4 DISK SPACE RESERVATION SCHEME. */ -+ -+/* We need to be able to reserve enough disk space to ensure that an atomic -+ operation will have enough disk space to flush (see flush.c and -+ http://namesys.com/v4/v4.html) and commit it once it is started. -+ -+ In our design a call for reserving disk space may fail but not an actual -+ block allocation. -+ -+ All free blocks, already allocated blocks, and all kinds of reserved blocks -+ are counted in different per-fs block counters. -+ -+ A reiser4 super block's set of block counters currently is: -+ -+ free -- free blocks, -+ used -- already allocated blocks, -+ -+ grabbed -- initially reserved for performing an fs operation, those blocks -+ are taken from free blocks, then grabbed disk space leaks from grabbed -+ blocks counter to other counters like "fake allocated", "flush -+ reserved", "used", the rest of not used grabbed space is returned to -+ free space at the end of fs operation; -+ -+ fake allocated -- counts all nodes without real disk block numbers assigned, -+ we have separate accounting for formatted and unformatted -+ nodes (for easier debugging); -+ -+ flush reserved -- disk space needed for flushing and committing an atom. -+ Each dirty already allocated block could be written as a -+ part of atom's overwrite set or as a part of atom's -+ relocate set. In both case one additional block is needed, -+ it is used as a wandered block if we do overwrite or as a -+ new location for a relocated block. -+ -+ In addition, blocks in some states are counted on per-thread and per-atom -+ basis. A reiser4 context has a counter of blocks grabbed by this transaction -+ and the sb's grabbed blocks counter is a sum of grabbed blocks counter values -+ of each reiser4 context. Each reiser4 atom has a counter of "flush reserved" -+ blocks, which are reserved for flush processing and atom commit. */ -+ -+/* AN EXAMPLE: suppose we insert new item to the reiser4 tree. We estimate -+ number of blocks to grab for most expensive case of balancing when the leaf -+ node we insert new item to gets split and new leaf node is allocated. -+ -+ So, we need to grab blocks for -+ -+ 1) one block for possible dirtying the node we insert an item to. That block -+ would be used for node relocation at flush time or for allocating of a -+ wandered one, it depends what will be a result (what set, relocate or -+ overwrite the node gets assigned to) of the node processing by the flush -+ algorithm. -+ -+ 2) one block for either allocating a new node, or dirtying of right or left -+ clean neighbor, only one case may happen. -+ -+ VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying of left neighbor, right neighbor, current -+ node, and creation of new node. have I forgotten something? email me. -+ -+ These grabbed blocks are counted in both reiser4 context "grabbed blocks" -+ counter and in the fs-wide one (both ctx->grabbed_blocks and -+ sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is -+ decremented by 2. -+ -+ Suppose both two blocks were spent for dirtying of an already allocated clean -+ node (one block went from "grabbed" to "flush reserved") and for new block -+ allocating (one block went from "grabbed" to "fake allocated formatted"). -+ -+ Inserting of a child pointer to the parent node caused parent node to be -+ split, the balancing code takes care about this grabbing necessary space -+ immediately by calling reiser4_grab with BA_RESERVED flag set which means -+ "can use the 5% reserved disk space". -+ -+ At this moment insertion completes and grabbed blocks (if they were not used) -+ should be returned to the free space counter. -+ -+ However the atom life-cycle is not completed. The atom had one "flush -+ reserved" block added by our insertion and the new fake allocated node is -+ counted as a "fake allocated formatted" one. The atom has to be fully -+ processed by flush before commit. Suppose that the flush moved the first, -+ already allocated node to the atom's overwrite list, the new fake allocated -+ node, obviously, went into the atom relocate set. The reiser4 flush -+ allocates the new node using one unit from "fake allocated formatted" -+ counter, the log writer uses one from "flush reserved" for wandered block -+ allocation. -+ -+ And, it is not the end. When the wandered block is deallocated after the -+ atom gets fully played (see wander.c for term description), the disk space -+ occupied for it is returned to free blocks. */ -+ -+/* BLOCK NUMBERS */ -+ -+/* Any reiser4 node has a block number assigned to it. We use these numbers for -+ indexing in hash tables, so if a block has not yet been assigned a location -+ on disk we need to give it a temporary fake block number. -+ -+ Current implementation of reiser4 uses 64-bit integers for block numbers. We -+ use highest bit in 64-bit block number to distinguish fake and real block -+ numbers. So, only 63 bits may be used to addressing of real device -+ blocks. That "fake" block numbers space is divided into subspaces of fake -+ block numbers for data blocks and for shadow (working) bitmap blocks. -+ -+ Fake block numbers for data blocks are generated by a cyclic counter, which -+ gets incremented after each real block allocation. We assume that it is -+ impossible to overload this counter during one transaction life. */ -+ -+/* Initialize a blocknr hint. */ -+void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint) -+{ -+ memset(hint, 0, sizeof(reiser4_blocknr_hint)); -+} -+ -+/* Release any resources of a blocknr hint. */ -+void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG) -+{ -+ /* No resources should be freed in current blocknr_hint implementation. */ -+} -+ -+/* see above for explanation of fake block number. */ -+/* Audited by: green(2002.06.11) */ -+int reiser4_blocknr_is_fake(const reiser4_block_nr * da) -+{ -+ /* The reason for not simply returning result of '&' operation is that -+ while return value is (possibly 32bit) int, the reiser4_block_nr is -+ at least 64 bits long, and high bit (which is the only possible -+ non zero bit after the masking) would be stripped off */ -+ return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0; -+} -+ -+/* Static functions for / block counters -+ arithmetic. Mostly, they are isolated to not to code same assertions in -+ several places. */ -+static void sub_from_ctx_grabbed(reiser4_context * ctx, __u64 count) -+{ -+ BUG_ON(ctx->grabbed_blocks < count); -+ assert("zam-527", ctx->grabbed_blocks >= count); -+ ctx->grabbed_blocks -= count; -+} -+ -+static void add_to_ctx_grabbed(reiser4_context * ctx, __u64 count) -+{ -+ ctx->grabbed_blocks += count; -+} -+ -+static void sub_from_sb_grabbed(reiser4_super_info_data * sbinfo, __u64 count) -+{ -+ assert("zam-525", sbinfo->blocks_grabbed >= count); -+ sbinfo->blocks_grabbed -= count; -+} -+ -+/* Decrease the counter of block reserved for flush in super block. */ -+static void -+sub_from_sb_flush_reserved(reiser4_super_info_data * sbinfo, __u64 count) -+{ -+ assert("vpf-291", sbinfo->blocks_flush_reserved >= count); -+ sbinfo->blocks_flush_reserved -= count; -+} -+ -+static void -+sub_from_sb_fake_allocated(reiser4_super_info_data * sbinfo, __u64 count, -+ reiser4_ba_flags_t flags) -+{ -+ if (flags & BA_FORMATTED) { -+ assert("zam-806", sbinfo->blocks_fake_allocated >= count); -+ sbinfo->blocks_fake_allocated -= count; -+ } else { -+ assert("zam-528", -+ sbinfo->blocks_fake_allocated_unformatted >= count); -+ sbinfo->blocks_fake_allocated_unformatted -= count; -+ } -+} -+ -+static void sub_from_sb_used(reiser4_super_info_data * sbinfo, __u64 count) -+{ -+ assert("zam-530", -+ sbinfo->blocks_used >= count + sbinfo->min_blocks_used); -+ sbinfo->blocks_used -= count; -+} -+ -+static void -+sub_from_cluster_reserved(reiser4_super_info_data * sbinfo, __u64 count) -+{ -+ assert("edward-501", sbinfo->blocks_clustered >= count); -+ sbinfo->blocks_clustered -= count; -+} -+ -+/* Increase the counter of block reserved for flush in atom. */ -+static void add_to_atom_flush_reserved_nolock(txn_atom * atom, __u32 count) -+{ -+ assert("zam-772", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ atom->flush_reserved += count; -+} -+ -+/* Decrease the counter of block reserved for flush in atom. */ -+static void sub_from_atom_flush_reserved_nolock(txn_atom * atom, __u32 count) -+{ -+ assert("zam-774", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ assert("nikita-2790", atom->flush_reserved >= count); -+ atom->flush_reserved -= count; -+} -+ -+/* super block has 6 counters: free, used, grabbed, fake allocated -+ (formatted and unformatted) and flush reserved. Their sum must be -+ number of blocks on a device. This function checks this */ -+int reiser4_check_block_counters(const struct super_block *super) -+{ -+ __u64 sum; -+ -+ sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) + -+ reiser4_data_blocks(super) + reiser4_fake_allocated(super) + -+ reiser4_fake_allocated_unformatted(super) + reiser4_flush_reserved(super) + -+ reiser4_clustered_blocks(super); -+ if (reiser4_block_count(super) != sum) { -+ printk("super block counters: " -+ "used %llu, free %llu, " -+ "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), " -+ "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n", -+ (unsigned long long)reiser4_data_blocks(super), -+ (unsigned long long)reiser4_free_blocks(super), -+ (unsigned long long)reiser4_grabbed_blocks(super), -+ (unsigned long long)reiser4_fake_allocated(super), -+ (unsigned long long) -+ reiser4_fake_allocated_unformatted(super), -+ (unsigned long long)reiser4_flush_reserved(super), -+ (unsigned long long)reiser4_clustered_blocks(super), -+ (unsigned long long)sum, -+ (unsigned long long)reiser4_block_count(super)); -+ return 0; -+ } -+ return 1; -+} -+ -+/* Adjust "working" free blocks counter for number of blocks we are going to -+ allocate. Record number of grabbed blocks in fs-wide and per-thread -+ counters. This function should be called before bitmap scanning or -+ allocating fake block numbers -+ -+ @super -- pointer to reiser4 super block; -+ @count -- number of blocks we reserve; -+ -+ @return -- 0 if success, -ENOSPC, if all -+ free blocks are preserved or already allocated. -+*/ -+ -+static int -+reiser4_grab(reiser4_context * ctx, __u64 count, reiser4_ba_flags_t flags) -+{ -+ __u64 free_blocks; -+ int ret = 0, use_reserved = flags & BA_RESERVED; -+ reiser4_super_info_data *sbinfo; -+ -+ assert("vs-1276", ctx == get_current_context()); -+ -+ /* Do not grab anything on ro-mounted fs. */ -+ if (rofs_super(ctx->super)) { -+ ctx->grab_enabled = 0; -+ return 0; -+ } -+ -+ sbinfo = get_super_private(ctx->super); -+ -+ spin_lock_reiser4_super(sbinfo); -+ -+ free_blocks = sbinfo->blocks_free; -+ -+ if ((use_reserved && free_blocks < count) || -+ (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) { -+ ret = RETERR(-ENOSPC); -+ goto unlock_and_ret; -+ } -+ -+ add_to_ctx_grabbed(ctx, count); -+ -+ sbinfo->blocks_grabbed += count; -+ sbinfo->blocks_free -= count; -+ -+#if REISER4_DEBUG -+ if (ctx->grabbed_initially == 0) -+ ctx->grabbed_initially = count; -+#endif -+ -+ assert("nikita-2986", reiser4_check_block_counters(ctx->super)); -+ -+ /* disable grab space in current context */ -+ ctx->grab_enabled = 0; -+ -+ unlock_and_ret: -+ spin_unlock_reiser4_super(sbinfo); -+ -+ return ret; -+} -+ -+int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags) -+{ -+ int ret; -+ reiser4_context *ctx; -+ -+ assert("nikita-2964", ergo(flags & BA_CAN_COMMIT, -+ lock_stack_isclean(get_current_lock_stack -+ ()))); -+ ctx = get_current_context(); -+ if (!(flags & BA_FORCE) && !is_grab_enabled(ctx)) { -+ return 0; -+ } -+ -+ ret = reiser4_grab(ctx, count, flags); -+ if (ret == -ENOSPC) { -+ -+ /* Trying to commit the all transactions if BA_CAN_COMMIT flag present */ -+ if (flags & BA_CAN_COMMIT) { -+ txnmgr_force_commit_all(ctx->super, 0); -+ ctx->grab_enabled = 1; -+ ret = reiser4_grab(ctx, count, flags); -+ } -+ } -+ /* -+ * allocation from reserved pool cannot fail. This is severe error. -+ */ -+ assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0)); -+ return ret; -+} -+ -+/* -+ * SPACE RESERVED FOR UNLINK/TRUNCATE -+ * -+ * Unlink and truncate require space in transaction (to update stat data, at -+ * least). But we don't want rm(1) to fail with "No space on device" error. -+ * -+ * Solution is to reserve 5% of disk space for truncates and -+ * unlinks. Specifically, normal space grabbing requests don't grab space from -+ * reserved area. Only requests with BA_RESERVED bit in flags are allowed to -+ * drain it. Per super block delete mutex is used to allow only one -+ * thread at a time to grab from reserved area. -+ * -+ * Grabbing from reserved area should always be performed with BA_CAN_COMMIT -+ * flag. -+ * -+ */ -+ -+int reiser4_grab_reserved(struct super_block *super, -+ __u64 count, reiser4_ba_flags_t flags) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(super); -+ -+ assert("nikita-3175", flags & BA_CAN_COMMIT); -+ -+ /* Check the delete mutex already taken by us, we assume that -+ * reading of machine word is atomic. */ -+ if (sbinfo->delete_mutex_owner == current) { -+ if (reiser4_grab_space -+ (count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) { -+ warning("zam-1003", -+ "nested call of grab_reserved fails count=(%llu)", -+ (unsigned long long)count); -+ reiser4_release_reserved(super); -+ return RETERR(-ENOSPC); -+ } -+ return 0; -+ } -+ -+ if (reiser4_grab_space(count, flags)) { -+ mutex_lock(&sbinfo->delete_mutex); -+ assert("nikita-2929", sbinfo->delete_mutex_owner == NULL); -+ sbinfo->delete_mutex_owner = current; -+ -+ if (reiser4_grab_space(count, flags | BA_RESERVED)) { -+ warning("zam-833", -+ "reserved space is not enough (%llu)", -+ (unsigned long long)count); -+ reiser4_release_reserved(super); -+ return RETERR(-ENOSPC); -+ } -+ } -+ return 0; -+} -+ -+void reiser4_release_reserved(struct super_block *super) -+{ -+ reiser4_super_info_data *info; -+ -+ info = get_super_private(super); -+ if (info->delete_mutex_owner == current) { -+ info->delete_mutex_owner = NULL; -+ mutex_unlock(&info->delete_mutex); -+ } -+} -+ -+static reiser4_super_info_data *grabbed2fake_allocated_head(int count) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ ctx = get_current_context(); -+ sub_from_ctx_grabbed(ctx, count); -+ -+ sbinfo = get_super_private(ctx->super); -+ spin_lock_reiser4_super(sbinfo); -+ -+ sub_from_sb_grabbed(sbinfo, count); -+ /* return sbinfo locked */ -+ return sbinfo; -+} -+ -+/* is called after @count fake block numbers are allocated and pointer to -+ those blocks are inserted into tree. */ -+static void grabbed2fake_allocated_formatted(void) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = grabbed2fake_allocated_head(1); -+ sbinfo->blocks_fake_allocated++; -+ -+ assert("vs-922", reiser4_check_block_counters(reiser4_get_current_sb())); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+/** -+ * grabbed2fake_allocated_unformatted -+ * @count: -+ * -+ */ -+static void grabbed2fake_allocated_unformatted(int count) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = grabbed2fake_allocated_head(count); -+ sbinfo->blocks_fake_allocated_unformatted += count; -+ -+ assert("vs-9221", reiser4_check_block_counters(reiser4_get_current_sb())); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+void grabbed2cluster_reserved(int count) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ ctx = get_current_context(); -+ sub_from_ctx_grabbed(ctx, count); -+ -+ sbinfo = get_super_private(ctx->super); -+ spin_lock_reiser4_super(sbinfo); -+ -+ sub_from_sb_grabbed(sbinfo, count); -+ sbinfo->blocks_clustered += count; -+ -+ assert("edward-504", reiser4_check_block_counters(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+void cluster_reserved2grabbed(int count) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ ctx = get_current_context(); -+ -+ sbinfo = get_super_private(ctx->super); -+ spin_lock_reiser4_super(sbinfo); -+ -+ sub_from_cluster_reserved(sbinfo, count); -+ sbinfo->blocks_grabbed += count; -+ -+ assert("edward-505", reiser4_check_block_counters(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+ add_to_ctx_grabbed(ctx, count); -+} -+ -+void cluster_reserved2free(int count) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ cluster_reserved2grabbed(count); -+ grabbed2free(ctx, sbinfo, count); -+} -+ -+static DEFINE_SPINLOCK(fake_lock); -+static reiser4_block_nr fake_gen = 0; -+ -+/** -+ * assign_fake_blocknr -+ * @blocknr: -+ * @count: -+ * -+ * Obtain a fake block number for new node which will be used to refer to -+ * this newly allocated node until real allocation is done. -+ */ -+static void assign_fake_blocknr(reiser4_block_nr *blocknr, int count) -+{ -+ spin_lock(&fake_lock); -+ *blocknr = fake_gen; -+ fake_gen += count; -+ spin_unlock(&fake_lock); -+ -+ BUG_ON(*blocknr & REISER4_BLOCKNR_STATUS_BIT_MASK); -+ /**blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;*/ -+ *blocknr |= REISER4_UNALLOCATED_STATUS_VALUE; -+ assert("zam-394", zlook(current_tree, blocknr) == NULL); -+} -+ -+int assign_fake_blocknr_formatted(reiser4_block_nr * blocknr) -+{ -+ assign_fake_blocknr(blocknr, 1); -+ grabbed2fake_allocated_formatted(); -+ return 0; -+} -+ -+/** -+ * fake_blocknrs_unformatted -+ * @count: number of fake numbers to get -+ * -+ * Allocates @count fake block numbers which will be assigned to jnodes -+ */ -+reiser4_block_nr fake_blocknr_unformatted(int count) -+{ -+ reiser4_block_nr blocknr; -+ -+ assign_fake_blocknr(&blocknr, count); -+ grabbed2fake_allocated_unformatted(count); -+ -+ return blocknr; -+} -+ -+/* adjust sb block counters, if real (on-disk) block allocation immediately -+ follows grabbing of free disk space. */ -+static void grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo, -+ __u64 count) -+{ -+ sub_from_ctx_grabbed(ctx, count); -+ -+ spin_lock_reiser4_super(sbinfo); -+ -+ sub_from_sb_grabbed(sbinfo, count); -+ sbinfo->blocks_used += count; -+ -+ assert("nikita-2679", reiser4_check_block_counters(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+/* adjust sb block counters when @count unallocated blocks get mapped to disk */ -+static void fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count, -+ reiser4_ba_flags_t flags) -+{ -+ spin_lock_reiser4_super(sbinfo); -+ -+ sub_from_sb_fake_allocated(sbinfo, count, flags); -+ sbinfo->blocks_used += count; -+ -+ assert("nikita-2680", -+ reiser4_check_block_counters(reiser4_get_current_sb())); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+static void flush_reserved2used(txn_atom * atom, __u64 count) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ assert("zam-787", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ -+ sub_from_atom_flush_reserved_nolock(atom, (__u32) count); -+ -+ sbinfo = get_current_super_private(); -+ spin_lock_reiser4_super(sbinfo); -+ -+ sub_from_sb_flush_reserved(sbinfo, count); -+ sbinfo->blocks_used += count; -+ -+ assert("zam-789", -+ reiser4_check_block_counters(reiser4_get_current_sb())); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+/* update the per fs blocknr hint default value. */ -+void -+update_blocknr_hint_default(const struct super_block *s, -+ const reiser4_block_nr * block) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(s); -+ -+ assert("nikita-3342", !reiser4_blocknr_is_fake(block)); -+ -+ spin_lock_reiser4_super(sbinfo); -+ if (*block < sbinfo->block_count) { -+ sbinfo->blocknr_hint_default = *block; -+ } else { -+ warning("zam-676", -+ "block number %llu is too large to be used in a blocknr hint\n", -+ (unsigned long long)*block); -+ dump_stack(); -+ DEBUGON(1); -+ } -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+/* get current value of the default blocknr hint. */ -+void get_blocknr_hint_default(reiser4_block_nr * result) -+{ -+ reiser4_super_info_data *sbinfo = get_current_super_private(); -+ -+ spin_lock_reiser4_super(sbinfo); -+ *result = sbinfo->blocknr_hint_default; -+ assert("zam-677", *result < sbinfo->block_count); -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+/* Allocate "real" disk blocks by calling a proper space allocation plugin -+ * method. Blocks are allocated in one contiguous disk region. The plugin -+ * independent part accounts blocks by subtracting allocated amount from grabbed -+ * or fake block counter and add the same amount to the counter of allocated -+ * blocks. -+ * -+ * @hint -- a reiser4 blocknr hint object which contains further block -+ * allocation hints and parameters (search start, a stage of block -+ * which will be mapped to disk, etc.), -+ * @blk -- an out parameter for the beginning of the allocated region, -+ * @len -- in/out parameter, it should contain the maximum number of allocated -+ * blocks, after block allocation completes, it contains the length of -+ * allocated disk region. -+ * @flags -- see reiser4_ba_flags_t description. -+ * -+ * @return -- 0 if success, error code otherwise. -+ */ -+int -+reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk, -+ reiser4_block_nr * len, reiser4_ba_flags_t flags) -+{ -+ __u64 needed = *len; -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ int ret; -+ -+ assert("zam-986", hint != NULL); -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ /* For write-optimized data we use default search start value, which is -+ * close to last write location. */ -+ if (flags & BA_USE_DEFAULT_SEARCH_START) { -+ get_blocknr_hint_default(&hint->blk); -+ } -+ -+ /* VITALY: allocator should grab this for internal/tx-lists/similar only. */ -+/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)? */ -+ if (hint->block_stage == BLOCK_NOT_COUNTED) { -+ ret = reiser4_grab_space_force(*len, flags); -+ if (ret != 0) -+ return ret; -+ } -+ -+ ret = -+ sa_alloc_blocks(reiser4_get_space_allocator(ctx->super), -+ hint, (int)needed, blk, len); -+ -+ if (!ret) { -+ assert("zam-680", *blk < reiser4_block_count(ctx->super)); -+ assert("zam-681", -+ *blk + *len <= reiser4_block_count(ctx->super)); -+ -+ if (flags & BA_PERMANENT) { -+ /* we assume that current atom exists at this moment */ -+ txn_atom *atom = get_current_atom_locked(); -+ atom->nr_blocks_allocated += *len; -+ spin_unlock_atom(atom); -+ } -+ -+ switch (hint->block_stage) { -+ case BLOCK_NOT_COUNTED: -+ case BLOCK_GRABBED: -+ grabbed2used(ctx, sbinfo, *len); -+ break; -+ case BLOCK_UNALLOCATED: -+ fake_allocated2used(sbinfo, *len, flags); -+ break; -+ case BLOCK_FLUSH_RESERVED: -+ { -+ txn_atom *atom = get_current_atom_locked(); -+ flush_reserved2used(atom, *len); -+ spin_unlock_atom(atom); -+ } -+ break; -+ default: -+ impossible("zam-531", "wrong block stage"); -+ } -+ } else { -+ assert("zam-821", -+ ergo(hint->max_dist == 0 -+ && !hint->backward, ret != -ENOSPC)); -+ if (hint->block_stage == BLOCK_NOT_COUNTED) -+ grabbed2free(ctx, sbinfo, needed); -+ } -+ -+ return ret; -+} -+ -+/* used -> fake_allocated -> grabbed -> free */ -+ -+/* adjust sb block counters when @count unallocated blocks get unmapped from -+ disk */ -+static void -+used2fake_allocated(reiser4_super_info_data * sbinfo, __u64 count, -+ int formatted) -+{ -+ spin_lock_reiser4_super(sbinfo); -+ -+ if (formatted) -+ sbinfo->blocks_fake_allocated += count; -+ else -+ sbinfo->blocks_fake_allocated_unformatted += count; -+ -+ sub_from_sb_used(sbinfo, count); -+ -+ assert("nikita-2681", -+ reiser4_check_block_counters(reiser4_get_current_sb())); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+static void -+used2flush_reserved(reiser4_super_info_data * sbinfo, txn_atom * atom, -+ __u64 count, reiser4_ba_flags_t flags UNUSED_ARG) -+{ -+ assert("nikita-2791", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ -+ add_to_atom_flush_reserved_nolock(atom, (__u32) count); -+ -+ spin_lock_reiser4_super(sbinfo); -+ -+ sbinfo->blocks_flush_reserved += count; -+ /*add_to_sb_flush_reserved(sbinfo, count); */ -+ sub_from_sb_used(sbinfo, count); -+ -+ assert("nikita-2681", -+ reiser4_check_block_counters(reiser4_get_current_sb())); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+/* disk space, virtually used by fake block numbers is counted as "grabbed" again. */ -+static void -+fake_allocated2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo, -+ __u64 count, reiser4_ba_flags_t flags) -+{ -+ add_to_ctx_grabbed(ctx, count); -+ -+ spin_lock_reiser4_super(sbinfo); -+ -+ assert("nikita-2682", reiser4_check_block_counters(ctx->super)); -+ -+ sbinfo->blocks_grabbed += count; -+ sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED); -+ -+ assert("nikita-2683", reiser4_check_block_counters(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ fake_allocated2grabbed(ctx, sbinfo, count, flags); -+ grabbed2free(ctx, sbinfo, count); -+} -+ -+void grabbed2free_mark(__u64 mark) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ assert("nikita-3007", (__s64) mark >= 0); -+ assert("nikita-3006", ctx->grabbed_blocks >= mark); -+ grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark); -+} -+ -+/** -+ * grabbed2free - adjust grabbed and free block counters -+ * @ctx: context to update grabbed block counter of -+ * @sbinfo: super block to update grabbed and free block counters of -+ * @count: number of blocks to adjust counters by -+ * -+ * Decreases context's and per filesystem's counters of grabbed -+ * blocks. Increases per filesystem's counter of free blocks. -+ */ -+void grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo, -+ __u64 count) -+{ -+ sub_from_ctx_grabbed(ctx, count); -+ -+ spin_lock_reiser4_super(sbinfo); -+ -+ sub_from_sb_grabbed(sbinfo, count); -+ sbinfo->blocks_free += count; -+ assert("nikita-2684", reiser4_check_block_counters(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ assert("vs-1095", atom); -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ sub_from_ctx_grabbed(ctx, count); -+ -+ add_to_atom_flush_reserved_nolock(atom, count); -+ -+ spin_lock_reiser4_super(sbinfo); -+ -+ sbinfo->blocks_flush_reserved += count; -+ sub_from_sb_grabbed(sbinfo, count); -+ -+ assert("vpf-292", reiser4_check_block_counters(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+void grabbed2flush_reserved(__u64 count) -+{ -+ txn_atom *atom = get_current_atom_locked(); -+ -+ grabbed2flush_reserved_nolock(atom, count); -+ -+ spin_unlock_atom(atom); -+} -+ -+void flush_reserved2grabbed(txn_atom * atom, __u64 count) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ assert("nikita-2788", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ add_to_ctx_grabbed(ctx, count); -+ -+ sub_from_atom_flush_reserved_nolock(atom, (__u32) count); -+ -+ spin_lock_reiser4_super(sbinfo); -+ -+ sbinfo->blocks_grabbed += count; -+ sub_from_sb_flush_reserved(sbinfo, count); -+ -+ assert("vpf-292", reiser4_check_block_counters(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+/** -+ * all_grabbed2free - releases all blocks grabbed in context -+ * -+ * Decreases context's and super block's grabbed block counters by number of -+ * blocks grabbed by current context and increases super block's free block -+ * counter correspondingly. -+ */ -+void all_grabbed2free(void) -+{ -+ reiser4_context *ctx = get_current_context(); -+ -+ grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks); -+} -+ -+/* adjust sb block counters if real (on-disk) blocks do not become unallocated -+ after freeing, @count blocks become "grabbed". */ -+static void -+used2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo, -+ __u64 count) -+{ -+ add_to_ctx_grabbed(ctx, count); -+ -+ spin_lock_reiser4_super(sbinfo); -+ -+ sbinfo->blocks_grabbed += count; -+ sub_from_sb_used(sbinfo, count); -+ -+ assert("nikita-2685", reiser4_check_block_counters(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+/* this used to be done through used2grabbed and grabbed2free*/ -+static void used2free(reiser4_super_info_data * sbinfo, __u64 count) -+{ -+ spin_lock_reiser4_super(sbinfo); -+ -+ sbinfo->blocks_free += count; -+ sub_from_sb_used(sbinfo, count); -+ -+ assert("nikita-2685", -+ reiser4_check_block_counters(reiser4_get_current_sb())); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+#if REISER4_DEBUG -+ -+/* check "allocated" state of given block range */ -+static void -+reiser4_check_blocks(const reiser4_block_nr * start, -+ const reiser4_block_nr * len, int desired) -+{ -+ sa_check_blocks(start, len, desired); -+} -+ -+/* check "allocated" state of given block */ -+void reiser4_check_block(const reiser4_block_nr * block, int desired) -+{ -+ const reiser4_block_nr one = 1; -+ -+ reiser4_check_blocks(block, &one, desired); -+} -+ -+#endif -+ -+/* Blocks deallocation function may do an actual deallocation through space -+ plugin allocation or store deleted block numbers in atom's delete_set data -+ structure depend on @defer parameter. */ -+ -+/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks which -+ will be deleted from WORKING bitmap. They might be just unmapped from disk, or -+ freed but disk space is still grabbed by current thread, or these blocks must -+ not be counted in any reiser4 sb block counters, see block_stage_t comment */ -+ -+/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to -+ distinguish blocks allocated for unformatted and formatted nodes */ -+ -+int -+reiser4_dealloc_blocks(const reiser4_block_nr * start, -+ const reiser4_block_nr * len, -+ block_stage_t target_stage, reiser4_ba_flags_t flags) -+{ -+ txn_atom *atom = NULL; -+ int ret; -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ if (REISER4_DEBUG) { -+ assert("zam-431", *len != 0); -+ assert("zam-432", *start != 0); -+ assert("zam-558", !reiser4_blocknr_is_fake(start)); -+ -+ spin_lock_reiser4_super(sbinfo); -+ assert("zam-562", *start < sbinfo->block_count); -+ spin_unlock_reiser4_super(sbinfo); -+ } -+ -+ if (flags & BA_DEFER) { -+ blocknr_set_entry *bsep = NULL; -+ -+ /* storing deleted block numbers in a blocknr set -+ datastructure for further actual deletion */ -+ do { -+ atom = get_current_atom_locked(); -+ assert("zam-430", atom != NULL); -+ -+ ret = -+ blocknr_set_add_extent(atom, &atom->delete_set, -+ &bsep, start, len); -+ -+ if (ret == -ENOMEM) -+ return ret; -+ -+ /* This loop might spin at most two times */ -+ } while (ret == -E_REPEAT); -+ -+ assert("zam-477", ret == 0); -+ assert("zam-433", atom != NULL); -+ -+ spin_unlock_atom(atom); -+ -+ } else { -+ assert("zam-425", get_current_super_private() != NULL); -+ sa_dealloc_blocks(reiser4_get_space_allocator(ctx->super), -+ *start, *len); -+ -+ if (flags & BA_PERMANENT) { -+ /* These blocks were counted as allocated, we have to revert it -+ * back if allocation is discarded. */ -+ txn_atom *atom = get_current_atom_locked(); -+ atom->nr_blocks_allocated -= *len; -+ spin_unlock_atom(atom); -+ } -+ -+ switch (target_stage) { -+ case BLOCK_NOT_COUNTED: -+ assert("vs-960", flags & BA_FORMATTED); -+ /* VITALY: This is what was grabbed for internal/tx-lists/similar only */ -+ used2free(sbinfo, *len); -+ break; -+ -+ case BLOCK_GRABBED: -+ used2grabbed(ctx, sbinfo, *len); -+ break; -+ -+ case BLOCK_UNALLOCATED: -+ used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED); -+ break; -+ -+ case BLOCK_FLUSH_RESERVED:{ -+ txn_atom *atom; -+ -+ atom = get_current_atom_locked(); -+ used2flush_reserved(sbinfo, atom, *len, -+ flags & BA_FORMATTED); -+ spin_unlock_atom(atom); -+ break; -+ } -+ default: -+ impossible("zam-532", "wrong block stage"); -+ } -+ } -+ -+ return 0; -+} -+ -+/* wrappers for block allocator plugin methods */ -+int reiser4_pre_commit_hook(void) -+{ -+ assert("zam-502", get_current_super_private() != NULL); -+ sa_pre_commit_hook(); -+ return 0; -+} -+ -+/* an actor which applies delete set to block allocator data */ -+static int -+apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a, -+ const reiser4_block_nr * b, void *data UNUSED_ARG) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ __u64 len = 1; -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT); -+ assert("zam-552", sbinfo != NULL); -+ -+ if (b != NULL) -+ len = *b; -+ -+ if (REISER4_DEBUG) { -+ spin_lock_reiser4_super(sbinfo); -+ -+ assert("zam-554", *a < reiser4_block_count(ctx->super)); -+ assert("zam-555", *a + len <= reiser4_block_count(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+ } -+ -+ sa_dealloc_blocks(&sbinfo->space_allocator, *a, len); -+ /* adjust sb block counters */ -+ used2free(sbinfo, len); -+ return 0; -+} -+ -+void reiser4_post_commit_hook(void) -+{ -+ txn_atom *atom; -+ -+ atom = get_current_atom_locked(); -+ assert("zam-452", atom->stage == ASTAGE_POST_COMMIT); -+ spin_unlock_atom(atom); -+ -+ /* do the block deallocation which was deferred -+ until commit is done */ -+ blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1); -+ -+ assert("zam-504", get_current_super_private() != NULL); -+ sa_post_commit_hook(); -+} -+ -+void reiser4_post_write_back_hook(void) -+{ -+ assert("zam-504", get_current_super_private() != NULL); -+ -+ sa_post_commit_hook(); -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/block_alloc.h linux-2.6.20/fs/reiser4/block_alloc.h ---- linux-2.6.20.orig/fs/reiser4/block_alloc.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/block_alloc.h 2007-05-06 14:50:43.682970725 +0400 -@@ -0,0 +1,175 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#if !defined (__FS_REISER4_BLOCK_ALLOC_H__) -+#define __FS_REISER4_BLOCK_ALLOC_H__ -+ -+#include "dformat.h" -+#include "forward.h" -+ -+#include /* for __u?? */ -+#include -+ -+/* Mask when is applied to given block number shows is that block number is a fake one */ -+#define REISER4_FAKE_BLOCKNR_BIT_MASK 0x8000000000000000ULL -+/* Mask which isolates a type of object this fake block number was assigned to */ -+#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL -+ -+/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared -+ against these two values to understand is the object unallocated or bitmap -+ shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */ -+#define REISER4_UNALLOCATED_STATUS_VALUE 0xC000000000000000ULL -+#define REISER4_BITMAP_BLOCKS_STATUS_VALUE 0x8000000000000000ULL -+ -+/* specification how block allocation was counted in sb block counters */ -+typedef enum { -+ BLOCK_NOT_COUNTED = 0, /* reiser4 has no info about this block yet */ -+ BLOCK_GRABBED = 1, /* free space grabbed for further allocation -+ of this block */ -+ BLOCK_FLUSH_RESERVED = 2, /* block is reserved for flush needs. */ -+ BLOCK_UNALLOCATED = 3, /* block is used for existing in-memory object -+ ( unallocated formatted or unformatted -+ node) */ -+ BLOCK_ALLOCATED = 4 /* block is mapped to disk, real on-disk block -+ number assigned */ -+} block_stage_t; -+ -+/* a hint for block allocator */ -+struct reiser4_blocknr_hint { -+ /* FIXME: I think we want to add a longterm lock on the bitmap block here. This -+ is to prevent jnode_flush() calls from interleaving allocations on the same -+ bitmap, once a hint is established. */ -+ -+ /* search start hint */ -+ reiser4_block_nr blk; -+ /* if not zero, it is a region size we search for free blocks in */ -+ reiser4_block_nr max_dist; -+ /* level for allocation, may be useful have branch-level and higher -+ write-optimized. */ -+ tree_level level; -+ /* block allocator assumes that blocks, which will be mapped to disk, -+ are in this specified block_stage */ -+ block_stage_t block_stage; -+ /* If direction = 1 allocate blocks in backward direction from the end -+ * of disk to the beginning of disk. */ -+ unsigned int backward:1; -+ -+}; -+ -+/* These flags control block allocation/deallocation behavior */ -+enum reiser4_ba_flags { -+ /* do allocatations from reserved (5%) area */ -+ BA_RESERVED = (1 << 0), -+ -+ /* block allocator can do commit trying to recover free space */ -+ BA_CAN_COMMIT = (1 << 1), -+ -+ /* if operation will be applied to formatted block */ -+ BA_FORMATTED = (1 << 2), -+ -+ /* defer actual block freeing until transaction commit */ -+ BA_DEFER = (1 << 3), -+ -+ /* allocate blocks for permanent fs objects (formatted or unformatted), not -+ wandered of log blocks */ -+ BA_PERMANENT = (1 << 4), -+ -+ /* grab space even it was disabled */ -+ BA_FORCE = (1 << 5), -+ -+ /* use default start value for free blocks search. */ -+ BA_USE_DEFAULT_SEARCH_START = (1 << 6) -+}; -+ -+typedef enum reiser4_ba_flags reiser4_ba_flags_t; -+ -+extern void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint); -+extern void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint); -+extern void update_blocknr_hint_default(const struct super_block *, -+ const reiser4_block_nr *); -+extern void get_blocknr_hint_default(reiser4_block_nr *); -+ -+extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block *super); -+ -+int assign_fake_blocknr_formatted(reiser4_block_nr *); -+reiser4_block_nr fake_blocknr_unformatted(int); -+ -+/* free -> grabbed -> fake_allocated -> used */ -+ -+int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags); -+void all_grabbed2free(void); -+void grabbed2free(reiser4_context *, reiser4_super_info_data *, __u64 count); -+void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags); -+void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count); -+void grabbed2flush_reserved(__u64 count); -+int reiser4_alloc_blocks(reiser4_blocknr_hint * hint, -+ reiser4_block_nr * start, -+ reiser4_block_nr * len, reiser4_ba_flags_t flags); -+int reiser4_dealloc_blocks(const reiser4_block_nr *, -+ const reiser4_block_nr *, -+ block_stage_t, reiser4_ba_flags_t flags); -+ -+static inline int reiser4_alloc_block(reiser4_blocknr_hint * hint, -+ reiser4_block_nr * start, -+ reiser4_ba_flags_t flags) -+{ -+ reiser4_block_nr one = 1; -+ return reiser4_alloc_blocks(hint, start, &one, flags); -+} -+ -+static inline int reiser4_dealloc_block(const reiser4_block_nr * block, -+ block_stage_t stage, -+ reiser4_ba_flags_t flags) -+{ -+ const reiser4_block_nr one = 1; -+ return reiser4_dealloc_blocks(block, &one, stage, flags); -+} -+ -+#define reiser4_grab_space_force(count, flags) \ -+ reiser4_grab_space(count, flags | BA_FORCE) -+ -+extern void grabbed2free_mark(__u64 mark); -+extern int reiser4_grab_reserved(struct super_block *, -+ __u64, reiser4_ba_flags_t); -+extern void reiser4_release_reserved(struct super_block *super); -+ -+/* grabbed -> fake_allocated */ -+ -+/* fake_allocated -> used */ -+ -+/* used -> fake_allocated -> grabbed -> free */ -+ -+extern void flush_reserved2grabbed(txn_atom * atom, __u64 count); -+ -+extern int reiser4_blocknr_is_fake(const reiser4_block_nr * da); -+ -+extern void grabbed2cluster_reserved(int count); -+extern void cluster_reserved2grabbed(int count); -+extern void cluster_reserved2free(int count); -+ -+extern int reiser4_check_block_counters(const struct super_block *); -+ -+#if REISER4_DEBUG -+ -+extern void reiser4_check_block(const reiser4_block_nr *, int); -+ -+#else -+ -+# define reiser4_check_block(beg, val) noop -+ -+#endif -+ -+extern int reiser4_pre_commit_hook(void); -+extern void reiser4_post_commit_hook(void); -+extern void reiser4_post_write_back_hook(void); -+ -+#endif /* __FS_REISER4_BLOCK_ALLOC_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/blocknrset.c linux-2.6.20/fs/reiser4/blocknrset.c ---- linux-2.6.20.orig/fs/reiser4/blocknrset.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/blocknrset.c 2007-05-06 14:50:43.686971975 +0400 -@@ -0,0 +1,368 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* This file contains code for various block number sets used by the atom to -+ track the deleted set and wandered block mappings. */ -+ -+#include "debug.h" -+#include "dformat.h" -+#include "txnmgr.h" -+#include "context.h" -+ -+#include -+ -+/* The proposed data structure for storing unordered block number sets is a -+ list of elements, each of which contains an array of block number or/and -+ array of block number pairs. That element called blocknr_set_entry is used -+ to store block numbers from the beginning and for extents from the end of -+ the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields -+ count numbers of blocks and extents. -+ -+ +------------------- blocknr_set_entry->data ------------------+ -+ |block1|block2| ... ... |pair3|pair2|pair1| -+ +------------------------------------------------------------+ -+ -+ When current blocknr_set_entry is full, allocate a new one. */ -+ -+/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete -+ * set (single blocks and block extents), in that case blocknr pair represent an -+ * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs -+ * there represent a (real block) -> (wandered block) mapping. */ -+ -+/* Protection: blocknr sets belong to reiser4 atom, and -+ * their modifications are performed with the atom lock held */ -+ -+typedef struct blocknr_pair blocknr_pair; -+ -+/* The total size of a blocknr_set_entry. */ -+#define BLOCKNR_SET_ENTRY_SIZE 128 -+ -+/* The number of blocks that can fit the blocknr data area. */ -+#define BLOCKNR_SET_ENTRIES_NUMBER \ -+ ((BLOCKNR_SET_ENTRY_SIZE - \ -+ 2 * sizeof (unsigned) - \ -+ sizeof(struct list_head)) / \ -+ sizeof(reiser4_block_nr)) -+ -+/* An entry of the blocknr_set */ -+struct blocknr_set_entry { -+ unsigned nr_singles; -+ unsigned nr_pairs; -+ struct list_head link; -+ reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER]; -+}; -+ -+/* A pair of blocks as recorded in the blocknr_set_entry data. */ -+struct blocknr_pair { -+ reiser4_block_nr a; -+ reiser4_block_nr b; -+}; -+ -+/* Return the number of blocknr slots available in a blocknr_set_entry. */ -+/* Audited by: green(2002.06.11) */ -+static unsigned bse_avail(blocknr_set_entry * bse) -+{ -+ unsigned used = bse->nr_singles + 2 * bse->nr_pairs; -+ -+ assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used); -+ cassert(sizeof(blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE); -+ -+ return BLOCKNR_SET_ENTRIES_NUMBER - used; -+} -+ -+/* Initialize a blocknr_set_entry. */ -+static void bse_init(blocknr_set_entry *bse) -+{ -+ bse->nr_singles = 0; -+ bse->nr_pairs = 0; -+ INIT_LIST_HEAD(&bse->link); -+} -+ -+/* Allocate and initialize a blocknr_set_entry. */ -+/* Audited by: green(2002.06.11) */ -+static blocknr_set_entry *bse_alloc(void) -+{ -+ blocknr_set_entry *e; -+ -+ if ((e = (blocknr_set_entry *) kmalloc(sizeof(blocknr_set_entry), -+ reiser4_ctx_gfp_mask_get())) == NULL) -+ return NULL; -+ -+ bse_init(e); -+ -+ return e; -+} -+ -+/* Free a blocknr_set_entry. */ -+/* Audited by: green(2002.06.11) */ -+static void bse_free(blocknr_set_entry * bse) -+{ -+ kfree(bse); -+} -+ -+/* Add a block number to a blocknr_set_entry */ -+/* Audited by: green(2002.06.11) */ -+static void -+bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block) -+{ -+ assert("jmacd-5099", bse_avail(bse) >= 1); -+ -+ bse->entries[bse->nr_singles++] = *block; -+} -+ -+/* Get a pair of block numbers */ -+/* Audited by: green(2002.06.11) */ -+static inline blocknr_pair *bse_get_pair(blocknr_set_entry * bse, unsigned pno) -+{ -+ assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1)); -+ -+ return (blocknr_pair *) (bse->entries + BLOCKNR_SET_ENTRIES_NUMBER - -+ 2 * (pno + 1)); -+} -+ -+/* Add a pair of block numbers to a blocknr_set_entry */ -+/* Audited by: green(2002.06.11) */ -+static void -+bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a, -+ const reiser4_block_nr * b) -+{ -+ blocknr_pair *pair; -+ -+ assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL); -+ -+ pair = bse_get_pair(bse, bse->nr_pairs++); -+ -+ pair->a = *a; -+ pair->b = *b; -+} -+ -+/* Add either a block or pair of blocks to the block number set. The first -+ blocknr (@a) must be non-NULL. If @b is NULL a single blocknr is added, if -+ @b is non-NULL a pair is added. The block number set belongs to atom, and -+ the call is made with the atom lock held. There may not be enough space in -+ the current blocknr_set_entry. If new_bsep points to a non-NULL -+ blocknr_set_entry then it will be added to the blocknr_set and new_bsep -+ will be set to NULL. If new_bsep contains NULL then the atom lock will be -+ released and a new bse will be allocated in new_bsep. E_REPEAT will be -+ returned with the atom unlocked for the operation to be tried again. If -+ the operation succeeds, 0 is returned. If new_bsep is non-NULL and not -+ used during the call, it will be freed automatically. */ -+static int blocknr_set_add(txn_atom *atom, struct list_head *bset, -+ blocknr_set_entry **new_bsep, const reiser4_block_nr *a, -+ const reiser4_block_nr *b) -+{ -+ blocknr_set_entry *bse; -+ unsigned entries_needed; -+ -+ assert("jmacd-5101", a != NULL); -+ -+ entries_needed = (b == NULL) ? 1 : 2; -+ if (list_empty(bset) || -+ bse_avail(list_entry(bset->next, blocknr_set_entry, link)) < entries_needed) { -+ /* See if a bse was previously allocated. */ -+ if (*new_bsep == NULL) { -+ spin_unlock_atom(atom); -+ *new_bsep = bse_alloc(); -+ return (*new_bsep != NULL) ? -E_REPEAT : -+ RETERR(-ENOMEM); -+ } -+ -+ /* Put it on the head of the list. */ -+ list_add(&((*new_bsep)->link), bset); -+ -+ *new_bsep = NULL; -+ } -+ -+ /* Add the single or pair. */ -+ bse = list_entry(bset->next, blocknr_set_entry, link); -+ if (b == NULL) { -+ bse_put_single(bse, a); -+ } else { -+ bse_put_pair(bse, a, b); -+ } -+ -+ /* If new_bsep is non-NULL then there was an allocation race, free this copy. */ -+ if (*new_bsep != NULL) { -+ bse_free(*new_bsep); -+ *new_bsep = NULL; -+ } -+ -+ return 0; -+} -+ -+/* Add an extent to the block set. If the length is 1, it is treated as a -+ single block (e.g., reiser4_set_add_block). */ -+/* Audited by: green(2002.06.11) */ -+/* Auditor note: Entire call chain cannot hold any spinlocks, because -+ kmalloc might schedule. The only exception is atom spinlock, which is -+ properly freed. */ -+int -+blocknr_set_add_extent(txn_atom * atom, -+ struct list_head * bset, -+ blocknr_set_entry ** new_bsep, -+ const reiser4_block_nr * start, -+ const reiser4_block_nr * len) -+{ -+ assert("jmacd-5102", start != NULL && len != NULL && *len > 0); -+ return blocknr_set_add(atom, bset, new_bsep, start, -+ *len == 1 ? NULL : len); -+} -+ -+/* Add a block pair to the block set. It adds exactly a pair, which is checked -+ * by an assertion that both arguments are not null.*/ -+/* Audited by: green(2002.06.11) */ -+/* Auditor note: Entire call chain cannot hold any spinlocks, because -+ kmalloc might schedule. The only exception is atom spinlock, which is -+ properly freed. */ -+int -+blocknr_set_add_pair(txn_atom * atom, -+ struct list_head * bset, -+ blocknr_set_entry ** new_bsep, const reiser4_block_nr * a, -+ const reiser4_block_nr * b) -+{ -+ assert("jmacd-5103", a != NULL && b != NULL); -+ return blocknr_set_add(atom, bset, new_bsep, a, b); -+} -+ -+/* Initialize a blocknr_set. */ -+void blocknr_set_init(struct list_head *bset) -+{ -+ INIT_LIST_HEAD(bset); -+} -+ -+/* Release the entries of a blocknr_set. */ -+void blocknr_set_destroy(struct list_head *bset) -+{ -+ blocknr_set_entry *bse; -+ -+ while (!list_empty(bset)) { -+ bse = list_entry(bset->next, blocknr_set_entry, link); -+ list_del_init(&bse->link); -+ bse_free(bse); -+ } -+} -+ -+/* Merge blocknr_set entries out of @from into @into. */ -+/* Audited by: green(2002.06.11) */ -+/* Auditor comments: This merge does not know if merged sets contain -+ blocks pairs (As for wandered sets) or extents, so it cannot really merge -+ overlapping ranges if there is some. So I believe it may lead to -+ some blocks being presented several times in one blocknr_set. To help -+ debugging such problems it might help to check for duplicate entries on -+ actual processing of this set. Testing this kind of stuff right here is -+ also complicated by the fact that these sets are not sorted and going -+ through whole set on each element addition is going to be CPU-heavy task */ -+void blocknr_set_merge(struct list_head * from, struct list_head * into) -+{ -+ blocknr_set_entry *bse_into = NULL; -+ -+ /* If @from is empty, no work to perform. */ -+ if (list_empty(from)) -+ return; -+ /* If @into is not empty, try merging partial-entries. */ -+ if (!list_empty(into)) { -+ -+ /* Neither set is empty, pop the front to members and try to combine them. */ -+ blocknr_set_entry *bse_from; -+ unsigned into_avail; -+ -+ bse_into = list_entry(into->next, blocknr_set_entry, link); -+ list_del_init(&bse_into->link); -+ bse_from = list_entry(from->next, blocknr_set_entry, link); -+ list_del_init(&bse_from->link); -+ -+ /* Combine singles. */ -+ for (into_avail = bse_avail(bse_into); -+ into_avail != 0 && bse_from->nr_singles != 0; -+ into_avail -= 1) { -+ bse_put_single(bse_into, -+ &bse_from->entries[--bse_from-> -+ nr_singles]); -+ } -+ -+ /* Combine pairs. */ -+ for (; into_avail > 1 && bse_from->nr_pairs != 0; -+ into_avail -= 2) { -+ blocknr_pair *pair = -+ bse_get_pair(bse_from, --bse_from->nr_pairs); -+ bse_put_pair(bse_into, &pair->a, &pair->b); -+ } -+ -+ /* If bse_from is empty, delete it now. */ -+ if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) { -+ bse_free(bse_from); -+ } else { -+ /* Otherwise, bse_into is full or nearly full (e.g., -+ it could have one slot avail and bse_from has one -+ pair left). Push it back onto the list. bse_from -+ becomes bse_into, which will be the new partial. */ -+ list_add(&bse_into->link, into); -+ bse_into = bse_from; -+ } -+ } -+ -+ /* Splice lists together. */ -+ list_splice_init(from, into->prev); -+ -+ /* Add the partial entry back to the head of the list. */ -+ if (bse_into != NULL) -+ list_add(&bse_into->link, into); -+} -+ -+/* Iterate over all blocknr set elements. */ -+int blocknr_set_iterator(txn_atom *atom, struct list_head *bset, -+ blocknr_set_actor_f actor, void *data, int delete) -+{ -+ -+ blocknr_set_entry *entry; -+ -+ assert("zam-429", atom != NULL); -+ assert("zam-430", atom_is_protected(atom)); -+ assert("zam-431", bset != 0); -+ assert("zam-432", actor != NULL); -+ -+ entry = list_entry(bset->next, blocknr_set_entry, link); -+ while (bset != &entry->link) { -+ blocknr_set_entry *tmp = list_entry(entry->link.next, blocknr_set_entry, link); -+ unsigned int i; -+ int ret; -+ -+ for (i = 0; i < entry->nr_singles; i++) { -+ ret = actor(atom, &entry->entries[i], NULL, data); -+ -+ /* We can't break a loop if delete flag is set. */ -+ if (ret != 0 && !delete) -+ return ret; -+ } -+ -+ for (i = 0; i < entry->nr_pairs; i++) { -+ struct blocknr_pair *ab; -+ -+ ab = bse_get_pair(entry, i); -+ -+ ret = actor(atom, &ab->a, &ab->b, data); -+ -+ if (ret != 0 && !delete) -+ return ret; -+ } -+ -+ if (delete) { -+ list_del(&entry->link); -+ bse_free(entry); -+ } -+ -+ entry = tmp; -+ } -+ -+ return 0; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/carry.c linux-2.6.20/fs/reiser4/carry.c ---- linux-2.6.20.orig/fs/reiser4/carry.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/carry.c 2007-05-06 14:50:43.686971975 +0400 -@@ -0,0 +1,1391 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+/* Functions to "carry" tree modification(s) upward. */ -+/* Tree is modified one level at a time. As we modify a level we accumulate a -+ set of changes that need to be propagated to the next level. We manage -+ node locking such that any searches that collide with carrying are -+ restarted, from the root if necessary. -+ -+ Insertion of a new item may result in items being moved among nodes and -+ this requires the delimiting key to be updated at the least common parent -+ of the nodes modified to preserve search tree invariants. Also, insertion -+ may require allocation of a new node. A pointer to the new node has to be -+ inserted into some node on the parent level, etc. -+ -+ Tree carrying is meant to be analogous to arithmetic carrying. -+ -+ A carry operation is always associated with some node (&carry_node). -+ -+ Carry process starts with some initial set of operations to be performed -+ and an initial set of already locked nodes. Operations are performed one -+ by one. Performing each single operation has following possible effects: -+ -+ - content of carry node associated with operation is modified -+ - new carry nodes are locked and involved into carry process on this level -+ - new carry operations are posted to the next level -+ -+ After all carry operations on this level are done, process is repeated for -+ the accumulated sequence on carry operations for the next level. This -+ starts by trying to lock (in left to right order) all carry nodes -+ associated with carry operations on the parent level. After this, we decide -+ whether more nodes are required on the left of already locked set. If so, -+ all locks taken on the parent level are released, new carry nodes are -+ added, and locking process repeats. -+ -+ It may happen that balancing process fails owing to unrecoverable error on -+ some of upper levels of a tree (possible causes are io error, failure to -+ allocate new node, etc.). In this case we should unmount the filesystem, -+ rebooting if it is the root, and possibly advise the use of fsck. -+ -+ USAGE: -+ -+ int some_tree_operation( znode *node, ... ) -+ { -+ // Allocate on a stack pool of carry objects: operations and nodes. -+ // Most carry processes will only take objects from here, without -+ // dynamic allocation. -+ -+I feel uneasy about this pool. It adds to code complexity, I understand why it exists, but.... -Hans -+ -+ carry_pool pool; -+ carry_level lowest_level; -+ carry_op *op; -+ -+ init_carry_pool( &pool ); -+ init_carry_level( &lowest_level, &pool ); -+ -+ // operation may be one of: -+ // COP_INSERT --- insert new item into node -+ // COP_CUT --- remove part of or whole node -+ // COP_PASTE --- increase size of item -+ // COP_DELETE --- delete pointer from parent node -+ // COP_UPDATE --- update delimiting key in least -+ // common ancestor of two -+ -+ op = reiser4_post_carry( &lowest_level, operation, node, 0 ); -+ if( IS_ERR( op ) || ( op == NULL ) ) { -+ handle error -+ } else { -+ // fill in remaining fields in @op, according to carry.h:carry_op -+ result = carry( &lowest_level, NULL ); -+ } -+ done_carry_pool( &pool ); -+ } -+ -+ When you are implementing node plugin method that participates in carry -+ (shifting, insertion, deletion, etc.), do the following: -+ -+ int foo_node_method( znode *node, ..., carry_level *todo ) -+ { -+ carry_op *op; -+ -+ .... -+ -+ // note, that last argument to reiser4_post_carry() is non-null -+ // here, because @op is to be applied to the parent of @node, rather -+ // than to the @node itself as in the previous case. -+ -+ op = node_post_carry( todo, operation, node, 1 ); -+ // fill in remaining fields in @op, according to carry.h:carry_op -+ -+ .... -+ -+ } -+ -+ BATCHING: -+ -+ One of the main advantages of level-by-level balancing implemented here is -+ ability to batch updates on a parent level and to peform them more -+ efficiently as a result. -+ -+ Description To Be Done (TBD). -+ -+ DIFFICULTIES AND SUBTLE POINTS: -+ -+ 1. complex plumbing is required, because: -+ -+ a. effective allocation through pools is needed -+ -+ b. target of operation is not exactly known when operation is -+ posted. This is worked around through bitfields in &carry_node and -+ logic in lock_carry_node() -+ -+ c. of interaction with locking code: node should be added into sibling -+ list when pointer to it is inserted into its parent, which is some time -+ after node was created. Between these moments, node is somewhat in -+ suspended state and is only registered in the carry lists -+ -+ 2. whole balancing logic is implemented here, in particular, insertion -+ logic is coded in make_space(). -+ -+ 3. special cases like insertion (reiser4_add_tree_root()) or deletion -+ (reiser4_kill_tree_root()) of tree root and morphing of paste into insert -+ (insert_paste()) have to be handled. -+ -+ 4. there is non-trivial interdependency between allocation of new nodes -+ and almost everything else. This is mainly due to the (1.c) above. I shall -+ write about this later. -+ -+*/ -+ -+#include "forward.h" -+#include "debug.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/item/item.h" -+#include "plugin/item/extent.h" -+#include "plugin/node/node.h" -+#include "jnode.h" -+#include "znode.h" -+#include "tree_mod.h" -+#include "tree_walk.h" -+#include "block_alloc.h" -+#include "pool.h" -+#include "tree.h" -+#include "carry.h" -+#include "carry_ops.h" -+#include "super.h" -+#include "reiser4.h" -+ -+#include -+ -+/* level locking/unlocking */ -+static int lock_carry_level(carry_level * level); -+static void unlock_carry_level(carry_level * level, int failure); -+static void done_carry_level(carry_level * level); -+static void unlock_carry_node(carry_level * level, carry_node * node, int fail); -+ -+int lock_carry_node(carry_level * level, carry_node * node); -+int lock_carry_node_tail(carry_node * node); -+ -+/* carry processing proper */ -+static int carry_on_level(carry_level * doing, carry_level * todo); -+ -+static carry_op *add_op(carry_level * level, pool_ordering order, -+ carry_op * reference); -+ -+/* handlers for carry operations. */ -+ -+static void fatal_carry_error(carry_level * doing, int ecode); -+static int add_new_root(carry_level * level, carry_node * node, znode * fake); -+ -+static void print_level(const char *prefix, carry_level * level); -+ -+#if REISER4_DEBUG -+typedef enum { -+ CARRY_TODO, -+ CARRY_DOING -+} carry_queue_state; -+static int carry_level_invariant(carry_level * level, carry_queue_state state); -+#endif -+ -+/* main entry point for tree balancing. -+ -+ Tree carry performs operations from @doing and while doing so accumulates -+ information about operations to be performed on the next level ("carried" -+ to the parent level). Carried operations are performed, causing possibly -+ more operations to be carried upward etc. carry() takes care about -+ locking and pinning znodes while operating on them. -+ -+ For usage, see comment at the top of fs/reiser4/carry.c -+ -+*/ -+int reiser4_carry(carry_level * doing /* set of carry operations to be -+ * performed */ , -+ carry_level * done /* set of nodes, already performed -+ * at the previous level. -+ * NULL in most cases */) -+{ -+ int result = 0; -+ /* queue of new requests */ -+ carry_level *todo; -+ ON_DEBUG(STORE_COUNTERS); -+ -+ assert("nikita-888", doing != NULL); -+ BUG_ON(done != NULL); -+ -+ todo = doing + 1; -+ init_carry_level(todo, doing->pool); -+ -+ /* queue of requests preformed on the previous level */ -+ done = todo + 1; -+ init_carry_level(done, doing->pool); -+ -+ /* iterate until there is nothing more to do */ -+ while (result == 0 && doing->ops_num > 0) { -+ carry_level *tmp; -+ -+ /* at this point @done is locked. */ -+ /* repeat lock/do/unlock while -+ -+ (1) lock_carry_level() fails due to deadlock avoidance, or -+ -+ (2) carry_on_level() decides that more nodes have to -+ be involved. -+ -+ (3) some unexpected error occurred while balancing on the -+ upper levels. In this case all changes are rolled back. -+ -+ */ -+ while (1) { -+ result = lock_carry_level(doing); -+ if (result == 0) { -+ /* perform operations from @doing and -+ accumulate new requests in @todo */ -+ result = carry_on_level(doing, todo); -+ if (result == 0) -+ break; -+ else if (result != -E_REPEAT || -+ !doing->restartable) { -+ warning("nikita-1043", -+ "Fatal error during carry: %i", -+ result); -+ print_level("done", done); -+ print_level("doing", doing); -+ print_level("todo", todo); -+ /* do some rough stuff like aborting -+ all pending transcrashes and thus -+ pushing tree back to the consistent -+ state. Alternatvely, just panic. -+ */ -+ fatal_carry_error(doing, result); -+ return result; -+ } -+ } else if (result != -E_REPEAT) { -+ fatal_carry_error(doing, result); -+ return result; -+ } -+ unlock_carry_level(doing, 1); -+ } -+ /* at this point @done can be safely unlocked */ -+ done_carry_level(done); -+ -+ /* cyclically shift queues */ -+ tmp = done; -+ done = doing; -+ doing = todo; -+ todo = tmp; -+ init_carry_level(todo, doing->pool); -+ -+ /* give other threads chance to run */ -+ reiser4_preempt_point(); -+ } -+ done_carry_level(done); -+ -+ /* all counters, but x_refs should remain the same. x_refs can change -+ owing to transaction manager */ -+ ON_DEBUG(CHECK_COUNTERS); -+ return result; -+} -+ -+/* perform carry operations on given level. -+ -+ Optimizations proposed by pooh: -+ -+ (1) don't lock all nodes from queue at the same time. Lock nodes lazily as -+ required; -+ -+ (2) unlock node if there are no more operations to be performed upon it and -+ node didn't add any operation to @todo. This can be implemented by -+ attaching to each node two counters: counter of operaions working on this -+ node and counter and operations carried upward from this node. -+ -+*/ -+static int carry_on_level(carry_level * doing /* queue of carry operations to -+ * do on this level */ , -+ carry_level * todo /* queue where new carry -+ * operations to be performed on -+ * the * parent level are -+ * accumulated during @doing -+ * processing. */ ) -+{ -+ int result; -+ int (*f) (carry_op *, carry_level *, carry_level *); -+ carry_op *op; -+ carry_op *tmp_op; -+ -+ assert("nikita-1034", doing != NULL); -+ assert("nikita-1035", todo != NULL); -+ -+ /* @doing->nodes are locked. */ -+ -+ /* This function can be split into two phases: analysis and modification. -+ -+ Analysis calculates precisely what items should be moved between -+ nodes. This information is gathered in some structures attached to -+ each carry_node in a @doing queue. Analysis also determines whether -+ new nodes are to be allocated etc. -+ -+ After analysis is completed, actual modification is performed. Here -+ we can take advantage of "batch modification": if there are several -+ operations acting on the same node, modifications can be performed -+ more efficiently when batched together. -+ -+ Above is an optimization left for the future. -+ */ -+ /* Important, but delayed optimization: it's possible to batch -+ operations together and perform them more efficiently as a -+ result. For example, deletion of several neighboring items from a -+ node can be converted to a single ->cut() operation. -+ -+ Before processing queue, it should be scanned and "mergeable" -+ operations merged. -+ */ -+ result = 0; -+ for_all_ops(doing, op, tmp_op) { -+ carry_opcode opcode; -+ -+ assert("nikita-1041", op != NULL); -+ opcode = op->op; -+ assert("nikita-1042", op->op < COP_LAST_OP); -+ f = op_dispatch_table[op->op].handler; -+ result = f(op, doing, todo); -+ /* locking can fail with -E_REPEAT. Any different error is fatal -+ and will be handled by fatal_carry_error() sledgehammer. -+ */ -+ if (result != 0) -+ break; -+ } -+ if (result == 0) { -+ carry_plugin_info info; -+ carry_node *scan; -+ carry_node *tmp_scan; -+ -+ info.doing = doing; -+ info.todo = todo; -+ -+ assert("nikita-3002", -+ carry_level_invariant(doing, CARRY_DOING)); -+ for_all_nodes(doing, scan, tmp_scan) { -+ znode *node; -+ -+ node = reiser4_carry_real(scan); -+ assert("nikita-2547", node != NULL); -+ if (node_is_empty(node)) { -+ result = -+ node_plugin_by_node(node)-> -+ prepare_removal(node, &info); -+ if (result != 0) -+ break; -+ } -+ } -+ } -+ return result; -+} -+ -+/* post carry operation -+ -+ This is main function used by external carry clients: node layout plugins -+ and tree operations to create new carry operation to be performed on some -+ level. -+ -+ New operation will be included in the @level queue. To actually perform it, -+ call carry( level, ... ). This function takes write lock on @node. Carry -+ manages all its locks by itself, don't worry about this. -+ -+ This function adds operation and node at the end of the queue. It is up to -+ caller to guarantee proper ordering of node queue. -+ -+*/ -+carry_op * reiser4_post_carry(carry_level * level /* queue where new operation -+ * is to be posted at */ , -+ carry_opcode op /* opcode of operation */ , -+ znode * node /* node on which this operation -+ * will operate */ , -+ int apply_to_parent_p /* whether operation will -+ * operate directly on @node -+ * or on it parent. */) -+{ -+ carry_op *result; -+ carry_node *child; -+ -+ assert("nikita-1046", level != NULL); -+ assert("nikita-1788", znode_is_write_locked(node)); -+ -+ result = add_op(level, POOLO_LAST, NULL); -+ if (IS_ERR(result)) -+ return result; -+ child = reiser4_add_carry(level, POOLO_LAST, NULL); -+ if (IS_ERR(child)) { -+ reiser4_pool_free(&level->pool->op_pool, &result->header); -+ return (carry_op *) child; -+ } -+ result->node = child; -+ result->op = op; -+ child->parent = apply_to_parent_p; -+ if (ZF_ISSET(node, JNODE_ORPHAN)) -+ child->left_before = 1; -+ child->node = node; -+ return result; -+} -+ -+/* initialize carry queue */ -+void init_carry_level(carry_level * level /* level to initialize */ , -+ carry_pool * pool /* pool @level will allocate objects -+ * from */ ) -+{ -+ assert("nikita-1045", level != NULL); -+ assert("nikita-967", pool != NULL); -+ -+ memset(level, 0, sizeof *level); -+ level->pool = pool; -+ -+ INIT_LIST_HEAD(&level->nodes); -+ INIT_LIST_HEAD(&level->ops); -+} -+ -+/* allocate carry pool and initialize pools within queue */ -+carry_pool *init_carry_pool(int size) -+{ -+ carry_pool *pool; -+ -+ assert("", size >= sizeof(carry_pool) + 3 * sizeof(carry_level)); -+ pool = kmalloc(size, reiser4_ctx_gfp_mask_get()); -+ if (pool == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ -+ reiser4_init_pool(&pool->op_pool, sizeof(carry_op), CARRIES_POOL_SIZE, -+ (char *)pool->op); -+ reiser4_init_pool(&pool->node_pool, sizeof(carry_node), -+ NODES_LOCKED_POOL_SIZE, (char *)pool->node); -+ return pool; -+} -+ -+/* finish with queue pools */ -+void done_carry_pool(carry_pool * pool /* pool to destroy */ ) -+{ -+ reiser4_done_pool(&pool->op_pool); -+ reiser4_done_pool(&pool->node_pool); -+ kfree(pool); -+} -+ -+/* add new carry node to the @level. -+ -+ Returns pointer to the new carry node allocated from pool. It's up to -+ callers to maintain proper order in the @level. Assumption is that if carry -+ nodes on one level are already sorted and modifications are peroformed from -+ left to right, carry nodes added on the parent level will be ordered -+ automatically. To control ordering use @order and @reference parameters. -+ -+*/ -+carry_node *reiser4_add_carry_skip(carry_level * level /* &carry_level to add -+ * node to */ , -+ pool_ordering order /* where to insert: -+ * at the beginning of -+ * @level, -+ * before @reference, -+ * after @reference, -+ * at the end of @level -+ */ , -+ carry_node * reference/* reference node for -+ * insertion */) -+{ -+ ON_DEBUG(carry_node * orig_ref = reference); -+ -+ if (order == POOLO_BEFORE) { -+ reference = find_left_carry(reference, level); -+ if (reference == NULL) -+ reference = list_entry(level->nodes.next, carry_node, -+ header.level_linkage); -+ else -+ reference = list_entry(reference->header.level_linkage.next, -+ carry_node, header.level_linkage); -+ } else if (order == POOLO_AFTER) { -+ reference = find_right_carry(reference, level); -+ if (reference == NULL) -+ reference = list_entry(level->nodes.prev, carry_node, -+ header.level_linkage); -+ else -+ reference = list_entry(reference->header.level_linkage.prev, -+ carry_node, header.level_linkage); -+ } -+ assert("nikita-2209", -+ ergo(orig_ref != NULL, -+ reiser4_carry_real(reference) == -+ reiser4_carry_real(orig_ref))); -+ return reiser4_add_carry(level, order, reference); -+} -+ -+carry_node *reiser4_add_carry(carry_level * level /* &carry_level to add node -+ * to */ , -+ pool_ordering order /* where to insert: at the -+ * beginning of @level, before -+ * @reference, after @reference, -+ * at the end of @level */ , -+ carry_node * reference /* reference node for -+ * insertion */ ) -+{ -+ carry_node *result; -+ -+ result = -+ (carry_node *) reiser4_add_obj(&level->pool->node_pool, -+ &level->nodes, -+ order, &reference->header); -+ if (!IS_ERR(result) && (result != NULL)) -+ ++level->nodes_num; -+ return result; -+} -+ -+/* add new carry operation to the @level. -+ -+ Returns pointer to the new carry operations allocated from pool. It's up to -+ callers to maintain proper order in the @level. To control ordering use -+ @order and @reference parameters. -+ -+*/ -+static carry_op *add_op(carry_level * level /* &carry_level to add node to */ , -+ pool_ordering order /* where to insert: at the beginning of -+ * @level, before @reference, after -+ * @reference, at the end of @level */ , -+ carry_op * -+ reference /* reference node for insertion */ ) -+{ -+ carry_op *result; -+ -+ result = -+ (carry_op *) reiser4_add_obj(&level->pool->op_pool, &level->ops, -+ order, &reference->header); -+ if (!IS_ERR(result) && (result != NULL)) -+ ++level->ops_num; -+ return result; -+} -+ -+/* Return node on the right of which @node was created. -+ -+ Each node is created on the right of some existing node (or it is new root, -+ which is special case not handled here). -+ -+ @node is new node created on some level, but not yet inserted into its -+ parent, it has corresponding bit (JNODE_ORPHAN) set in zstate. -+ -+*/ -+static carry_node *find_begetting_brother(carry_node * node /* node to start search -+ * from */ , -+ carry_level * kin UNUSED_ARG /* level to -+ * scan */ ) -+{ -+ carry_node *scan; -+ -+ assert("nikita-1614", node != NULL); -+ assert("nikita-1615", kin != NULL); -+ assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree)); -+ assert("nikita-1619", ergo(reiser4_carry_real(node) != NULL, -+ ZF_ISSET(reiser4_carry_real(node), -+ JNODE_ORPHAN))); -+ for (scan = node;; -+ scan = list_entry(scan->header.level_linkage.prev, carry_node, -+ header.level_linkage)) { -+ assert("nikita-1617", &kin->nodes != &scan->header.level_linkage); -+ if ((scan->node != node->node) && -+ !ZF_ISSET(scan->node, JNODE_ORPHAN)) { -+ assert("nikita-1618", reiser4_carry_real(scan) != NULL); -+ break; -+ } -+ } -+ return scan; -+} -+ -+static cmp_t -+carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2) -+{ -+ assert("nikita-2199", n1 != NULL); -+ assert("nikita-2200", n2 != NULL); -+ -+ if (n1 == n2) -+ return EQUAL_TO; -+ while (1) { -+ n1 = carry_node_next(n1); -+ if (carry_node_end(level, n1)) -+ return GREATER_THAN; -+ if (n1 == n2) -+ return LESS_THAN; -+ } -+ impossible("nikita-2201", "End of level reached"); -+} -+ -+carry_node *find_carry_node(carry_level * level, const znode * node) -+{ -+ carry_node *scan; -+ carry_node *tmp_scan; -+ -+ assert("nikita-2202", level != NULL); -+ assert("nikita-2203", node != NULL); -+ -+ for_all_nodes(level, scan, tmp_scan) { -+ if (reiser4_carry_real(scan) == node) -+ return scan; -+ } -+ return NULL; -+} -+ -+znode *reiser4_carry_real(const carry_node * node) -+{ -+ assert("nikita-3061", node != NULL); -+ -+ return node->lock_handle.node; -+} -+ -+carry_node *insert_carry_node(carry_level * doing, carry_level * todo, -+ const znode * node) -+{ -+ carry_node *base; -+ carry_node *scan; -+ carry_node *tmp_scan; -+ carry_node *proj; -+ -+ base = find_carry_node(doing, node); -+ assert("nikita-2204", base != NULL); -+ -+ for_all_nodes(todo, scan, tmp_scan) { -+ proj = find_carry_node(doing, scan->node); -+ assert("nikita-2205", proj != NULL); -+ if (carry_node_cmp(doing, proj, base) != LESS_THAN) -+ break; -+ } -+ return scan; -+} -+ -+static carry_node *add_carry_atplace(carry_level * doing, carry_level * todo, -+ znode * node) -+{ -+ carry_node *reference; -+ -+ assert("nikita-2994", doing != NULL); -+ assert("nikita-2995", todo != NULL); -+ assert("nikita-2996", node != NULL); -+ -+ reference = insert_carry_node(doing, todo, node); -+ assert("nikita-2997", reference != NULL); -+ -+ return reiser4_add_carry(todo, POOLO_BEFORE, reference); -+} -+ -+/* like reiser4_post_carry(), but designed to be called from node plugin methods. -+ This function is different from reiser4_post_carry() in that it finds proper -+ place to insert node in the queue. */ -+carry_op *node_post_carry(carry_plugin_info * info /* carry parameters -+ * passed down to node -+ * plugin */ , -+ carry_opcode op /* opcode of operation */ , -+ znode * node /* node on which this -+ * operation will operate */ , -+ int apply_to_parent_p /* whether operation will -+ * operate directly on @node -+ * or on it parent. */ ) -+{ -+ carry_op *result; -+ carry_node *child; -+ -+ assert("nikita-2207", info != NULL); -+ assert("nikita-2208", info->todo != NULL); -+ -+ if (info->doing == NULL) -+ return reiser4_post_carry(info->todo, op, node, -+ apply_to_parent_p); -+ -+ result = add_op(info->todo, POOLO_LAST, NULL); -+ if (IS_ERR(result)) -+ return result; -+ child = add_carry_atplace(info->doing, info->todo, node); -+ if (IS_ERR(child)) { -+ reiser4_pool_free(&info->todo->pool->op_pool, &result->header); -+ return (carry_op *) child; -+ } -+ result->node = child; -+ result->op = op; -+ child->parent = apply_to_parent_p; -+ if (ZF_ISSET(node, JNODE_ORPHAN)) -+ child->left_before = 1; -+ child->node = node; -+ return result; -+} -+ -+/* lock all carry nodes in @level */ -+static int lock_carry_level(carry_level * level /* level to lock */ ) -+{ -+ int result; -+ carry_node *node; -+ carry_node *tmp_node; -+ -+ assert("nikita-881", level != NULL); -+ assert("nikita-2229", carry_level_invariant(level, CARRY_TODO)); -+ -+ /* lock nodes from left to right */ -+ result = 0; -+ for_all_nodes(level, node, tmp_node) { -+ result = lock_carry_node(level, node); -+ if (result != 0) -+ break; -+ } -+ return result; -+} -+ -+/* Synchronize delimiting keys between @node and its left neighbor. -+ -+ To reduce contention on dk key and simplify carry code, we synchronize -+ delimiting keys only when carry ultimately leaves tree level (carrying -+ changes upward) and unlocks nodes at this level. -+ -+ This function first finds left neighbor of @node and then updates left -+ neighbor's right delimiting key to conincide with least key in @node. -+ -+*/ -+ -+ON_DEBUG(extern atomic_t delim_key_version; -+ ) -+ -+static void sync_dkeys(znode * spot /* node to update */ ) -+{ -+ reiser4_key pivot; -+ reiser4_tree *tree; -+ -+ assert("nikita-1610", spot != NULL); -+ assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk)); -+ -+ tree = znode_get_tree(spot); -+ read_lock_tree(tree); -+ write_lock_dk(tree); -+ -+ assert("nikita-2192", znode_is_loaded(spot)); -+ -+ /* sync left delimiting key of @spot with key in its leftmost item */ -+ if (node_is_empty(spot)) -+ pivot = *znode_get_rd_key(spot); -+ else -+ leftmost_key_in_node(spot, &pivot); -+ -+ znode_set_ld_key(spot, &pivot); -+ -+ /* there can be sequence of empty nodes pending removal on the left of -+ @spot. Scan them and update their left and right delimiting keys to -+ match left delimiting key of @spot. Also, update right delimiting -+ key of first non-empty left neighbor. -+ */ -+ while (1) { -+ if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED)) -+ break; -+ -+ spot = spot->left; -+ if (spot == NULL) -+ break; -+ -+ znode_set_rd_key(spot, &pivot); -+ /* don't sink into the domain of another balancing */ -+ if (!znode_is_write_locked(spot)) -+ break; -+ if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE)) -+ znode_set_ld_key(spot, &pivot); -+ else -+ break; -+ } -+ -+ write_unlock_dk(tree); -+ read_unlock_tree(tree); -+} -+ -+/* unlock all carry nodes in @level */ -+static void unlock_carry_level(carry_level * level /* level to unlock */ , -+ int failure /* true if unlocking owing to -+ * failure */ ) -+{ -+ carry_node *node; -+ carry_node *tmp_node; -+ -+ assert("nikita-889", level != NULL); -+ -+ if (!failure) { -+ znode *spot; -+ -+ spot = NULL; -+ /* update delimiting keys */ -+ for_all_nodes(level, node, tmp_node) { -+ if (reiser4_carry_real(node) != spot) { -+ spot = reiser4_carry_real(node); -+ sync_dkeys(spot); -+ } -+ } -+ } -+ -+ /* nodes can be unlocked in arbitrary order. In preemptible -+ environment it's better to unlock in reverse order of locking, -+ though. -+ */ -+ for_all_nodes_back(level, node, tmp_node) { -+ /* all allocated nodes should be already linked to their -+ parents at this moment. */ -+ assert("nikita-1631", -+ ergo(!failure, !ZF_ISSET(reiser4_carry_real(node), -+ JNODE_ORPHAN))); -+ ON_DEBUG(check_dkeys(reiser4_carry_real(node))); -+ unlock_carry_node(level, node, failure); -+ } -+ level->new_root = NULL; -+} -+ -+/* finish with @level -+ -+ Unlock nodes and release all allocated resources */ -+static void done_carry_level(carry_level * level /* level to finish */ ) -+{ -+ carry_node *node; -+ carry_node *tmp_node; -+ carry_op *op; -+ carry_op *tmp_op; -+ -+ assert("nikita-1076", level != NULL); -+ -+ unlock_carry_level(level, 0); -+ for_all_nodes(level, node, tmp_node) { -+ assert("nikita-2113", list_empty_careful(&node->lock_handle.locks_link)); -+ assert("nikita-2114", list_empty_careful(&node->lock_handle.owners_link)); -+ reiser4_pool_free(&level->pool->node_pool, &node->header); -+ } -+ for_all_ops(level, op, tmp_op) -+ reiser4_pool_free(&level->pool->op_pool, &op->header); -+} -+ -+/* helper function to complete locking of carry node -+ -+ Finish locking of carry node. There are several ways in which new carry -+ node can be added into carry level and locked. Normal is through -+ lock_carry_node(), but also from find_{left|right}_neighbor(). This -+ function factors out common final part of all locking scenarios. It -+ supposes that @node -> lock_handle is lock handle for lock just taken and -+ fills ->real_node from this lock handle. -+ -+*/ -+int lock_carry_node_tail(carry_node * node /* node to complete locking of */ ) -+{ -+ assert("nikita-1052", node != NULL); -+ assert("nikita-1187", reiser4_carry_real(node) != NULL); -+ assert("nikita-1188", !node->unlock); -+ -+ node->unlock = 1; -+ /* Load node content into memory and install node plugin by -+ looking at the node header. -+ -+ Most of the time this call is cheap because the node is -+ already in memory. -+ -+ Corresponding zrelse() is in unlock_carry_node() -+ */ -+ return zload(reiser4_carry_real(node)); -+} -+ -+/* lock carry node -+ -+ "Resolve" node to real znode, lock it and mark as locked. -+ This requires recursive locking of znodes. -+ -+ When operation is posted to the parent level, node it will be applied to is -+ not yet known. For example, when shifting data between two nodes, -+ delimiting has to be updated in parent or parents of nodes involved. But -+ their parents is not yet locked and, moreover said nodes can be reparented -+ by concurrent balancing. -+ -+ To work around this, carry operation is applied to special "carry node" -+ rather than to the znode itself. Carry node consists of some "base" or -+ "reference" znode and flags indicating how to get to the target of carry -+ operation (->real_node field of carry_node) from base. -+ -+*/ -+int lock_carry_node(carry_level * level /* level @node is in */ , -+ carry_node * node /* node to lock */ ) -+{ -+ int result; -+ znode *reference_point; -+ lock_handle lh; -+ lock_handle tmp_lh; -+ reiser4_tree *tree; -+ -+ assert("nikita-887", level != NULL); -+ assert("nikita-882", node != NULL); -+ -+ result = 0; -+ reference_point = node->node; -+ init_lh(&lh); -+ init_lh(&tmp_lh); -+ if (node->left_before) { -+ /* handling of new nodes, allocated on the previous level: -+ -+ some carry ops were propably posted from the new node, but -+ this node neither has parent pointer set, nor is -+ connected. This will be done in ->create_hook() for -+ internal item. -+ -+ No then less, parent of new node has to be locked. To do -+ this, first go to the "left" in the carry order. This -+ depends on the decision to always allocate new node on the -+ right of existing one. -+ -+ Loop handles case when multiple nodes, all orphans, were -+ inserted. -+ -+ Strictly speaking, taking tree lock is not necessary here, -+ because all nodes scanned by loop in -+ find_begetting_brother() are write-locked by this thread, -+ and thus, their sibling linkage cannot change. -+ -+ */ -+ tree = znode_get_tree(reference_point); -+ read_lock_tree(tree); -+ reference_point = find_begetting_brother(node, level)->node; -+ read_unlock_tree(tree); -+ assert("nikita-1186", reference_point != NULL); -+ } -+ if (node->parent && (result == 0)) { -+ result = -+ reiser4_get_parent(&tmp_lh, reference_point, -+ ZNODE_WRITE_LOCK); -+ if (result != 0) { -+ ; /* nothing */ -+ } else if (znode_get_level(tmp_lh.node) == 0) { -+ assert("nikita-1347", znode_above_root(tmp_lh.node)); -+ result = add_new_root(level, node, tmp_lh.node); -+ if (result == 0) { -+ reference_point = level->new_root; -+ move_lh(&lh, &node->lock_handle); -+ } -+ } else if ((level->new_root != NULL) -+ && (level->new_root != -+ znode_parent_nolock(reference_point))) { -+ /* parent of node exists, but this level aready -+ created different new root, so */ -+ warning("nikita-1109", -+ /* it should be "radicis", but tradition is -+ tradition. do banshees read latin? */ -+ "hodie natus est radici frater"); -+ result = -EIO; -+ } else { -+ move_lh(&lh, &tmp_lh); -+ reference_point = lh.node; -+ } -+ } -+ if (node->left && (result == 0)) { -+ assert("nikita-1183", node->parent); -+ assert("nikita-883", reference_point != NULL); -+ result = -+ reiser4_get_left_neighbor(&tmp_lh, reference_point, -+ ZNODE_WRITE_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ if (result == 0) { -+ done_lh(&lh); -+ move_lh(&lh, &tmp_lh); -+ reference_point = lh.node; -+ } -+ } -+ if (!node->parent && !node->left && !node->left_before) { -+ result = -+ longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK, -+ ZNODE_LOCK_HIPRI); -+ } -+ if (result == 0) { -+ move_lh(&node->lock_handle, &lh); -+ result = lock_carry_node_tail(node); -+ } -+ done_lh(&tmp_lh); -+ done_lh(&lh); -+ return result; -+} -+ -+/* release a lock on &carry_node. -+ -+ Release if necessary lock on @node. This opearion is pair of -+ lock_carry_node() and is idempotent: you can call it more than once on the -+ same node. -+ -+*/ -+static void -+unlock_carry_node(carry_level * level, -+ carry_node * node /* node to be released */ , -+ int failure /* 0 if node is unlocked due -+ * to some error */ ) -+{ -+ znode *real_node; -+ -+ assert("nikita-884", node != NULL); -+ -+ real_node = reiser4_carry_real(node); -+ /* pair to zload() in lock_carry_node_tail() */ -+ zrelse(real_node); -+ if (node->unlock && (real_node != NULL)) { -+ assert("nikita-899", real_node == node->lock_handle.node); -+ longterm_unlock_znode(&node->lock_handle); -+ } -+ if (failure) { -+ if (node->deallocate && (real_node != NULL)) { -+ /* free node in bitmap -+ -+ Prepare node for removal. Last zput() will finish -+ with it. -+ */ -+ ZF_SET(real_node, JNODE_HEARD_BANSHEE); -+ } -+ if (node->free) { -+ assert("nikita-2177", -+ list_empty_careful(&node->lock_handle.locks_link)); -+ assert("nikita-2112", -+ list_empty_careful(&node->lock_handle.owners_link)); -+ reiser4_pool_free(&level->pool->node_pool, -+ &node->header); -+ } -+ } -+} -+ -+/* fatal_carry_error() - all-catching error handling function -+ -+ It is possible that carry faces unrecoverable error, like unability to -+ insert pointer at the internal level. Our simple solution is just panic in -+ this situation. More sophisticated things like attempt to remount -+ file-system as read-only can be implemented without much difficlties. -+ -+ It is believed, that: -+ -+ 1. in stead of panicking, all current transactions can be aborted rolling -+ system back to the consistent state. -+ -+Umm, if you simply panic without doing anything more at all, then all current -+transactions are aborted and the system is rolled back to a consistent state, -+by virtue of the design of the transactional mechanism. Well, wait, let's be -+precise. If an internal node is corrupted on disk due to hardware failure, -+then there may be no consistent state that can be rolled back to, so instead -+we should say that it will rollback the transactions, which barring other -+factors means rolling back to a consistent state. -+ -+# Nikita: there is a subtle difference between panic and aborting -+# transactions: machine doesn't reboot. Processes aren't killed. Processes -+# don't using reiser4 (not that we care about such processes), or using other -+# reiser4 mounts (about them we do care) will simply continue to run. With -+# some luck, even application using aborted file system can survive: it will -+# get some error, like EBADF, from each file descriptor on failed file system, -+# but applications that do care about tolerance will cope with this (squid -+# will). -+ -+It would be a nice feature though to support rollback without rebooting -+followed by remount, but this can wait for later versions. -+ -+ 2. once isolated transactions will be implemented it will be possible to -+ roll back offending transaction. -+ -+2. is additional code complexity of inconsistent value (it implies that a broken tree should be kept in operation), so we must think about -+it more before deciding if it should be done. -Hans -+ -+*/ -+static void fatal_carry_error(carry_level * doing UNUSED_ARG /* carry level -+ * where -+ * unrecoverable -+ * error -+ * occurred */ , -+ int ecode /* error code */ ) -+{ -+ assert("nikita-1230", doing != NULL); -+ assert("nikita-1231", ecode < 0); -+ -+ reiser4_panic("nikita-1232", "Carry failed: %i", ecode); -+} -+ -+/* add new root to the tree -+ -+ This function itself only manages changes in carry structures and delegates -+ all hard work (allocation of znode for new root, changes of parent and -+ sibling pointers to the reiser4_add_tree_root(). -+ -+ Locking: old tree root is locked by carry at this point. Fake znode is also -+ locked. -+ -+*/ -+static int add_new_root(carry_level * level /* carry level in context of which -+ * operation is performed */ , -+ carry_node * node /* carry node for existing root */ , -+ znode * fake /* "fake" znode already locked by -+ * us */ ) -+{ -+ int result; -+ -+ assert("nikita-1104", level != NULL); -+ assert("nikita-1105", node != NULL); -+ -+ assert("nikita-1403", znode_is_write_locked(node->node)); -+ assert("nikita-1404", znode_is_write_locked(fake)); -+ -+ /* trying to create new root. */ -+ /* @node is root and it's already locked by us. This -+ means that nobody else can be trying to add/remove -+ tree root right now. -+ */ -+ if (level->new_root == NULL) -+ level->new_root = reiser4_add_tree_root(node->node, fake); -+ if (!IS_ERR(level->new_root)) { -+ assert("nikita-1210", znode_is_root(level->new_root)); -+ node->deallocate = 1; -+ result = -+ longterm_lock_znode(&node->lock_handle, level->new_root, -+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI); -+ if (result == 0) -+ zput(level->new_root); -+ } else { -+ result = PTR_ERR(level->new_root); -+ level->new_root = NULL; -+ } -+ return result; -+} -+ -+/* allocate new znode and add the operation that inserts the -+ pointer to it into the parent node into the todo level -+ -+ Allocate new znode, add it into carry queue and post into @todo queue -+ request to add pointer to new node into its parent. -+ -+ This is carry related routing that calls reiser4_new_node() to allocate new -+ node. -+*/ -+carry_node *add_new_znode(znode * brother /* existing left neighbor of new -+ * node */ , -+ carry_node * ref /* carry node after which new -+ * carry node is to be inserted -+ * into queue. This affects -+ * locking. */ , -+ carry_level * doing /* carry queue where new node is -+ * to be added */ , -+ carry_level * todo /* carry queue where COP_INSERT -+ * operation to add pointer to -+ * new node will ne added */ ) -+{ -+ carry_node *fresh; -+ znode *new_znode; -+ carry_op *add_pointer; -+ carry_plugin_info info; -+ -+ assert("nikita-1048", brother != NULL); -+ assert("nikita-1049", todo != NULL); -+ -+ /* There is a lot of possible variations here: to what parent -+ new node will be attached and where. For simplicity, always -+ do the following: -+ -+ (1) new node and @brother will have the same parent. -+ -+ (2) new node is added on the right of @brother -+ -+ */ -+ -+ fresh = reiser4_add_carry_skip(doing, -+ ref ? POOLO_AFTER : POOLO_LAST, ref); -+ if (IS_ERR(fresh)) -+ return fresh; -+ -+ fresh->deallocate = 1; -+ fresh->free = 1; -+ -+ new_znode = reiser4_new_node(brother, znode_get_level(brother)); -+ if (IS_ERR(new_znode)) -+ /* @fresh will be deallocated automatically by error -+ handling code in the caller. */ -+ return (carry_node *) new_znode; -+ -+ /* new_znode returned znode with x_count 1. Caller has to decrease -+ it. make_space() does. */ -+ -+ ZF_SET(new_znode, JNODE_ORPHAN); -+ fresh->node = new_znode; -+ -+ while (ZF_ISSET(reiser4_carry_real(ref), JNODE_ORPHAN)) { -+ ref = carry_node_prev(ref); -+ assert("nikita-1606", !carry_node_end(doing, ref)); -+ } -+ -+ info.todo = todo; -+ info.doing = doing; -+ add_pointer = node_post_carry(&info, COP_INSERT, -+ reiser4_carry_real(ref), 1); -+ if (IS_ERR(add_pointer)) { -+ /* no need to deallocate @new_znode here: it will be -+ deallocated during carry error handling. */ -+ return (carry_node *) add_pointer; -+ } -+ -+ add_pointer->u.insert.type = COPT_CHILD; -+ add_pointer->u.insert.child = fresh; -+ add_pointer->u.insert.brother = brother; -+ /* initially new node spawns empty key range */ -+ write_lock_dk(znode_get_tree(brother)); -+ znode_set_ld_key(new_znode, -+ znode_set_rd_key(new_znode, -+ znode_get_rd_key(brother))); -+ write_unlock_dk(znode_get_tree(brother)); -+ return fresh; -+} -+ -+/* DEBUGGING FUNCTIONS. -+ -+ Probably we also should leave them on even when -+ debugging is turned off to print dumps at errors. -+*/ -+#if REISER4_DEBUG -+static int carry_level_invariant(carry_level * level, carry_queue_state state) -+{ -+ carry_node *node; -+ carry_node *tmp_node; -+ -+ if (level == NULL) -+ return 0; -+ -+ if (level->track_type != 0 && -+ level->track_type != CARRY_TRACK_NODE && -+ level->track_type != CARRY_TRACK_CHANGE) -+ return 0; -+ -+ /* check that nodes are in ascending order */ -+ for_all_nodes(level, node, tmp_node) { -+ znode *left; -+ znode *right; -+ -+ reiser4_key lkey; -+ reiser4_key rkey; -+ -+ if (node != carry_node_front(level)) { -+ if (state == CARRY_TODO) { -+ right = node->node; -+ left = carry_node_prev(node)->node; -+ } else { -+ right = reiser4_carry_real(node); -+ left = reiser4_carry_real(carry_node_prev(node)); -+ } -+ if (right == NULL || left == NULL) -+ continue; -+ if (node_is_empty(right) || node_is_empty(left)) -+ continue; -+ if (!keyle(leftmost_key_in_node(left, &lkey), -+ leftmost_key_in_node(right, &rkey))) { -+ warning("", "wrong key order"); -+ return 0; -+ } -+ } -+ } -+ return 1; -+} -+#endif -+ -+/* get symbolic name for boolean */ -+static const char *tf(int boolean /* truth value */ ) -+{ -+ return boolean ? "t" : "f"; -+} -+ -+/* symbolic name for carry operation */ -+static const char *carry_op_name(carry_opcode op /* carry opcode */ ) -+{ -+ switch (op) { -+ case COP_INSERT: -+ return "COP_INSERT"; -+ case COP_DELETE: -+ return "COP_DELETE"; -+ case COP_CUT: -+ return "COP_CUT"; -+ case COP_PASTE: -+ return "COP_PASTE"; -+ case COP_UPDATE: -+ return "COP_UPDATE"; -+ case COP_EXTENT: -+ return "COP_EXTENT"; -+ case COP_INSERT_FLOW: -+ return "COP_INSERT_FLOW"; -+ default:{ -+ /* not mt safe, but who cares? */ -+ static char buf[20]; -+ -+ sprintf(buf, "unknown op: %x", op); -+ return buf; -+ } -+ } -+} -+ -+/* dump information about carry node */ -+static void print_carry(const char *prefix /* prefix to print */ , -+ carry_node * node /* node to print */ ) -+{ -+ if (node == NULL) { -+ printk("%s: null\n", prefix); -+ return; -+ } -+ printk -+ ("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n", -+ prefix, node, tf(node->parent), tf(node->left), tf(node->unlock), -+ tf(node->free), tf(node->deallocate)); -+} -+ -+/* dump information about carry operation */ -+static void print_op(const char *prefix /* prefix to print */ , -+ carry_op * op /* operation to print */ ) -+{ -+ if (op == NULL) { -+ printk("%s: null\n", prefix); -+ return; -+ } -+ printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op)); -+ print_carry("\tnode", op->node); -+ switch (op->op) { -+ case COP_INSERT: -+ case COP_PASTE: -+ print_coord("\tcoord", -+ op->u.insert.d ? op->u.insert.d->coord : NULL, 0); -+ reiser4_print_key("\tkey", -+ op->u.insert.d ? op->u.insert.d->key : NULL); -+ print_carry("\tchild", op->u.insert.child); -+ break; -+ case COP_DELETE: -+ print_carry("\tchild", op->u.delete.child); -+ break; -+ case COP_CUT: -+ if (op->u.cut_or_kill.is_cut) { -+ print_coord("\tfrom", -+ op->u.cut_or_kill.u.kill->params.from, 0); -+ print_coord("\tto", op->u.cut_or_kill.u.kill->params.to, -+ 0); -+ } else { -+ print_coord("\tfrom", -+ op->u.cut_or_kill.u.cut->params.from, 0); -+ print_coord("\tto", op->u.cut_or_kill.u.cut->params.to, -+ 0); -+ } -+ break; -+ case COP_UPDATE: -+ print_carry("\tleft", op->u.update.left); -+ break; -+ default: -+ /* do nothing */ -+ break; -+ } -+} -+ -+/* dump information about all nodes and operations in a @level */ -+static void print_level(const char *prefix /* prefix to print */ , -+ carry_level * level /* level to print */ ) -+{ -+ carry_node *node; -+ carry_node *tmp_node; -+ carry_op *op; -+ carry_op *tmp_op; -+ -+ if (level == NULL) { -+ printk("%s: null\n", prefix); -+ return; -+ } -+ printk("%s: %p, restartable: %s\n", -+ prefix, level, tf(level->restartable)); -+ -+ for_all_nodes(level, node, tmp_node) -+ print_carry("\tcarry node", node); -+ for_all_ops(level, op, tmp_op) -+ print_op("\tcarry op", op); -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/carry.h linux-2.6.20/fs/reiser4/carry.h ---- linux-2.6.20.orig/fs/reiser4/carry.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/carry.h 2007-05-06 14:50:43.690973225 +0400 -@@ -0,0 +1,442 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Functions and data types to "carry" tree modification(s) upward. -+ See fs/reiser4/carry.c for details. */ -+ -+#if !defined( __FS_REISER4_CARRY_H__ ) -+#define __FS_REISER4_CARRY_H__ -+ -+#include "forward.h" -+#include "debug.h" -+#include "pool.h" -+#include "znode.h" -+ -+#include -+ -+/* &carry_node - "location" of carry node. -+ -+ "location" of node that is involved or going to be involved into -+ carry process. Node where operation will be carried to on the -+ parent level cannot be recorded explicitly. Operation will be carried -+ usually to the parent of some node (where changes are performed at -+ the current level) or, to the left neighbor of its parent. But while -+ modifications are performed at the current level, parent may -+ change. So, we have to allow some indirection (or, positevly, -+ flexibility) in locating carry nodes. -+ -+*/ -+typedef struct carry_node { -+ /* pool linkage */ -+ reiser4_pool_header header; -+ -+ /* base node from which real_node is calculated. See -+ fs/reiser4/carry.c:lock_carry_node(). */ -+ znode *node; -+ -+ /* how to get ->real_node */ -+ /* to get ->real_node obtain parent of ->node */ -+ __u32 parent:1; -+ /* to get ->real_node obtain left neighbor of parent of -+ ->node */ -+ __u32 left:1; -+ __u32 left_before:1; -+ -+ /* locking */ -+ -+ /* this node was locked by carry process and should be -+ unlocked when carry leaves a level */ -+ __u32 unlock:1; -+ -+ /* disk block for this node was allocated by carry process and -+ should be deallocated when carry leaves a level */ -+ __u32 deallocate:1; -+ /* this carry node was allocated by carry process and should be -+ freed when carry leaves a level */ -+ __u32 free:1; -+ -+ /* type of lock we want to take on this node */ -+ lock_handle lock_handle; -+} carry_node; -+ -+/* &carry_opcode - elementary operations that can be carried upward -+ -+ Operations that carry() can handle. This list is supposed to be -+ expanded. -+ -+ Each carry operation (cop) is handled by appropriate function defined -+ in fs/reiser4/carry.c. For example COP_INSERT is handled by -+ fs/reiser4/carry.c:carry_insert() etc. These functions in turn -+ call plugins of nodes affected by operation to modify nodes' content -+ and to gather operations to be performed on the next level. -+ -+*/ -+typedef enum { -+ /* insert new item into node. */ -+ COP_INSERT, -+ /* delete pointer from parent node */ -+ COP_DELETE, -+ /* remove part of or whole node. */ -+ COP_CUT, -+ /* increase size of item. */ -+ COP_PASTE, -+ /* insert extent (that is sequence of unformatted nodes). */ -+ COP_EXTENT, -+ /* update delimiting key in least common ancestor of two -+ nodes. This is performed when items are moved between two -+ nodes. -+ */ -+ COP_UPDATE, -+ /* insert flow */ -+ COP_INSERT_FLOW, -+ COP_LAST_OP, -+} carry_opcode; -+ -+#define CARRY_FLOW_NEW_NODES_LIMIT 20 -+ -+/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target -+ item is determined. */ -+typedef enum { -+ /* target item is one containing pointer to the ->child node */ -+ COPT_CHILD, -+ /* target item is given explicitly by @coord */ -+ COPT_ITEM_DATA, -+ /* target item is given by key */ -+ COPT_KEY, -+ /* see insert_paste_common() for more comments on this. */ -+ COPT_PASTE_RESTARTED, -+} cop_insert_pos_type; -+ -+/* flags to cut and delete */ -+typedef enum { -+ /* don't kill node even if it became completely empty as results of -+ * cut. This is needed for eottl handling. See carry_extent() for -+ * details. */ -+ DELETE_RETAIN_EMPTY = (1 << 0) -+} cop_delete_flag; -+ -+/* -+ * carry() implements "lock handle tracking" feature. -+ * -+ * Callers supply carry with node where to perform initial operation and lock -+ * handle on this node. Trying to optimize node utilization carry may actually -+ * move insertion point to different node. Callers expect that lock handle -+ * will rebe transferred to the new node also. -+ * -+ */ -+typedef enum { -+ /* transfer lock handle along with insertion point */ -+ CARRY_TRACK_CHANGE = 1, -+ /* acquire new lock handle to the node where insertion point is. This -+ * is used when carry() client doesn't initially possess lock handle -+ * on the insertion point node, for example, by extent insertion -+ * code. See carry_extent(). */ -+ CARRY_TRACK_NODE = 2 -+} carry_track_type; -+ -+/* data supplied to COP_{INSERT|PASTE} by callers */ -+typedef struct carry_insert_data { -+ /* position where new item is to be inserted */ -+ coord_t *coord; -+ /* new item description */ -+ reiser4_item_data *data; -+ /* key of new item */ -+ const reiser4_key *key; -+} carry_insert_data; -+ -+/* cut and kill are similar, so carry_cut_data and carry_kill_data share the below structure of parameters */ -+struct cut_kill_params { -+ /* coord where cut starts (inclusive) */ -+ coord_t *from; -+ /* coord where cut stops (inclusive, this item/unit will also be -+ * cut) */ -+ coord_t *to; -+ /* starting key. This is necessary when item and unit pos don't -+ * uniquely identify what portion or tree to remove. For example, this -+ * indicates what portion of extent unit will be affected. */ -+ const reiser4_key *from_key; -+ /* exclusive stop key */ -+ const reiser4_key *to_key; -+ /* if this is not NULL, smallest actually removed key is stored -+ * here. */ -+ reiser4_key *smallest_removed; -+ /* kill_node_content() is called for file truncate */ -+ int truncate; -+}; -+ -+struct carry_cut_data { -+ struct cut_kill_params params; -+}; -+ -+struct carry_kill_data { -+ struct cut_kill_params params; -+ /* parameter to be passed to the ->kill_hook() method of item -+ * plugin */ -+ /*void *iplug_params; *//* FIXME: unused currently */ -+ /* if not NULL---inode whose items are being removed. This is needed -+ * for ->kill_hook() of extent item to update VM structures when -+ * removing pages. */ -+ struct inode *inode; -+ /* sibling list maintenance is complicated by existence of eottl. When -+ * eottl whose left and right neighbors are formatted leaves is -+ * removed, one has to connect said leaves in the sibling list. This -+ * cannot be done when extent removal is just started as locking rules -+ * require sibling list update to happen atomically with removal of -+ * extent item. Therefore: 1. pointers to left and right neighbors -+ * have to be passed down to the ->kill_hook() of extent item, and -+ * 2. said neighbors have to be locked. */ -+ lock_handle *left; -+ lock_handle *right; -+ /* flags modifying behavior of kill. Currently, it may have DELETE_RETAIN_EMPTY set. */ -+ unsigned flags; -+ char *buf; -+}; -+ -+/* &carry_tree_op - operation to "carry" upward. -+ -+ Description of an operation we want to "carry" to the upper level of -+ a tree: e.g, when we insert something and there is not enough space -+ we allocate a new node and "carry" the operation of inserting a -+ pointer to the new node to the upper level, on removal of empty node, -+ we carry up operation of removing appropriate entry from parent. -+ -+ There are two types of carry ops: when adding or deleting node we -+ node at the parent level where appropriate modification has to be -+ performed is known in advance. When shifting items between nodes -+ (split, merge), delimiting key should be changed in the least common -+ parent of the nodes involved that is not known in advance. -+ -+ For the operations of the first type we store in &carry_op pointer to -+ the &carry_node at the parent level. For the operation of the second -+ type we store &carry_node or parents of the left and right nodes -+ modified and keep track of them upward until they coincide. -+ -+*/ -+typedef struct carry_op { -+ /* pool linkage */ -+ reiser4_pool_header header; -+ carry_opcode op; -+ /* node on which operation is to be performed: -+ -+ for insert, paste: node where new item is to be inserted -+ -+ for delete: node where pointer is to be deleted -+ -+ for cut: node to cut from -+ -+ for update: node where delimiting key is to be modified -+ -+ for modify: parent of modified node -+ -+ */ -+ carry_node *node; -+ union { -+ struct { -+ /* (sub-)type of insertion/paste. Taken from -+ cop_insert_pos_type. */ -+ __u8 type; -+ /* various operation flags. Taken from -+ cop_insert_flag. */ -+ __u8 flags; -+ carry_insert_data *d; -+ carry_node *child; -+ znode *brother; -+ } insert, paste, extent; -+ -+ struct { -+ int is_cut; -+ union { -+ carry_kill_data *kill; -+ carry_cut_data *cut; -+ } u; -+ } cut_or_kill; -+ -+ struct { -+ carry_node *left; -+ } update; -+ struct { -+ /* changed child */ -+ carry_node *child; -+ /* bitmask of changes. See &cop_modify_flag */ -+ __u32 flag; -+ } modify; -+ struct { -+ /* flags to deletion operation. Are taken from -+ cop_delete_flag */ -+ __u32 flags; -+ /* child to delete from parent. If this is -+ NULL, delete op->node. */ -+ carry_node *child; -+ } delete; -+ struct { -+ /* various operation flags. Taken from -+ cop_insert_flag. */ -+ __u32 flags; -+ flow_t *flow; -+ coord_t *insert_point; -+ reiser4_item_data *data; -+ /* flow insertion is limited by number of new blocks -+ added in that operation which do not get any data -+ but part of flow. This limit is set by macro -+ CARRY_FLOW_NEW_NODES_LIMIT. This field stores number -+ of nodes added already during one carry_flow */ -+ int new_nodes; -+ } insert_flow; -+ } u; -+} carry_op; -+ -+/* &carry_op_pool - preallocated pool of carry operations, and nodes */ -+typedef struct carry_pool { -+ carry_op op[CARRIES_POOL_SIZE]; -+ reiser4_pool op_pool; -+ carry_node node[NODES_LOCKED_POOL_SIZE]; -+ reiser4_pool node_pool; -+} carry_pool; -+ -+/* &carry_tree_level - carry process on given level -+ -+ Description of balancing process on the given level. -+ -+ No need for locking here, as carry_tree_level is essentially per -+ thread thing (for now). -+ -+*/ -+struct carry_level { -+ /* this level may be restarted */ -+ __u32 restartable:1; -+ /* list of carry nodes on this level, ordered by key order */ -+ struct list_head nodes; -+ struct list_head ops; -+ /* pool where new objects are allocated from */ -+ carry_pool *pool; -+ int ops_num; -+ int nodes_num; -+ /* new root created on this level, if any */ -+ znode *new_root; -+ /* This is set by caller (insert_by_key(), rreiser4_esize_item(), etc.) -+ when they want ->tracked to automagically wander to the node where -+ insertion point moved after insert or paste. -+ */ -+ carry_track_type track_type; -+ /* lock handle supplied by user that we are tracking. See -+ above. */ -+ lock_handle *tracked; -+}; -+ -+/* information carry passes to plugin methods that may add new operations to -+ the @todo queue */ -+struct carry_plugin_info { -+ carry_level *doing; -+ carry_level *todo; -+}; -+ -+int reiser4_carry(carry_level * doing, carry_level * done); -+ -+carry_node *reiser4_add_carry(carry_level * level, pool_ordering order, -+ carry_node * reference); -+carry_node *reiser4_add_carry_skip(carry_level * level, pool_ordering order, -+ carry_node * reference); -+ -+extern carry_node *insert_carry_node(carry_level * doing, -+ carry_level * todo, const znode * node); -+ -+extern carry_pool *init_carry_pool(int); -+extern void done_carry_pool(carry_pool * pool); -+ -+extern void init_carry_level(carry_level * level, carry_pool * pool); -+ -+extern carry_op *reiser4_post_carry(carry_level * level, carry_opcode op, -+ znode * node, int apply_to_parent); -+extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op, -+ znode * node, int apply_to_parent_p); -+ -+carry_node *add_new_znode(znode * brother, carry_node * reference, -+ carry_level * doing, carry_level * todo); -+ -+carry_node *find_carry_node(carry_level * level, const znode * node); -+ -+extern znode *reiser4_carry_real(const carry_node * node); -+ -+/* helper macros to iterate over carry queues */ -+ -+#define carry_node_next( node ) \ -+ list_entry((node)->header.level_linkage.next, carry_node, \ -+ header.level_linkage) -+ -+#define carry_node_prev( node ) \ -+ list_entry((node)->header.level_linkage.prev, carry_node, \ -+ header.level_linkage) -+ -+#define carry_node_front( level ) \ -+ list_entry((level)->nodes.next, carry_node, header.level_linkage) -+ -+#define carry_node_back( level ) \ -+ list_entry((level)->nodes.prev, carry_node, header.level_linkage) -+ -+#define carry_node_end( level, node ) \ -+ (&(level)->nodes == &(node)->header.level_linkage) -+ -+/* macro to iterate over all operations in a @level */ -+#define for_all_ops( level /* carry level (of type carry_level *) */, \ -+ op /* pointer to carry operation, modified by loop (of \ -+ * type carry_op *) */, \ -+ tmp /* pointer to carry operation (of type carry_op *), \ -+ * used to make iterator stable in the face of \ -+ * deletions from the level */ ) \ -+for (op = list_entry(level->ops.next, carry_op, header.level_linkage), \ -+ tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage); \ -+ &op->header.level_linkage != &level->ops; \ -+ op = tmp, \ -+ tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage)) -+ -+#if 0 -+for( op = ( carry_op * ) pool_level_list_front( &level -> ops ), \ -+ tmp = ( carry_op * ) pool_level_list_next( &op -> header ) ; \ -+ ! pool_level_list_end( &level -> ops, &op -> header ) ; \ -+ op = tmp, tmp = ( carry_op * ) pool_level_list_next( &op -> header ) ) -+#endif -+ -+/* macro to iterate over all nodes in a @level */ \ -+#define for_all_nodes( level /* carry level (of type carry_level *) */, \ -+ node /* pointer to carry node, modified by loop (of \ -+ * type carry_node *) */, \ -+ tmp /* pointer to carry node (of type carry_node *), \ -+ * used to make iterator stable in the face of * \ -+ * deletions from the level */ ) \ -+for (node = list_entry(level->nodes.next, carry_node, header.level_linkage), \ -+ tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage); \ -+ &node->header.level_linkage != &level->nodes; \ -+ node = tmp, \ -+ tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage)) -+ -+#if 0 -+for( node = carry_node_front( level ), \ -+ tmp = carry_node_next( node ) ; ! carry_node_end( level, node ) ; \ -+ node = tmp, tmp = carry_node_next( node ) ) -+#endif -+ -+/* macro to iterate over all nodes in a @level in reverse order -+ -+ This is used, because nodes are unlocked in reversed order of locking */ -+#define for_all_nodes_back( level /* carry level (of type carry_level *) */, \ -+ node /* pointer to carry node, modified by loop \ -+ * (of type carry_node *) */, \ -+ tmp /* pointer to carry node (of type carry_node \ -+ * *), used to make iterator stable in the \ -+ * face of deletions from the level */ ) \ -+for( node = carry_node_back( level ), \ -+ tmp = carry_node_prev( node ) ; ! carry_node_end( level, node ) ; \ -+ node = tmp, tmp = carry_node_prev( node ) ) -+ -+/* __FS_REISER4_CARRY_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/carry_ops.c linux-2.6.20/fs/reiser4/carry_ops.c ---- linux-2.6.20.orig/fs/reiser4/carry_ops.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/carry_ops.c 2007-05-06 14:50:43.694974475 +0400 -@@ -0,0 +1,2131 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* implementation of carry operations */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/item/item.h" -+#include "plugin/node/node.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree_walk.h" -+#include "pool.h" -+#include "tree_mod.h" -+#include "carry.h" -+#include "carry_ops.h" -+#include "tree.h" -+#include "super.h" -+#include "reiser4.h" -+ -+#include -+#include -+ -+static int carry_shift_data(sideof side, coord_t * insert_coord, znode * node, -+ carry_level * doing, carry_level * todo, -+ unsigned int including_insert_coord_p); -+ -+extern int lock_carry_node(carry_level * level, carry_node * node); -+extern int lock_carry_node_tail(carry_node * node); -+ -+/* find left neighbor of a carry node -+ -+ Look for left neighbor of @node and add it to the @doing queue. See -+ comments in the body. -+ -+*/ -+static carry_node *find_left_neighbor(carry_op * op /* node to find left -+ * neighbor of */ , -+ carry_level * doing /* level to scan */ ) -+{ -+ int result; -+ carry_node *node; -+ carry_node *left; -+ int flags; -+ reiser4_tree *tree; -+ -+ node = op->node; -+ -+ tree = current_tree; -+ read_lock_tree(tree); -+ /* first, check whether left neighbor is already in a @doing queue */ -+ if (reiser4_carry_real(node)->left != NULL) { -+ /* NOTE: there is locking subtlety here. Look into -+ * find_right_neighbor() for more info */ -+ if (find_carry_node(doing, -+ reiser4_carry_real(node)->left) != NULL) { -+ read_unlock_tree(tree); -+ left = node; -+ do { -+ left = list_entry(left->header.level_linkage.prev, -+ carry_node, header.level_linkage); -+ assert("nikita-3408", !carry_node_end(doing, -+ left)); -+ } while (reiser4_carry_real(left) == -+ reiser4_carry_real(node)); -+ return left; -+ } -+ } -+ read_unlock_tree(tree); -+ -+ left = reiser4_add_carry_skip(doing, POOLO_BEFORE, node); -+ if (IS_ERR(left)) -+ return left; -+ -+ left->node = node->node; -+ left->free = 1; -+ -+ flags = GN_TRY_LOCK; -+ if (!op->u.insert.flags & COPI_LOAD_LEFT) -+ flags |= GN_NO_ALLOC; -+ -+ /* then, feeling lucky, peek left neighbor in the cache. */ -+ result = reiser4_get_left_neighbor(&left->lock_handle, -+ reiser4_carry_real(node), -+ ZNODE_WRITE_LOCK, flags); -+ if (result == 0) { -+ /* ok, node found and locked. */ -+ result = lock_carry_node_tail(left); -+ if (result != 0) -+ left = ERR_PTR(result); -+ } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) { -+ /* node is leftmost node in a tree, or neighbor wasn't in -+ cache, or there is an extent on the left. */ -+ reiser4_pool_free(&doing->pool->node_pool, &left->header); -+ left = NULL; -+ } else if (doing->restartable) { -+ /* if left neighbor is locked, and level is restartable, add -+ new node to @doing and restart. */ -+ assert("nikita-913", node->parent != 0); -+ assert("nikita-914", node->node != NULL); -+ left->left = 1; -+ left->free = 0; -+ left = ERR_PTR(-E_REPEAT); -+ } else { -+ /* left neighbor is locked, level cannot be restarted. Just -+ ignore left neighbor. */ -+ reiser4_pool_free(&doing->pool->node_pool, &left->header); -+ left = NULL; -+ } -+ return left; -+} -+ -+/* find right neighbor of a carry node -+ -+ Look for right neighbor of @node and add it to the @doing queue. See -+ comments in the body. -+ -+*/ -+static carry_node *find_right_neighbor(carry_op * op /* node to find right -+ * neighbor of */ , -+ carry_level * doing /* level to scan */ ) -+{ -+ int result; -+ carry_node *node; -+ carry_node *right; -+ lock_handle lh; -+ int flags; -+ reiser4_tree *tree; -+ -+ init_lh(&lh); -+ -+ node = op->node; -+ -+ tree = current_tree; -+ read_lock_tree(tree); -+ /* first, check whether right neighbor is already in a @doing queue */ -+ if (reiser4_carry_real(node)->right != NULL) { -+ /* -+ * Tree lock is taken here anyway, because, even if _outcome_ -+ * of (find_carry_node() != NULL) doesn't depends on -+ * concurrent updates to ->right, find_carry_node() cannot -+ * work with second argument NULL. Hence, following comment is -+ * of historic importance only. -+ * -+ * Subtle: -+ * -+ * Q: why don't we need tree lock here, looking for the right -+ * neighbor? -+ * -+ * A: even if value of node->real_node->right were changed -+ * during find_carry_node() execution, outcome of execution -+ * wouldn't change, because (in short) other thread cannot add -+ * elements to the @doing, and if node->real_node->right -+ * already was in @doing, value of node->real_node->right -+ * couldn't change, because node cannot be inserted between -+ * locked neighbors. -+ */ -+ if (find_carry_node(doing, -+ reiser4_carry_real(node)->right) != NULL) { -+ read_unlock_tree(tree); -+ /* -+ * What we are doing here (this is also applicable to -+ * the find_left_neighbor()). -+ * -+ * tree_walk.c code requires that insertion of a -+ * pointer to a child, modification of parent pointer -+ * in the child, and insertion of the child into -+ * sibling list are atomic (see -+ * plugin/item/internal.c:create_hook_internal()). -+ * -+ * carry allocates new node long before pointer to it -+ * is inserted into parent and, actually, long before -+ * parent is even known. Such allocated-but-orphaned -+ * nodes are only trackable through carry level lists. -+ * -+ * Situation that is handled here is following: @node -+ * has valid ->right pointer, but there is -+ * allocated-but-orphaned node in the carry queue that -+ * is logically between @node and @node->right. Here -+ * we are searching for it. Critical point is that -+ * this is only possible if @node->right is also in -+ * the carry queue (this is checked above), because -+ * this is the only way new orphaned node could be -+ * inserted between them (before inserting new node, -+ * make_space() first tries to shift to the right, so, -+ * right neighbor will be locked and queued). -+ * -+ */ -+ right = node; -+ do { -+ right = list_entry(right->header.level_linkage.next, -+ carry_node, header.level_linkage); -+ assert("nikita-3408", !carry_node_end(doing, -+ right)); -+ } while (reiser4_carry_real(right) == -+ reiser4_carry_real(node)); -+ return right; -+ } -+ } -+ read_unlock_tree(tree); -+ -+ flags = GN_CAN_USE_UPPER_LEVELS; -+ if (!op->u.insert.flags & COPI_LOAD_RIGHT) -+ flags = GN_NO_ALLOC; -+ -+ /* then, try to lock right neighbor */ -+ init_lh(&lh); -+ result = reiser4_get_right_neighbor(&lh, -+ reiser4_carry_real(node), -+ ZNODE_WRITE_LOCK, flags); -+ if (result == 0) { -+ /* ok, node found and locked. */ -+ right = reiser4_add_carry_skip(doing, POOLO_AFTER, node); -+ if (!IS_ERR(right)) { -+ right->node = lh.node; -+ move_lh(&right->lock_handle, &lh); -+ right->free = 1; -+ result = lock_carry_node_tail(right); -+ if (result != 0) -+ right = ERR_PTR(result); -+ } -+ } else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) { -+ /* node is rightmost node in a tree, or neighbor wasn't in -+ cache, or there is an extent on the right. */ -+ right = NULL; -+ } else -+ right = ERR_PTR(result); -+ done_lh(&lh); -+ return right; -+} -+ -+/* how much free space in a @node is needed for @op -+ -+ How much space in @node is required for completion of @op, where @op is -+ insert or paste operation. -+*/ -+static unsigned int space_needed_for_op(znode * node /* znode data are -+ * inserted or -+ * pasted in */ , -+ carry_op * op /* carry -+ operation */ ) -+{ -+ assert("nikita-919", op != NULL); -+ -+ switch (op->op) { -+ default: -+ impossible("nikita-1701", "Wrong opcode"); -+ case COP_INSERT: -+ return space_needed(node, NULL, op->u.insert.d->data, 1); -+ case COP_PASTE: -+ return space_needed(node, op->u.insert.d->coord, -+ op->u.insert.d->data, 0); -+ } -+} -+ -+/* how much space in @node is required to insert or paste @data at -+ @coord. */ -+unsigned int space_needed(const znode * node /* node data are inserted or -+ * pasted in */ , -+ const coord_t * coord /* coord where data are -+ * inserted or pasted -+ * at */ , -+ const reiser4_item_data * data /* data to insert or -+ * paste */ , -+ int insertion /* non-0 is inserting, 0---paste */ ) -+{ -+ int result; -+ item_plugin *iplug; -+ -+ assert("nikita-917", node != NULL); -+ assert("nikita-918", node_plugin_by_node(node) != NULL); -+ assert("vs-230", !insertion || (coord == NULL)); -+ -+ result = 0; -+ iplug = data->iplug; -+ if (iplug->b.estimate != NULL) { -+ /* ask item plugin how much space is needed to insert this -+ item */ -+ result += iplug->b.estimate(insertion ? NULL : coord, data); -+ } else { -+ /* reasonable default */ -+ result += data->length; -+ } -+ if (insertion) { -+ node_plugin *nplug; -+ -+ nplug = node->nplug; -+ /* and add node overhead */ -+ if (nplug->item_overhead != NULL) { -+ result += nplug->item_overhead(node, NULL); -+ } -+ } -+ return result; -+} -+ -+/* find &coord in parent where pointer to new child is to be stored. */ -+static int find_new_child_coord(carry_op * op /* COP_INSERT carry operation to -+ * insert pointer to new -+ * child */ ) -+{ -+ int result; -+ znode *node; -+ znode *child; -+ -+ assert("nikita-941", op != NULL); -+ assert("nikita-942", op->op == COP_INSERT); -+ -+ node = reiser4_carry_real(op->node); -+ assert("nikita-943", node != NULL); -+ assert("nikita-944", node_plugin_by_node(node) != NULL); -+ -+ child = reiser4_carry_real(op->u.insert.child); -+ result = -+ find_new_child_ptr(node, child, op->u.insert.brother, -+ op->u.insert.d->coord); -+ -+ build_child_ptr_data(child, op->u.insert.d->data); -+ return result; -+} -+ -+/* additional amount of free space in @node required to complete @op */ -+static int free_space_shortage(znode * node /* node to check */ , -+ carry_op * op /* operation being performed */ ) -+{ -+ assert("nikita-1061", node != NULL); -+ assert("nikita-1062", op != NULL); -+ -+ switch (op->op) { -+ default: -+ impossible("nikita-1702", "Wrong opcode"); -+ case COP_INSERT: -+ case COP_PASTE: -+ return space_needed_for_op(node, op) - znode_free_space(node); -+ case COP_EXTENT: -+ /* when inserting extent shift data around until insertion -+ point is utmost in the node. */ -+ if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE) -+ return +1; -+ else -+ return -1; -+ } -+} -+ -+/* helper function: update node pointer in operation after insertion -+ point was probably shifted into @target. */ -+static znode *sync_op(carry_op * op, carry_node * target) -+{ -+ znode *insertion_node; -+ -+ /* reget node from coord: shift might move insertion coord to -+ the neighbor */ -+ insertion_node = op->u.insert.d->coord->node; -+ /* if insertion point was actually moved into new node, -+ update carry node pointer in operation. */ -+ if (insertion_node != reiser4_carry_real(op->node)) { -+ op->node = target; -+ assert("nikita-2540", -+ reiser4_carry_real(target) == insertion_node); -+ } -+ assert("nikita-2541", -+ reiser4_carry_real(op->node) == op->u.insert.d->coord->node); -+ return insertion_node; -+} -+ -+/* -+ * complete make_space() call: update tracked lock handle if necessary. See -+ * comments for fs/reiser4/carry.h:carry_track_type -+ */ -+static int -+make_space_tail(carry_op * op, carry_level * doing, znode * orig_node) -+{ -+ int result; -+ carry_track_type tracking; -+ znode *node; -+ -+ tracking = doing->track_type; -+ node = op->u.insert.d->coord->node; -+ -+ if (tracking == CARRY_TRACK_NODE || -+ (tracking == CARRY_TRACK_CHANGE && node != orig_node)) { -+ /* inserting or pasting into node different from -+ original. Update lock handle supplied by caller. */ -+ assert("nikita-1417", doing->tracked != NULL); -+ done_lh(doing->tracked); -+ init_lh(doing->tracked); -+ result = longterm_lock_znode(doing->tracked, node, -+ ZNODE_WRITE_LOCK, -+ ZNODE_LOCK_HIPRI); -+ } else -+ result = 0; -+ return result; -+} -+ -+/* This is insertion policy function. It shifts data to the left and right -+ neighbors of insertion coord and allocates new nodes until there is enough -+ free space to complete @op. -+ -+ See comments in the body. -+ -+ Assumes that the node format favors insertions at the right end of the node -+ as node40 does. -+ -+ See carry_flow() on detail about flow insertion -+*/ -+static int make_space(carry_op * op /* carry operation, insert or paste */ , -+ carry_level * doing /* current carry queue */ , -+ carry_level * todo /* carry queue on the parent level */ ) -+{ -+ znode *node; -+ int result; -+ int not_enough_space; -+ int blk_alloc; -+ znode *orig_node; -+ __u32 flags; -+ -+ coord_t *coord; -+ -+ assert("nikita-890", op != NULL); -+ assert("nikita-891", todo != NULL); -+ assert("nikita-892", -+ op->op == COP_INSERT || -+ op->op == COP_PASTE || op->op == COP_EXTENT); -+ assert("nikita-1607", -+ reiser4_carry_real(op->node) == op->u.insert.d->coord->node); -+ -+ flags = op->u.insert.flags; -+ -+ /* NOTE check that new node can only be allocated after checking left -+ * and right neighbors. This is necessary for proper work of -+ * find_{left,right}_neighbor(). */ -+ assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE, -+ flags & COPI_DONT_SHIFT_LEFT)); -+ assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE, -+ flags & COPI_DONT_SHIFT_RIGHT)); -+ -+ coord = op->u.insert.d->coord; -+ orig_node = node = coord->node; -+ -+ assert("nikita-908", node != NULL); -+ assert("nikita-909", node_plugin_by_node(node) != NULL); -+ -+ result = 0; -+ /* If there is not enough space in a node, try to shift something to -+ the left neighbor. This is a bit tricky, as locking to the left is -+ low priority. This is handled by restart logic in carry(). -+ */ -+ not_enough_space = free_space_shortage(node, op); -+ if (not_enough_space <= 0) -+ /* it is possible that carry was called when there actually -+ was enough space in the node. For example, when inserting -+ leftmost item so that delimiting keys have to be updated. -+ */ -+ return make_space_tail(op, doing, orig_node); -+ if (!(flags & COPI_DONT_SHIFT_LEFT)) { -+ carry_node *left; -+ /* make note in statistics of an attempt to move -+ something into the left neighbor */ -+ left = find_left_neighbor(op, doing); -+ if (unlikely(IS_ERR(left))) { -+ if (PTR_ERR(left) == -E_REPEAT) -+ return -E_REPEAT; -+ else { -+ /* some error other than restart request -+ occurred. This shouldn't happen. Issue a -+ warning and continue as if left neighbor -+ weren't existing. -+ */ -+ warning("nikita-924", -+ "Error accessing left neighbor: %li", -+ PTR_ERR(left)); -+ } -+ } else if (left != NULL) { -+ -+ /* shift everything possible on the left of and -+ including insertion coord into the left neighbor */ -+ result = carry_shift_data(LEFT_SIDE, coord, -+ reiser4_carry_real(left), -+ doing, todo, -+ flags & COPI_GO_LEFT); -+ -+ /* reget node from coord: shift_left() might move -+ insertion coord to the left neighbor */ -+ node = sync_op(op, left); -+ -+ not_enough_space = free_space_shortage(node, op); -+ /* There is not enough free space in @node, but -+ may be, there is enough free space in -+ @left. Various balancing decisions are valid here. -+ The same for the shifiting to the right. -+ */ -+ } -+ } -+ /* If there still is not enough space, shift to the right */ -+ if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) { -+ carry_node *right; -+ -+ right = find_right_neighbor(op, doing); -+ if (IS_ERR(right)) { -+ warning("nikita-1065", -+ "Error accessing right neighbor: %li", -+ PTR_ERR(right)); -+ } else if (right != NULL) { -+ /* node containing insertion point, and its right -+ neighbor node are write locked by now. -+ -+ shift everything possible on the right of but -+ excluding insertion coord into the right neighbor -+ */ -+ result = carry_shift_data(RIGHT_SIDE, coord, -+ reiser4_carry_real(right), -+ doing, todo, -+ flags & COPI_GO_RIGHT); -+ /* reget node from coord: shift_right() might move -+ insertion coord to the right neighbor */ -+ node = sync_op(op, right); -+ not_enough_space = free_space_shortage(node, op); -+ } -+ } -+ /* If there is still not enough space, allocate new node(s). -+ -+ We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in -+ the carry operation flags (currently this is needed during flush -+ only). -+ */ -+ for (blk_alloc = 0; -+ not_enough_space > 0 && result == 0 && blk_alloc < 2 && -+ !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) { -+ carry_node *fresh; /* new node we are allocating */ -+ coord_t coord_shadow; /* remembered insertion point before -+ * shifting data into new node */ -+ carry_node *node_shadow; /* remembered insertion node before -+ * shifting */ -+ unsigned int gointo; /* whether insertion point should move -+ * into newly allocated node */ -+ -+ /* allocate new node on the right of @node. Znode and disk -+ fake block number for new node are allocated. -+ -+ add_new_znode() posts carry operation COP_INSERT with -+ COPT_CHILD option to the parent level to add -+ pointer to newly created node to its parent. -+ -+ Subtle point: if several new nodes are required to complete -+ insertion operation at this level, they will be inserted -+ into their parents in the order of creation, which means -+ that @node will be valid "cookie" at the time of insertion. -+ -+ */ -+ fresh = add_new_znode(node, op->node, doing, todo); -+ if (IS_ERR(fresh)) -+ return PTR_ERR(fresh); -+ -+ /* Try to shift into new node. */ -+ result = lock_carry_node(doing, fresh); -+ zput(reiser4_carry_real(fresh)); -+ if (result != 0) { -+ warning("nikita-947", -+ "Cannot lock new node: %i", result); -+ return result; -+ } -+ -+ /* both nodes are write locked by now. -+ -+ shift everything possible on the right of and -+ including insertion coord into the right neighbor. -+ */ -+ coord_dup(&coord_shadow, op->u.insert.d->coord); -+ node_shadow = op->node; -+ /* move insertion point into newly created node if: -+ -+ . insertion point is rightmost in the source node, or -+ . this is not the first node we are allocating in a row. -+ */ -+ gointo = -+ (blk_alloc > 0) || -+ coord_is_after_rightmost(op->u.insert.d->coord); -+ -+ if (gointo && -+ op->op == COP_PASTE && -+ coord_is_existing_item(op->u.insert.d->coord) && -+ is_solid_item((item_plugin_by_coord(op->u.insert.d->coord)))) { -+ /* paste into solid (atomic) item, which can contain -+ only one unit, so we need to shift it right, where -+ insertion point supposed to be */ -+ -+ assert("edward-1444", op->u.insert.d->data->iplug == -+ item_plugin_by_id(STATIC_STAT_DATA_ID)); -+ assert("edward-1445", -+ op->u.insert.d->data->length > -+ node_plugin_by_node(coord->node)->free_space -+ (coord->node)); -+ -+ op->u.insert.d->coord->between = BEFORE_UNIT; -+ } -+ -+ result = carry_shift_data(RIGHT_SIDE, coord, -+ reiser4_carry_real(fresh), -+ doing, todo, gointo); -+ /* if insertion point was actually moved into new node, -+ update carry node pointer in operation. */ -+ node = sync_op(op, fresh); -+ not_enough_space = free_space_shortage(node, op); -+ if ((not_enough_space > 0) && (node != coord_shadow.node)) { -+ /* there is not enough free in new node. Shift -+ insertion point back to the @shadow_node so that -+ next new node would be inserted between -+ @shadow_node and @fresh. -+ */ -+ coord_normalize(&coord_shadow); -+ coord_dup(coord, &coord_shadow); -+ node = coord->node; -+ op->node = node_shadow; -+ if (1 || (flags & COPI_STEP_BACK)) { -+ /* still not enough space?! Maybe there is -+ enough space in the source node (i.e., node -+ data are moved from) now. -+ */ -+ not_enough_space = -+ free_space_shortage(node, op); -+ } -+ } -+ } -+ if (not_enough_space > 0) { -+ if (!(flags & COPI_DONT_ALLOCATE)) -+ warning("nikita-948", "Cannot insert new item"); -+ result = -E_NODE_FULL; -+ } -+ assert("nikita-1622", ergo(result == 0, -+ reiser4_carry_real(op->node) == coord->node)); -+ assert("nikita-2616", coord == op->u.insert.d->coord); -+ if (result == 0) -+ result = make_space_tail(op, doing, orig_node); -+ return result; -+} -+ -+/* insert_paste_common() - common part of insert and paste operations -+ -+ This function performs common part of COP_INSERT and COP_PASTE. -+ -+ There are two ways in which insertion/paste can be requested: -+ -+ . by directly supplying reiser4_item_data. In this case, op -> -+ u.insert.type is set to COPT_ITEM_DATA. -+ -+ . by supplying child pointer to which is to inserted into parent. In this -+ case op -> u.insert.type == COPT_CHILD. -+ -+ . by supplying key of new item/unit. This is currently only used during -+ extent insertion -+ -+ This is required, because when new node is allocated we don't know at what -+ position pointer to it is to be stored in the parent. Actually, we don't -+ even know what its parent will be, because parent can be re-balanced -+ concurrently and new node re-parented, and because parent can be full and -+ pointer to the new node will go into some other node. -+ -+ insert_paste_common() resolves pointer to child node into position in the -+ parent by calling find_new_child_coord(), that fills -+ reiser4_item_data. After this, insertion/paste proceeds uniformly. -+ -+ Another complication is with finding free space during pasting. It may -+ happen that while shifting items to the neighbors and newly allocated -+ nodes, insertion coord can no longer be in the item we wanted to paste -+ into. At this point, paste becomes (morphs) into insert. Moreover free -+ space analysis has to be repeated, because amount of space required for -+ insertion is different from that of paste (item header overhead, etc). -+ -+ This function "unifies" different insertion modes (by resolving child -+ pointer or key into insertion coord), and then calls make_space() to free -+ enough space in the node by shifting data to the left and right and by -+ allocating new nodes if necessary. Carry operation knows amount of space -+ required for its completion. After enough free space is obtained, caller of -+ this function (carry_{insert,paste,etc.}) performs actual insertion/paste -+ by calling item plugin method. -+ -+*/ -+static int insert_paste_common(carry_op * op /* carry operation being -+ * performed */ , -+ carry_level * doing /* current carry level */ , -+ carry_level * todo /* next carry level */ , -+ carry_insert_data * cdata /* pointer to -+ * cdata */ , -+ coord_t * coord /* insertion/paste coord */ , -+ reiser4_item_data * data /* data to be -+ * inserted/pasted */ ) -+{ -+ assert("nikita-981", op != NULL); -+ assert("nikita-980", todo != NULL); -+ assert("nikita-979", (op->op == COP_INSERT) || (op->op == COP_PASTE) -+ || (op->op == COP_EXTENT)); -+ -+ if (op->u.insert.type == COPT_PASTE_RESTARTED) { -+ /* nothing to do. Fall through to make_space(). */ -+ ; -+ } else if (op->u.insert.type == COPT_KEY) { -+ node_search_result intra_node; -+ znode *node; -+ /* Problem with doing batching at the lowest level, is that -+ operations here are given by coords where modification is -+ to be performed, and one modification can invalidate coords -+ of all following operations. -+ -+ So, we are implementing yet another type for operation that -+ will use (the only) "locator" stable across shifting of -+ data between nodes, etc.: key (COPT_KEY). -+ -+ This clause resolves key to the coord in the node. -+ -+ But node can change also. Probably some pieces have to be -+ added to the lock_carry_node(), to lock node by its key. -+ -+ */ -+ /* NOTE-NIKITA Lookup bias is fixed to FIND_EXACT. Complain -+ if you need something else. */ -+ op->u.insert.d->coord = coord; -+ node = reiser4_carry_real(op->node); -+ intra_node = node_plugin_by_node(node)->lookup -+ (node, op->u.insert.d->key, FIND_EXACT, -+ op->u.insert.d->coord); -+ if ((intra_node != NS_FOUND) && (intra_node != NS_NOT_FOUND)) { -+ warning("nikita-1715", "Intra node lookup failure: %i", -+ intra_node); -+ return intra_node; -+ } -+ } else if (op->u.insert.type == COPT_CHILD) { -+ /* if we are asked to insert pointer to the child into -+ internal node, first convert pointer to the child into -+ coord within parent node. -+ */ -+ znode *child; -+ int result; -+ -+ op->u.insert.d = cdata; -+ op->u.insert.d->coord = coord; -+ op->u.insert.d->data = data; -+ op->u.insert.d->coord->node = reiser4_carry_real(op->node); -+ result = find_new_child_coord(op); -+ child = reiser4_carry_real(op->u.insert.child); -+ if (result != NS_NOT_FOUND) { -+ warning("nikita-993", -+ "Cannot find a place for child pointer: %i", -+ result); -+ return result; -+ } -+ /* This only happens when we did multiple insertions at -+ the previous level, trying to insert single item and -+ it so happened, that insertion of pointers to all new -+ nodes before this one already caused parent node to -+ split (may be several times). -+ -+ I am going to come up with better solution. -+ -+ You are not expected to understand this. -+ -- v6root/usr/sys/ken/slp.c -+ -+ Basically, what happens here is the following: carry came -+ to the parent level and is about to insert internal item -+ pointing to the child node that it just inserted in the -+ level below. Position where internal item is to be inserted -+ was found by find_new_child_coord() above, but node of the -+ current carry operation (that is, parent node of child -+ inserted on the previous level), was determined earlier in -+ the lock_carry_level/lock_carry_node. It could so happen -+ that other carry operations already performed on the parent -+ level already split parent node, so that insertion point -+ moved into another node. Handle this by creating new carry -+ node for insertion point if necessary. -+ */ -+ if (reiser4_carry_real(op->node) != -+ op->u.insert.d->coord->node) { -+ pool_ordering direction; -+ znode *z1; -+ znode *z2; -+ reiser4_key k1; -+ reiser4_key k2; -+ -+ /* -+ * determine in what direction insertion point -+ * moved. Do this by comparing delimiting keys. -+ */ -+ z1 = op->u.insert.d->coord->node; -+ z2 = reiser4_carry_real(op->node); -+ if (keyle(leftmost_key_in_node(z1, &k1), -+ leftmost_key_in_node(z2, &k2))) -+ /* insertion point moved to the left */ -+ direction = POOLO_BEFORE; -+ else -+ /* insertion point moved to the right */ -+ direction = POOLO_AFTER; -+ -+ op->node = reiser4_add_carry_skip(doing, -+ direction, op->node); -+ if (IS_ERR(op->node)) -+ return PTR_ERR(op->node); -+ op->node->node = op->u.insert.d->coord->node; -+ op->node->free = 1; -+ result = lock_carry_node(doing, op->node); -+ if (result != 0) -+ return result; -+ } -+ -+ /* -+ * set up key of an item being inserted: we are inserting -+ * internal item and its key is (by the very definition of -+ * search tree) is leftmost key in the child node. -+ */ -+ write_lock_dk(znode_get_tree(child)); -+ op->u.insert.d->key = leftmost_key_in_node(child, -+ znode_get_ld_key(child)); -+ write_unlock_dk(znode_get_tree(child)); -+ op->u.insert.d->data->arg = op->u.insert.brother; -+ } else { -+ assert("vs-243", op->u.insert.d->coord != NULL); -+ op->u.insert.d->coord->node = reiser4_carry_real(op->node); -+ } -+ -+ /* find free space. */ -+ return make_space(op, doing, todo); -+} -+ -+/* handle carry COP_INSERT operation. -+ -+ Insert new item into node. New item can be given in one of two ways: -+ -+ - by passing &tree_coord and &reiser4_item_data as part of @op. This is -+ only applicable at the leaf/twig level. -+ -+ - by passing a child node pointer to which is to be inserted by this -+ operation. -+ -+*/ -+static int carry_insert(carry_op * op /* operation to perform */ , -+ carry_level * doing /* queue of operations @op -+ * is part of */ , -+ carry_level * todo /* queue where new operations -+ * are accumulated */ ) -+{ -+ znode *node; -+ carry_insert_data cdata; -+ coord_t coord; -+ reiser4_item_data data; -+ carry_plugin_info info; -+ int result; -+ -+ assert("nikita-1036", op != NULL); -+ assert("nikita-1037", todo != NULL); -+ assert("nikita-1038", op->op == COP_INSERT); -+ -+ coord_init_zero(&coord); -+ -+ /* perform common functionality of insert and paste. */ -+ result = insert_paste_common(op, doing, todo, &cdata, &coord, &data); -+ if (result != 0) -+ return result; -+ -+ node = op->u.insert.d->coord->node; -+ assert("nikita-1039", node != NULL); -+ assert("nikita-1040", node_plugin_by_node(node) != NULL); -+ -+ assert("nikita-949", -+ space_needed_for_op(node, op) <= znode_free_space(node)); -+ -+ /* ask node layout to create new item. */ -+ info.doing = doing; -+ info.todo = todo; -+ result = node_plugin_by_node(node)->create_item -+ (op->u.insert.d->coord, op->u.insert.d->key, op->u.insert.d->data, -+ &info); -+ doing->restartable = 0; -+ znode_make_dirty(node); -+ -+ return result; -+} -+ -+/* -+ * Flow insertion code. COP_INSERT_FLOW is special tree operation that is -+ * supplied with a "flow" (that is, a stream of data) and inserts it into tree -+ * by slicing into multiple items. -+ */ -+ -+#define flow_insert_point(op) ( ( op ) -> u.insert_flow.insert_point ) -+#define flow_insert_flow(op) ( ( op ) -> u.insert_flow.flow ) -+#define flow_insert_data(op) ( ( op ) -> u.insert_flow.data ) -+ -+static size_t item_data_overhead(carry_op * op) -+{ -+ if (flow_insert_data(op)->iplug->b.estimate == NULL) -+ return 0; -+ return (flow_insert_data(op)->iplug->b. -+ estimate(NULL /* estimate insertion */ , flow_insert_data(op)) - -+ flow_insert_data(op)->length); -+} -+ -+/* FIXME-VS: this is called several times during one make_flow_for_insertion -+ and it will always return the same result. Some optimization could be made -+ by calculating this value once at the beginning and passing it around. That -+ would reduce some flexibility in future changes -+*/ -+static int can_paste(coord_t *, const reiser4_key *, const reiser4_item_data *); -+static size_t flow_insertion_overhead(carry_op * op) -+{ -+ znode *node; -+ size_t insertion_overhead; -+ -+ node = flow_insert_point(op)->node; -+ insertion_overhead = 0; -+ if (node->nplug->item_overhead && -+ !can_paste(flow_insert_point(op), &flow_insert_flow(op)->key, -+ flow_insert_data(op))) -+ insertion_overhead = -+ node->nplug->item_overhead(node, NULL) + -+ item_data_overhead(op); -+ return insertion_overhead; -+} -+ -+/* how many bytes of flow does fit to the node */ -+static int what_can_fit_into_node(carry_op * op) -+{ -+ size_t free, overhead; -+ -+ overhead = flow_insertion_overhead(op); -+ free = znode_free_space(flow_insert_point(op)->node); -+ if (free <= overhead) -+ return 0; -+ free -= overhead; -+ /* FIXME: flow->length is loff_t only to not get overflowed in case of expandign truncate */ -+ if (free < op->u.insert_flow.flow->length) -+ return free; -+ return (int)op->u.insert_flow.flow->length; -+} -+ -+/* in make_space_for_flow_insertion we need to check either whether whole flow -+ fits into a node or whether minimal fraction of flow fits into a node */ -+static int enough_space_for_whole_flow(carry_op * op) -+{ -+ return (unsigned)what_can_fit_into_node(op) == -+ op->u.insert_flow.flow->length; -+} -+ -+#define MIN_FLOW_FRACTION 1 -+static int enough_space_for_min_flow_fraction(carry_op * op) -+{ -+ assert("vs-902", coord_is_after_rightmost(flow_insert_point(op))); -+ -+ return what_can_fit_into_node(op) >= MIN_FLOW_FRACTION; -+} -+ -+/* this returns 0 if left neighbor was obtained successfully and everything -+ upto insertion point including it were shifted and left neighbor still has -+ some free space to put minimal fraction of flow into it */ -+static int -+make_space_by_shift_left(carry_op * op, carry_level * doing, carry_level * todo) -+{ -+ carry_node *left; -+ znode *orig; -+ -+ left = find_left_neighbor(op, doing); -+ if (unlikely(IS_ERR(left))) { -+ warning("vs-899", -+ "make_space_by_shift_left: " -+ "error accessing left neighbor: %li", PTR_ERR(left)); -+ return 1; -+ } -+ if (left == NULL) -+ /* left neighbor either does not exist or is unformatted -+ node */ -+ return 1; -+ -+ orig = flow_insert_point(op)->node; -+ /* try to shift content of node @orig from its head upto insert point -+ including insertion point into the left neighbor */ -+ carry_shift_data(LEFT_SIDE, flow_insert_point(op), -+ reiser4_carry_real(left), doing, todo, -+ 1 /* including insert point */); -+ if (reiser4_carry_real(left) != flow_insert_point(op)->node) { -+ /* insertion point did not move */ -+ return 1; -+ } -+ -+ /* insertion point is set after last item in the node */ -+ assert("vs-900", coord_is_after_rightmost(flow_insert_point(op))); -+ -+ if (!enough_space_for_min_flow_fraction(op)) { -+ /* insertion point node does not have enough free space to put -+ even minimal portion of flow into it, therefore, move -+ insertion point back to orig node (before first item) */ -+ coord_init_before_first_item(flow_insert_point(op), orig); -+ return 1; -+ } -+ -+ /* part of flow is to be written to the end of node */ -+ op->node = left; -+ return 0; -+} -+ -+/* this returns 0 if right neighbor was obtained successfully and everything to -+ the right of insertion point was shifted to it and node got enough free -+ space to put minimal fraction of flow into it */ -+static int -+make_space_by_shift_right(carry_op * op, carry_level * doing, -+ carry_level * todo) -+{ -+ carry_node *right; -+ -+ right = find_right_neighbor(op, doing); -+ if (unlikely(IS_ERR(right))) { -+ warning("nikita-1065", "shift_right_excluding_insert_point: " -+ "error accessing right neighbor: %li", PTR_ERR(right)); -+ return 1; -+ } -+ if (right) { -+ /* shift everything possible on the right of but excluding -+ insertion coord into the right neighbor */ -+ carry_shift_data(RIGHT_SIDE, flow_insert_point(op), -+ reiser4_carry_real(right), doing, todo, -+ 0 /* not including insert point */); -+ } else { -+ /* right neighbor either does not exist or is unformatted -+ node */ -+ ; -+ } -+ if (coord_is_after_rightmost(flow_insert_point(op))) { -+ if (enough_space_for_min_flow_fraction(op)) { -+ /* part of flow is to be written to the end of node */ -+ return 0; -+ } -+ } -+ -+ /* new node is to be added if insert point node did not get enough -+ space for whole flow */ -+ return 1; -+} -+ -+/* this returns 0 when insert coord is set at the node end and fraction of flow -+ fits into that node */ -+static int -+make_space_by_new_nodes(carry_op * op, carry_level * doing, carry_level * todo) -+{ -+ int result; -+ znode *node; -+ carry_node *new; -+ -+ node = flow_insert_point(op)->node; -+ -+ if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT) -+ return RETERR(-E_NODE_FULL); -+ /* add new node after insert point node */ -+ new = add_new_znode(node, op->node, doing, todo); -+ if (unlikely(IS_ERR(new))) { -+ return PTR_ERR(new); -+ } -+ result = lock_carry_node(doing, new); -+ zput(reiser4_carry_real(new)); -+ if (unlikely(result)) { -+ return result; -+ } -+ op->u.insert_flow.new_nodes++; -+ if (!coord_is_after_rightmost(flow_insert_point(op))) { -+ carry_shift_data(RIGHT_SIDE, flow_insert_point(op), -+ reiser4_carry_real(new), doing, todo, -+ 0 /* not including insert point */); -+ assert("vs-901", -+ coord_is_after_rightmost(flow_insert_point(op))); -+ -+ if (enough_space_for_min_flow_fraction(op)) { -+ return 0; -+ } -+ if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT) -+ return RETERR(-E_NODE_FULL); -+ -+ /* add one more new node */ -+ new = add_new_znode(node, op->node, doing, todo); -+ if (unlikely(IS_ERR(new))) { -+ return PTR_ERR(new); -+ } -+ result = lock_carry_node(doing, new); -+ zput(reiser4_carry_real(new)); -+ if (unlikely(result)) { -+ return result; -+ } -+ op->u.insert_flow.new_nodes++; -+ } -+ -+ /* move insertion point to new node */ -+ coord_init_before_first_item(flow_insert_point(op), -+ reiser4_carry_real(new)); -+ op->node = new; -+ return 0; -+} -+ -+static int -+make_space_for_flow_insertion(carry_op * op, carry_level * doing, -+ carry_level * todo) -+{ -+ __u32 flags = op->u.insert_flow.flags; -+ -+ if (enough_space_for_whole_flow(op)) { -+ /* whole flow fits into insert point node */ -+ return 0; -+ } -+ -+ if (!(flags & COPI_DONT_SHIFT_LEFT) -+ && (make_space_by_shift_left(op, doing, todo) == 0)) { -+ /* insert point is shifted to left neighbor of original insert -+ point node and is set after last unit in that node. It has -+ enough space to fit at least minimal fraction of flow. */ -+ return 0; -+ } -+ -+ if (enough_space_for_whole_flow(op)) { -+ /* whole flow fits into insert point node */ -+ return 0; -+ } -+ -+ if (!(flags & COPI_DONT_SHIFT_RIGHT) -+ && (make_space_by_shift_right(op, doing, todo) == 0)) { -+ /* insert point is still set to the same node, but there is -+ nothing to the right of insert point. */ -+ return 0; -+ } -+ -+ if (enough_space_for_whole_flow(op)) { -+ /* whole flow fits into insert point node */ -+ return 0; -+ } -+ -+ return make_space_by_new_nodes(op, doing, todo); -+} -+ -+/* implements COP_INSERT_FLOW operation */ -+static int -+carry_insert_flow(carry_op * op, carry_level * doing, carry_level * todo) -+{ -+ int result; -+ flow_t *f; -+ coord_t *insert_point; -+ node_plugin *nplug; -+ carry_plugin_info info; -+ znode *orig_node; -+ lock_handle *orig_lh; -+ -+ f = op->u.insert_flow.flow; -+ result = 0; -+ -+ /* carry system needs this to work */ -+ info.doing = doing; -+ info.todo = todo; -+ -+ orig_node = flow_insert_point(op)->node; -+ orig_lh = doing->tracked; -+ -+ while (f->length) { -+ result = make_space_for_flow_insertion(op, doing, todo); -+ if (result) -+ break; -+ -+ insert_point = flow_insert_point(op); -+ nplug = node_plugin_by_node(insert_point->node); -+ -+ /* compose item data for insertion/pasting */ -+ flow_insert_data(op)->data = f->data; -+ flow_insert_data(op)->length = what_can_fit_into_node(op); -+ -+ if (can_paste(insert_point, &f->key, flow_insert_data(op))) { -+ /* insert point is set to item of file we are writing to and we have to append to it */ -+ assert("vs-903", insert_point->between == AFTER_UNIT); -+ nplug->change_item_size(insert_point, -+ flow_insert_data(op)->length); -+ flow_insert_data(op)->iplug->b.paste(insert_point, -+ flow_insert_data -+ (op), &info); -+ } else { -+ /* new item must be inserted */ -+ pos_in_node_t new_pos; -+ flow_insert_data(op)->length += item_data_overhead(op); -+ -+ /* FIXME-VS: this is because node40_create_item changes -+ insert_point for obscure reasons */ -+ switch (insert_point->between) { -+ case AFTER_ITEM: -+ new_pos = insert_point->item_pos + 1; -+ break; -+ case EMPTY_NODE: -+ new_pos = 0; -+ break; -+ case BEFORE_ITEM: -+ assert("vs-905", insert_point->item_pos == 0); -+ new_pos = 0; -+ break; -+ default: -+ impossible("vs-906", -+ "carry_insert_flow: invalid coord"); -+ new_pos = 0; -+ break; -+ } -+ -+ nplug->create_item(insert_point, &f->key, -+ flow_insert_data(op), &info); -+ coord_set_item_pos(insert_point, new_pos); -+ } -+ coord_init_after_item_end(insert_point); -+ doing->restartable = 0; -+ znode_make_dirty(insert_point->node); -+ -+ move_flow_forward(f, (unsigned)flow_insert_data(op)->length); -+ } -+ -+ if (orig_node != flow_insert_point(op)->node) { -+ /* move lock to new insert point */ -+ done_lh(orig_lh); -+ init_lh(orig_lh); -+ result = -+ longterm_lock_znode(orig_lh, flow_insert_point(op)->node, -+ ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI); -+ } -+ -+ return result; -+} -+ -+/* implements COP_DELETE operation -+ -+ Remove pointer to @op -> u.delete.child from it's parent. -+ -+ This function also handles killing of a tree root is last pointer from it -+ was removed. This is complicated by our handling of "twig" level: root on -+ twig level is never killed. -+ -+*/ -+static int carry_delete(carry_op * op /* operation to be performed */ , -+ carry_level * doing UNUSED_ARG /* current carry -+ * level */ , -+ carry_level * todo /* next carry level */ ) -+{ -+ int result; -+ coord_t coord; -+ coord_t coord2; -+ znode *parent; -+ znode *child; -+ carry_plugin_info info; -+ reiser4_tree *tree; -+ -+ /* -+ * This operation is called to delete internal item pointing to the -+ * child node that was removed by carry from the tree on the previous -+ * tree level. -+ */ -+ -+ assert("nikita-893", op != NULL); -+ assert("nikita-894", todo != NULL); -+ assert("nikita-895", op->op == COP_DELETE); -+ -+ coord_init_zero(&coord); -+ coord_init_zero(&coord2); -+ -+ parent = reiser4_carry_real(op->node); -+ child = op->u.delete.child ? -+ reiser4_carry_real(op->u.delete.child) : op->node->node; -+ tree = znode_get_tree(child); -+ read_lock_tree(tree); -+ -+ /* -+ * @parent was determined when carry entered parent level -+ * (lock_carry_level/lock_carry_node). Since then, actual parent of -+ * @child node could change due to other carry operations performed on -+ * the parent level. Check for this. -+ */ -+ -+ if (znode_parent(child) != parent) { -+ /* NOTE-NIKITA add stat counter for this. */ -+ parent = znode_parent(child); -+ assert("nikita-2581", find_carry_node(doing, parent)); -+ } -+ read_unlock_tree(tree); -+ -+ assert("nikita-1213", znode_get_level(parent) > LEAF_LEVEL); -+ -+ /* Twig level horrors: tree should be of height at least 2. So, last -+ pointer from the root at twig level is preserved even if child is -+ empty. This is ugly, but so it was architectured. -+ */ -+ -+ if (znode_is_root(parent) && -+ znode_get_level(parent) <= REISER4_MIN_TREE_HEIGHT && -+ node_num_items(parent) == 1) { -+ /* Delimiting key manipulations. */ -+ write_lock_dk(tree); -+ znode_set_ld_key(child, znode_set_ld_key(parent, reiser4_min_key())); -+ znode_set_rd_key(child, znode_set_rd_key(parent, reiser4_max_key())); -+ ZF_SET(child, JNODE_DKSET); -+ write_unlock_dk(tree); -+ -+ /* @child escaped imminent death! */ -+ ZF_CLR(child, JNODE_HEARD_BANSHEE); -+ return 0; -+ } -+ -+ /* convert child pointer to the coord_t */ -+ result = find_child_ptr(parent, child, &coord); -+ if (result != NS_FOUND) { -+ warning("nikita-994", "Cannot find child pointer: %i", result); -+ print_coord_content("coord", &coord); -+ return result; -+ } -+ -+ coord_dup(&coord2, &coord); -+ info.doing = doing; -+ info.todo = todo; -+ { -+ /* -+ * Actually kill internal item: prepare structure with -+ * arguments for ->cut_and_kill() method... -+ */ -+ -+ struct carry_kill_data kdata; -+ kdata.params.from = &coord; -+ kdata.params.to = &coord2; -+ kdata.params.from_key = NULL; -+ kdata.params.to_key = NULL; -+ kdata.params.smallest_removed = NULL; -+ kdata.params.truncate = 1; -+ kdata.flags = op->u.delete.flags; -+ kdata.inode = NULL; -+ kdata.left = NULL; -+ kdata.right = NULL; -+ kdata.buf = NULL; -+ /* ... and call it. */ -+ result = node_plugin_by_node(parent)->cut_and_kill(&kdata, -+ &info); -+ } -+ doing->restartable = 0; -+ -+ /* check whether root should be killed violently */ -+ if (znode_is_root(parent) && -+ /* don't kill roots at and lower than twig level */ -+ znode_get_level(parent) > REISER4_MIN_TREE_HEIGHT && -+ node_num_items(parent) == 1) { -+ result = reiser4_kill_tree_root(coord.node); -+ } -+ -+ return result < 0 ? : 0; -+} -+ -+/* implements COP_CUT opration -+ -+ Cuts part or whole content of node. -+ -+*/ -+static int carry_cut(carry_op * op /* operation to be performed */ , -+ carry_level * doing /* current carry level */ , -+ carry_level * todo /* next carry level */ ) -+{ -+ int result; -+ carry_plugin_info info; -+ node_plugin *nplug; -+ -+ assert("nikita-896", op != NULL); -+ assert("nikita-897", todo != NULL); -+ assert("nikita-898", op->op == COP_CUT); -+ -+ info.doing = doing; -+ info.todo = todo; -+ -+ nplug = node_plugin_by_node(reiser4_carry_real(op->node)); -+ if (op->u.cut_or_kill.is_cut) -+ result = nplug->cut(op->u.cut_or_kill.u.cut, &info); -+ else -+ result = nplug->cut_and_kill(op->u.cut_or_kill.u.kill, &info); -+ -+ doing->restartable = 0; -+ return result < 0 ? : 0; -+} -+ -+/* helper function for carry_paste(): returns true if @op can be continued as -+ paste */ -+static int -+can_paste(coord_t * icoord, const reiser4_key * key, -+ const reiser4_item_data * data) -+{ -+ coord_t circa; -+ item_plugin *new_iplug; -+ item_plugin *old_iplug; -+ int result = 0; /* to keep gcc shut */ -+ -+ assert("", icoord->between != AT_UNIT); -+ -+ /* obviously, one cannot paste when node is empty---there is nothing -+ to paste into. */ -+ if (node_is_empty(icoord->node)) -+ return 0; -+ /* if insertion point is at the middle of the item, then paste */ -+ if (!coord_is_between_items(icoord)) -+ return 1; -+ coord_dup(&circa, icoord); -+ circa.between = AT_UNIT; -+ -+ old_iplug = item_plugin_by_coord(&circa); -+ new_iplug = data->iplug; -+ -+ /* check whether we can paste to the item @icoord is "at" when we -+ ignore ->between field */ -+ if (old_iplug == new_iplug && item_can_contain_key(&circa, key, data)) { -+ result = 1; -+ } else if (icoord->between == BEFORE_UNIT -+ || icoord->between == BEFORE_ITEM) { -+ /* otherwise, try to glue to the item at the left, if any */ -+ coord_dup(&circa, icoord); -+ if (coord_set_to_left(&circa)) { -+ result = 0; -+ coord_init_before_item(icoord); -+ } else { -+ old_iplug = item_plugin_by_coord(&circa); -+ result = (old_iplug == new_iplug) -+ && item_can_contain_key(icoord, key, data); -+ if (result) { -+ coord_dup(icoord, &circa); -+ icoord->between = AFTER_UNIT; -+ } -+ } -+ } else if (icoord->between == AFTER_UNIT -+ || icoord->between == AFTER_ITEM) { -+ coord_dup(&circa, icoord); -+ /* otherwise, try to glue to the item at the right, if any */ -+ if (coord_set_to_right(&circa)) { -+ result = 0; -+ coord_init_after_item(icoord); -+ } else { -+ int (*cck) (const coord_t *, const reiser4_key *, -+ const reiser4_item_data *); -+ -+ old_iplug = item_plugin_by_coord(&circa); -+ -+ cck = old_iplug->b.can_contain_key; -+ if (cck == NULL) -+ /* item doesn't define ->can_contain_key -+ method? So it is not expandable. */ -+ result = 0; -+ else { -+ result = (old_iplug == new_iplug) -+ && cck(&circa /*icoord */ , key, data); -+ if (result) { -+ coord_dup(icoord, &circa); -+ icoord->between = BEFORE_UNIT; -+ } -+ } -+ } -+ } else -+ impossible("nikita-2513", "Nothing works"); -+ if (result) { -+ if (icoord->between == BEFORE_ITEM) { -+ assert("vs-912", icoord->unit_pos == 0); -+ icoord->between = BEFORE_UNIT; -+ } else if (icoord->between == AFTER_ITEM) { -+ coord_init_after_item_end(icoord); -+ } -+ } -+ return result; -+} -+ -+/* implements COP_PASTE operation -+ -+ Paste data into existing item. This is complicated by the fact that after -+ we shifted something to the left or right neighbors trying to free some -+ space, item we were supposed to paste into can be in different node than -+ insertion coord. If so, we are no longer doing paste, but insert. See -+ comments in insert_paste_common(). -+ -+*/ -+static int carry_paste(carry_op * op /* operation to be performed */ , -+ carry_level * doing UNUSED_ARG /* current carry -+ * level */ , -+ carry_level * todo /* next carry level */ ) -+{ -+ znode *node; -+ carry_insert_data cdata; -+ coord_t dcoord; -+ reiser4_item_data data; -+ int result; -+ int real_size; -+ item_plugin *iplug; -+ carry_plugin_info info; -+ coord_t *coord; -+ -+ assert("nikita-982", op != NULL); -+ assert("nikita-983", todo != NULL); -+ assert("nikita-984", op->op == COP_PASTE); -+ -+ coord_init_zero(&dcoord); -+ -+ result = insert_paste_common(op, doing, todo, &cdata, &dcoord, &data); -+ if (result != 0) -+ return result; -+ -+ coord = op->u.insert.d->coord; -+ -+ /* handle case when op -> u.insert.coord doesn't point to the item -+ of required type. restart as insert. */ -+ if (!can_paste(coord, op->u.insert.d->key, op->u.insert.d->data)) { -+ op->op = COP_INSERT; -+ op->u.insert.type = COPT_PASTE_RESTARTED; -+ result = op_dispatch_table[COP_INSERT].handler(op, doing, todo); -+ -+ return result; -+ } -+ -+ node = coord->node; -+ iplug = item_plugin_by_coord(coord); -+ assert("nikita-992", iplug != NULL); -+ -+ assert("nikita-985", node != NULL); -+ assert("nikita-986", node_plugin_by_node(node) != NULL); -+ -+ assert("nikita-987", -+ space_needed_for_op(node, op) <= znode_free_space(node)); -+ -+ assert("nikita-1286", coord_is_existing_item(coord)); -+ -+ /* -+ * if item is expanded as a result of this operation, we should first -+ * change item size, than call ->b.paste item method. If item is -+ * shrunk, it should be done other way around: first call ->b.paste -+ * method, then reduce item size. -+ */ -+ -+ real_size = space_needed_for_op(node, op); -+ if (real_size > 0) -+ node->nplug->change_item_size(coord, real_size); -+ -+ doing->restartable = 0; -+ info.doing = doing; -+ info.todo = todo; -+ -+ result = iplug->b.paste(coord, op->u.insert.d->data, &info); -+ -+ if (real_size < 0) -+ node->nplug->change_item_size(coord, real_size); -+ -+ /* if we pasted at the beginning of the item, update item's key. */ -+ if (coord->unit_pos == 0 && coord->between != AFTER_UNIT) -+ node->nplug->update_item_key(coord, op->u.insert.d->key, &info); -+ -+ znode_make_dirty(node); -+ return result; -+} -+ -+/* handle carry COP_EXTENT operation. */ -+static int carry_extent(carry_op * op /* operation to perform */ , -+ carry_level * doing /* queue of operations @op -+ * is part of */ , -+ carry_level * todo /* queue where new operations -+ * are accumulated */ ) -+{ -+ znode *node; -+ carry_insert_data cdata; -+ coord_t coord; -+ reiser4_item_data data; -+ carry_op *delete_dummy; -+ carry_op *insert_extent; -+ int result; -+ carry_plugin_info info; -+ -+ assert("nikita-1751", op != NULL); -+ assert("nikita-1752", todo != NULL); -+ assert("nikita-1753", op->op == COP_EXTENT); -+ -+ /* extent insertion overview: -+ -+ extents live on the TWIG LEVEL, which is level one above the leaf -+ one. This complicates extent insertion logic somewhat: it may -+ happen (and going to happen all the time) that in logical key -+ ordering extent has to be placed between items I1 and I2, located -+ at the leaf level, but I1 and I2 are in the same formatted leaf -+ node N1. To insert extent one has to -+ -+ (1) reach node N1 and shift data between N1, its neighbors and -+ possibly newly allocated nodes until I1 and I2 fall into different -+ nodes. Since I1 and I2 are still neighboring items in logical key -+ order, they will be necessary utmost items in their respective -+ nodes. -+ -+ (2) After this new extent item is inserted into node on the twig -+ level. -+ -+ Fortunately this process can reuse almost all code from standard -+ insertion procedure (viz. make_space() and insert_paste_common()), -+ due to the following observation: make_space() only shifts data up -+ to and excluding or including insertion point. It never -+ "over-moves" through insertion point. Thus, one can use -+ make_space() to perform step (1). All required for this is just to -+ instruct free_space_shortage() to keep make_space() shifting data -+ until insertion point is at the node border. -+ -+ */ -+ -+ /* perform common functionality of insert and paste. */ -+ result = insert_paste_common(op, doing, todo, &cdata, &coord, &data); -+ if (result != 0) -+ return result; -+ -+ node = op->u.extent.d->coord->node; -+ assert("nikita-1754", node != NULL); -+ assert("nikita-1755", node_plugin_by_node(node) != NULL); -+ assert("nikita-1700", coord_wrt(op->u.extent.d->coord) != COORD_INSIDE); -+ -+ /* NOTE-NIKITA add some checks here. Not assertions, -EIO. Check that -+ extent fits between items. */ -+ -+ info.doing = doing; -+ info.todo = todo; -+ -+ /* there is another complication due to placement of extents on the -+ twig level: extents are "rigid" in the sense that key-range -+ occupied by extent cannot grow indefinitely to the right as it is -+ for the formatted leaf nodes. Because of this when search finds two -+ adjacent extents on the twig level, it has to "drill" to the leaf -+ level, creating new node. Here we are removing this node. -+ */ -+ if (node_is_empty(node)) { -+ delete_dummy = node_post_carry(&info, COP_DELETE, node, 1); -+ if (IS_ERR(delete_dummy)) -+ return PTR_ERR(delete_dummy); -+ delete_dummy->u.delete.child = NULL; -+ delete_dummy->u.delete.flags = DELETE_RETAIN_EMPTY; -+ ZF_SET(node, JNODE_HEARD_BANSHEE); -+ } -+ -+ /* proceed with inserting extent item into parent. We are definitely -+ inserting rather than pasting if we get that far. */ -+ insert_extent = node_post_carry(&info, COP_INSERT, node, 1); -+ if (IS_ERR(insert_extent)) -+ /* @delete_dummy will be automatically destroyed on the level -+ exiting */ -+ return PTR_ERR(insert_extent); -+ /* NOTE-NIKITA insertion by key is simplest option here. Another -+ possibility is to insert on the left or right of already existing -+ item. -+ */ -+ insert_extent->u.insert.type = COPT_KEY; -+ insert_extent->u.insert.d = op->u.extent.d; -+ assert("nikita-1719", op->u.extent.d->key != NULL); -+ insert_extent->u.insert.d->data->arg = op->u.extent.d->coord; -+ insert_extent->u.insert.flags = -+ znode_get_tree(node)->carry.new_extent_flags; -+ -+ /* -+ * if carry was asked to track lock handle we should actually track -+ * lock handle on the twig node rather than on the leaf where -+ * operation was started from. Transfer tracked lock handle. -+ */ -+ if (doing->track_type) { -+ assert("nikita-3242", doing->tracked != NULL); -+ assert("nikita-3244", todo->tracked == NULL); -+ todo->tracked = doing->tracked; -+ todo->track_type = CARRY_TRACK_NODE; -+ doing->tracked = NULL; -+ doing->track_type = 0; -+ } -+ -+ return 0; -+} -+ -+/* update key in @parent between pointers to @left and @right. -+ -+ Find coords of @left and @right and update delimiting key between them. -+ This is helper function called by carry_update(). Finds position of -+ internal item involved. Updates item key. Updates delimiting keys of child -+ nodes involved. -+*/ -+static int update_delimiting_key(znode * parent /* node key is updated -+ * in */ , -+ znode * left /* child of @parent */ , -+ znode * right /* child of @parent */ , -+ carry_level * doing /* current carry -+ * level */ , -+ carry_level * todo /* parent carry -+ * level */ , -+ const char **error_msg /* place to -+ * store error -+ * message */ ) -+{ -+ coord_t left_pos; -+ coord_t right_pos; -+ int result; -+ reiser4_key ldkey; -+ carry_plugin_info info; -+ -+ assert("nikita-1177", right != NULL); -+ /* find position of right left child in a parent */ -+ result = find_child_ptr(parent, right, &right_pos); -+ if (result != NS_FOUND) { -+ *error_msg = "Cannot find position of right child"; -+ return result; -+ } -+ -+ if ((left != NULL) && !coord_is_leftmost_unit(&right_pos)) { -+ /* find position of the left child in a parent */ -+ result = find_child_ptr(parent, left, &left_pos); -+ if (result != NS_FOUND) { -+ *error_msg = "Cannot find position of left child"; -+ return result; -+ } -+ assert("nikita-1355", left_pos.node != NULL); -+ } else -+ left_pos.node = NULL; -+ -+ /* check that they are separated by exactly one key and are basically -+ sane */ -+ if (REISER4_DEBUG) { -+ if ((left_pos.node != NULL) -+ && !coord_is_existing_unit(&left_pos)) { -+ *error_msg = "Left child is bastard"; -+ return RETERR(-EIO); -+ } -+ if (!coord_is_existing_unit(&right_pos)) { -+ *error_msg = "Right child is bastard"; -+ return RETERR(-EIO); -+ } -+ if (left_pos.node != NULL && -+ !coord_are_neighbors(&left_pos, &right_pos)) { -+ *error_msg = "Children are not direct siblings"; -+ return RETERR(-EIO); -+ } -+ } -+ *error_msg = NULL; -+ -+ info.doing = doing; -+ info.todo = todo; -+ -+ /* -+ * If child node is not empty, new key of internal item is a key of -+ * leftmost item in the child node. If the child is empty, take its -+ * right delimiting key as a new key of the internal item. Precise key -+ * in the latter case is not important per se, because the child (and -+ * the internal item) are going to be killed shortly anyway, but we -+ * have to preserve correct order of keys in the parent node. -+ */ -+ -+ if (!ZF_ISSET(right, JNODE_HEARD_BANSHEE)) -+ leftmost_key_in_node(right, &ldkey); -+ else { -+ read_lock_dk(znode_get_tree(parent)); -+ ldkey = *znode_get_rd_key(right); -+ read_unlock_dk(znode_get_tree(parent)); -+ } -+ node_plugin_by_node(parent)->update_item_key(&right_pos, &ldkey, &info); -+ doing->restartable = 0; -+ znode_make_dirty(parent); -+ return 0; -+} -+ -+/* implements COP_UPDATE opration -+ -+ Update delimiting keys. -+ -+*/ -+static int carry_update(carry_op * op /* operation to be performed */ , -+ carry_level * doing /* current carry level */ , -+ carry_level * todo /* next carry level */ ) -+{ -+ int result; -+ carry_node *missing UNUSED_ARG; -+ znode *left; -+ znode *right; -+ carry_node *lchild; -+ carry_node *rchild; -+ const char *error_msg; -+ reiser4_tree *tree; -+ -+ /* -+ * This operation is called to update key of internal item. This is -+ * necessary when carry shifted of cut data on the child -+ * level. Arguments of this operation are: -+ * -+ * @right --- child node. Operation should update key of internal -+ * item pointing to @right. -+ * -+ * @left --- left neighbor of @right. This parameter is optional. -+ */ -+ -+ assert("nikita-902", op != NULL); -+ assert("nikita-903", todo != NULL); -+ assert("nikita-904", op->op == COP_UPDATE); -+ -+ lchild = op->u.update.left; -+ rchild = op->node; -+ -+ if (lchild != NULL) { -+ assert("nikita-1001", lchild->parent); -+ assert("nikita-1003", !lchild->left); -+ left = reiser4_carry_real(lchild); -+ } else -+ left = NULL; -+ -+ tree = znode_get_tree(rchild->node); -+ read_lock_tree(tree); -+ right = znode_parent(rchild->node); -+ read_unlock_tree(tree); -+ -+ if (right != NULL) { -+ result = update_delimiting_key(right, -+ lchild ? lchild->node : NULL, -+ rchild->node, -+ doing, todo, &error_msg); -+ } else { -+ error_msg = "Cannot find node to update key in"; -+ result = RETERR(-EIO); -+ } -+ /* operation will be reposted to the next level by the -+ ->update_item_key() method of node plugin, if necessary. */ -+ -+ if (result != 0) { -+ warning("nikita-999", "Error updating delimiting key: %s (%i)", -+ error_msg ? : "", result); -+ } -+ return result; -+} -+ -+/* move items from @node during carry */ -+static int carry_shift_data(sideof side /* in what direction to move data */ , -+ coord_t * insert_coord /* coord where new item -+ * is to be inserted */ , -+ znode * node /* node which data are moved from */ , -+ carry_level * doing /* active carry queue */ , -+ carry_level * todo /* carry queue where new -+ * operations are to be put -+ * in */ , -+ unsigned int including_insert_coord_p /* true if -+ * @insertion_coord -+ * can be moved */ ) -+{ -+ int result; -+ znode *source; -+ carry_plugin_info info; -+ node_plugin *nplug; -+ -+ source = insert_coord->node; -+ -+ info.doing = doing; -+ info.todo = todo; -+ -+ nplug = node_plugin_by_node(node); -+ result = nplug->shift(insert_coord, node, -+ (side == LEFT_SIDE) ? SHIFT_LEFT : SHIFT_RIGHT, 0, -+ (int)including_insert_coord_p, &info); -+ /* the only error ->shift() method of node plugin can return is -+ -ENOMEM due to carry node/operation allocation. */ -+ assert("nikita-915", result >= 0 || result == -ENOMEM); -+ if (result > 0) { -+ /* -+ * if some number of bytes was actually shifted, mark nodes -+ * dirty, and carry level as non-restartable. -+ */ -+ doing->restartable = 0; -+ znode_make_dirty(source); -+ znode_make_dirty(node); -+ } -+ -+ assert("nikita-2077", coord_check(insert_coord)); -+ return 0; -+} -+ -+typedef carry_node *(*carry_iterator) (carry_node * node); -+static carry_node *find_dir_carry(carry_node * node, carry_level * level, -+ carry_iterator iterator); -+ -+static carry_node *pool_level_list_prev(carry_node *node) -+{ -+ return list_entry(node->header.level_linkage.prev, carry_node, header.level_linkage); -+} -+ -+/* look for the left neighbor of given carry node in a carry queue. -+ -+ This is used by find_left_neighbor(), but I am not sure that this -+ really gives any advantage. More statistics required. -+ -+*/ -+carry_node *find_left_carry(carry_node * node /* node to find left neighbor -+ * of */ , -+ carry_level * level /* level to scan */ ) -+{ -+ return find_dir_carry(node, level, -+ (carry_iterator) pool_level_list_prev); -+} -+ -+static carry_node *pool_level_list_next(carry_node *node) -+{ -+ return list_entry(node->header.level_linkage.next, carry_node, header.level_linkage); -+} -+ -+/* look for the right neighbor of given carry node in a -+ carry queue. -+ -+ This is used by find_right_neighbor(), but I am not sure that this -+ really gives any advantage. More statistics required. -+ -+*/ -+carry_node *find_right_carry(carry_node * node /* node to find right neighbor -+ * of */ , -+ carry_level * level /* level to scan */ ) -+{ -+ return find_dir_carry(node, level, -+ (carry_iterator) pool_level_list_next); -+} -+ -+/* look for the left or right neighbor of given carry node in a carry -+ queue. -+ -+ Helper function used by find_{left|right}_carry(). -+*/ -+static carry_node *find_dir_carry(carry_node * node /* node to start scanning -+ * from */ , -+ carry_level * level /* level to scan */ , -+ carry_iterator iterator /* operation to -+ * move to the next -+ * node */ ) -+{ -+ carry_node *neighbor; -+ -+ assert("nikita-1059", node != NULL); -+ assert("nikita-1060", level != NULL); -+ -+ /* scan list of carry nodes on this list dir-ward, skipping all -+ carry nodes referencing the same znode. */ -+ neighbor = node; -+ while (1) { -+ neighbor = iterator(neighbor); -+ if (carry_node_end(level, neighbor)) -+ /* list head is reached */ -+ return NULL; -+ if (reiser4_carry_real(neighbor) != reiser4_carry_real(node)) -+ return neighbor; -+ } -+} -+ -+/* -+ * Memory reservation estimation. -+ * -+ * Carry process proceeds through tree levels upwards. Carry assumes that it -+ * takes tree in consistent state (e.g., that search tree invariants hold), -+ * and leaves tree consistent after it finishes. This means that when some -+ * error occurs carry cannot simply return if there are pending carry -+ * operations. Generic solution for this problem is carry-undo either as -+ * transaction manager feature (requiring checkpoints and isolation), or -+ * through some carry specific mechanism. -+ * -+ * Our current approach is to panic if carry hits an error while tree is -+ * inconsistent. Unfortunately -ENOMEM can easily be triggered. To work around -+ * this "memory reservation" mechanism was added. -+ * -+ * Memory reservation is implemented by perthread-pages.diff patch from -+ * core-patches. Its API is defined in -+ * -+ * int perthread_pages_reserve(int nrpages, gfp_t gfp); -+ * void perthread_pages_release(int nrpages); -+ * int perthread_pages_count(void); -+ * -+ * carry estimates its worst case memory requirements at the entry, reserved -+ * enough memory, and released unused pages before returning. -+ * -+ * Code below estimates worst case memory requirements for a given carry -+ * queue. This is dome by summing worst case memory requirements for each -+ * operation in the queue. -+ * -+ */ -+ -+/* -+ * Memory memory requirements of many operations depends on the tree -+ * height. For example, item insertion requires new node to be inserted at -+ * each tree level in the worst case. What tree height should be used for -+ * estimation? Current tree height is wrong, because tree height can change -+ * between the time when estimation was done and the time when operation is -+ * actually performed. Maximal possible tree height (REISER4_MAX_ZTREE_HEIGHT) -+ * is also not desirable, because it would lead to the huge over-estimation -+ * all the time. Plausible solution is "capped tree height": if current tree -+ * height is less than some TREE_HEIGHT_CAP constant, capped tree height is -+ * TREE_HEIGHT_CAP, otherwise it's current tree height. Idea behind this is -+ * that if tree height is TREE_HEIGHT_CAP or larger, it's extremely unlikely -+ * to be increased even more during short interval of time. -+ */ -+#define TREE_HEIGHT_CAP (5) -+ -+/* return capped tree height for the @tree. See comment above. */ -+static int cap_tree_height(reiser4_tree * tree) -+{ -+ return max_t(int, tree->height, TREE_HEIGHT_CAP); -+} -+ -+/* return capped tree height for the current tree. */ -+static int capped_height(void) -+{ -+ return cap_tree_height(current_tree); -+} -+ -+/* return number of pages required to store given number of bytes */ -+static int bytes_to_pages(int bytes) -+{ -+ return (bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; -+} -+ -+/* how many pages are required to allocate znodes during item insertion. */ -+static int carry_estimate_znodes(void) -+{ -+ /* -+ * Note, that there we have some problem here: there is no way to -+ * reserve pages specifically for the given slab. This means that -+ * these pages can be hijacked for some other end. -+ */ -+ -+ /* in the worst case we need 3 new znode on each tree level */ -+ return bytes_to_pages(capped_height() * sizeof(znode) * 3); -+} -+ -+/* -+ * how many pages are required to load bitmaps. One bitmap per level. -+ */ -+static int carry_estimate_bitmaps(void) -+{ -+ if (reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_LOAD_BITMAP)) { -+ int bytes; -+ -+ bytes = capped_height() * (0 + /* bnode should be added, but its is private to -+ * bitmap.c, skip for now. */ -+ 2 * sizeof(jnode)); /* working and commit jnodes */ -+ return bytes_to_pages(bytes) + 2; /* and their contents */ -+ } else -+ /* bitmaps were pre-loaded during mount */ -+ return 0; -+} -+ -+/* worst case item insertion memory requirements */ -+static int carry_estimate_insert(carry_op * op, carry_level * level) -+{ -+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */ -+ capped_height() + /* new block on each level */ -+ 1 + /* and possibly extra new block at the leaf level */ -+ 3; /* loading of leaves into memory */ -+} -+ -+/* worst case item deletion memory requirements */ -+static int carry_estimate_delete(carry_op * op, carry_level * level) -+{ -+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */ -+ 3; /* loading of leaves into memory */ -+} -+ -+/* worst case tree cut memory requirements */ -+static int carry_estimate_cut(carry_op * op, carry_level * level) -+{ -+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */ -+ 3; /* loading of leaves into memory */ -+} -+ -+/* worst case memory requirements of pasting into item */ -+static int carry_estimate_paste(carry_op * op, carry_level * level) -+{ -+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */ -+ capped_height() + /* new block on each level */ -+ 1 + /* and possibly extra new block at the leaf level */ -+ 3; /* loading of leaves into memory */ -+} -+ -+/* worst case memory requirements of extent insertion */ -+static int carry_estimate_extent(carry_op * op, carry_level * level) -+{ -+ return carry_estimate_insert(op, level) + /* insert extent */ -+ carry_estimate_delete(op, level); /* kill leaf */ -+} -+ -+/* worst case memory requirements of key update */ -+static int carry_estimate_update(carry_op * op, carry_level * level) -+{ -+ return 0; -+} -+ -+/* worst case memory requirements of flow insertion */ -+static int carry_estimate_insert_flow(carry_op * op, carry_level * level) -+{ -+ int newnodes; -+ -+ newnodes = min(bytes_to_pages(op->u.insert_flow.flow->length), -+ CARRY_FLOW_NEW_NODES_LIMIT); -+ /* -+ * roughly estimate insert_flow as a sequence of insertions. -+ */ -+ return newnodes * carry_estimate_insert(op, level); -+} -+ -+/* This is dispatch table for carry operations. It can be trivially -+ abstracted into useful plugin: tunable balancing policy is a good -+ thing. */ -+carry_op_handler op_dispatch_table[COP_LAST_OP] = { -+ [COP_INSERT] = { -+ .handler = carry_insert, -+ .estimate = carry_estimate_insert} -+ , -+ [COP_DELETE] = { -+ .handler = carry_delete, -+ .estimate = carry_estimate_delete} -+ , -+ [COP_CUT] = { -+ .handler = carry_cut, -+ .estimate = carry_estimate_cut} -+ , -+ [COP_PASTE] = { -+ .handler = carry_paste, -+ .estimate = carry_estimate_paste} -+ , -+ [COP_EXTENT] = { -+ .handler = carry_extent, -+ .estimate = carry_estimate_extent} -+ , -+ [COP_UPDATE] = { -+ .handler = carry_update, -+ .estimate = carry_estimate_update} -+ , -+ [COP_INSERT_FLOW] = { -+ .handler = carry_insert_flow, -+ .estimate = carry_estimate_insert_flow} -+}; -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/carry_ops.h linux-2.6.20/fs/reiser4/carry_ops.h ---- linux-2.6.20.orig/fs/reiser4/carry_ops.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/carry_ops.h 2007-05-06 14:50:43.694974475 +0400 -@@ -0,0 +1,42 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* implementation of carry operations. See carry_ops.c for details. */ -+ -+#if !defined( __CARRY_OPS_H__ ) -+#define __CARRY_OPS_H__ -+ -+#include "forward.h" -+#include "znode.h" -+#include "carry.h" -+ -+/* carry operation handlers */ -+typedef struct carry_op_handler { -+ /* perform operation */ -+ int (*handler) (carry_op * op, carry_level * doing, carry_level * todo); -+ /* estimate memory requirements for @op */ -+ int (*estimate) (carry_op * op, carry_level * level); -+} carry_op_handler; -+ -+/* This is dispatch table for carry operations. It can be trivially -+ abstracted into useful plugin: tunable balancing policy is a good -+ thing. */ -+extern carry_op_handler op_dispatch_table[COP_LAST_OP]; -+ -+unsigned int space_needed(const znode * node, const coord_t * coord, -+ const reiser4_item_data * data, int inserting); -+extern carry_node *find_left_carry(carry_node * node, carry_level * level); -+extern carry_node *find_right_carry(carry_node * node, carry_level * level); -+ -+/* __CARRY_OPS_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/context.c linux-2.6.20/fs/reiser4/context.c ---- linux-2.6.20.orig/fs/reiser4/context.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/context.c 2007-05-06 14:50:43.694974475 +0400 -@@ -0,0 +1,288 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Manipulation of reiser4_context */ -+ -+/* -+ * global context used during system call. Variable of this type is allocated -+ * on the stack at the beginning of the reiser4 part of the system call and -+ * pointer to it is stored in the current->fs_context. This allows us to avoid -+ * passing pointer to current transaction and current lockstack (both in -+ * one-to-one mapping with threads) all over the call chain. -+ * -+ * It's kind of like those global variables the prof used to tell you not to -+ * use in CS1, except thread specific.;-) Nikita, this was a good idea. -+ * -+ * In some situations it is desirable to have ability to enter reiser4_context -+ * more than once for the same thread (nested contexts). For example, there -+ * are some functions that can be called either directly from VFS/VM or from -+ * already active reiser4 context (->writepage, for example). -+ * -+ * In such situations "child" context acts like dummy: all activity is -+ * actually performed in the top level context, and get_current_context() -+ * always returns top level context. -+ * Of course, reiser4_init_context()/reiser4_done_context() have to be properly -+ * nested any way. -+ * -+ * Note that there is an important difference between reiser4 uses -+ * ->fs_context and the way other file systems use it. Other file systems -+ * (ext3 and reiserfs) use ->fs_context only for the duration of _transaction_ -+ * (this is why ->fs_context was initially called ->journal_info). This means, -+ * that when ext3 or reiserfs finds that ->fs_context is not NULL on the entry -+ * to the file system, they assume that some transaction is already underway, -+ * and usually bail out, because starting nested transaction would most likely -+ * lead to the deadlock. This gives false positives with reiser4, because we -+ * set ->fs_context before starting transaction. -+ */ -+ -+#include "debug.h" -+#include "super.h" -+#include "context.h" -+ -+#include /* balance_dirty_pages() */ -+#include -+ -+static void _reiser4_init_context(reiser4_context * context, -+ struct super_block *super) -+{ -+ memset(context, 0, sizeof(*context)); -+ -+ context->super = super; -+ context->magic = context_magic; -+ context->outer = current->journal_info; -+ current->journal_info = (void *)context; -+ context->nr_children = 0; -+ context->gfp_mask = GFP_KERNEL; -+ -+ init_lock_stack(&context->stack); -+ -+ reiser4_txn_begin(context); -+ -+ /* initialize head of tap list */ -+ INIT_LIST_HEAD(&context->taps); -+#if REISER4_DEBUG -+ context->task = current; -+#endif -+ grab_space_enable(); -+} -+ -+/* initialize context and bind it to the current thread -+ -+ This function should be called at the beginning of reiser4 part of -+ syscall. -+*/ -+reiser4_context * reiser4_init_context(struct super_block * super) -+{ -+ reiser4_context *context; -+ -+ assert("nikita-2662", !in_interrupt() && !in_irq()); -+ assert("nikita-3357", super != NULL); -+ assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super)); -+ -+ context = get_current_context_check(); -+ if (context && context->super == super) { -+ context = (reiser4_context *) current->journal_info; -+ context->nr_children++; -+ return context; -+ } -+ -+ context = kmalloc(sizeof(*context), GFP_KERNEL); -+ if (context == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ -+ _reiser4_init_context(context, super); -+ return context; -+} -+ -+/* this is used in scan_mgr which is called with spinlock held and in -+ reiser4_fill_super magic */ -+void init_stack_context(reiser4_context *context, struct super_block *super) -+{ -+ assert("nikita-2662", !in_interrupt() && !in_irq()); -+ assert("nikita-3357", super != NULL); -+ assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super)); -+ assert("vs-12", !is_in_reiser4_context()); -+ -+ _reiser4_init_context(context, super); -+ context->on_stack = 1; -+ return; -+} -+ -+/* cast lock stack embedded into reiser4 context up to its container */ -+reiser4_context *get_context_by_lock_stack(lock_stack * owner) -+{ -+ return container_of(owner, reiser4_context, stack); -+} -+ -+/* true if there is already _any_ reiser4 context for the current thread */ -+int is_in_reiser4_context(void) -+{ -+ reiser4_context *ctx; -+ -+ ctx = current->journal_info; -+ return ctx != NULL && ((unsigned long)ctx->magic) == context_magic; -+} -+ -+/* -+ * call balance dirty pages for the current context. -+ * -+ * File system is expected to call balance_dirty_pages_ratelimited() whenever -+ * it dirties a page. reiser4 does this for unformatted nodes (that is, during -+ * write---this covers vast majority of all dirty traffic), but we cannot do -+ * this immediately when formatted node is dirtied, because long term lock is -+ * usually held at that time. To work around this, dirtying of formatted node -+ * simply increases ->nr_marked_dirty counter in the current reiser4 -+ * context. When we are about to leave this context, -+ * balance_dirty_pages_ratelimited() is called, if necessary. -+ * -+ * This introduces another problem: sometimes we do not want to run -+ * balance_dirty_pages_ratelimited() when leaving a context, for example -+ * because some important lock (like ->i_mutex on the parent directory) is -+ * held. To achieve this, ->nobalance flag can be set in the current context. -+ */ -+static void balance_dirty_pages_at(reiser4_context *context) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(context->super); -+ -+ /* -+ * call balance_dirty_pages_ratelimited() to process formatted nodes -+ * dirtied during this system call. Do that only if we are not in mount -+ * and there were nodes dirtied in this context and we are not in -+ * writepage (to avoid deadlock) and not in pdflush -+ */ -+ if (sbinfo != NULL && sbinfo->fake != NULL && -+ context->nr_marked_dirty != 0 && -+ !(current->flags & PF_MEMALLOC) && -+ !current_is_pdflush()) -+ balance_dirty_pages_ratelimited(sbinfo->fake->i_mapping); -+} -+ -+/* release resources associated with context. -+ -+ This function should be called at the end of "session" with reiser4, -+ typically just before leaving reiser4 driver back to VFS. -+ -+ This is good place to put some degugging consistency checks, like that -+ thread released all locks and closed transcrash etc. -+ -+*/ -+static void reiser4_done_context(reiser4_context * context /* context being released */ ) -+{ -+ assert("nikita-860", context != NULL); -+ assert("nikita-859", context->magic == context_magic); -+ assert("vs-646", (reiser4_context *) current->journal_info == context); -+ assert("zam-686", !in_interrupt() && !in_irq()); -+ -+ /* only do anything when leaving top-level reiser4 context. All nested -+ * contexts are just dummies. */ -+ if (context->nr_children == 0) { -+ assert("jmacd-673", context->trans == NULL); -+ assert("jmacd-1002", lock_stack_isclean(&context->stack)); -+ assert("nikita-1936", reiser4_no_counters_are_held()); -+ assert("nikita-2626", list_empty_careful(reiser4_taps_list())); -+ assert("zam-1004", ergo(get_super_private(context->super), -+ get_super_private(context->super)->delete_mutex_owner != -+ current)); -+ -+ /* release all grabbed but as yet unused blocks */ -+ if (context->grabbed_blocks != 0) -+ all_grabbed2free(); -+ -+ /* -+ * synchronize against longterm_unlock_znode(): -+ * wake_up_requestor() wakes up requestors without holding -+ * zlock (otherwise they will immediately bump into that lock -+ * after wake up on another CPU). To work around (rare) -+ * situation where requestor has been woken up asynchronously -+ * and managed to run until completion (and destroy its -+ * context and lock stack) before wake_up_requestor() called -+ * wake_up() on it, wake_up_requestor() synchronize on lock -+ * stack spin lock. It has actually been observed that spin -+ * lock _was_ locked at this point, because -+ * wake_up_requestor() took interrupt. -+ */ -+ spin_lock_stack(&context->stack); -+ spin_unlock_stack(&context->stack); -+ -+ assert("zam-684", context->nr_children == 0); -+ /* restore original ->fs_context value */ -+ current->journal_info = context->outer; -+ if (context->on_stack == 0) -+ kfree(context); -+ } else { -+ context->nr_children--; -+#if REISER4_DEBUG -+ assert("zam-685", context->nr_children >= 0); -+#endif -+ } -+} -+ -+/* -+ * exit reiser4 context. Call balance_dirty_pages_at() if necessary. Close -+ * transaction. Call done_context() to do context related book-keeping. -+ */ -+void reiser4_exit_context(reiser4_context * context) -+{ -+ assert("nikita-3021", reiser4_schedulable()); -+ -+ if (context->nr_children == 0) { -+ if (!context->nobalance) { -+ reiser4_txn_restart(context); -+ balance_dirty_pages_at(context); -+ } -+ -+ /* if filesystem is mounted with -o sync or -o dirsync - commit -+ transaction. FIXME: TXNH_DONT_COMMIT is used to avoid -+ commiting on exit_context when inode semaphore is held and -+ to have ktxnmgrd to do commit instead to get better -+ concurrent filesystem accesses. But, when one mounts with -o -+ sync, he cares more about reliability than about -+ performance. So, for now we have this simple mount -o sync -+ support. */ -+ if (context->super->s_flags & (MS_SYNCHRONOUS | MS_DIRSYNC)) { -+ txn_atom *atom; -+ -+ atom = get_current_atom_locked_nocheck(); -+ if (atom) { -+ atom->flags |= ATOM_FORCE_COMMIT; -+ context->trans->flags &= ~TXNH_DONT_COMMIT; -+ spin_unlock_atom(atom); -+ } -+ } -+ reiser4_txn_end(context); -+ } -+ reiser4_done_context(context); -+} -+ -+void reiser4_ctx_gfp_mask_set(void) -+{ -+ reiser4_context *ctx; -+ -+ ctx = get_current_context(); -+ if (ctx->entd == 0 && -+ list_empty(&ctx->stack.locks) && -+ ctx->trans->atom == NULL) -+ ctx->gfp_mask = GFP_KERNEL; -+ else -+ ctx->gfp_mask = GFP_NOFS; -+} -+ -+void reiser4_ctx_gfp_mask_force (gfp_t mask) -+{ -+ reiser4_context *ctx; -+ ctx = get_current_context(); -+ -+ assert("edward-1454", ctx != NULL); -+ -+ ctx->gfp_mask = mask; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 120 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/context.h linux-2.6.20/fs/reiser4/context.h ---- linux-2.6.20.orig/fs/reiser4/context.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/context.h 2007-05-06 14:50:43.698975725 +0400 -@@ -0,0 +1,228 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Reiser4 context. See context.c for details. */ -+ -+#if !defined( __REISER4_CONTEXT_H__ ) -+#define __REISER4_CONTEXT_H__ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "tap.h" -+#include "lock.h" -+ -+#include /* for __u?? */ -+#include /* for struct super_block */ -+#include -+#include /* for struct task_struct */ -+ -+/* reiser4 per-thread context */ -+struct reiser4_context { -+ /* magic constant. For identification of reiser4 contexts. */ -+ __u32 magic; -+ -+ /* current lock stack. See lock.[ch]. This is where list of all -+ locks taken by current thread is kept. This is also used in -+ deadlock detection. */ -+ lock_stack stack; -+ -+ /* current transcrash. */ -+ txn_handle *trans; -+ /* transaction handle embedded into reiser4_context. ->trans points -+ * here by default. */ -+ txn_handle trans_in_ctx; -+ -+ /* super block we are working with. To get the current tree -+ use &get_super_private (reiser4_get_current_sb ())->tree. */ -+ struct super_block *super; -+ -+ /* parent fs activation */ -+ struct fs_activation *outer; -+ -+ /* per-thread grabbed (for further allocation) blocks counter */ -+ reiser4_block_nr grabbed_blocks; -+ -+ /* list of taps currently monitored. See tap.c */ -+ struct list_head taps; -+ -+ /* grabbing space is enabled */ -+ unsigned int grab_enabled:1; -+ /* should be set when we are write dirty nodes to disk in jnode_flush or -+ * reiser4_write_logs() */ -+ unsigned int writeout_mode:1; -+ /* true, if current thread is an ent thread */ -+ unsigned int entd:1; -+ /* true, if balance_dirty_pages() should not be run when leaving this -+ * context. This is used to avoid lengthly balance_dirty_pages() -+ * operation when holding some important resource, like directory -+ * ->i_mutex */ -+ unsigned int nobalance:1; -+ -+ /* this bit is used on reiser4_done_context to decide whether context is -+ kmalloc-ed and has to be kfree-ed */ -+ unsigned int on_stack:1; -+ -+ /* count non-trivial jnode_set_dirty() calls */ -+ unsigned long nr_marked_dirty; -+ -+ /* reiser4_sync_inodes calls (via generic_sync_sb_inodes) -+ * reiser4_writepages for each of dirty inodes. Reiser4_writepages -+ * captures pages. When number of pages captured in one -+ * reiser4_sync_inodes reaches some threshold - some atoms get -+ * flushed */ -+ int nr_captured; -+ int nr_children; /* number of child contexts */ -+#if REISER4_DEBUG -+ /* debugging information about reiser4 locks held by the current -+ * thread */ -+ reiser4_lock_counters_info locks; -+ struct task_struct *task; /* so we can easily find owner of the stack */ -+ -+ /* -+ * disk space grabbing debugging support -+ */ -+ /* how many disk blocks were grabbed by the first call to -+ * reiser4_grab_space() in this context */ -+ reiser4_block_nr grabbed_initially; -+ -+ /* list of all threads doing flush currently */ -+ struct list_head flushers_link; -+ /* information about last error encountered by reiser4 */ -+ err_site err; -+#endif -+ void *vp; -+ gfp_t gfp_mask; -+}; -+ -+extern reiser4_context *get_context_by_lock_stack(lock_stack *); -+ -+/* Debugging helps. */ -+#if REISER4_DEBUG -+extern void print_contexts(void); -+#endif -+ -+#define current_tree (&(get_super_private(reiser4_get_current_sb())->tree)) -+#define current_blocksize reiser4_get_current_sb()->s_blocksize -+#define current_blocksize_bits reiser4_get_current_sb()->s_blocksize_bits -+ -+extern reiser4_context *reiser4_init_context(struct super_block *); -+extern void init_stack_context(reiser4_context *, struct super_block *); -+extern void reiser4_exit_context(reiser4_context *); -+ -+/* magic constant we store in reiser4_context allocated at the stack. Used to -+ catch accesses to staled or uninitialized contexts. */ -+#define context_magic ((__u32) 0x4b1b5d0b) -+ -+extern int is_in_reiser4_context(void); -+ -+/* -+ * return reiser4_context for the thread @tsk -+ */ -+static inline reiser4_context *get_context(const struct task_struct *tsk) -+{ -+ assert("vs-1682", -+ ((reiser4_context *) tsk->journal_info)->magic == context_magic); -+ return (reiser4_context *) tsk->journal_info; -+} -+ -+/* -+ * return reiser4 context of the current thread, or NULL if there is none. -+ */ -+static inline reiser4_context *get_current_context_check(void) -+{ -+ if (is_in_reiser4_context()) -+ return get_context(current); -+ else -+ return NULL; -+} -+ -+static inline reiser4_context *get_current_context(void); /* __attribute__((const)); */ -+ -+/* return context associated with current thread */ -+static inline reiser4_context *get_current_context(void) -+{ -+ return get_context(current); -+} -+ -+static inline gfp_t reiser4_ctx_gfp_mask_get(void) -+{ -+ reiser4_context *ctx; -+ -+ ctx = get_current_context_check(); -+ return (ctx == NULL) ? GFP_KERNEL : ctx->gfp_mask; -+} -+ -+void reiser4_ctx_gfp_mask_set(void); -+void reiser4_ctx_gfp_mask_force (gfp_t mask); -+ -+/* -+ * true if current thread is in the write-out mode. Thread enters write-out -+ * mode during jnode_flush and reiser4_write_logs(). -+ */ -+static inline int is_writeout_mode(void) -+{ -+ return get_current_context()->writeout_mode; -+} -+ -+/* -+ * enter write-out mode -+ */ -+static inline void writeout_mode_enable(void) -+{ -+ assert("zam-941", !get_current_context()->writeout_mode); -+ get_current_context()->writeout_mode = 1; -+} -+ -+/* -+ * leave write-out mode -+ */ -+static inline void writeout_mode_disable(void) -+{ -+ assert("zam-942", get_current_context()->writeout_mode); -+ get_current_context()->writeout_mode = 0; -+} -+ -+static inline void grab_space_enable(void) -+{ -+ get_current_context()->grab_enabled = 1; -+} -+ -+static inline void grab_space_disable(void) -+{ -+ get_current_context()->grab_enabled = 0; -+} -+ -+static inline void grab_space_set_enabled(int enabled) -+{ -+ get_current_context()->grab_enabled = enabled; -+} -+ -+static inline int is_grab_enabled(reiser4_context * ctx) -+{ -+ return ctx->grab_enabled; -+} -+ -+/* mark transaction handle in @ctx as TXNH_DONT_COMMIT, so that no commit or -+ * flush would be performed when it is closed. This is necessary when handle -+ * has to be closed under some coarse semaphore, like i_mutex of -+ * directory. Commit will be performed by ktxnmgrd. */ -+static inline void context_set_commit_async(reiser4_context * context) -+{ -+ context->nobalance = 1; -+ context->trans->flags |= TXNH_DONT_COMMIT; -+} -+ -+/* __REISER4_CONTEXT_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/coord.c linux-2.6.20/fs/reiser4/coord.c ---- linux-2.6.20.orig/fs/reiser4/coord.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/coord.c 2007-05-06 14:50:43.698975725 +0400 -@@ -0,0 +1,935 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "tree.h" -+#include "plugin/item/item.h" -+#include "znode.h" -+#include "coord.h" -+ -+/* Internal constructor. */ -+static inline void -+coord_init_values(coord_t * coord, const znode * node, pos_in_node_t item_pos, -+ pos_in_node_t unit_pos, between_enum between) -+{ -+ coord->node = (znode *) node; -+ coord_set_item_pos(coord, item_pos); -+ coord->unit_pos = unit_pos; -+ coord->between = between; -+ ON_DEBUG(coord->plug_v = 0); -+ ON_DEBUG(coord->body_v = 0); -+ -+ /*ON_TRACE (TRACE_COORDS, "init coord %p node %p: %u %u %s\n", coord, node, item_pos, unit_pos, coord_tween_tostring (between)); */ -+} -+ -+/* after shifting of node content, coord previously set properly may become -+ invalid, try to "normalize" it. */ -+void coord_normalize(coord_t * coord) -+{ -+ znode *node; -+ -+ node = coord->node; -+ assert("vs-683", node); -+ -+ coord_clear_iplug(coord); -+ -+ if (node_is_empty(node)) { -+ coord_init_first_unit(coord, node); -+ } else if ((coord->between == AFTER_ITEM) -+ || (coord->between == AFTER_UNIT)) { -+ return; -+ } else if (coord->item_pos == coord_num_items(coord) -+ && coord->between == BEFORE_ITEM) { -+ coord_dec_item_pos(coord); -+ coord->between = AFTER_ITEM; -+ } else if (coord->unit_pos == coord_num_units(coord) -+ && coord->between == BEFORE_UNIT) { -+ coord->unit_pos--; -+ coord->between = AFTER_UNIT; -+ } else if (coord->item_pos == coord_num_items(coord) -+ && coord->unit_pos == 0 && coord->between == BEFORE_UNIT) { -+ coord_dec_item_pos(coord); -+ coord->unit_pos = 0; -+ coord->between = AFTER_ITEM; -+ } -+} -+ -+/* Copy a coordinate. */ -+void coord_dup(coord_t * coord, const coord_t * old_coord) -+{ -+ assert("jmacd-9800", coord_check(old_coord)); -+ coord_dup_nocheck(coord, old_coord); -+} -+ -+/* Copy a coordinate without check. Useful when old_coord->node is not -+ loaded. As in cbk_tree_lookup -> connect_znode -> connect_one_side */ -+void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord) -+{ -+ coord->node = old_coord->node; -+ coord_set_item_pos(coord, old_coord->item_pos); -+ coord->unit_pos = old_coord->unit_pos; -+ coord->between = old_coord->between; -+ coord->iplugid = old_coord->iplugid; -+ ON_DEBUG(coord->plug_v = old_coord->plug_v); -+ ON_DEBUG(coord->body_v = old_coord->body_v); -+} -+ -+/* Initialize an invalid coordinate. */ -+void coord_init_invalid(coord_t * coord, const znode * node) -+{ -+ coord_init_values(coord, node, 0, 0, INVALID_COORD); -+} -+ -+void coord_init_first_unit_nocheck(coord_t * coord, const znode * node) -+{ -+ coord_init_values(coord, node, 0, 0, AT_UNIT); -+} -+ -+/* Initialize a coordinate to point at the first unit of the first item. If the node is -+ empty, it is positioned at the EMPTY_NODE. */ -+void coord_init_first_unit(coord_t * coord, const znode * node) -+{ -+ int is_empty = node_is_empty(node); -+ -+ coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : AT_UNIT)); -+ -+ assert("jmacd-9801", coord_check(coord)); -+} -+ -+/* Initialize a coordinate to point at the last unit of the last item. If the node is -+ empty, it is positioned at the EMPTY_NODE. */ -+void coord_init_last_unit(coord_t * coord, const znode * node) -+{ -+ int is_empty = node_is_empty(node); -+ -+ coord_init_values(coord, node, -+ (is_empty ? 0 : node_num_items(node) - 1), 0, -+ (is_empty ? EMPTY_NODE : AT_UNIT)); -+ if (!is_empty) -+ coord->unit_pos = coord_last_unit_pos(coord); -+ assert("jmacd-9802", coord_check(coord)); -+} -+ -+/* Initialize a coordinate to before the first item. If the node is empty, it is -+ positioned at the EMPTY_NODE. */ -+void coord_init_before_first_item(coord_t * coord, const znode * node) -+{ -+ int is_empty = node_is_empty(node); -+ -+ coord_init_values(coord, node, 0, 0, -+ (is_empty ? EMPTY_NODE : BEFORE_UNIT)); -+ -+ assert("jmacd-9803", coord_check(coord)); -+} -+ -+/* Initialize a coordinate to after the last item. If the node is empty, it is positioned -+ at the EMPTY_NODE. */ -+void coord_init_after_last_item(coord_t * coord, const znode * node) -+{ -+ int is_empty = node_is_empty(node); -+ -+ coord_init_values(coord, node, -+ (is_empty ? 0 : node_num_items(node) - 1), 0, -+ (is_empty ? EMPTY_NODE : AFTER_ITEM)); -+ -+ assert("jmacd-9804", coord_check(coord)); -+} -+ -+/* Initialize a coordinate to after last unit in the item. Coord must be set -+ already to existing item */ -+void coord_init_after_item_end(coord_t * coord) -+{ -+ coord->between = AFTER_UNIT; -+ coord->unit_pos = coord_last_unit_pos(coord); -+} -+ -+/* Initialize a coordinate to before the item. Coord must be set already to existing item */ -+void coord_init_before_item(coord_t * coord) -+{ -+ coord->unit_pos = 0; -+ coord->between = BEFORE_ITEM; -+} -+ -+/* Initialize a coordinate to after the item. Coord must be set already to existing item */ -+void coord_init_after_item(coord_t * coord) -+{ -+ coord->unit_pos = 0; -+ coord->between = AFTER_ITEM; -+} -+ -+/* Initialize a coordinate by 0s. Used in places where init_coord was used and -+ it was not clear how actually */ -+void coord_init_zero(coord_t * coord) -+{ -+ memset(coord, 0, sizeof(*coord)); -+} -+ -+/* Return the number of units at the present item. Asserts coord_is_existing_item(). */ -+unsigned coord_num_units(const coord_t * coord) -+{ -+ assert("jmacd-9806", coord_is_existing_item(coord)); -+ -+ return item_plugin_by_coord(coord)->b.nr_units(coord); -+} -+ -+/* Returns true if the coord was initializewd by coord_init_invalid (). */ -+/* Audited by: green(2002.06.15) */ -+int coord_is_invalid(const coord_t * coord) -+{ -+ return coord->between == INVALID_COORD; -+} -+ -+/* Returns true if the coordinate is positioned at an existing item, not before or after -+ an item. It may be placed at, before, or after any unit within the item, whether -+ existing or not. */ -+int coord_is_existing_item(const coord_t * coord) -+{ -+ switch (coord->between) { -+ case EMPTY_NODE: -+ case BEFORE_ITEM: -+ case AFTER_ITEM: -+ case INVALID_COORD: -+ return 0; -+ -+ case BEFORE_UNIT: -+ case AT_UNIT: -+ case AFTER_UNIT: -+ return coord->item_pos < coord_num_items(coord); -+ } -+ -+ impossible("jmacd-9900", "unreachable coord: %p", coord); -+ return 0; -+} -+ -+/* Returns true if the coordinate is positioned at an existing unit, not before or after a -+ unit. */ -+/* Audited by: green(2002.06.15) */ -+int coord_is_existing_unit(const coord_t * coord) -+{ -+ switch (coord->between) { -+ case EMPTY_NODE: -+ case BEFORE_UNIT: -+ case AFTER_UNIT: -+ case BEFORE_ITEM: -+ case AFTER_ITEM: -+ case INVALID_COORD: -+ return 0; -+ -+ case AT_UNIT: -+ return (coord->item_pos < coord_num_items(coord) -+ && coord->unit_pos < coord_num_units(coord)); -+ } -+ -+ impossible("jmacd-9902", "unreachable"); -+ return 0; -+} -+ -+/* Returns true if the coordinate is positioned at the first unit of the first item. Not -+ true for empty nodes nor coordinates positioned before the first item. */ -+/* Audited by: green(2002.06.15) */ -+int coord_is_leftmost_unit(const coord_t * coord) -+{ -+ return (coord->between == AT_UNIT && coord->item_pos == 0 -+ && coord->unit_pos == 0); -+} -+ -+#if REISER4_DEBUG -+/* For assertions only, checks for a valid coordinate. */ -+int coord_check(const coord_t * coord) -+{ -+ if (coord->node == NULL) { -+ return 0; -+ } -+ if (znode_above_root(coord->node)) -+ return 1; -+ -+ switch (coord->between) { -+ default: -+ case INVALID_COORD: -+ return 0; -+ case EMPTY_NODE: -+ if (!node_is_empty(coord->node)) { -+ return 0; -+ } -+ return coord->item_pos == 0 && coord->unit_pos == 0; -+ -+ case BEFORE_UNIT: -+ case AFTER_UNIT: -+ if (node_is_empty(coord->node) && (coord->item_pos == 0) -+ && (coord->unit_pos == 0)) -+ return 1; -+ case AT_UNIT: -+ break; -+ case AFTER_ITEM: -+ case BEFORE_ITEM: -+ /* before/after item should not set unit_pos. */ -+ if (coord->unit_pos != 0) { -+ return 0; -+ } -+ break; -+ } -+ -+ if (coord->item_pos >= node_num_items(coord->node)) { -+ return 0; -+ } -+ -+ /* FIXME-VS: we are going to check unit_pos. This makes no sense when -+ between is set either AFTER_ITEM or BEFORE_ITEM */ -+ if (coord->between == AFTER_ITEM || coord->between == BEFORE_ITEM) -+ return 1; -+ -+ if (coord_is_iplug_set(coord) && -+ coord->unit_pos > -+ item_plugin_by_coord(coord)->b.nr_units(coord) - 1) { -+ return 0; -+ } -+ return 1; -+} -+#endif -+ -+/* Adjust coordinate boundaries based on the number of items prior to coord_next/prev. -+ Returns 1 if the new position is does not exist. */ -+static int coord_adjust_items(coord_t * coord, unsigned items, int is_next) -+{ -+ /* If the node is invalid, leave it. */ -+ if (coord->between == INVALID_COORD) { -+ return 1; -+ } -+ -+ /* If the node is empty, set it appropriately. */ -+ if (items == 0) { -+ coord->between = EMPTY_NODE; -+ coord_set_item_pos(coord, 0); -+ coord->unit_pos = 0; -+ return 1; -+ } -+ -+ /* If it was empty and it no longer is, set to BEFORE/AFTER_ITEM. */ -+ if (coord->between == EMPTY_NODE) { -+ coord->between = (is_next ? BEFORE_ITEM : AFTER_ITEM); -+ coord_set_item_pos(coord, 0); -+ coord->unit_pos = 0; -+ return 0; -+ } -+ -+ /* If the item_pos is out-of-range, set it appropriatly. */ -+ if (coord->item_pos >= items) { -+ coord->between = AFTER_ITEM; -+ coord_set_item_pos(coord, items - 1); -+ coord->unit_pos = 0; -+ /* If is_next, return 1 (can't go any further). */ -+ return is_next; -+ } -+ -+ return 0; -+} -+ -+/* Advances the coordinate by one unit to the right. If empty, no change. If -+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is an -+ existing unit. */ -+int coord_next_unit(coord_t * coord) -+{ -+ unsigned items = coord_num_items(coord); -+ -+ if (coord_adjust_items(coord, items, 1) == 1) { -+ return 1; -+ } -+ -+ switch (coord->between) { -+ case BEFORE_UNIT: -+ /* Now it is positioned at the same unit. */ -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case AFTER_UNIT: -+ case AT_UNIT: -+ /* If it was at or after a unit and there are more units in this item, -+ advance to the next one. */ -+ if (coord->unit_pos < coord_last_unit_pos(coord)) { -+ coord->unit_pos += 1; -+ coord->between = AT_UNIT; -+ return 0; -+ } -+ -+ /* Otherwise, it is crossing an item boundary and treated as if it was -+ after the current item. */ -+ coord->between = AFTER_ITEM; -+ coord->unit_pos = 0; -+ /* FALLTHROUGH */ -+ -+ case AFTER_ITEM: -+ /* Check for end-of-node. */ -+ if (coord->item_pos == items - 1) { -+ return 1; -+ } -+ -+ coord_inc_item_pos(coord); -+ coord->unit_pos = 0; -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case BEFORE_ITEM: -+ /* The adjust_items checks ensure that we are valid here. */ -+ coord->unit_pos = 0; -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case INVALID_COORD: -+ case EMPTY_NODE: -+ /* Handled in coord_adjust_items(). */ -+ break; -+ } -+ -+ impossible("jmacd-9902", "unreachable"); -+ return 0; -+} -+ -+/* Advances the coordinate by one item to the right. If empty, no change. If -+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is -+ an existing item. */ -+int coord_next_item(coord_t * coord) -+{ -+ unsigned items = coord_num_items(coord); -+ -+ if (coord_adjust_items(coord, items, 1) == 1) { -+ return 1; -+ } -+ -+ switch (coord->between) { -+ case AFTER_UNIT: -+ case AT_UNIT: -+ case BEFORE_UNIT: -+ case AFTER_ITEM: -+ /* Check for end-of-node. */ -+ if (coord->item_pos == items - 1) { -+ coord->between = AFTER_ITEM; -+ coord->unit_pos = 0; -+ coord_clear_iplug(coord); -+ return 1; -+ } -+ -+ /* Anywhere in an item, go to the next one. */ -+ coord->between = AT_UNIT; -+ coord_inc_item_pos(coord); -+ coord->unit_pos = 0; -+ return 0; -+ -+ case BEFORE_ITEM: -+ /* The out-of-range check ensures that we are valid here. */ -+ coord->unit_pos = 0; -+ coord->between = AT_UNIT; -+ return 0; -+ case INVALID_COORD: -+ case EMPTY_NODE: -+ /* Handled in coord_adjust_items(). */ -+ break; -+ } -+ -+ impossible("jmacd-9903", "unreachable"); -+ return 0; -+} -+ -+/* Advances the coordinate by one unit to the left. If empty, no change. If -+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position -+ is an existing unit. */ -+int coord_prev_unit(coord_t * coord) -+{ -+ unsigned items = coord_num_items(coord); -+ -+ if (coord_adjust_items(coord, items, 0) == 1) { -+ return 1; -+ } -+ -+ switch (coord->between) { -+ case AT_UNIT: -+ case BEFORE_UNIT: -+ if (coord->unit_pos > 0) { -+ coord->unit_pos -= 1; -+ coord->between = AT_UNIT; -+ return 0; -+ } -+ -+ if (coord->item_pos == 0) { -+ coord->between = BEFORE_ITEM; -+ return 1; -+ } -+ -+ coord_dec_item_pos(coord); -+ coord->unit_pos = coord_last_unit_pos(coord); -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case AFTER_UNIT: -+ /* What if unit_pos is out-of-range? */ -+ assert("jmacd-5442", -+ coord->unit_pos <= coord_last_unit_pos(coord)); -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case BEFORE_ITEM: -+ if (coord->item_pos == 0) { -+ return 1; -+ } -+ -+ coord_dec_item_pos(coord); -+ /* FALLTHROUGH */ -+ -+ case AFTER_ITEM: -+ coord->between = AT_UNIT; -+ coord->unit_pos = coord_last_unit_pos(coord); -+ return 0; -+ -+ case INVALID_COORD: -+ case EMPTY_NODE: -+ break; -+ } -+ -+ impossible("jmacd-9904", "unreachable"); -+ return 0; -+} -+ -+/* Advances the coordinate by one item to the left. If empty, no change. If -+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position -+ is an existing item. */ -+int coord_prev_item(coord_t * coord) -+{ -+ unsigned items = coord_num_items(coord); -+ -+ if (coord_adjust_items(coord, items, 0) == 1) { -+ return 1; -+ } -+ -+ switch (coord->between) { -+ case AT_UNIT: -+ case AFTER_UNIT: -+ case BEFORE_UNIT: -+ case BEFORE_ITEM: -+ -+ if (coord->item_pos == 0) { -+ coord->between = BEFORE_ITEM; -+ coord->unit_pos = 0; -+ return 1; -+ } -+ -+ coord_dec_item_pos(coord); -+ coord->unit_pos = 0; -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case AFTER_ITEM: -+ coord->between = AT_UNIT; -+ coord->unit_pos = 0; -+ return 0; -+ -+ case INVALID_COORD: -+ case EMPTY_NODE: -+ break; -+ } -+ -+ impossible("jmacd-9905", "unreachable"); -+ return 0; -+} -+ -+/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */ -+void coord_init_sideof_unit(coord_t * coord, const znode * node, sideof dir) -+{ -+ assert("jmacd-9821", dir == LEFT_SIDE || dir == RIGHT_SIDE); -+ if (dir == LEFT_SIDE) { -+ coord_init_first_unit(coord, node); -+ } else { -+ coord_init_last_unit(coord, node); -+ } -+} -+ -+/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof -+ argument. */ -+/* Audited by: green(2002.06.15) */ -+int coord_is_after_sideof_unit(coord_t * coord, sideof dir) -+{ -+ assert("jmacd-9822", dir == LEFT_SIDE || dir == RIGHT_SIDE); -+ if (dir == LEFT_SIDE) { -+ return coord_is_before_leftmost(coord); -+ } else { -+ return coord_is_after_rightmost(coord); -+ } -+} -+ -+/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */ -+/* Audited by: green(2002.06.15) */ -+int coord_sideof_unit(coord_t * coord, sideof dir) -+{ -+ assert("jmacd-9823", dir == LEFT_SIDE || dir == RIGHT_SIDE); -+ if (dir == LEFT_SIDE) { -+ return coord_prev_unit(coord); -+ } else { -+ return coord_next_unit(coord); -+ } -+} -+ -+#if REISER4_DEBUG -+int coords_equal(const coord_t * c1, const coord_t * c2) -+{ -+ assert("nikita-2840", c1 != NULL); -+ assert("nikita-2841", c2 != NULL); -+ -+ return -+ c1->node == c2->node && -+ c1->item_pos == c2->item_pos && -+ c1->unit_pos == c2->unit_pos && c1->between == c2->between; -+} -+#endif /* REISER4_DEBUG */ -+ -+/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost -+ return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */ -+/* Audited by: green(2002.06.15) */ -+coord_wrt_node coord_wrt(const coord_t * coord) -+{ -+ if (coord_is_before_leftmost(coord)) { -+ return COORD_ON_THE_LEFT; -+ } -+ -+ if (coord_is_after_rightmost(coord)) { -+ return COORD_ON_THE_RIGHT; -+ } -+ -+ return COORD_INSIDE; -+} -+ -+/* Returns true if the coordinate is positioned after the last item or after the last unit -+ of the last item or it is an empty node. */ -+/* Audited by: green(2002.06.15) */ -+int coord_is_after_rightmost(const coord_t * coord) -+{ -+ assert("jmacd-7313", coord_check(coord)); -+ -+ switch (coord->between) { -+ case INVALID_COORD: -+ case AT_UNIT: -+ case BEFORE_UNIT: -+ case BEFORE_ITEM: -+ return 0; -+ -+ case EMPTY_NODE: -+ return 1; -+ -+ case AFTER_ITEM: -+ return (coord->item_pos == node_num_items(coord->node) - 1); -+ -+ case AFTER_UNIT: -+ return ((coord->item_pos == node_num_items(coord->node) - 1) && -+ coord->unit_pos == coord_last_unit_pos(coord)); -+ } -+ -+ impossible("jmacd-9908", "unreachable"); -+ return 0; -+} -+ -+/* Returns true if the coordinate is positioned before the first item or it is an empty -+ node. */ -+int coord_is_before_leftmost(const coord_t * coord) -+{ -+ /* FIXME-VS: coord_check requires node to be loaded whereas it is not -+ necessary to check if coord is set before leftmost -+ assert ("jmacd-7313", coord_check (coord)); */ -+ switch (coord->between) { -+ case INVALID_COORD: -+ case AT_UNIT: -+ case AFTER_ITEM: -+ case AFTER_UNIT: -+ return 0; -+ -+ case EMPTY_NODE: -+ return 1; -+ -+ case BEFORE_ITEM: -+ case BEFORE_UNIT: -+ return (coord->item_pos == 0) && (coord->unit_pos == 0); -+ } -+ -+ impossible("jmacd-9908", "unreachable"); -+ return 0; -+} -+ -+/* Returns true if the coordinate is positioned after a item, before a item, after the -+ last unit of an item, before the first unit of an item, or at an empty node. */ -+/* Audited by: green(2002.06.15) */ -+int coord_is_between_items(const coord_t * coord) -+{ -+ assert("jmacd-7313", coord_check(coord)); -+ -+ switch (coord->between) { -+ case INVALID_COORD: -+ case AT_UNIT: -+ return 0; -+ -+ case AFTER_ITEM: -+ case BEFORE_ITEM: -+ case EMPTY_NODE: -+ return 1; -+ -+ case BEFORE_UNIT: -+ return coord->unit_pos == 0; -+ -+ case AFTER_UNIT: -+ return coord->unit_pos == coord_last_unit_pos(coord); -+ } -+ -+ impossible("jmacd-9908", "unreachable"); -+ return 0; -+} -+ -+#if REISER4_DEBUG -+/* Returns true if the coordinates are positioned at adjacent units, regardless of -+ before-after or item boundaries. */ -+int coord_are_neighbors(coord_t * c1, coord_t * c2) -+{ -+ coord_t *left; -+ coord_t *right; -+ -+ assert("nikita-1241", c1 != NULL); -+ assert("nikita-1242", c2 != NULL); -+ assert("nikita-1243", c1->node == c2->node); -+ assert("nikita-1244", coord_is_existing_unit(c1)); -+ assert("nikita-1245", coord_is_existing_unit(c2)); -+ -+ left = right = NULL; -+ switch (coord_compare(c1, c2)) { -+ case COORD_CMP_ON_LEFT: -+ left = c1; -+ right = c2; -+ break; -+ case COORD_CMP_ON_RIGHT: -+ left = c2; -+ right = c1; -+ break; -+ case COORD_CMP_SAME: -+ return 0; -+ default: -+ wrong_return_value("nikita-1246", "compare_coords()"); -+ } -+ assert("vs-731", left && right); -+ if (left->item_pos == right->item_pos) { -+ return left->unit_pos + 1 == right->unit_pos; -+ } else if (left->item_pos + 1 == right->item_pos) { -+ return (left->unit_pos == coord_last_unit_pos(left)) -+ && (right->unit_pos == 0); -+ } else { -+ return 0; -+ } -+} -+#endif /* REISER4_DEBUG */ -+ -+/* Assuming two coordinates are positioned in the same node, return COORD_CMP_ON_RIGHT, -+ COORD_CMP_ON_LEFT, or COORD_CMP_SAME depending on c1's position relative to c2. */ -+/* Audited by: green(2002.06.15) */ -+coord_cmp coord_compare(coord_t * c1, coord_t * c2) -+{ -+ assert("vs-209", c1->node == c2->node); -+ assert("vs-194", coord_is_existing_unit(c1) -+ && coord_is_existing_unit(c2)); -+ -+ if (c1->item_pos > c2->item_pos) -+ return COORD_CMP_ON_RIGHT; -+ if (c1->item_pos < c2->item_pos) -+ return COORD_CMP_ON_LEFT; -+ if (c1->unit_pos > c2->unit_pos) -+ return COORD_CMP_ON_RIGHT; -+ if (c1->unit_pos < c2->unit_pos) -+ return COORD_CMP_ON_LEFT; -+ return COORD_CMP_SAME; -+} -+ -+/* If the coordinate is between items, shifts it to the right. Returns 0 on success and -+ non-zero if there is no position to the right. */ -+int coord_set_to_right(coord_t * coord) -+{ -+ unsigned items = coord_num_items(coord); -+ -+ if (coord_adjust_items(coord, items, 1) == 1) { -+ return 1; -+ } -+ -+ switch (coord->between) { -+ case AT_UNIT: -+ return 0; -+ -+ case BEFORE_ITEM: -+ case BEFORE_UNIT: -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case AFTER_UNIT: -+ if (coord->unit_pos < coord_last_unit_pos(coord)) { -+ coord->unit_pos += 1; -+ coord->between = AT_UNIT; -+ return 0; -+ } else { -+ -+ coord->unit_pos = 0; -+ -+ if (coord->item_pos == items - 1) { -+ coord->between = AFTER_ITEM; -+ return 1; -+ } -+ -+ coord_inc_item_pos(coord); -+ coord->between = AT_UNIT; -+ return 0; -+ } -+ -+ case AFTER_ITEM: -+ if (coord->item_pos == items - 1) { -+ return 1; -+ } -+ -+ coord_inc_item_pos(coord); -+ coord->unit_pos = 0; -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case EMPTY_NODE: -+ return 1; -+ -+ case INVALID_COORD: -+ break; -+ } -+ -+ impossible("jmacd-9920", "unreachable"); -+ return 0; -+} -+ -+/* If the coordinate is between items, shifts it to the left. Returns 0 on success and -+ non-zero if there is no position to the left. */ -+int coord_set_to_left(coord_t * coord) -+{ -+ unsigned items = coord_num_items(coord); -+ -+ if (coord_adjust_items(coord, items, 0) == 1) { -+ return 1; -+ } -+ -+ switch (coord->between) { -+ case AT_UNIT: -+ return 0; -+ -+ case AFTER_UNIT: -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case AFTER_ITEM: -+ coord->between = AT_UNIT; -+ coord->unit_pos = coord_last_unit_pos(coord); -+ return 0; -+ -+ case BEFORE_UNIT: -+ if (coord->unit_pos > 0) { -+ coord->unit_pos -= 1; -+ coord->between = AT_UNIT; -+ return 0; -+ } else { -+ -+ if (coord->item_pos == 0) { -+ coord->between = BEFORE_ITEM; -+ return 1; -+ } -+ -+ coord->unit_pos = coord_last_unit_pos(coord); -+ coord_dec_item_pos(coord); -+ coord->between = AT_UNIT; -+ return 0; -+ } -+ -+ case BEFORE_ITEM: -+ if (coord->item_pos == 0) { -+ return 1; -+ } -+ -+ coord_dec_item_pos(coord); -+ coord->unit_pos = coord_last_unit_pos(coord); -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case EMPTY_NODE: -+ return 1; -+ -+ case INVALID_COORD: -+ break; -+ } -+ -+ impossible("jmacd-9920", "unreachable"); -+ return 0; -+} -+ -+static const char *coord_tween_tostring(between_enum n) -+{ -+ switch (n) { -+ case BEFORE_UNIT: -+ return "before unit"; -+ case BEFORE_ITEM: -+ return "before item"; -+ case AT_UNIT: -+ return "at unit"; -+ case AFTER_UNIT: -+ return "after unit"; -+ case AFTER_ITEM: -+ return "after item"; -+ case EMPTY_NODE: -+ return "empty node"; -+ case INVALID_COORD: -+ return "invalid"; -+ default: -+ { -+ static char buf[30]; -+ -+ sprintf(buf, "unknown: %i", n); -+ return buf; -+ } -+ } -+} -+ -+void print_coord(const char *mes, const coord_t * coord, int node) -+{ -+ if (coord == NULL) { -+ printk("%s: null\n", mes); -+ return; -+ } -+ printk("%s: item_pos = %d, unit_pos %d, tween=%s, iplug=%d\n", -+ mes, coord->item_pos, coord->unit_pos, -+ coord_tween_tostring(coord->between), coord->iplugid); -+} -+ -+int -+item_utmost_child_real_block(const coord_t * coord, sideof side, -+ reiser4_block_nr * blk) -+{ -+ return item_plugin_by_coord(coord)->f.utmost_child_real_block(coord, -+ side, -+ blk); -+} -+ -+int item_utmost_child(const coord_t * coord, sideof side, jnode ** child) -+{ -+ return item_plugin_by_coord(coord)->f.utmost_child(coord, side, child); -+} -+ -+/* @count bytes of flow @f got written, update correspondingly f->length, -+ f->data and f->key */ -+void move_flow_forward(flow_t * f, unsigned count) -+{ -+ if (f->data) -+ f->data += count; -+ f->length -= count; -+ set_key_offset(&f->key, get_key_offset(&f->key) + count); -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/coord.h linux-2.6.20/fs/reiser4/coord.h ---- linux-2.6.20.orig/fs/reiser4/coord.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/coord.h 2007-05-06 14:50:43.698975725 +0400 -@@ -0,0 +1,389 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Coords */ -+ -+#if !defined( __REISER4_COORD_H__ ) -+#define __REISER4_COORD_H__ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+ -+/* insertions happen between coords in the tree, so we need some means -+ of specifying the sense of betweenness. */ -+typedef enum { -+ BEFORE_UNIT, /* Note: we/init_coord depends on this value being zero. */ -+ AT_UNIT, -+ AFTER_UNIT, -+ BEFORE_ITEM, -+ AFTER_ITEM, -+ INVALID_COORD, -+ EMPTY_NODE, -+} between_enum; -+ -+/* location of coord w.r.t. its node */ -+typedef enum { -+ COORD_ON_THE_LEFT = -1, -+ COORD_ON_THE_RIGHT = +1, -+ COORD_INSIDE = 0 -+} coord_wrt_node; -+ -+typedef enum { -+ COORD_CMP_SAME = 0, COORD_CMP_ON_LEFT = -1, COORD_CMP_ON_RIGHT = +1 -+} coord_cmp; -+ -+struct coord { -+ /* node in a tree */ -+ /* 0 */ znode *node; -+ -+ /* position of item within node */ -+ /* 4 */ pos_in_node_t item_pos; -+ /* position of unit within item */ -+ /* 6 */ pos_in_node_t unit_pos; -+ /* optimization: plugin of item is stored in coord_t. Until this was -+ implemented, item_plugin_by_coord() was major CPU consumer. ->iplugid -+ is invalidated (set to 0xff) on each modification of ->item_pos, -+ and all such modifications are funneled through coord_*_item_pos() -+ functions below. -+ */ -+ /* 8 */ char iplugid; -+ /* position of coord w.r.t. to neighboring items and/or units. -+ Values are taken from &between_enum above. -+ */ -+ /* 9 */ char between; -+ /* padding. It will be added by the compiler anyway to conform to the -+ * C language alignment requirements. We keep it here to be on the -+ * safe side and to have a clear picture of the memory layout of this -+ * structure. */ -+ /* 10 */ __u16 pad; -+ /* 12 */ int offset; -+#if REISER4_DEBUG -+ unsigned long plug_v; -+ unsigned long body_v; -+#endif -+}; -+ -+#define INVALID_PLUGID ((char)((1 << 8) - 1)) -+#define INVALID_OFFSET -1 -+ -+static inline void coord_clear_iplug(coord_t * coord) -+{ -+ assert("nikita-2835", coord != NULL); -+ coord->iplugid = INVALID_PLUGID; -+ coord->offset = INVALID_OFFSET; -+} -+ -+static inline int coord_is_iplug_set(const coord_t * coord) -+{ -+ assert("nikita-2836", coord != NULL); -+ return coord->iplugid != INVALID_PLUGID; -+} -+ -+static inline void coord_set_item_pos(coord_t * coord, pos_in_node_t pos) -+{ -+ assert("nikita-2478", coord != NULL); -+ coord->item_pos = pos; -+ coord_clear_iplug(coord); -+} -+ -+static inline void coord_dec_item_pos(coord_t * coord) -+{ -+ assert("nikita-2480", coord != NULL); -+ --coord->item_pos; -+ coord_clear_iplug(coord); -+} -+ -+static inline void coord_inc_item_pos(coord_t * coord) -+{ -+ assert("nikita-2481", coord != NULL); -+ ++coord->item_pos; -+ coord_clear_iplug(coord); -+} -+ -+static inline void coord_add_item_pos(coord_t * coord, int delta) -+{ -+ assert("nikita-2482", coord != NULL); -+ coord->item_pos += delta; -+ coord_clear_iplug(coord); -+} -+ -+static inline void coord_invalid_item_pos(coord_t * coord) -+{ -+ assert("nikita-2832", coord != NULL); -+ coord->item_pos = (unsigned short)~0; -+ coord_clear_iplug(coord); -+} -+ -+/* Reverse a direction. */ -+static inline sideof sideof_reverse(sideof side) -+{ -+ return side == LEFT_SIDE ? RIGHT_SIDE : LEFT_SIDE; -+} -+ -+/* NOTE: There is a somewhat odd mixture of the following opposed terms: -+ -+ "first" and "last" -+ "next" and "prev" -+ "before" and "after" -+ "leftmost" and "rightmost" -+ -+ But I think the chosen names are decent the way they are. -+*/ -+ -+/* COORD INITIALIZERS */ -+ -+/* Initialize an invalid coordinate. */ -+extern void coord_init_invalid(coord_t * coord, const znode * node); -+ -+extern void coord_init_first_unit_nocheck(coord_t * coord, const znode * node); -+ -+/* Initialize a coordinate to point at the first unit of the first item. If the node is -+ empty, it is positioned at the EMPTY_NODE. */ -+extern void coord_init_first_unit(coord_t * coord, const znode * node); -+ -+/* Initialize a coordinate to point at the last unit of the last item. If the node is -+ empty, it is positioned at the EMPTY_NODE. */ -+extern void coord_init_last_unit(coord_t * coord, const znode * node); -+ -+/* Initialize a coordinate to before the first item. If the node is empty, it is -+ positioned at the EMPTY_NODE. */ -+extern void coord_init_before_first_item(coord_t * coord, const znode * node); -+ -+/* Initialize a coordinate to after the last item. If the node is empty, it is positioned -+ at the EMPTY_NODE. */ -+extern void coord_init_after_last_item(coord_t * coord, const znode * node); -+ -+/* Initialize a coordinate to after last unit in the item. Coord must be set -+ already to existing item */ -+void coord_init_after_item_end(coord_t * coord); -+ -+/* Initialize a coordinate to before the item. Coord must be set already to existing item */ -+void coord_init_before_item(coord_t *); -+/* Initialize a coordinate to after the item. Coord must be set already to existing item */ -+void coord_init_after_item(coord_t *); -+ -+/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */ -+extern void coord_init_sideof_unit(coord_t * coord, const znode * node, -+ sideof dir); -+ -+/* Initialize a coordinate by 0s. Used in places where init_coord was used and -+ it was not clear how actually -+ FIXME-VS: added by vs (2002, june, 8) */ -+extern void coord_init_zero(coord_t * coord); -+ -+/* COORD METHODS */ -+ -+/* after shifting of node content, coord previously set properly may become -+ invalid, try to "normalize" it. */ -+void coord_normalize(coord_t * coord); -+ -+/* Copy a coordinate. */ -+extern void coord_dup(coord_t * coord, const coord_t * old_coord); -+ -+/* Copy a coordinate without check. */ -+void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord); -+ -+unsigned coord_num_units(const coord_t * coord); -+ -+/* Return the last valid unit number at the present item (i.e., -+ coord_num_units() - 1). */ -+static inline unsigned coord_last_unit_pos(const coord_t * coord) -+{ -+ return coord_num_units(coord) - 1; -+} -+ -+#if REISER4_DEBUG -+/* For assertions only, checks for a valid coordinate. */ -+extern int coord_check(const coord_t * coord); -+ -+extern unsigned long znode_times_locked(const znode * z); -+ -+static inline void coord_update_v(coord_t * coord) -+{ -+ coord->plug_v = coord->body_v = znode_times_locked(coord->node); -+} -+#endif -+ -+extern int coords_equal(const coord_t * c1, const coord_t * c2); -+ -+extern void print_coord(const char *mes, const coord_t * coord, int print_node); -+ -+/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost -+ return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */ -+extern coord_wrt_node coord_wrt(const coord_t * coord); -+ -+/* Returns true if the coordinates are positioned at adjacent units, regardless of -+ before-after or item boundaries. */ -+extern int coord_are_neighbors(coord_t * c1, coord_t * c2); -+ -+/* Assuming two coordinates are positioned in the same node, return NCOORD_CMP_ON_RIGHT, -+ NCOORD_CMP_ON_LEFT, or NCOORD_CMP_SAME depending on c1's position relative to c2. */ -+extern coord_cmp coord_compare(coord_t * c1, coord_t * c2); -+ -+/* COORD PREDICATES */ -+ -+/* Returns true if the coord was initializewd by coord_init_invalid (). */ -+extern int coord_is_invalid(const coord_t * coord); -+ -+/* Returns true if the coordinate is positioned at an existing item, not before or after -+ an item. It may be placed at, before, or after any unit within the item, whether -+ existing or not. If this is true you can call methods of the item plugin. */ -+extern int coord_is_existing_item(const coord_t * coord); -+ -+/* Returns true if the coordinate is positioned after a item, before a item, after the -+ last unit of an item, before the first unit of an item, or at an empty node. */ -+extern int coord_is_between_items(const coord_t * coord); -+ -+/* Returns true if the coordinate is positioned at an existing unit, not before or after a -+ unit. */ -+extern int coord_is_existing_unit(const coord_t * coord); -+ -+/* Returns true if the coordinate is positioned at an empty node. */ -+extern int coord_is_empty(const coord_t * coord); -+ -+/* Returns true if the coordinate is positioned at the first unit of the first item. Not -+ true for empty nodes nor coordinates positioned before the first item. */ -+extern int coord_is_leftmost_unit(const coord_t * coord); -+ -+/* Returns true if the coordinate is positioned after the last item or after the last unit -+ of the last item or it is an empty node. */ -+extern int coord_is_after_rightmost(const coord_t * coord); -+ -+/* Returns true if the coordinate is positioned before the first item or it is an empty -+ node. */ -+extern int coord_is_before_leftmost(const coord_t * coord); -+ -+/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof -+ argument. */ -+extern int coord_is_after_sideof_unit(coord_t * coord, sideof dir); -+ -+/* COORD MODIFIERS */ -+ -+/* Advances the coordinate by one unit to the right. If empty, no change. If -+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is -+ an existing unit. */ -+extern int coord_next_unit(coord_t * coord); -+ -+/* Advances the coordinate by one item to the right. If empty, no change. If -+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is -+ an existing item. */ -+extern int coord_next_item(coord_t * coord); -+ -+/* Advances the coordinate by one unit to the left. If empty, no change. If -+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position -+ is an existing unit. */ -+extern int coord_prev_unit(coord_t * coord); -+ -+/* Advances the coordinate by one item to the left. If empty, no change. If -+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position -+ is an existing item. */ -+extern int coord_prev_item(coord_t * coord); -+ -+/* If the coordinate is between items, shifts it to the right. Returns 0 on success and -+ non-zero if there is no position to the right. */ -+extern int coord_set_to_right(coord_t * coord); -+ -+/* If the coordinate is between items, shifts it to the left. Returns 0 on success and -+ non-zero if there is no position to the left. */ -+extern int coord_set_to_left(coord_t * coord); -+ -+/* If the coordinate is at an existing unit, set to after that unit. Returns 0 on success -+ and non-zero if the unit did not exist. */ -+extern int coord_set_after_unit(coord_t * coord); -+ -+/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */ -+extern int coord_sideof_unit(coord_t * coord, sideof dir); -+ -+/* iterate over all units in @node */ -+#define for_all_units( coord, node ) \ -+ for( coord_init_before_first_item( ( coord ), ( node ) ) ; \ -+ coord_next_unit( coord ) == 0 ; ) -+ -+/* iterate over all items in @node */ -+#define for_all_items( coord, node ) \ -+ for( coord_init_before_first_item( ( coord ), ( node ) ) ; \ -+ coord_next_item( coord ) == 0 ; ) -+ -+/* COORD/ITEM METHODS */ -+ -+extern int item_utmost_child_real_block(const coord_t * coord, sideof side, -+ reiser4_block_nr * blk); -+extern int item_utmost_child(const coord_t * coord, sideof side, -+ jnode ** child); -+ -+/* a flow is a sequence of bytes being written to or read from the tree. The -+ tree will slice the flow into items while storing it into nodes, but all of -+ that is hidden from anything outside the tree. */ -+ -+struct flow { -+ reiser4_key key; /* key of start of flow's sequence of bytes */ -+ loff_t length; /* length of flow's sequence of bytes */ -+ char *data; /* start of flow's sequence of bytes */ -+ int user; /* if 1 data is user space, 0 - kernel space */ -+ rw_op op; /* NIKITA-FIXME-HANS: comment is where? */ -+}; -+ -+void move_flow_forward(flow_t * f, unsigned count); -+ -+/* &reiser4_item_data - description of data to be inserted or pasted -+ -+ Q: articulate the reasons for the difference between this and flow. -+ -+ A: Becides flow we insert into tree other things: stat data, directory -+ entry, etc. To insert them into tree one has to provide this structure. If -+ one is going to insert flow - he can use insert_flow, where this structure -+ does not have to be created -+*/ -+struct reiser4_item_data { -+ /* actual data to be inserted. If NULL, ->create_item() will not -+ do xmemcpy itself, leaving this up to the caller. This can -+ save some amount of unnecessary memory copying, for example, -+ during insertion of stat data. -+ -+ */ -+ char *data; -+ /* 1 if 'char * data' contains pointer to user space and 0 if it is -+ kernel space */ -+ int user; -+ /* amount of data we are going to insert or paste */ -+ int length; -+ /* "Arg" is opaque data that is passed down to the -+ ->create_item() method of node layout, which in turn -+ hands it to the ->create_hook() of item being created. This -+ arg is currently used by: -+ -+ . ->create_hook() of internal item -+ (fs/reiser4/plugin/item/internal.c:internal_create_hook()), -+ . ->paste() method of directory item. -+ . ->create_hook() of extent item -+ -+ For internal item, this is left "brother" of new node being -+ inserted and it is used to add new node into sibling list -+ after parent to it was just inserted into parent. -+ -+ While ->arg does look somewhat of unnecessary compication, -+ it actually saves a lot of headache in many places, because -+ all data necessary to insert or paste new data into tree are -+ collected in one place, and this eliminates a lot of extra -+ argument passing and storing everywhere. -+ -+ */ -+ void *arg; -+ /* plugin of item we are inserting */ -+ item_plugin *iplug; -+}; -+ -+/* __REISER4_COORD_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/debug.c linux-2.6.20/fs/reiser4/debug.c ---- linux-2.6.20.orig/fs/reiser4/debug.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/debug.c 2007-05-06 14:50:43.702976975 +0400 -@@ -0,0 +1,308 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Debugging facilities. */ -+ -+/* -+ * This file contains generic debugging functions used by reiser4. Roughly -+ * following: -+ * -+ * panicking: reiser4_do_panic(), reiser4_print_prefix(). -+ * -+ * locking: -+ * reiser4_schedulable(), reiser4_lock_counters(), print_lock_counters(), -+ * reiser4_no_counters_are_held(), reiser4_commit_check_locks() -+ * -+ * error code monitoring (see comment before RETERR macro): -+ * reiser4_return_err(), reiser4_report_err(). -+ * -+ * stack back-tracing: fill_backtrace() -+ * -+ * miscellaneous: reiser4_preempt_point(), call_on_each_assert(), -+ * reiser4_debugtrap(). -+ * -+ */ -+ -+#include "reiser4.h" -+#include "context.h" -+#include "super.h" -+#include "txnmgr.h" -+#include "znode.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#if 0 -+#if REISER4_DEBUG -+static void reiser4_report_err(void); -+#else -+#define reiser4_report_err() noop -+#endif -+#endif /* 0 */ -+ -+/* -+ * global buffer where message given to reiser4_panic is formatted. -+ */ -+static char panic_buf[REISER4_PANIC_MSG_BUFFER_SIZE]; -+ -+/* -+ * lock protecting consistency of panic_buf under concurrent panics -+ */ -+static DEFINE_SPINLOCK(panic_guard); -+ -+/* Your best friend. Call it on each occasion. This is called by -+ fs/reiser4/debug.h:reiser4_panic(). */ -+void reiser4_do_panic(const char *format /* format string */ , ... /* rest */ ) -+{ -+ static int in_panic = 0; -+ va_list args; -+ -+ /* -+ * check for recursive panic. -+ */ -+ if (in_panic == 0) { -+ in_panic = 1; -+ -+ spin_lock(&panic_guard); -+ va_start(args, format); -+ vsnprintf(panic_buf, sizeof(panic_buf), format, args); -+ va_end(args); -+ printk(KERN_EMERG "reiser4 panicked cowardly: %s", panic_buf); -+ spin_unlock(&panic_guard); -+ -+ /* -+ * if kernel debugger is configured---drop in. Early dropping -+ * into kgdb is not always convenient, because panic message -+ * is not yet printed most of the times. But: -+ * -+ * (1) message can be extracted from printk_buf[] -+ * (declared static inside of printk()), and -+ * -+ * (2) sometimes serial/kgdb combo dies while printing -+ * long panic message, so it's more prudent to break into -+ * debugger earlier. -+ * -+ */ -+ DEBUGON(1); -+ } -+ /* to make gcc happy about noreturn attribute */ -+ panic("%s", panic_buf); -+} -+ -+#if 0 -+void -+reiser4_print_prefix(const char *level, int reperr, const char *mid, -+ const char *function, const char *file, int lineno) -+{ -+ const char *comm; -+ int pid; -+ -+ if (unlikely(in_interrupt() || in_irq())) { -+ comm = "interrupt"; -+ pid = 0; -+ } else { -+ comm = current->comm; -+ pid = current->pid; -+ } -+ printk("%sreiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n", -+ level, comm, pid, function, file, lineno, mid); -+ if (reperr) -+ reiser4_report_err(); -+} -+#endif /* 0 */ -+ -+/* Preemption point: this should be called periodically during long running -+ operations (carry, allocate, and squeeze are best examples) */ -+int reiser4_preempt_point(void) -+{ -+ assert("nikita-3008", reiser4_schedulable()); -+ cond_resched(); -+ return signal_pending(current); -+} -+ -+#if REISER4_DEBUG -+/* Debugging aid: return struct where information about locks taken by current -+ thread is accumulated. This can be used to formulate lock ordering -+ constraints and various assertions. -+ -+*/ -+reiser4_lock_counters_info *reiser4_lock_counters(void) -+{ -+ reiser4_context *ctx = get_current_context(); -+ assert("jmacd-1123", ctx != NULL); -+ return &ctx->locks; -+} -+ -+/* -+ * print human readable information about locks held by the reiser4 context. -+ */ -+static void print_lock_counters(const char *prefix, -+ const reiser4_lock_counters_info * info) -+{ -+ printk("%s: jnode: %i, tree: %i (r:%i,w:%i), dk: %i (r:%i,w:%i)\n" -+ "jload: %i, " -+ "txnh: %i, atom: %i, stack: %i, txnmgr: %i, " -+ "ktxnmgrd: %i, fq: %i\n" -+ "inode: %i, " -+ "cbk_cache: %i (r:%i,w%i), " -+ "eflush: %i, " -+ "zlock: %i,\n" -+ "spin: %i, long: %i inode_sem: (r:%i,w:%i)\n" -+ "d: %i, x: %i, t: %i\n", prefix, -+ info->spin_locked_jnode, -+ info->rw_locked_tree, info->read_locked_tree, -+ info->write_locked_tree, -+ info->rw_locked_dk, info->read_locked_dk, info->write_locked_dk, -+ info->spin_locked_jload, -+ info->spin_locked_txnh, -+ info->spin_locked_atom, info->spin_locked_stack, -+ info->spin_locked_txnmgr, info->spin_locked_ktxnmgrd, -+ info->spin_locked_fq, -+ info->spin_locked_inode, -+ info->rw_locked_cbk_cache, -+ info->read_locked_cbk_cache, -+ info->write_locked_cbk_cache, -+ info->spin_locked_super_eflush, -+ info->spin_locked_zlock, -+ info->spin_locked, -+ info->long_term_locked_znode, -+ info->inode_sem_r, info->inode_sem_w, -+ info->d_refs, info->x_refs, info->t_refs); -+} -+ -+/* check that no spinlocks are held */ -+int reiser4_schedulable(void) -+{ -+ if (get_current_context_check() != NULL) { -+ if (!LOCK_CNT_NIL(spin_locked)) { -+ print_lock_counters("in atomic", reiser4_lock_counters()); -+ return 0; -+ } -+ } -+ might_sleep(); -+ return 1; -+} -+/* -+ * return true, iff no locks are held. -+ */ -+int reiser4_no_counters_are_held(void) -+{ -+ reiser4_lock_counters_info *counters; -+ -+ counters = reiser4_lock_counters(); -+ return -+ (counters->spin_locked_zlock == 0) && -+ (counters->spin_locked_jnode == 0) && -+ (counters->rw_locked_tree == 0) && -+ (counters->read_locked_tree == 0) && -+ (counters->write_locked_tree == 0) && -+ (counters->rw_locked_dk == 0) && -+ (counters->read_locked_dk == 0) && -+ (counters->write_locked_dk == 0) && -+ (counters->spin_locked_txnh == 0) && -+ (counters->spin_locked_atom == 0) && -+ (counters->spin_locked_stack == 0) && -+ (counters->spin_locked_txnmgr == 0) && -+ (counters->spin_locked_inode == 0) && -+ (counters->spin_locked == 0) && -+ (counters->long_term_locked_znode == 0) && -+ (counters->inode_sem_r == 0) && -+ (counters->inode_sem_w == 0) && (counters->d_refs == 0); -+} -+ -+/* -+ * return true, iff transaction commit can be done under locks held by the -+ * current thread. -+ */ -+int reiser4_commit_check_locks(void) -+{ -+ reiser4_lock_counters_info *counters; -+ int inode_sem_r; -+ int inode_sem_w; -+ int result; -+ -+ /* -+ * inode's read/write semaphore is the only reiser4 lock that can be -+ * held during commit. -+ */ -+ -+ counters = reiser4_lock_counters(); -+ inode_sem_r = counters->inode_sem_r; -+ inode_sem_w = counters->inode_sem_w; -+ -+ counters->inode_sem_r = counters->inode_sem_w = 0; -+ result = reiser4_no_counters_are_held(); -+ counters->inode_sem_r = inode_sem_r; -+ counters->inode_sem_w = inode_sem_w; -+ return result; -+} -+ -+/* -+ * fill "error site" in the current reiser4 context. See comment before RETERR -+ * macro for more details. -+ */ -+void reiser4_return_err(int code, const char *file, int line) -+{ -+ if (code < 0 && is_in_reiser4_context()) { -+ reiser4_context *ctx = get_current_context(); -+ -+ if (ctx != NULL) { -+ ctx->err.code = code; -+ ctx->err.file = file; -+ ctx->err.line = line; -+ } -+ } -+} -+ -+#if 0 -+/* -+ * report error information recorder by reiser4_return_err(). -+ */ -+static void reiser4_report_err(void) -+{ -+ reiser4_context *ctx = get_current_context_check(); -+ -+ if (ctx != NULL) { -+ if (ctx->err.code != 0) { -+ printk("code: %i at %s:%i\n", -+ ctx->err.code, ctx->err.file, ctx->err.line); -+ } -+ } -+} -+#endif /* 0 */ -+ -+#endif /* REISER4_DEBUG */ -+ -+#if KERNEL_DEBUGGER -+ -+/* -+ * this functions just drops into kernel debugger. It is a convenient place to -+ * put breakpoint in. -+ */ -+void reiser4_debugtrap(void) -+{ -+ /* do nothing. Put break point here. */ -+#if defined(CONFIG_KGDB) && !defined(CONFIG_REISER4_FS_MODULE) -+ extern void breakpoint(void); -+ breakpoint(); -+#endif -+} -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/debug.h linux-2.6.20/fs/reiser4/debug.h ---- linux-2.6.20.orig/fs/reiser4/debug.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/debug.h 2007-05-06 14:50:43.702976975 +0400 -@@ -0,0 +1,350 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Declarations of debug macros. */ -+ -+#if !defined( __FS_REISER4_DEBUG_H__ ) -+#define __FS_REISER4_DEBUG_H__ -+ -+#include "forward.h" -+#include "reiser4.h" -+ -+/* generic function to produce formatted output, decorating it with -+ whatever standard prefixes/postfixes we want. "Fun" is a function -+ that will be actually called, can be printk, panic etc. -+ This is for use by other debugging macros, not by users. */ -+#define DCALL(lev, fun, reperr, label, format, ...) \ -+({ \ -+ fun(lev "reiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n" format "\n" , \ -+ current->comm, current->pid, __FUNCTION__, \ -+ __FILE__, __LINE__, label, ## __VA_ARGS__); \ -+}) -+ -+/* -+ * cause kernel to crash -+ */ -+#define reiser4_panic(mid, format, ...) \ -+ DCALL("", reiser4_do_panic, 1, mid, format , ## __VA_ARGS__) -+ -+/* print message with indication of current process, file, line and -+ function */ -+#define reiser4_log(label, format, ...) \ -+ DCALL(KERN_DEBUG, printk, 0, label, format , ## __VA_ARGS__) -+ -+/* Assertion checked during compilation. -+ If "cond" is false (0) we get duplicate case label in switch. -+ Use this to check something like famous -+ cassert (sizeof(struct reiserfs_journal_commit) == 4096) ; -+ in 3.x journal.c. If cassertion fails you get compiler error, -+ so no "maintainer-id". -+*/ -+#define cassert(cond) ({ switch(-1) { case (cond): case 0: break; } }) -+ -+#define noop do {;} while(0) -+ -+#if REISER4_DEBUG -+/* version of info that only actually prints anything when _d_ebugging -+ is on */ -+#define dinfo(format, ...) printk(format , ## __VA_ARGS__) -+/* macro to catch logical errors. Put it into `default' clause of -+ switch() statement. */ -+#define impossible(label, format, ...) \ -+ reiser4_panic(label, "impossible: " format , ## __VA_ARGS__) -+/* assert assures that @cond is true. If it is not, reiser4_panic() is -+ called. Use this for checking logical consistency and _never_ call -+ this to check correctness of external data: disk blocks and user-input . */ -+#define assert(label, cond) \ -+({ \ -+ /* call_on_each_assert(); */ \ -+ if (cond) { \ -+ /* put negated check to avoid using !(cond) that would lose \ -+ * warnings for things like assert(a = b); */ \ -+ ; \ -+ } else { \ -+ DEBUGON(1); \ -+ reiser4_panic(label, "assertion failed: %s", #cond); \ -+ } \ -+}) -+ -+/* like assertion, but @expr is evaluated even if REISER4_DEBUG is off. */ -+#define check_me( label, expr ) assert( label, ( expr ) ) -+ -+#define ON_DEBUG( exp ) exp -+ -+extern int reiser4_schedulable(void); -+extern void call_on_each_assert(void); -+ -+#else -+ -+#define dinfo( format, args... ) noop -+#define impossible( label, format, args... ) noop -+#define assert( label, cond ) noop -+#define check_me( label, expr ) ( ( void ) ( expr ) ) -+#define ON_DEBUG( exp ) -+#define reiser4_schedulable() might_sleep() -+ -+/* REISER4_DEBUG */ -+#endif -+ -+#if REISER4_DEBUG -+/* per-thread information about lock acquired by this thread. Used by lock -+ * ordering checking in spin_macros.h */ -+typedef struct reiser4_lock_counters_info { -+ int rw_locked_tree; -+ int read_locked_tree; -+ int write_locked_tree; -+ -+ int rw_locked_dk; -+ int read_locked_dk; -+ int write_locked_dk; -+ -+ int rw_locked_cbk_cache; -+ int read_locked_cbk_cache; -+ int write_locked_cbk_cache; -+ -+ int spin_locked_zlock; -+ int spin_locked_jnode; -+ int spin_locked_jload; -+ int spin_locked_txnh; -+ int spin_locked_atom; -+ int spin_locked_stack; -+ int spin_locked_txnmgr; -+ int spin_locked_ktxnmgrd; -+ int spin_locked_fq; -+ int spin_locked_inode; -+ int spin_locked_super_eflush; -+ int spin_locked; -+ int long_term_locked_znode; -+ -+ int inode_sem_r; -+ int inode_sem_w; -+ -+ int d_refs; -+ int x_refs; -+ int t_refs; -+} reiser4_lock_counters_info; -+ -+extern reiser4_lock_counters_info *reiser4_lock_counters(void); -+#define IN_CONTEXT(a, b) (is_in_reiser4_context() ? (a) : (b)) -+ -+/* increment lock-counter @counter, if present */ -+#define LOCK_CNT_INC(counter) \ -+ IN_CONTEXT(++(reiser4_lock_counters()->counter), 0) -+ -+/* decrement lock-counter @counter, if present */ -+#define LOCK_CNT_DEC(counter) \ -+ IN_CONTEXT(--(reiser4_lock_counters()->counter), 0) -+ -+/* check that lock-counter is zero. This is for use in assertions */ -+#define LOCK_CNT_NIL(counter) \ -+ IN_CONTEXT(reiser4_lock_counters()->counter == 0, 1) -+ -+/* check that lock-counter is greater than zero. This is for use in -+ * assertions */ -+#define LOCK_CNT_GTZ(counter) \ -+ IN_CONTEXT(reiser4_lock_counters()->counter > 0, 1) -+#define LOCK_CNT_LT(counter,n) \ -+ IN_CONTEXT(reiser4_lock_counters()->counter < n, 1) -+ -+#else /* REISER4_DEBUG */ -+ -+/* no-op versions on the above */ -+ -+typedef struct reiser4_lock_counters_info { -+} reiser4_lock_counters_info; -+ -+#define reiser4_lock_counters() ((reiser4_lock_counters_info *)NULL) -+#define LOCK_CNT_INC(counter) noop -+#define LOCK_CNT_DEC(counter) noop -+#define LOCK_CNT_NIL(counter) (1) -+#define LOCK_CNT_GTZ(counter) (1) -+#define LOCK_CNT_LT(counter,n) (1) -+ -+#endif /* REISER4_DEBUG */ -+ -+#define assert_spin_not_locked(lock) BUG_ON(0) -+#define assert_rw_write_locked(lock) BUG_ON(0) -+#define assert_rw_read_locked(lock) BUG_ON(0) -+#define assert_rw_locked(lock) BUG_ON(0) -+#define assert_rw_not_write_locked(lock) BUG_ON(0) -+#define assert_rw_not_read_locked(lock) BUG_ON(0) -+#define assert_rw_not_locked(lock) BUG_ON(0) -+ -+/* flags controlling debugging behavior. Are set through debug_flags=N mount -+ option. */ -+typedef enum { -+ /* print a lot of information during panic. When this is on all jnodes -+ * are listed. This can be *very* large output. Usually you don't want -+ * this. Especially over serial line. */ -+ REISER4_VERBOSE_PANIC = 0x00000001, -+ /* print a lot of information during umount */ -+ REISER4_VERBOSE_UMOUNT = 0x00000002, -+ /* print gathered statistics on umount */ -+ REISER4_STATS_ON_UMOUNT = 0x00000004, -+ /* check node consistency */ -+ REISER4_CHECK_NODE = 0x00000008 -+} reiser4_debug_flags; -+ -+extern int is_in_reiser4_context(void); -+ -+/* -+ * evaluate expression @e only if with reiser4 context -+ */ -+#define ON_CONTEXT(e) do { \ -+ if(is_in_reiser4_context()) { \ -+ e; \ -+ } } while(0) -+ -+/* -+ * evaluate expression @e only when within reiser4_context and debugging is -+ * on. -+ */ -+#define ON_DEBUG_CONTEXT( e ) ON_DEBUG( ON_CONTEXT( e ) ) -+ -+/* -+ * complain about unexpected function result and crash. Used in "default" -+ * branches of switch statements and alike to assert that invalid results are -+ * not silently ignored. -+ */ -+#define wrong_return_value( label, function ) \ -+ impossible( label, "wrong return value from " function ) -+ -+/* Issue different types of reiser4 messages to the console */ -+#define warning( label, format, ... ) \ -+ DCALL( KERN_WARNING, \ -+ printk, 1, label, "WARNING: " format , ## __VA_ARGS__ ) -+#define notice( label, format, ... ) \ -+ DCALL( KERN_NOTICE, \ -+ printk, 1, label, "NOTICE: " format , ## __VA_ARGS__ ) -+ -+/* mark not yet implemented functionality */ -+#define not_yet( label, format, ... ) \ -+ reiser4_panic( label, "NOT YET IMPLEMENTED: " format , ## __VA_ARGS__ ) -+ -+extern void reiser4_do_panic(const char *format, ...) -+ __attribute__ ((noreturn, format(printf, 1, 2))); -+ -+extern int reiser4_preempt_point(void); -+extern void reiser4_print_stats(void); -+ -+#if REISER4_DEBUG -+extern int reiser4_no_counters_are_held(void); -+extern int reiser4_commit_check_locks(void); -+#else -+#define reiser4_no_counters_are_held() (1) -+#define reiser4_commit_check_locks() (1) -+#endif -+ -+/* true if @i is power-of-two. Useful for rate-limited warnings, etc. */ -+#define IS_POW(i) \ -+({ \ -+ typeof(i) __i; \ -+ \ -+ __i = (i); \ -+ !(__i & (__i - 1)); \ -+}) -+ -+#define KERNEL_DEBUGGER (1) -+ -+#if KERNEL_DEBUGGER -+ -+extern void reiser4_debugtrap(void); -+ -+/* -+ * Check condition @cond and drop into kernel debugger (kgdb) if it's true. If -+ * kgdb is not compiled in, do nothing. -+ */ -+#define DEBUGON(cond) \ -+({ \ -+ if (unlikely(cond)) \ -+ reiser4_debugtrap(); \ -+}) -+#else -+#define DEBUGON(cond) noop -+#endif -+ -+/* -+ * Error code tracing facility. (Idea is borrowed from XFS code.) -+ * -+ * Suppose some strange and/or unexpected code is returned from some function -+ * (for example, write(2) returns -EEXIST). It is possible to place a -+ * breakpoint in the reiser4_write(), but it is too late here. How to find out -+ * in what particular place -EEXIST was generated first? -+ * -+ * In reiser4 all places where actual error codes are produced (that is, -+ * statements of the form -+ * -+ * return -EFOO; // (1), or -+ * -+ * result = -EFOO; // (2) -+ * -+ * are replaced with -+ * -+ * return RETERR(-EFOO); // (1a), and -+ * -+ * result = RETERR(-EFOO); // (2a) respectively -+ * -+ * RETERR() macro fills a backtrace in reiser4_context. This back-trace is -+ * printed in error and warning messages. Moreover, it's possible to put a -+ * conditional breakpoint in reiser4_return_err (low-level function called -+ * by RETERR() to do the actual work) to break into debugger immediately -+ * when particular error happens. -+ * -+ */ -+ -+#if REISER4_DEBUG -+ -+/* -+ * data-type to store information about where error happened ("error site"). -+ */ -+typedef struct err_site { -+ int code; /* error code */ -+ const char *file; /* source file, filled by __FILE__ */ -+ int line; /* source file line, filled by __LINE__ */ -+} err_site; -+ -+extern void reiser4_return_err(int code, const char *file, int line); -+ -+/* -+ * fill &get_current_context()->err_site with error information. -+ */ -+#define RETERR(code) \ -+({ \ -+ typeof(code) __code; \ -+ \ -+ __code = (code); \ -+ reiser4_return_err(__code, __FILE__, __LINE__); \ -+ __code; \ -+}) -+ -+#else -+ -+/* -+ * no-op versions of the above -+ */ -+ -+typedef struct err_site { -+} err_site; -+#define RETERR(code) code -+#endif -+ -+#if REISER4_LARGE_KEY -+/* -+ * conditionally compile arguments only if REISER4_LARGE_KEY is on. -+ */ -+#define ON_LARGE_KEY(...) __VA_ARGS__ -+#else -+#define ON_LARGE_KEY(...) -+#endif -+ -+/* __FS_REISER4_DEBUG_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/dformat.h linux-2.6.20/fs/reiser4/dformat.h ---- linux-2.6.20.orig/fs/reiser4/dformat.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/dformat.h 2007-05-06 14:50:43.702976975 +0400 -@@ -0,0 +1,70 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Formats of on-disk data and conversion functions. */ -+ -+/* put all item formats in the files describing the particular items, -+ our model is, everything you need to do to add an item to reiser4, -+ (excepting the changes to the plugin that uses the item which go -+ into the file defining that plugin), you put into one file. */ -+/* Data on disk are stored in little-endian format. -+ To declare fields of on-disk structures, use d8, d16, d32 and d64. -+ d??tocpu() and cputod??() to convert. */ -+ -+#if !defined( __FS_REISER4_DFORMAT_H__ ) -+#define __FS_REISER4_DFORMAT_H__ -+ -+#include -+#include -+#include -+ -+typedef __u8 d8; -+typedef __le16 d16; -+typedef __le32 d32; -+typedef __le64 d64; -+ -+#define PACKED __attribute__((packed)) -+ -+/* data-type for block number */ -+typedef __u64 reiser4_block_nr; -+ -+/* data-type for block number on disk, disk format */ -+typedef __le64 reiser4_dblock_nr; -+ -+/** -+ * disk_addr_eq - compare disk addresses -+ * @b1: pointer to block number ot compare -+ * @b2: pointer to block number ot compare -+ * -+ * Returns true if if disk addresses are the same -+ */ -+static inline int disk_addr_eq(const reiser4_block_nr *b1, -+ const reiser4_block_nr * b2) -+{ -+ assert("nikita-1033", b1 != NULL); -+ assert("nikita-1266", b2 != NULL); -+ -+ return !memcmp(b1, b2, sizeof *b1); -+} -+ -+/* structure of master reiser4 super block */ -+typedef struct reiser4_master_sb { -+ char magic[16]; /* "ReIsEr4" */ -+ __le16 disk_plugin_id; /* id of disk layout plugin */ -+ __le16 blocksize; -+ char uuid[16]; /* unique id */ -+ char label[16]; /* filesystem label */ -+ __le64 diskmap; /* location of the diskmap. 0 if not present */ -+} reiser4_master_sb; -+ -+/* __FS_REISER4_DFORMAT_H__ */ -+#endif -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/dscale.c linux-2.6.20/fs/reiser4/dscale.c ---- linux-2.6.20.orig/fs/reiser4/dscale.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/dscale.c 2007-05-06 14:50:43.702976975 +0400 -@@ -0,0 +1,174 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Scalable on-disk integers */ -+ -+/* -+ * Various on-disk structures contain integer-like structures. Stat-data -+ * contain [yes, "data" is plural, check the dictionary] file size, link -+ * count; extent unit contains extent width etc. To accommodate for general -+ * case enough space is reserved to keep largest possible value. 64 bits in -+ * all cases above. But in overwhelming majority of cases numbers actually -+ * stored in these fields will be comparatively small and reserving 8 bytes is -+ * a waste of precious disk bandwidth. -+ * -+ * Scalable integers are one way to solve this problem. dscale_write() -+ * function stores __u64 value in the given area consuming from 1 to 9 bytes, -+ * depending on the magnitude of the value supplied. dscale_read() reads value -+ * previously stored by dscale_write(). -+ * -+ * dscale_write() produces format not completely unlike of UTF: two highest -+ * bits of the first byte are used to store "tag". One of 4 possible tag -+ * values is chosen depending on the number being encoded: -+ * -+ * 0 ... 0x3f => 0 [table 1] -+ * 0x40 ... 0x3fff => 1 -+ * 0x4000 ... 0x3fffffff => 2 -+ * 0x40000000 ... 0xffffffffffffffff => 3 -+ * -+ * (see dscale_range() function) -+ * -+ * Values in the range 0x40000000 ... 0xffffffffffffffff require 8 full bytes -+ * to be stored, so in this case there is no place in the first byte to store -+ * tag. For such values tag is stored in an extra 9th byte. -+ * -+ * As _highest_ bits are used for the test (which is natural) scaled integers -+ * are stored in BIG-ENDIAN format in contrast with the rest of reiser4 which -+ * uses LITTLE-ENDIAN. -+ * -+ */ -+ -+#include "debug.h" -+#include "dscale.h" -+ -+/* return tag of scaled integer stored at @address */ -+static int gettag(const unsigned char *address) -+{ -+ /* tag is stored in two highest bits */ -+ return (*address) >> 6; -+} -+ -+/* clear tag from value. Clear tag embedded into @value. */ -+static void cleartag(__u64 * value, int tag) -+{ -+ /* -+ * W-w-what ?! -+ * -+ * Actually, this is rather simple: @value passed here was read by -+ * dscale_read(), converted from BIG-ENDIAN, and padded to __u64 by -+ * zeroes. Tag is still stored in the highest (arithmetically) -+ * non-zero bits of @value, but relative position of tag within __u64 -+ * depends on @tag. -+ * -+ * For example if @tag is 0, it's stored 2 highest bits of lowest -+ * byte, and its offset (counting from lowest bit) is 8 - 2 == 6 bits. -+ * -+ * If tag is 1, it's stored in two highest bits of 2nd lowest byte, -+ * and it's offset if (2 * 8) - 2 == 14 bits. -+ * -+ * See table 1 above for details. -+ * -+ * All these cases are captured by the formula: -+ */ -+ *value &= ~(3 << (((1 << tag) << 3) - 2)); -+ /* -+ * That is, clear two (3 == 0t11) bits at the offset -+ * -+ * 8 * (2 ^ tag) - 2, -+ * -+ * that is, two highest bits of (2 ^ tag)-th byte of @value. -+ */ -+} -+ -+/* return tag for @value. See table 1 above for details. */ -+static int dscale_range(__u64 value) -+{ -+ if (value > 0x3fffffff) -+ return 3; -+ if (value > 0x3fff) -+ return 2; -+ if (value > 0x3f) -+ return 1; -+ return 0; -+} -+ -+/* restore value stored at @adderss by dscale_write() and return number of -+ * bytes consumed */ -+int dscale_read(unsigned char *address, __u64 * value) -+{ -+ int tag; -+ -+ /* read tag */ -+ tag = gettag(address); -+ switch (tag) { -+ case 3: -+ /* In this case tag is stored in an extra byte, skip this byte -+ * and decode value stored in the next 8 bytes.*/ -+ *value = __be64_to_cpu(get_unaligned((__be64 *)(address + 1))); -+ /* worst case: 8 bytes for value itself plus one byte for -+ * tag. */ -+ return 9; -+ case 0: -+ *value = get_unaligned(address); -+ break; -+ case 1: -+ *value = __be16_to_cpu(get_unaligned((__be16 *)address)); -+ break; -+ case 2: -+ *value = __be32_to_cpu(get_unaligned((__be32 *)address)); -+ break; -+ default: -+ return RETERR(-EIO); -+ } -+ /* clear tag embedded into @value */ -+ cleartag(value, tag); -+ /* number of bytes consumed is (2 ^ tag)---see table 1. */ -+ return 1 << tag; -+} -+ -+/* store @value at @address and return number of bytes consumed */ -+int dscale_write(unsigned char *address, __u64 value) -+{ -+ int tag; -+ int shift; -+ __be64 v; -+ unsigned char *valarr; -+ -+ tag = dscale_range(value); -+ v = __cpu_to_be64(value); -+ valarr = (unsigned char *)&v; -+ shift = (tag == 3) ? 1 : 0; -+ memcpy(address + shift, valarr + sizeof v - (1 << tag), 1 << tag); -+ *address |= (tag << 6); -+ return shift + (1 << tag); -+} -+ -+/* number of bytes required to store @value */ -+int dscale_bytes(__u64 value) -+{ -+ int bytes; -+ -+ bytes = 1 << dscale_range(value); -+ if (bytes == 8) -+ ++bytes; -+ return bytes; -+} -+ -+/* returns true if @value and @other require the same number of bytes to be -+ * stored. Used by detect when data structure (like stat-data) has to be -+ * expanded or contracted. */ -+int dscale_fit(__u64 value, __u64 other) -+{ -+ return dscale_range(value) == dscale_range(other); -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/dscale.h linux-2.6.20/fs/reiser4/dscale.h ---- linux-2.6.20.orig/fs/reiser4/dscale.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/dscale.h 2007-05-06 14:50:43.702976975 +0400 -@@ -0,0 +1,27 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Scalable on-disk integers. See dscale.h for details. */ -+ -+#if !defined( __FS_REISER4_DSCALE_H__ ) -+#define __FS_REISER4_DSCALE_H__ -+ -+#include "dformat.h" -+ -+extern int dscale_read(unsigned char *address, __u64 * value); -+extern int dscale_write(unsigned char *address, __u64 value); -+extern int dscale_bytes(__u64 value); -+extern int dscale_fit(__u64 value, __u64 other); -+ -+/* __FS_REISER4_DSCALE_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/entd.c linux-2.6.20/fs/reiser4/entd.c ---- linux-2.6.20.orig/fs/reiser4/entd.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/entd.c 2007-05-06 14:50:43.702976975 +0400 -@@ -0,0 +1,335 @@ -+/* Copyright 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Ent daemon. */ -+ -+#include "debug.h" -+#include "txnmgr.h" -+#include "tree.h" -+#include "entd.h" -+#include "super.h" -+#include "context.h" -+#include "reiser4.h" -+#include "vfs_ops.h" -+#include "page_cache.h" -+#include "inode.h" -+ -+#include /* struct task_struct */ -+#include -+#include -+#include -+#include /* INITIAL_JIFFIES */ -+#include /* bdi_write_congested */ -+#include -+#include -+#include -+ -+#define DEF_PRIORITY 12 -+#define MAX_ENTD_ITERS 10 -+ -+static void entd_flush(struct super_block *, struct wbq *); -+static int entd(void *arg); -+ -+/* -+ * set ->comm field of end thread to make its state visible to the user level -+ */ -+#define entd_set_comm(state) \ -+ snprintf(current->comm, sizeof(current->comm), \ -+ "ent:%s%s", super->s_id, (state)) -+ -+/** -+ * reiser4_init_entd - initialize entd context and start kernel daemon -+ * @super: super block to start ent thread for -+ * -+ * Creates entd contexts, starts kernel thread and waits until it -+ * initializes. -+ */ -+int reiser4_init_entd(struct super_block *super) -+{ -+ entd_context *ctx; -+ -+ assert("nikita-3104", super != NULL); -+ -+ ctx = get_entd_context(super); -+ -+ memset(ctx, 0, sizeof *ctx); -+ spin_lock_init(&ctx->guard); -+ init_waitqueue_head(&ctx->wait); -+#if REISER4_DEBUG -+ INIT_LIST_HEAD(&ctx->flushers_list); -+#endif -+ /* lists of writepage requests */ -+ INIT_LIST_HEAD(&ctx->todo_list); -+ INIT_LIST_HEAD(&ctx->done_list); -+ /* start entd */ -+ ctx->tsk = kthread_run(entd, super, "ent:%s", super->s_id); -+ if (IS_ERR(ctx->tsk)) -+ return PTR_ERR(ctx->tsk); -+ return 0; -+} -+ -+static void put_wbq(struct wbq *rq) -+{ -+ iput(rq->mapping->host); -+ complete(&rq->completion); -+} -+ -+/* ent should be locked */ -+static struct wbq *__get_wbq(entd_context * ent) -+{ -+ struct wbq *wbq; -+ -+ if (list_empty(&ent->todo_list)) -+ return NULL; -+ -+ ent->nr_todo_reqs --; -+ wbq = list_entry(ent->todo_list.next, struct wbq, link); -+ list_del_init(&wbq->link); -+ return wbq; -+} -+ -+/* ent thread function */ -+static int entd(void *arg) -+{ -+ struct super_block *super; -+ entd_context *ent; -+ int done = 0; -+ -+ super = arg; -+ /* do_fork() just copies task_struct into the new -+ thread. ->fs_context shouldn't be copied of course. This shouldn't -+ be a problem for the rest of the code though. -+ */ -+ current->journal_info = NULL; -+ -+ ent = get_entd_context(super); -+ -+ while (!done) { -+ try_to_freeze(); -+ -+ spin_lock(&ent->guard); -+ while (ent->nr_todo_reqs != 0) { -+ struct wbq *rq; -+ -+ assert("", list_empty(&ent->done_list)); -+ -+ /* take request from the queue head */ -+ rq = __get_wbq(ent); -+ assert("", rq != NULL); -+ ent->cur_request = rq; -+ spin_unlock(&ent->guard); -+ -+ entd_set_comm("!"); -+ entd_flush(super, rq); -+ -+ put_wbq(rq); -+ -+ /* -+ * wakeup all requestors and iput their inodes -+ */ -+ spin_lock(&ent->guard); -+ while (!list_empty(&ent->done_list)) { -+ rq = list_entry(ent->done_list.next, struct wbq, link); -+ list_del_init(&rq->link); -+ ent->nr_done_reqs --; -+ spin_unlock(&ent->guard); -+ assert("", rq->written == 1); -+ put_wbq(rq); -+ spin_lock(&ent->guard); -+ } -+ } -+ spin_unlock(&ent->guard); -+ -+ entd_set_comm("."); -+ -+ { -+ DEFINE_WAIT(__wait); -+ -+ do { -+ prepare_to_wait(&ent->wait, &__wait, TASK_INTERRUPTIBLE); -+ if (kthread_should_stop()) { -+ done = 1; -+ break; -+ } -+ if (ent->nr_todo_reqs != 0) -+ break; -+ schedule(); -+ } while (0); -+ finish_wait(&ent->wait, &__wait); -+ } -+ } -+ BUG_ON(ent->nr_todo_reqs != 0); -+ return 0; -+} -+ -+/** -+ * reiser4_done_entd - stop entd kernel thread -+ * @super: super block to stop ent thread for -+ * -+ * It is called on umount. Sends stop signal to entd and wait until it handles -+ * it. -+ */ -+void reiser4_done_entd(struct super_block *super) -+{ -+ entd_context *ent; -+ -+ assert("nikita-3103", super != NULL); -+ -+ ent = get_entd_context(super); -+ assert("zam-1055", ent->tsk != NULL); -+ kthread_stop(ent->tsk); -+} -+ -+/* called at the beginning of jnode_flush to register flusher thread with ent -+ * daemon */ -+void reiser4_enter_flush(struct super_block *super) -+{ -+ entd_context *ent; -+ -+ assert("zam-1029", super != NULL); -+ ent = get_entd_context(super); -+ -+ assert("zam-1030", ent != NULL); -+ -+ spin_lock(&ent->guard); -+ ent->flushers++; -+#if REISER4_DEBUG -+ list_add(&get_current_context()->flushers_link, &ent->flushers_list); -+#endif -+ spin_unlock(&ent->guard); -+} -+ -+/* called at the end of jnode_flush */ -+void reiser4_leave_flush(struct super_block *super) -+{ -+ entd_context *ent; -+ int wake_up_ent; -+ -+ assert("zam-1027", super != NULL); -+ ent = get_entd_context(super); -+ -+ assert("zam-1028", ent != NULL); -+ -+ spin_lock(&ent->guard); -+ ent->flushers--; -+ wake_up_ent = (ent->flushers == 0 && ent->nr_todo_reqs != 0); -+#if REISER4_DEBUG -+ list_del_init(&get_current_context()->flushers_link); -+#endif -+ spin_unlock(&ent->guard); -+ if (wake_up_ent) -+ wake_up(&ent->wait); -+} -+ -+#define ENTD_CAPTURE_APAGE_BURST SWAP_CLUSTER_MAX -+ -+static void entd_flush(struct super_block *super, struct wbq *rq) -+{ -+ reiser4_context ctx; -+ int tmp; -+ -+ init_stack_context(&ctx, super); -+ ctx.entd = 1; -+ ctx.gfp_mask = GFP_NOFS; -+ -+ rq->wbc->range_start = page_offset(rq->page); -+ rq->wbc->range_end = rq->wbc->range_start + -+ (ENTD_CAPTURE_APAGE_BURST << PAGE_CACHE_SHIFT); -+ tmp = rq->wbc->nr_to_write; -+ rq->mapping->a_ops->writepages(rq->mapping, rq->wbc); -+ -+ if (rq->wbc->nr_to_write > 0) { -+ rq->wbc->range_start = 0; -+ rq->wbc->range_end = LLONG_MAX; -+ generic_sync_sb_inodes(super, rq->wbc); -+ } -+ rq->wbc->nr_to_write = ENTD_CAPTURE_APAGE_BURST; -+ reiser4_writeout(super, rq->wbc); -+ -+ context_set_commit_async(&ctx); -+ reiser4_exit_context(&ctx); -+} -+ -+/** -+ * write_page_by_ent - ask entd thread to flush this page as part of slum -+ * @page: page to be written -+ * @wbc: writeback control passed to reiser4_writepage -+ * -+ * Creates a request, puts it on entd list of requests, wakeups entd if -+ * necessary, waits until entd completes with the request. -+ */ -+int write_page_by_ent(struct page *page, struct writeback_control *wbc) -+{ -+ struct super_block *sb; -+ struct inode *inode; -+ entd_context *ent; -+ struct wbq rq; -+ -+ assert("", PageLocked(page)); -+ assert("", page->mapping != NULL); -+ -+ sb = page->mapping->host->i_sb; -+ ent = get_entd_context(sb); -+ assert("", ent && ent->done == 0); -+ -+ /* -+ * we are going to unlock page and ask ent thread to write the -+ * page. Re-dirty page before unlocking so that if ent thread fails to -+ * write it - it will remain dirty -+ */ -+ reiser4_set_page_dirty_internal(page); -+ -+ /* -+ * pin inode in memory, unlock page, entd_flush will iput. We can not -+ * iput here becasue we can not allow delete_inode to be called here -+ */ -+ inode = igrab(page->mapping->host); -+ unlock_page(page); -+ if (inode == NULL) -+ /* inode is getting freed */ -+ return 0; -+ -+ /* init wbq */ -+ INIT_LIST_HEAD(&rq.link); -+ rq.magic = WBQ_MAGIC; -+ rq.wbc = wbc; -+ rq.page = page; -+ rq.mapping = inode->i_mapping; -+ rq.node = NULL; -+ rq.written = 0; -+ init_completion(&rq.completion); -+ -+ /* add request to entd's list of writepage requests */ -+ spin_lock(&ent->guard); -+ ent->nr_todo_reqs++; -+ list_add_tail(&rq.link, &ent->todo_list); -+ if (ent->nr_todo_reqs == 1) -+ wake_up(&ent->wait); -+ -+ spin_unlock(&ent->guard); -+ -+ /* wait until entd finishes */ -+ wait_for_completion(&rq.completion); -+ -+ if (rq.written) -+ /* Eventually ENTD has written the page to disk. */ -+ return 0; -+ return 0; -+} -+ -+int wbq_available(void) -+{ -+ struct super_block *sb = reiser4_get_current_sb(); -+ entd_context *ent = get_entd_context(sb); -+ return ent->nr_todo_reqs; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/entd.h linux-2.6.20/fs/reiser4/entd.h ---- linux-2.6.20.orig/fs/reiser4/entd.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/entd.h 2007-05-06 14:50:43.706978224 +0400 -@@ -0,0 +1,90 @@ -+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Ent daemon. */ -+ -+#ifndef __ENTD_H__ -+#define __ENTD_H__ -+ -+#include "context.h" -+ -+#include -+#include -+#include -+#include -+#include /* for struct task_struct */ -+ -+#define WBQ_MAGIC 0x7876dc76 -+ -+/* write-back request. */ -+struct wbq { -+ int magic; -+ struct list_head link; /* list head of this list is in entd context */ -+ struct writeback_control *wbc; -+ struct page *page; -+ struct address_space *mapping; -+ struct completion completion; -+ jnode *node; /* set if ent thread captured requested page */ -+ int written; /* set if ent thread wrote requested page */ -+}; -+ -+/* ent-thread context. This is used to synchronize starting/stopping ent -+ * threads. */ -+typedef struct entd_context { -+ /* wait queue that ent thread waits on for more work. It's -+ * signaled by write_page_by_ent(). */ -+ wait_queue_head_t wait; -+ /* spinlock protecting other fields */ -+ spinlock_t guard; -+ /* ent thread */ -+ struct task_struct *tsk; -+ /* set to indicate that ent thread should leave. */ -+ int done; -+ /* counter of active flushers */ -+ int flushers; -+ /* -+ * when reiser4_writepage asks entd to write a page - it adds struct -+ * wbq to this list -+ */ -+ struct list_head todo_list; -+ /* number of elements on the above list */ -+ int nr_todo_reqs; -+ -+ struct wbq *cur_request; -+ /* -+ * when entd writes a page it moves write-back request from todo_list -+ * to done_list. This list is used at the end of entd iteration to -+ * wakeup requestors and iput inodes. -+ */ -+ struct list_head done_list; -+ /* number of elements on the above list */ -+ int nr_done_reqs; -+ -+#if REISER4_DEBUG -+ /* list of all active flushers */ -+ struct list_head flushers_list; -+#endif -+} entd_context; -+ -+extern int reiser4_init_entd(struct super_block *); -+extern void reiser4_done_entd(struct super_block *); -+ -+extern void reiser4_enter_flush(struct super_block *); -+extern void reiser4_leave_flush(struct super_block *); -+ -+extern int write_page_by_ent(struct page *, struct writeback_control *); -+extern int wbq_available(void); -+extern void ent_writes_page(struct super_block *, struct page *); -+ -+extern jnode *get_jnode_by_wbq(struct super_block *, struct wbq *); -+/* __ENTD_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/eottl.c linux-2.6.20/fs/reiser4/eottl.c ---- linux-2.6.20.orig/fs/reiser4/eottl.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/eottl.c 2007-05-06 14:50:43.706978224 +0400 -@@ -0,0 +1,509 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/item/item.h" -+#include "plugin/node/node.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree_walk.h" -+#include "tree_mod.h" -+#include "carry.h" -+#include "tree.h" -+#include "super.h" -+ -+#include /* for __u?? */ -+ -+/* -+ * Extents on the twig level (EOTTL) handling. -+ * -+ * EOTTL poses some problems to the tree traversal, that are better explained -+ * by example. -+ * -+ * Suppose we have block B1 on the twig level with the following items: -+ * -+ * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id, -+ * offset) -+ * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each -+ * 2. internal item I2 with key (10:0:0:0) -+ * -+ * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and -+ * then intra-node lookup is done. This lookup finished on the E1, because the -+ * key we are looking for is larger than the key of E1 and is smaller than key -+ * the of I2. -+ * -+ * Here search is stuck. -+ * -+ * After some thought it is clear what is wrong here: extents on the twig level -+ * break some basic property of the *search* tree (on the pretext, that they -+ * restore property of balanced tree). -+ * -+ * Said property is the following: if in the internal node of the search tree -+ * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be -+ * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible -+ * through the Pointer. -+ * -+ * This is not true, when Pointer is Extent-Pointer, simply because extent -+ * cannot expand indefinitely to the right to include any item with -+ * -+ * Key1 <= Key <= Key2. -+ * -+ * For example, our E1 extent is only responsible for the data with keys -+ * -+ * (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and -+ * -+ * so, key range -+ * -+ * ( (1:4:100:0xffffffffffffffff), (10:0:0:0) ) -+ * -+ * is orphaned: there is no way to get there from the tree root. -+ * -+ * In other words, extent pointers are different than normal child pointers as -+ * far as search tree is concerned, and this creates such problems. -+ * -+ * Possible solution for this problem is to insert our item into node pointed -+ * to by I2. There are some problems through: -+ * -+ * (1) I2 can be in a different node. -+ * (2) E1 can be immediately followed by another extent E2. -+ * -+ * (1) is solved by calling reiser4_get_right_neighbor() and accounting -+ * for locks/coords as necessary. -+ * -+ * (2) is more complex. Solution here is to insert new empty leaf node and -+ * insert internal item between E1 and E2 pointing to said leaf node. This is -+ * further complicated by possibility that E2 is in a different node, etc. -+ * -+ * Problems: -+ * -+ * (1) if there was internal item I2 immediately on the right of an extent E1 -+ * we and we decided to insert new item S1 into node N2 pointed to by I2, then -+ * key of S1 will be less than smallest key in the N2. Normally, search key -+ * checks that key we are looking for is in the range of keys covered by the -+ * node key is being looked in. To work around of this situation, while -+ * preserving useful consistency check new flag CBK_TRUST_DK was added to the -+ * cbk falgs bitmask. This flag is automatically set on entrance to the -+ * coord_by_key() and is only cleared when we are about to enter situation -+ * described above. -+ * -+ * (2) If extent E1 is immediately followed by another extent E2 and we are -+ * searching for the key that is between E1 and E2 we only have to insert new -+ * empty leaf node when coord_by_key was called for insertion, rather than just -+ * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to -+ * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls -+ * performed by insert_by_key() and friends. -+ * -+ * (3) Insertion of new empty leaf node (possibly) requires balancing. In any -+ * case it requires modification of node content which is only possible under -+ * write lock. It may well happen that we only have read lock on the node where -+ * new internal pointer is to be inserted (common case: lookup of non-existent -+ * stat-data that fells between two extents). If only read lock is held, tree -+ * traversal is restarted with lock_level modified so that next time we hit -+ * this problem, write lock will be held. Once we have write lock, balancing -+ * will be performed. -+ */ -+ -+/** -+ * is_next_item_internal - check whether next item is internal -+ * @coord: coordinate of extent item in twig node -+ * @key: search key -+ * @lh: twig node lock handle -+ * -+ * Looks at the unit next to @coord. If it is an internal one - 1 is returned, -+ * @coord is set to that unit. If that unit is in right neighbor, @lh is moved -+ * to that node, @coord is set to its first unit. If next item is not internal -+ * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2 -+ * is returned if search restart has to be done. -+ */ -+static int -+is_next_item_internal(coord_t *coord, const reiser4_key *key, -+ lock_handle *lh) -+{ -+ coord_t next; -+ lock_handle rn; -+ int result; -+ -+ coord_dup(&next, coord); -+ if (coord_next_unit(&next) == 0) { -+ /* next unit is in this node */ -+ if (item_is_internal(&next)) { -+ coord_dup(coord, &next); -+ return 1; -+ } -+ assert("vs-3", item_is_extent(&next)); -+ return 0; -+ } -+ -+ /* -+ * next unit either does not exist or is in right neighbor. If it is in -+ * right neighbor we have to check right delimiting key because -+ * concurrent thread could get their first and insert item with a key -+ * smaller than @key -+ */ -+ read_lock_dk(current_tree); -+ result = keycmp(key, znode_get_rd_key(coord->node)); -+ read_unlock_dk(current_tree); -+ assert("vs-6", result != EQUAL_TO); -+ if (result == GREATER_THAN) -+ return 2; -+ -+ /* lock right neighbor */ -+ init_lh(&rn); -+ result = reiser4_get_right_neighbor(&rn, coord->node, -+ znode_is_wlocked(coord->node) ? -+ ZNODE_WRITE_LOCK : ZNODE_READ_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ if (result == -E_NO_NEIGHBOR) { -+ /* we are on the rightmost edge of the tree */ -+ done_lh(&rn); -+ return 0; -+ } -+ -+ if (result) { -+ assert("vs-4", result < 0); -+ done_lh(&rn); -+ return result; -+ } -+ -+ /* -+ * check whether concurrent thread managed to insert item with a key -+ * smaller than @key -+ */ -+ read_lock_dk(current_tree); -+ result = keycmp(key, znode_get_ld_key(rn.node)); -+ read_unlock_dk(current_tree); -+ assert("vs-6", result != EQUAL_TO); -+ if (result == GREATER_THAN) { -+ done_lh(&rn); -+ return 2; -+ } -+ -+ result = zload(rn.node); -+ if (result) { -+ assert("vs-5", result < 0); -+ done_lh(&rn); -+ return result; -+ } -+ -+ coord_init_first_unit(&next, rn.node); -+ if (item_is_internal(&next)) { -+ /* -+ * next unit is in right neighbor and it is an unit of internal -+ * item. Unlock coord->node. Move @lh to right neighbor. @coord -+ * is set to the first unit of right neighbor. -+ */ -+ coord_dup(coord, &next); -+ zrelse(rn.node); -+ done_lh(lh); -+ move_lh(lh, &rn); -+ return 1; -+ } -+ -+ /* -+ * next unit is unit of extent item. Return without chaning @lh and -+ * @coord. -+ */ -+ assert("vs-6", item_is_extent(&next)); -+ zrelse(rn.node); -+ done_lh(&rn); -+ return 0; -+} -+ -+/** -+ * rd_key - calculate key of an item next to the given one -+ * @coord: position in a node -+ * @key: storage for result key -+ * -+ * @coord is set between items or after the last item in a node. Calculate key -+ * of item to the right of @coord. -+ */ -+static reiser4_key *rd_key(const coord_t *coord, reiser4_key *key) -+{ -+ coord_t dup; -+ -+ assert("nikita-2281", coord_is_between_items(coord)); -+ coord_dup(&dup, coord); -+ -+ if (coord_set_to_right(&dup) == 0) -+ /* next item is in this node. Return its key. */ -+ unit_key_by_coord(&dup, key); -+ else { -+ /* -+ * next item either does not exist or is in right -+ * neighbor. Return znode's right delimiting key. -+ */ -+ read_lock_dk(current_tree); -+ *key = *znode_get_rd_key(coord->node); -+ read_unlock_dk(current_tree); -+ } -+ return key; -+} -+ -+/** -+ * add_empty_leaf - insert empty leaf between two extents -+ * @insert_coord: position in twig node between two extents -+ * @lh: twig node lock handle -+ * @key: left delimiting key of new node -+ * @rdkey: right delimiting key of new node -+ * -+ * Inserts empty leaf node between two extent items. It is necessary when we -+ * have to insert an item on leaf level between two extents (items on the twig -+ * level). -+ */ -+static int -+add_empty_leaf(coord_t *insert_coord, lock_handle *lh, -+ const reiser4_key *key, const reiser4_key *rdkey) -+{ -+ int result; -+ carry_pool *pool; -+ carry_level *todo; -+ reiser4_item_data *item; -+ carry_insert_data *cdata; -+ carry_op *op; -+ znode *node; -+ reiser4_tree *tree; -+ -+ assert("vs-49827", znode_contains_key_lock(insert_coord->node, key)); -+ tree = znode_get_tree(insert_coord->node); -+ node = reiser4_new_node(insert_coord->node, LEAF_LEVEL); -+ if (IS_ERR(node)) -+ return PTR_ERR(node); -+ -+ /* setup delimiting keys for node being inserted */ -+ write_lock_dk(tree); -+ znode_set_ld_key(node, key); -+ znode_set_rd_key(node, rdkey); -+ ON_DEBUG(node->creator = current); -+ ON_DEBUG(node->first_key = *key); -+ write_unlock_dk(tree); -+ -+ ZF_SET(node, JNODE_ORPHAN); -+ -+ /* -+ * allocate carry_pool, 3 carry_level-s, reiser4_item_data and -+ * carry_insert_data -+ */ -+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) + -+ sizeof(*item) + sizeof(*cdata)); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ todo = (carry_level *) (pool + 1); -+ init_carry_level(todo, pool); -+ -+ item = (reiser4_item_data *) (todo + 3); -+ cdata = (carry_insert_data *) (item + 1); -+ -+ op = reiser4_post_carry(todo, COP_INSERT, insert_coord->node, 0); -+ if (!IS_ERR(op)) { -+ cdata->coord = insert_coord; -+ cdata->key = key; -+ cdata->data = item; -+ op->u.insert.d = cdata; -+ op->u.insert.type = COPT_ITEM_DATA; -+ build_child_ptr_data(node, item); -+ item->arg = NULL; -+ /* have @insert_coord to be set at inserted item after -+ insertion is done */ -+ todo->track_type = CARRY_TRACK_CHANGE; -+ todo->tracked = lh; -+ -+ result = reiser4_carry(todo, NULL); -+ if (result == 0) { -+ /* -+ * pin node in memory. This is necessary for -+ * znode_make_dirty() below. -+ */ -+ result = zload(node); -+ if (result == 0) { -+ lock_handle local_lh; -+ -+ /* -+ * if we inserted new child into tree we have -+ * to mark it dirty so that flush will be able -+ * to process it. -+ */ -+ init_lh(&local_lh); -+ result = longterm_lock_znode(&local_lh, node, -+ ZNODE_WRITE_LOCK, -+ ZNODE_LOCK_LOPRI); -+ if (result == 0) { -+ znode_make_dirty(node); -+ -+ /* -+ * when internal item pointing to @node -+ * was inserted into twig node -+ * create_hook_internal did not connect -+ * it properly because its right -+ * neighbor was not known. Do it -+ * here -+ */ -+ write_lock_tree(tree); -+ assert("nikita-3312", -+ znode_is_right_connected(node)); -+ assert("nikita-2984", -+ node->right == NULL); -+ ZF_CLR(node, JNODE_RIGHT_CONNECTED); -+ write_unlock_tree(tree); -+ result = -+ connect_znode(insert_coord, node); -+ ON_DEBUG(if (result == 0) check_dkeys(node);); -+ -+ done_lh(lh); -+ move_lh(lh, &local_lh); -+ assert("vs-1676", node_is_empty(node)); -+ coord_init_first_unit(insert_coord, -+ node); -+ } else { -+ warning("nikita-3136", -+ "Cannot lock child"); -+ } -+ done_lh(&local_lh); -+ zrelse(node); -+ } -+ } -+ } else -+ result = PTR_ERR(op); -+ zput(node); -+ done_carry_pool(pool); -+ return result; -+} -+ -+/** -+ * handle_eottl - handle extent-on-the-twig-level cases in tree traversal -+ * @h: search handle -+ * @outcome: flag saying whether search has to restart or is done -+ * -+ * Handles search on twig level. If this function completes search itself then -+ * it returns 1. If search has to go one level down then 0 is returned. If -+ * error happens then LOOKUP_DONE is returned via @outcome and error code is saved -+ * in @h->result. -+ */ -+int handle_eottl(cbk_handle *h, int *outcome) -+{ -+ int result; -+ reiser4_key key; -+ coord_t *coord; -+ -+ coord = h->coord; -+ -+ if (h->level != TWIG_LEVEL || -+ (coord_is_existing_item(coord) && item_is_internal(coord))) { -+ /* Continue to traverse tree downward. */ -+ return 0; -+ } -+ -+ /* -+ * make sure that @h->coord is set to twig node and that it is either -+ * set to extent item or after extent item -+ */ -+ assert("vs-356", h->level == TWIG_LEVEL); -+ assert("vs-357", ( { -+ coord_t lcoord; -+ coord_dup(&lcoord, coord); -+ check_me("vs-733", coord_set_to_left(&lcoord) == 0); -+ item_is_extent(&lcoord); -+ } -+ )); -+ -+ if (*outcome == NS_FOUND) { -+ /* we have found desired key on twig level in extent item */ -+ h->result = CBK_COORD_FOUND; -+ *outcome = LOOKUP_DONE; -+ return 1; -+ } -+ -+ if (!(h->flags & CBK_FOR_INSERT)) { -+ /* tree traversal is not for insertion. Just return -+ CBK_COORD_NOTFOUND. */ -+ h->result = CBK_COORD_NOTFOUND; -+ *outcome = LOOKUP_DONE; -+ return 1; -+ } -+ -+ /* take a look at the item to the right of h -> coord */ -+ result = is_next_item_internal(coord, h->key, h->active_lh); -+ if (unlikely(result < 0)) { -+ h->error = "get_right_neighbor failed"; -+ h->result = result; -+ *outcome = LOOKUP_DONE; -+ return 1; -+ } -+ if (result == 0) { -+ /* -+ * item to the right is also an extent one. Allocate a new node -+ * and insert pointer to it after item h -> coord. -+ * -+ * This is a result of extents being located at the twig -+ * level. For explanation, see comment just above -+ * is_next_item_internal(). -+ */ -+ znode *loaded; -+ -+ if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) { -+ /* -+ * we got node read locked, restart coord_by_key to -+ * have write lock on twig level -+ */ -+ h->lock_level = TWIG_LEVEL; -+ h->lock_mode = ZNODE_WRITE_LOCK; -+ *outcome = LOOKUP_REST; -+ return 1; -+ } -+ -+ loaded = coord->node; -+ result = -+ add_empty_leaf(coord, h->active_lh, h->key, -+ rd_key(coord, &key)); -+ if (result) { -+ h->error = "could not add empty leaf"; -+ h->result = result; -+ *outcome = LOOKUP_DONE; -+ return 1; -+ } -+ /* added empty leaf is locked (h->active_lh), its parent node -+ is unlocked, h->coord is set as EMPTY */ -+ assert("vs-13", coord->between == EMPTY_NODE); -+ assert("vs-14", znode_is_write_locked(coord->node)); -+ assert("vs-15", -+ WITH_DATA(coord->node, node_is_empty(coord->node))); -+ assert("vs-16", jnode_is_leaf(ZJNODE(coord->node))); -+ assert("vs-17", coord->node == h->active_lh->node); -+ *outcome = LOOKUP_DONE; -+ h->result = CBK_COORD_NOTFOUND; -+ return 1; -+ } else if (result == 1) { -+ /* -+ * this is special case mentioned in the comment on -+ * tree.h:cbk_flags. We have found internal item immediately on -+ * the right of extent, and we are going to insert new item -+ * there. Key of item we are going to insert is smaller than -+ * leftmost key in the node pointed to by said internal item -+ * (otherwise search wouldn't come to the extent in the first -+ * place). -+ * -+ * This is a result of extents being located at the twig -+ * level. For explanation, see comment just above -+ * is_next_item_internal(). -+ */ -+ h->flags &= ~CBK_TRUST_DK; -+ } else { -+ assert("vs-8", result == 2); -+ *outcome = LOOKUP_REST; -+ return 1; -+ } -+ assert("vs-362", WITH_DATA(coord->node, item_is_internal(coord))); -+ return 0; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 120 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/estimate.c linux-2.6.20/fs/reiser4/estimate.c ---- linux-2.6.20.orig/fs/reiser4/estimate.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/estimate.c 2007-05-06 14:50:43.706978224 +0400 -@@ -0,0 +1,111 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "debug.h" -+#include "dformat.h" -+#include "tree.h" -+#include "carry.h" -+#include "inode.h" -+#include "plugin/cluster.h" -+#include "plugin/item/ctail.h" -+ -+/* this returns how many nodes might get dirty and added nodes if @children nodes are dirtied -+ -+ Amount of internals which will get dirty or get allocated we estimate as 5% of the childs + 1 balancing. 1 balancing -+ is 2 neighbours, 2 new blocks and the current block on the leaf level, 2 neighbour nodes + the current (or 1 -+ neighbour and 1 new and the current) on twig level, 2 neighbour nodes on upper levels and 1 for a new root. So 5 for -+ leaf level, 3 for twig level, 2 on upper + 1 for root. -+ -+ Do not calculate the current node of the lowest level here - this is overhead only. -+ -+ children is almost always 1 here. Exception is flow insertion -+*/ -+static reiser4_block_nr -+max_balance_overhead(reiser4_block_nr childen, tree_level tree_height) -+{ -+ reiser4_block_nr ten_percent; -+ -+ ten_percent = ((103 * childen) >> 10); -+ -+ /* If we have too many balancings at the time, tree height can raise on more -+ then 1. Assume that if tree_height is 5, it can raise on 1 only. */ -+ return ((tree_height < 5 ? 5 : tree_height) * 2 + (4 + ten_percent)); -+} -+ -+/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to -+ perform insertion of one item into the tree */ -+/* it is only called when tree height changes, or gets initialized */ -+reiser4_block_nr calc_estimate_one_insert(tree_level height) -+{ -+ return 1 + max_balance_overhead(1, height); -+} -+ -+reiser4_block_nr estimate_one_insert_item(reiser4_tree * tree) -+{ -+ return tree->estimate_one_insert; -+} -+ -+/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to -+ perform insertion of one unit into an item in the tree */ -+reiser4_block_nr estimate_one_insert_into_item(reiser4_tree * tree) -+{ -+ /* estimate insert into item just like item insertion */ -+ return tree->estimate_one_insert; -+} -+ -+reiser4_block_nr estimate_one_item_removal(reiser4_tree * tree) -+{ -+ /* on item removal reiser4 does not try to pack nodes more complact, so, only one node may be dirtied on leaf -+ level */ -+ return tree->estimate_one_insert; -+} -+ -+/* on leaf level insert_flow may add CARRY_FLOW_NEW_NODES_LIMIT new nodes and dirty 3 existing nodes (insert point and -+ both its neighbors). Max_balance_overhead should estimate number of blocks which may change/get added on internal -+ levels */ -+reiser4_block_nr estimate_insert_flow(tree_level height) -+{ -+ return 3 + CARRY_FLOW_NEW_NODES_LIMIT + max_balance_overhead(3 + -+ CARRY_FLOW_NEW_NODES_LIMIT, -+ height); -+} -+ -+/* returnes max number of nodes can be occupied by disk cluster */ -+static reiser4_block_nr estimate_cluster(struct inode * inode, int unprepped) -+{ -+ int per_cluster; -+ per_cluster = (unprepped ? 1 : cluster_nrpages(inode)); -+ return 3 + per_cluster + -+ max_balance_overhead(3 + per_cluster, -+ REISER4_MAX_ZTREE_HEIGHT); -+} -+ -+/* how many nodes might get dirty and added -+ during insertion of a disk cluster */ -+reiser4_block_nr estimate_insert_cluster(struct inode * inode) -+{ -+ return estimate_cluster(inode, 1); /* 24 */ -+} -+ -+/* how many nodes might get dirty and added -+ during update of a (prepped or unprepped) disk cluster */ -+reiser4_block_nr estimate_update_cluster(struct inode * inode) -+{ -+ return estimate_cluster(inode, 0); /* 44, for 64K-cluster */ -+} -+ -+/* how many nodes occupied by a disk cluster might get dirty */ -+reiser4_block_nr estimate_dirty_cluster(struct inode * inode) -+{ -+ return cluster_nrpages(inode) + 4; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/export_ops.c linux-2.6.20/fs/reiser4/export_ops.c ---- linux-2.6.20.orig/fs/reiser4/export_ops.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/export_ops.c 2007-05-06 14:50:43.706978224 +0400 -@@ -0,0 +1,295 @@ -+/* Copyright 2005 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#include "inode.h" -+#include "plugin/plugin.h" -+ -+/* -+ * Supported file-handle types -+ */ -+typedef enum { -+ FH_WITH_PARENT = 0x10, /* file handle with parent */ -+ FH_WITHOUT_PARENT = 0x11 /* file handle without parent */ -+} reiser4_fhtype; -+ -+#define NFSERROR (255) -+ -+/* initialize place-holder for object */ -+static void object_on_wire_init(reiser4_object_on_wire *o) -+{ -+ o->plugin = NULL; -+} -+ -+/* finish with @o */ -+static void object_on_wire_done(reiser4_object_on_wire *o) -+{ -+ if (o->plugin != NULL) -+ o->plugin->wire.done(o); -+} -+ -+/* -+ * read serialized object identity from @addr and store information about -+ * object in @obj. This is dual to encode_inode(). -+ */ -+static char *decode_inode(struct super_block *s, char *addr, -+ reiser4_object_on_wire * obj) -+{ -+ file_plugin *fplug; -+ -+ /* identifier of object plugin is stored in the first two bytes, -+ * followed by... */ -+ fplug = file_plugin_by_disk_id(reiser4_get_tree(s), (d16 *) addr); -+ if (fplug != NULL) { -+ addr += sizeof(d16); -+ obj->plugin = fplug; -+ assert("nikita-3520", fplug->wire.read != NULL); -+ /* plugin specific encoding of object identity. */ -+ addr = fplug->wire.read(addr, obj); -+ } else -+ addr = ERR_PTR(RETERR(-EINVAL)); -+ return addr; -+} -+ -+/** -+ * reiser4_decode_fh - decode_fh of export operations -+ * @super: super block -+ * @fh: nfsd file handle -+ * @len: length of file handle -+ * @fhtype: type of file handle -+ * @acceptable: acceptability testing function -+ * @context: argument for @acceptable -+ * -+ * Returns dentry referring to the same file as @fh. -+ */ -+static struct dentry *reiser4_decode_fh(struct super_block *super, __u32 *fh, -+ int len, int fhtype, -+ int (*acceptable) (void *context, -+ struct dentry *de), -+ void *context) -+{ -+ reiser4_context *ctx; -+ reiser4_object_on_wire object; -+ reiser4_object_on_wire parent; -+ char *addr; -+ int with_parent; -+ -+ ctx = reiser4_init_context(super); -+ if (IS_ERR(ctx)) -+ return (struct dentry *)ctx; -+ -+ assert("vs-1482", -+ fhtype == FH_WITH_PARENT || fhtype == FH_WITHOUT_PARENT); -+ -+ with_parent = (fhtype == FH_WITH_PARENT); -+ -+ addr = (char *)fh; -+ -+ object_on_wire_init(&object); -+ object_on_wire_init(&parent); -+ -+ addr = decode_inode(super, addr, &object); -+ if (!IS_ERR(addr)) { -+ if (with_parent) -+ addr = decode_inode(super, addr, &parent); -+ if (!IS_ERR(addr)) { -+ struct dentry *d; -+ typeof(super->s_export_op->find_exported_dentry) fn; -+ -+ fn = super->s_export_op->find_exported_dentry; -+ assert("nikita-3521", fn != NULL); -+ d = fn(super, &object, with_parent ? &parent : NULL, -+ acceptable, context); -+ if (d != NULL && !IS_ERR(d)) -+ /* FIXME check for -ENOMEM */ -+ reiser4_get_dentry_fsdata(d)->stateless = 1; -+ addr = (char *)d; -+ } -+ } -+ -+ object_on_wire_done(&object); -+ object_on_wire_done(&parent); -+ -+ reiser4_exit_context(ctx); -+ return (void *)addr; -+} -+ -+/* -+ * Object serialization support. -+ * -+ * To support knfsd file system provides export_operations that are used to -+ * construct and interpret NFS file handles. As a generalization of this, -+ * reiser4 object plugins have serialization support: it provides methods to -+ * create on-wire representation of identity of reiser4 object, and -+ * re-create/locate object given its on-wire identity. -+ * -+ */ -+ -+/* -+ * return number of bytes that on-wire representation of @inode's identity -+ * consumes. -+ */ -+static int encode_inode_size(struct inode *inode) -+{ -+ assert("nikita-3514", inode != NULL); -+ assert("nikita-3515", inode_file_plugin(inode) != NULL); -+ assert("nikita-3516", inode_file_plugin(inode)->wire.size != NULL); -+ -+ return inode_file_plugin(inode)->wire.size(inode) + sizeof(d16); -+} -+ -+/* -+ * store on-wire representation of @inode's identity at the area beginning at -+ * @start. -+ */ -+static char *encode_inode(struct inode *inode, char *start) -+{ -+ assert("nikita-3517", inode != NULL); -+ assert("nikita-3518", inode_file_plugin(inode) != NULL); -+ assert("nikita-3519", inode_file_plugin(inode)->wire.write != NULL); -+ -+ /* -+ * first, store two-byte identifier of object plugin, then -+ */ -+ save_plugin_id(file_plugin_to_plugin(inode_file_plugin(inode)), -+ (d16 *) start); -+ start += sizeof(d16); -+ /* -+ * call plugin to serialize object's identity -+ */ -+ return inode_file_plugin(inode)->wire.write(inode, start); -+} -+ -+/* this returns number of 32 bit long numbers encoded in @lenp. 255 is -+ * returned if file handle can not be stored */ -+/** -+ * reiser4_encode_fh - encode_fh of export operations -+ * @dentry: -+ * @fh: -+ * @lenp: -+ * @need_parent: -+ * -+ */ -+static int -+reiser4_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp, -+ int need_parent) -+{ -+ struct inode *inode; -+ struct inode *parent; -+ char *addr; -+ int need; -+ int delta; -+ int result; -+ reiser4_context *ctx; -+ -+ /* -+ * knfsd asks as to serialize object in @dentry, and, optionally its -+ * parent (if need_parent != 0). -+ * -+ * encode_inode() and encode_inode_size() is used to build -+ * representation of object and its parent. All hard work is done by -+ * object plugins. -+ */ -+ inode = dentry->d_inode; -+ parent = dentry->d_parent->d_inode; -+ -+ addr = (char *)fh; -+ -+ need = encode_inode_size(inode); -+ if (need < 0) -+ return NFSERROR; -+ if (need_parent) { -+ delta = encode_inode_size(parent); -+ if (delta < 0) -+ return NFSERROR; -+ need += delta; -+ } -+ -+ ctx = reiser4_init_context(dentry->d_inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ if (need <= sizeof(__u32) * (*lenp)) { -+ addr = encode_inode(inode, addr); -+ if (need_parent) -+ addr = encode_inode(parent, addr); -+ -+ /* store in lenp number of 32bit words required for file -+ * handle. */ -+ *lenp = (need + sizeof(__u32) - 1) >> 2; -+ result = need_parent ? FH_WITH_PARENT : FH_WITHOUT_PARENT; -+ } else -+ /* no enough space in file handle */ -+ result = NFSERROR; -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/** -+ * reiser4_get_dentry_parent - get_parent of export operations -+ * @child: -+ * -+ */ -+static struct dentry *reiser4_get_dentry_parent(struct dentry *child) -+{ -+ struct inode *dir; -+ dir_plugin *dplug; -+ -+ assert("nikita-3527", child != NULL); -+ /* see comment in reiser4_get_dentry() about following assertion */ -+ assert("nikita-3528", is_in_reiser4_context()); -+ -+ dir = child->d_inode; -+ assert("nikita-3529", dir != NULL); -+ dplug = inode_dir_plugin(dir); -+ assert("nikita-3531", ergo(dplug != NULL, dplug->get_parent != NULL)); -+ if (dplug != NULL) -+ return dplug->get_parent(dir); -+ else -+ return ERR_PTR(RETERR(-ENOTDIR)); -+} -+ -+/** -+ * reiser4_get_dentry - get_dentry of export operations -+ * @super: -+ * @data: -+ * -+ * -+ */ -+static struct dentry *reiser4_get_dentry(struct super_block *super, void *data) -+{ -+ reiser4_object_on_wire *o; -+ -+ assert("nikita-3522", super != NULL); -+ assert("nikita-3523", data != NULL); -+ /* -+ * this is only supposed to be called by -+ * -+ * reiser4_decode_fh->find_exported_dentry -+ * -+ * so, reiser4_context should be here already. -+ */ -+ assert("nikita-3526", is_in_reiser4_context()); -+ -+ o = (reiser4_object_on_wire *)data; -+ assert("nikita-3524", o->plugin != NULL); -+ assert("nikita-3525", o->plugin->wire.get != NULL); -+ -+ return o->plugin->wire.get(super, o); -+} -+ -+struct export_operations reiser4_export_operations = { -+ .encode_fh = reiser4_encode_fh, -+ .decode_fh = reiser4_decode_fh, -+ .get_parent = reiser4_get_dentry_parent, -+ .get_dentry = reiser4_get_dentry -+}; -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/flush.c linux-2.6.20/fs/reiser4/flush.c ---- linux-2.6.20.orig/fs/reiser4/flush.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/flush.c 2007-05-06 14:50:43.000000000 +0400 -@@ -0,0 +1,3622 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* The design document for this file is at http://www.namesys.com/v4/v4.html. */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/item/item.h" -+#include "plugin/plugin.h" -+#include "plugin/object.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree_walk.h" -+#include "carry.h" -+#include "tree.h" -+#include "vfs_ops.h" -+#include "inode.h" -+#include "page_cache.h" -+#include "wander.h" -+#include "super.h" -+#include "entd.h" -+#include "reiser4.h" -+#include "flush.h" -+#include "writeout.h" -+ -+#include -+#include /* for struct super_block */ -+#include /* for struct page */ -+#include /* for struct bio */ -+#include -+#include -+ -+/* IMPLEMENTATION NOTES */ -+ -+/* PARENT-FIRST: Some terminology: A parent-first traversal is a way of assigning a total -+ order to the nodes of the tree in which the parent is placed before its children, which -+ are ordered (recursively) in left-to-right order. When we speak of a "parent-first preceder", it -+ describes the node that "came before in forward parent-first order". When we speak of a -+ "parent-first follower", it describes the node that "comes next in parent-first -+ order" (alternatively the node that "came before in reverse parent-first order"). -+ -+ The following pseudo-code prints the nodes of a tree in forward parent-first order: -+ -+ void parent_first (node) -+ { -+ print_node (node); -+ if (node->level > leaf) { -+ for (i = 0; i < num_children; i += 1) { -+ parent_first (node->child[i]); -+ } -+ } -+ } -+*/ -+ -+/* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE? The idea is to optimize block allocation so -+ that a left-to-right scan of the tree's data (i.e., the leaves in left-to-right order) -+ can be accomplished with sequential reads, which results in reading nodes in their -+ parent-first order. This is a read-optimization aspect of the flush algorithm, and -+ there is also a write-optimization aspect, which is that we wish to make large -+ sequential writes to the disk by allocating or reallocating blocks so that they can be -+ written in sequence. Sometimes the read-optimization and write-optimization goals -+ conflict with each other, as we discuss in more detail below. -+*/ -+ -+/* STATE BITS: The flush code revolves around the state of the jnodes it covers. Here are -+ the relevant jnode->state bits and their relevence to flush: -+ -+ JNODE_DIRTY: If a node is dirty, it must be flushed. But in order to be written it -+ must be allocated first. In order to be considered allocated, the jnode must have -+ exactly one of { JNODE_OVRWR, JNODE_RELOC } set. These two bits are exclusive, and -+ all dirtied jnodes eventually have one of these bits set during each transaction. -+ -+ JNODE_CREATED: The node was freshly created in its transaction and has no previous -+ block address, so it is unconditionally assigned to be relocated, although this is -+ mainly for code-convenience. It is not being 'relocated' from anything, but in -+ almost every regard it is treated as part of the relocate set. The JNODE_CREATED bit -+ remains set even after JNODE_RELOC is set, so the actual relocate can be -+ distinguished from the created-and-allocated set easily: relocate-set members -+ (belonging to the preserve-set) have (JNODE_RELOC) set and created-set members which -+ have no previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set. -+ -+ JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm made the -+ decision to maintain the pre-existing location for this node and it will be written -+ to the wandered-log. -+ -+ JNODE_RELOC: The flush algorithm made the decision to relocate this block (if it was -+ not created, see note above). A block with JNODE_RELOC set is eligible for -+ early-flushing and may be submitted during flush_empty_queues. When the JNODE_RELOC -+ bit is set on a znode, the parent node's internal item is modified and the znode is -+ rehashed. -+ -+ JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm scans the node -+ and calls plugin->f.squeeze() method for its items. By this technology we update disk -+ clusters of cryptcompress objects. Also if leftmost point that was found by flush scan -+ has this flag (races with write(), rare case) the flush algorythm makes the decision -+ to pass it to squalloc() in spite of its flushprepped status for squeezing, not for -+ repeated allocation. -+ -+ JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode into its -+ flush queue. This means the jnode is not on any clean or dirty list, instead it is -+ moved to one of the flush queue (see flush_queue.h) object private list. This -+ prevents multiple concurrent flushes from attempting to start flushing from the -+ same node. -+ -+ (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up -+ squeeze-and-allocate on a node while its children are actively being squeezed and -+ allocated. This flag was created to avoid submitting a write request for a node -+ while its children are still being allocated and squeezed. Then flush queue was -+ re-implemented to allow unlimited number of nodes be queued. This flag support was -+ commented out in source code because we decided that there was no reason to submit -+ queued nodes before jnode_flush() finishes. However, current code calls fq_write() -+ during a slum traversal and may submit "busy nodes" to disk. Probably we can -+ re-enable the JNODE_FLUSH_BUSY bit support in future. -+ -+ With these state bits, we describe a test used frequently in the code below, -+ jnode_is_flushprepped() (and the spin-lock-taking jnode_check_flushprepped()). The -+ test for "flushprepped" returns true if any of the following are true: -+ -+ - The node is not dirty -+ - The node has JNODE_RELOC set -+ - The node has JNODE_OVRWR set -+ -+ If either the node is not dirty or it has already been processed by flush (and assigned -+ JNODE_OVRWR or JNODE_RELOC), then it is prepped. If jnode_is_flushprepped() returns -+ true then flush has work to do on that node. -+*/ -+ -+/* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never -+ flushprepped twice (unless an explicit call to flush_unprep is made as described in -+ detail below). For example a node is dirtied, allocated, and then early-flushed to -+ disk and set clean. Before the transaction commits, the page is dirtied again and, due -+ to memory pressure, the node is flushed again. The flush algorithm will not relocate -+ the node to a new disk location, it will simply write it to the same, previously -+ relocated position again. -+*/ -+ -+/* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm where we -+ start at a leaf node and allocate in parent-first order by iterating to the right. At -+ each step of the iteration, we check for the right neighbor. Before advancing to the -+ right neighbor, we check if the current position and the right neighbor share the same -+ parent. If they do not share the same parent, the parent is allocated before the right -+ neighbor. -+ -+ This process goes recursively up the tree and squeeze nodes level by level as long as -+ the right neighbor and the current position have different parents, then it allocates -+ the right-neighbors-with-different-parents on the way back down. This process is -+ described in more detail in flush_squalloc_changed_ancestor and the recursive function -+ squalloc_one_changed_ancestor. But the purpose here is not to discuss the -+ specifics of the bottom-up approach as it is to contrast the bottom-up and top-down -+ approaches. -+ -+ The top-down algorithm was implemented earlier (April-May 2002). In the top-down -+ approach, we find a starting point by scanning left along each level past dirty nodes, -+ then going up and repeating the process until the left node and the parent node are -+ clean. We then perform a parent-first traversal from the starting point, which makes -+ allocating in parent-first order trivial. After one subtree has been allocated in this -+ manner, we move to the right, try moving upward, then repeat the parent-first -+ traversal. -+ -+ Both approaches have problems that need to be addressed. Both are approximately the -+ same amount of code, but the bottom-up approach has advantages in the order it acquires -+ locks which, at the very least, make it the better approach. At first glance each one -+ makes the other one look simpler, so it is important to remember a few of the problems -+ with each one. -+ -+ Main problem with the top-down approach: When you encounter a clean child during the -+ parent-first traversal, what do you do? You would like to avoid searching through a -+ large tree of nodes just to find a few dirty leaves at the bottom, and there is not an -+ obvious solution. One of the advantages of the top-down approach is that during the -+ parent-first traversal you check every child of a parent to see if it is dirty. In -+ this way, the top-down approach easily handles the main problem of the bottom-up -+ approach: unallocated children. -+ -+ The unallocated children problem is that before writing a node to disk we must make -+ sure that all of its children are allocated. Otherwise, the writing the node means -+ extra I/O because the node will have to be written again when the child is finally -+ allocated. -+ -+ WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM. Except for bugs, this -+ should not cause any file system corruption, it only degrades I/O performance because a -+ node may be written when it is sure to be written at least one more time in the same -+ transaction when the remaining children are allocated. What follows is a description -+ of how we will solve the problem. -+*/ -+ -+/* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node then, -+ proceeding in parent first order, allocate some of its left-children, then encounter a -+ clean child in the middle of the parent. We do not allocate the clean child, but there -+ may remain unallocated (dirty) children to the right of the clean child. If we were to -+ stop flushing at this moment and write everything to disk, the parent might still -+ contain unallocated children. -+ -+ We could try to allocate all the descendents of every node that we allocate, but this -+ is not necessary. Doing so could result in allocating the entire tree: if the root -+ node is allocated then every unallocated node would have to be allocated before -+ flushing. Actually, we do not have to write a node just because we allocate it. It is -+ possible to allocate but not write a node during flush, when it still has unallocated -+ children. However, this approach is probably not optimal for the following reason. -+ -+ The flush algorithm is designed to allocate nodes in parent-first order in an attempt -+ to optimize reads that occur in the same order. Thus we are read-optimizing for a -+ left-to-right scan through all the leaves in the system, and we are hoping to -+ write-optimize at the same time because those nodes will be written together in batch. -+ What happens, however, if we assign a block number to a node in its read-optimized -+ order but then avoid writing it because it has unallocated children? In that -+ situation, we lose out on the write-optimization aspect because a node will have to be -+ written again to the its location on the device, later, which likely means seeking back -+ to that location. -+ -+ So there are tradeoffs. We can choose either: -+ -+ A. Allocate all unallocated children to preserve both write-optimization and -+ read-optimization, but this is not always desirable because it may mean having to -+ allocate and flush very many nodes at once. -+ -+ B. Defer writing nodes with unallocated children, keep their read-optimized locations, -+ but sacrifice write-optimization because those nodes will be written again. -+ -+ C. Defer writing nodes with unallocated children, but do not keep their read-optimized -+ locations. Instead, choose to write-optimize them later, when they are written. To -+ facilitate this, we "undo" the read-optimized allocation that was given to the node so -+ that later it can be write-optimized, thus "unpreparing" the flush decision. This is a -+ case where we disturb the FLUSH_PREP_ONCE_PER_TRANSACTION rule described above. By a -+ call to flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit; -+ if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate its block -+ location, and set the JNODE_CREATED bit, effectively setting the node back to an -+ unallocated state. -+ -+ We will take the following approach in v4.0: for twig nodes we will always finish -+ allocating unallocated children (A). For nodes with (level > TWIG) we will defer -+ writing and choose write-optimization (C). -+ -+ To summarize, there are several parts to a solution that avoids the problem with -+ unallocated children: -+ -+ FIXME-ZAM: Still no one approach is implemented to eliminate the "UNALLOCATED CHILDREN" -+ problem because there was an experiment which was done showed that we have 1-2 nodes -+ with unallocated children for thousands of written nodes. The experiment was simple -+ like coping / deletion of linux kernel sources. However the problem can arise in more -+ complex tests. I think we have jnode_io_hook to insert a check for unallocated -+ children and see what kind of problem we have. -+ -+ 1. When flush reaches a stopping point (e.g., a clean node), it should continue calling -+ squeeze-and-allocate on any remaining unallocated children. FIXME: Difficulty to -+ implement: should be simple -- amounts to adding a while loop to jnode_flush, see -+ comments in that function. -+ -+ 2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes may still -+ have unallocated children. If the twig level has unallocated children it is an -+ assertion failure. If a higher-level node has unallocated children, then it should be -+ explicitly de-allocated by a call to flush_unprep(). FIXME: Difficulty to implement: -+ should be simple. -+ -+ 3. (CPU-Optimization) Checking whether a node has unallocated children may consume more -+ CPU cycles than we would like, and it is possible (but medium complexity) to optimize -+ this somewhat in the case where large sub-trees are flushed. The following observation -+ helps: if both the left- and right-neighbor of a node are processed by the flush -+ algorithm then the node itself is guaranteed to have all of its children allocated. -+ However, the cost of this check may not be so expensive after all: it is not needed for -+ leaves and flush can guarantee this property for twigs. That leaves only (level > -+ TWIG) nodes that have to be checked, so this optimization only helps if at least three -+ (level > TWIG) nodes are flushed in one pass, and the savings will be very small unless -+ there are many more (level > TWIG) nodes. But if there are many (level > TWIG) nodes -+ then the number of blocks being written will be very large, so the savings may be -+ insignificant. That said, the idea is to maintain both the left and right edges of -+ nodes that are processed in flush. When flush_empty_queue() is called, a relatively -+ simple test will tell whether the (level > TWIG) node is on the edge. If it is on the -+ edge, the slow check is necessary, but if it is in the interior then it can be assumed -+ to have all of its children allocated. FIXME: medium complexity to implement, but -+ simple to verify given that we must have a slow check anyway. -+ -+ 4. (Optional) This part is optional, not for v4.0--flush should work independently of -+ whether this option is used or not. Called RAPID_SCAN, the idea is to amend the -+ left-scan operation to take unallocated children into account. Normally, the left-scan -+ operation goes left as long as adjacent nodes are dirty up until some large maximum -+ value (FLUSH_SCAN_MAXNODES) at which point it stops and begins flushing. But scan-left -+ may stop at a position where there are unallocated children to the left with the same -+ parent. When RAPID_SCAN is enabled, the ordinary scan-left operation stops after -+ FLUSH_RELOCATE_THRESHOLD, which is much smaller than FLUSH_SCAN_MAXNODES, then procedes -+ with a rapid scan. The rapid scan skips all the interior children of a node--if the -+ leftmost child of a twig is dirty, check its left neighbor (the rightmost child of the -+ twig to the left). If the left neighbor of the leftmost child is also dirty, then -+ continue the scan at the left twig and repeat. This option will cause flush to -+ allocate more twigs in a single pass, but it also has the potential to write many more -+ nodes than would otherwise be written without the RAPID_SCAN option. RAPID_SCAN -+ was partially implemented, code removed August 12, 2002 by JMACD. -+*/ -+ -+/* FLUSH CALLED ON NON-LEAF LEVEL. Most of our design considerations assume that the -+ starting point for flush is a leaf node, but actually the flush code cares very little -+ about whether or not this is true. It is possible that all the leaf nodes are flushed -+ and dirty parent nodes still remain, in which case jnode_flush() is called on a -+ non-leaf argument. Flush doesn't care--it treats the argument node as if it were a -+ leaf, even when it is not. This is a simple approach, and there may be a more optimal -+ policy but until a problem with this approach is discovered, simplest is probably best. -+ -+ NOTE: In this case, the ordering produced by flush is parent-first only if you ignore -+ the leaves. This is done as a matter of simplicity and there is only one (shaky) -+ justification. When an atom commits, it flushes all leaf level nodes first, followed -+ by twigs, and so on. With flushing done in this order, if flush is eventually called -+ on a non-leaf node it means that (somehow) we reached a point where all leaves are -+ clean and only internal nodes need to be flushed. If that it the case, then it means -+ there were no leaves that were the parent-first preceder/follower of the parent. This -+ is expected to be a rare case, which is why we do nothing special about it. However, -+ memory pressure may pass an internal node to flush when there are still dirty leaf -+ nodes that need to be flushed, which could prove our original assumptions -+ "inoperative". If this needs to be fixed, then scan_left/right should have -+ special checks for the non-leaf levels. For example, instead of passing from a node to -+ the left neighbor, it should pass from the node to the left neighbor's rightmost -+ descendent (if dirty). -+ -+*/ -+ -+/* UNIMPLEMENTED AS YET: REPACKING AND RESIZING. We walk the tree in 4MB-16MB chunks, dirtying everything and putting -+ it into a transaction. We tell the allocator to allocate the blocks as far as possible towards one end of the -+ logical device--the left (starting) end of the device if we are walking from left to right, the right end of the -+ device if we are walking from right to left. We then make passes in alternating directions, and as we do this the -+ device becomes sorted such that tree order and block number order fully correlate. -+ -+ Resizing is done by shifting everything either all the way to the left or all the way -+ to the right, and then reporting the last block. -+*/ -+ -+/* RELOCATE DECISIONS: The code makes a decision to relocate in several places. This -+ descibes the policy from the highest level: -+ -+ The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive nodes on the -+ leaf level during flush-scan (right, left), then we unconditionally decide to relocate -+ leaf nodes. -+ -+ Otherwise, there are two contexts in which we make a decision to relocate: -+ -+ 1. The REVERSE PARENT-FIRST context: Implemented in reverse_relocate_test(). -+ During the initial stages of flush, after scan-right completes, we want to ask the -+ question: should we relocate this leaf node and thus dirty the parent node. Then if -+ the node is a leftmost child its parent is its own parent-first preceder, thus we repeat -+ the question at the next level up, and so on. In these cases we are moving in the -+ reverse-parent first direction. -+ -+ There is another case which is considered the reverse direction, which comes at the end -+ of a twig in reverse_relocate_end_of_twig(). As we finish processing a twig we may -+ reach a point where there is a clean twig to the right with a dirty leftmost child. In -+ this case, we may wish to relocate the child by testing if it should be relocated -+ relative to its parent. -+ -+ 2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done in -+ allocate_znode. What distinguishes the forward parent-first case from the -+ reverse-parent first case is that the preceder has already been allocated in the -+ forward case, whereas in the reverse case we don't know what the preceder is until we -+ finish "going in reverse". That simplifies the forward case considerably, and there we -+ actually use the block allocator to determine whether, e.g., a block closer to the -+ preceder is available. -+*/ -+ -+/* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration. The idea is, once we -+ finish scan-left and find a starting point, if the parent's left neighbor is dirty then -+ squeeze the parent's left neighbor and the parent. This may change the -+ flush-starting-node's parent. Repeat until the child's parent is stable. If the child -+ is a leftmost child, repeat this left-edge squeezing operation at the next level up. -+ Note that we cannot allocate extents during this or they will be out of parent-first -+ order. There is also some difficult coordinate maintenence issues. We can't do a tree -+ search to find coordinates again (because we hold locks), we have to determine them -+ from the two nodes being squeezed. Looks difficult, but has potential to increase -+ space utilization. */ -+ -+/* Flush-scan helper functions. */ -+static void scan_init(flush_scan * scan); -+static void scan_done(flush_scan * scan); -+ -+/* Flush-scan algorithm. */ -+static int scan_left(flush_scan * scan, flush_scan * right, jnode * node, -+ unsigned limit); -+static int scan_right(flush_scan * scan, jnode * node, unsigned limit); -+static int scan_common(flush_scan * scan, flush_scan * other); -+static int scan_formatted(flush_scan * scan); -+static int scan_unformatted(flush_scan * scan, flush_scan * other); -+static int scan_by_coord(flush_scan * scan); -+ -+/* Initial flush-point ancestor allocation. */ -+static int alloc_pos_and_ancestors(flush_pos_t * pos); -+static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos); -+static int set_preceder(const coord_t * coord_in, flush_pos_t * pos); -+ -+/* Main flush algorithm. Note on abbreviation: "squeeze and allocate" == "squalloc". */ -+static int squalloc(flush_pos_t * pos); -+ -+/* Flush squeeze implementation. */ -+static int squeeze_right_non_twig(znode * left, znode * right); -+static int shift_one_internal_unit(znode * left, znode * right); -+ -+/* Flush reverse parent-first relocation routines. */ -+static int reverse_relocate_if_close_enough(const reiser4_block_nr * pblk, -+ const reiser4_block_nr * nblk); -+static int reverse_relocate_test(jnode * node, const coord_t * parent_coord, -+ flush_pos_t * pos); -+static int reverse_relocate_check_dirty_parent(jnode * node, -+ const coord_t * parent_coord, -+ flush_pos_t * pos); -+ -+/* Flush allocate write-queueing functions: */ -+static int allocate_znode(znode * node, const coord_t * parent_coord, -+ flush_pos_t * pos); -+static int allocate_znode_update(znode * node, const coord_t * parent_coord, -+ flush_pos_t * pos); -+static int lock_parent_and_allocate_znode(znode *, flush_pos_t *); -+ -+/* Flush helper functions: */ -+static int jnode_lock_parent_coord(jnode * node, -+ coord_t * coord, -+ lock_handle * parent_lh, -+ load_count * parent_zh, -+ znode_lock_mode mode, int try); -+static int neighbor_in_slum(znode * node, lock_handle * right_lock, sideof side, -+ znode_lock_mode mode, int check_dirty); -+static int znode_same_parents(znode * a, znode * b); -+ -+static int znode_check_flushprepped(znode * node) -+{ -+ return jnode_check_flushprepped(ZJNODE(node)); -+} -+ -+/* Flush position functions */ -+static void pos_init(flush_pos_t * pos); -+static int pos_valid(flush_pos_t * pos); -+static void pos_done(flush_pos_t * pos); -+static int pos_stop(flush_pos_t * pos); -+ -+/* check that @org is first jnode extent unit, if extent is unallocated, -+ * because all jnodes of unallocated extent are dirty and of the same atom. */ -+#define checkchild(scan) \ -+assert("nikita-3435", \ -+ ergo(scan->direction == LEFT_SIDE && \ -+ (scan->parent_coord.node->level == TWIG_LEVEL) && \ -+ jnode_is_unformatted(scan->node) && \ -+ extent_is_unallocated(&scan->parent_coord), \ -+ extent_unit_index(&scan->parent_coord) == index_jnode(scan->node))) -+ -+/* This flush_cnt variable is used to track the number of concurrent flush operations, -+ useful for debugging. It is initialized in txnmgr.c out of laziness (because flush has -+ no static initializer function...) */ -+ON_DEBUG(atomic_t flush_cnt; -+ ) -+ -+/* check fs backing device for write congestion */ -+static int check_write_congestion(void) -+{ -+ struct super_block *sb; -+ struct backing_dev_info *bdi; -+ -+ sb = reiser4_get_current_sb(); -+ bdi = reiser4_get_super_fake(sb)->i_mapping->backing_dev_info; -+ return bdi_write_congested(bdi); -+} -+ -+/* conditionally write flush queue */ -+static int write_prepped_nodes(flush_pos_t * pos) -+{ -+ int ret; -+ -+ assert("zam-831", pos); -+ assert("zam-832", pos->fq); -+ -+ if (!(pos->flags & JNODE_FLUSH_WRITE_BLOCKS)) -+ return 0; -+ -+ if (check_write_congestion()) -+ return 0; -+ -+ ret = reiser4_write_fq(pos->fq, pos->nr_written, -+ WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM); -+ return ret; -+} -+ -+/* Proper release all flush pos. resources then move flush position to new -+ locked node */ -+static void move_flush_pos(flush_pos_t * pos, lock_handle * new_lock, -+ load_count * new_load, const coord_t * new_coord) -+{ -+ assert("zam-857", new_lock->node == new_load->node); -+ -+ if (new_coord) { -+ assert("zam-858", new_coord->node == new_lock->node); -+ coord_dup(&pos->coord, new_coord); -+ } else { -+ coord_init_first_unit(&pos->coord, new_lock->node); -+ } -+ -+ if (pos->child) { -+ jput(pos->child); -+ pos->child = NULL; -+ } -+ -+ move_load_count(&pos->load, new_load); -+ done_lh(&pos->lock); -+ move_lh(&pos->lock, new_lock); -+} -+ -+/* delete empty node which link from the parent still exists. */ -+static int delete_empty_node(znode * node) -+{ -+ reiser4_key smallest_removed; -+ -+ assert("zam-1019", node != NULL); -+ assert("zam-1020", node_is_empty(node)); -+ assert("zam-1023", znode_is_wlocked(node)); -+ -+ return reiser4_delete_node(node, &smallest_removed, NULL, 1); -+} -+ -+/* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */ -+static int prepare_flush_pos(flush_pos_t * pos, jnode * org) -+{ -+ int ret; -+ load_count load; -+ lock_handle lock; -+ -+ init_lh(&lock); -+ init_load_count(&load); -+ -+ if (jnode_is_znode(org)) { -+ ret = longterm_lock_znode(&lock, JZNODE(org), -+ ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI); -+ if (ret) -+ return ret; -+ -+ ret = incr_load_count_znode(&load, JZNODE(org)); -+ if (ret) -+ return ret; -+ -+ pos->state = -+ (jnode_get_level(org) == -+ LEAF_LEVEL) ? POS_ON_LEAF : POS_ON_INTERNAL; -+ move_flush_pos(pos, &lock, &load, NULL); -+ } else { -+ coord_t parent_coord; -+ ret = jnode_lock_parent_coord(org, &parent_coord, &lock, -+ &load, ZNODE_WRITE_LOCK, 0); -+ if (ret) -+ goto done; -+ if (!item_is_extent(&parent_coord)) { -+ /* file was converted to tail, org became HB, we found internal -+ item */ -+ ret = -EAGAIN; -+ goto done; -+ } -+ -+ pos->state = POS_ON_EPOINT; -+ move_flush_pos(pos, &lock, &load, &parent_coord); -+ pos->child = jref(org); -+ if (extent_is_unallocated(&parent_coord) -+ && extent_unit_index(&parent_coord) != index_jnode(org)) { -+ /* @org is not first child of its parent unit. This may happen -+ because longerm lock of its parent node was released between -+ scan_left and scan_right. For now work around this having flush to repeat */ -+ ret = -EAGAIN; -+ } -+ } -+ -+ done: -+ done_load_count(&load); -+ done_lh(&lock); -+ return ret; -+} -+ -+/* TODO LIST (no particular order): */ -+/* I have labelled most of the legitimate FIXME comments in this file with letters to -+ indicate which issue they relate to. There are a few miscellaneous FIXMEs with -+ specific names mentioned instead that need to be inspected/resolved. */ -+/* B. There is an issue described in reverse_relocate_test having to do with an -+ imprecise is_preceder? check having to do with partially-dirty extents. The code that -+ sets preceder hints and computes the preceder is basically untested. Careful testing -+ needs to be done that preceder calculations are done correctly, since if it doesn't -+ affect correctness we will not catch this stuff during regular testing. */ -+/* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling. It is unclear which of these are -+ considered expected but unlikely conditions. Flush currently returns 0 (i.e., success -+ but no progress, i.e., restart) whenever it receives any of these in jnode_flush(). -+ Many of the calls that may produce one of these return values (i.e., -+ longterm_lock_znode, reiser4_get_parent, reiser4_get_neighbor, ...) check some of these -+ values themselves and, for instance, stop flushing instead of resulting in a restart. -+ If any of these results are true error conditions then flush will go into a busy-loop, -+ as we noticed during testing when a corrupt tree caused find_child_ptr to return -+ ENOENT. It needs careful thought and testing of corner conditions. -+*/ -+/* D. Atomicity of flush_prep against deletion and flush concurrency. Suppose a created -+ block is assigned a block number then early-flushed to disk. It is dirtied again and -+ flush is called again. Concurrently, that block is deleted, and the de-allocation of -+ its block number does not need to be deferred, since it is not part of the preserve set -+ (i.e., it didn't exist before the transaction). I think there may be a race condition -+ where flush writes the dirty, created block after the non-deferred deallocated block -+ number is re-allocated, making it possible to write deleted data on top of non-deleted -+ data. Its just a theory, but it needs to be thought out. */ -+/* F. bio_alloc() failure is not handled gracefully. */ -+/* G. Unallocated children. */ -+/* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered blocks. */ -+/* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */ -+ -+/* JNODE_FLUSH: MAIN ENTRY POINT */ -+/* This is the main entry point for flushing a jnode and its dirty neighborhood (dirty -+ neighborhood is named "slum"). Jnode_flush() is called if reiser4 has to write dirty -+ blocks to disk, it happens when Linux VM decides to reduce number of dirty pages or as -+ a part of transaction commit. -+ -+ Our objective here is to prep and flush the slum the jnode belongs to. We want to -+ squish the slum together, and allocate the nodes in it as we squish because allocation -+ of children affects squishing of parents. -+ -+ The "argument" @node tells flush where to start. From there, flush finds the left edge -+ of the slum, and calls squalloc (in which nodes are squeezed and allocated). To find a -+ "better place" to start squalloc first we perform a flush_scan. -+ -+ Flush-scanning may be performed in both left and right directions, but for different -+ purposes. When scanning to the left, we are searching for a node that precedes a -+ sequence of parent-first-ordered nodes which we will then flush in parent-first order. -+ During flush-scanning, we also take the opportunity to count the number of consecutive -+ leaf nodes. If this number is past some threshold (FLUSH_RELOCATE_THRESHOLD), then we -+ make a decision to reallocate leaf nodes (thus favoring write-optimization). -+ -+ Since the flush argument node can be anywhere in a sequence of dirty leaves, there may -+ also be dirty nodes to the right of the argument. If the scan-left operation does not -+ count at least FLUSH_RELOCATE_THRESHOLD nodes then we follow it with a right-scan -+ operation to see whether there is, in fact, enough nodes to meet the relocate -+ threshold. Each right- and left-scan operation uses a single flush_scan object. -+ -+ After left-scan and possibly right-scan, we prepare a flush_position object with the -+ starting flush point or parent coordinate, which was determined using scan-left. -+ -+ Next we call the main flush routine, squalloc, which iterates along the -+ leaf level, squeezing and allocating nodes (and placing them into the flush queue). -+ -+ After squalloc returns we take extra steps to ensure that all the children -+ of the final twig node are allocated--this involves repeating squalloc -+ until we finish at a twig with no unallocated children. -+ -+ Finally, we call flush_empty_queue to submit write-requests to disk. If we encounter -+ any above-twig nodes during flush_empty_queue that still have unallocated children, we -+ flush_unprep them. -+ -+ Flush treats several "failure" cases as non-failures, essentially causing them to start -+ over. E_DEADLOCK is one example. FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should -+ probably be handled properly rather than restarting, but there are a bunch of cases to -+ audit. -+*/ -+ -+static int -+jnode_flush(jnode * node, long nr_to_write, long *nr_written, -+ flush_queue_t * fq, int flags) -+{ -+ long ret = 0; -+ flush_scan *right_scan; -+ flush_scan *left_scan; -+ flush_pos_t *flush_pos; -+ int todo; -+ struct super_block *sb; -+ reiser4_super_info_data *sbinfo; -+ jnode *leftmost_in_slum = NULL; -+ -+ assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack())); -+ assert("nikita-3022", reiser4_schedulable()); -+ -+ assert("nikita-3185", -+ get_current_super_private()->delete_mutex_owner != current); -+ -+ /* allocate right_scan, left_scan and flush_pos */ -+ right_scan = -+ kmalloc(2 * sizeof(*right_scan) + sizeof(*flush_pos), -+ reiser4_ctx_gfp_mask_get()); -+ if (right_scan == NULL) -+ return RETERR(-ENOMEM); -+ left_scan = right_scan + 1; -+ flush_pos = (flush_pos_t *) (left_scan + 1); -+ -+ sb = reiser4_get_current_sb(); -+ sbinfo = get_super_private(sb); -+ -+ /* Flush-concurrency debug code */ -+#if REISER4_DEBUG -+ atomic_inc(&flush_cnt); -+#endif -+ -+ reiser4_enter_flush(sb); -+ -+ /* Initialize a flush position. */ -+ pos_init(flush_pos); -+ -+ flush_pos->nr_written = nr_written; -+ flush_pos->fq = fq; -+ flush_pos->flags = flags; -+ flush_pos->nr_to_write = nr_to_write; -+ -+ scan_init(right_scan); -+ scan_init(left_scan); -+ -+ /* First scan left and remember the leftmost scan position. If the leftmost -+ position is unformatted we remember its parent_coord. We scan until counting -+ FLUSH_SCAN_MAXNODES. -+ -+ If starting @node is unformatted, at the beginning of left scan its -+ parent (twig level node, containing extent item) will be long term -+ locked and lock handle will be stored in the -+ @right_scan->parent_lock. This lock is used to start the rightward -+ scan without redoing the tree traversal (necessary to find parent) -+ and, hence, is kept during leftward scan. As a result, we have to -+ use try-lock when taking long term locks during the leftward scan. -+ */ -+ ret = scan_left(left_scan, right_scan, -+ node, sbinfo->flush.scan_maxnodes); -+ if (ret != 0) -+ goto failed; -+ -+ leftmost_in_slum = jref(left_scan->node); -+ scan_done(left_scan); -+ -+ /* Then possibly go right to decide if we will use a policy of relocating leaves. -+ This is only done if we did not scan past (and count) enough nodes during the -+ leftward scan. If we do scan right, we only care to go far enough to establish -+ that at least FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed. The -+ scan limit is the difference between left_scan.count and the threshold. */ -+ -+ todo = sbinfo->flush.relocate_threshold - left_scan->count; -+ /* scan right is inherently deadlock prone, because we are -+ * (potentially) holding a lock on the twig node at this moment. -+ * FIXME: this is incorrect comment: lock is not held */ -+ if (todo > 0) { -+ ret = scan_right(right_scan, node, (unsigned)todo); -+ if (ret != 0) -+ goto failed; -+ } -+ -+ /* Only the right-scan count is needed, release any rightward locks right away. */ -+ scan_done(right_scan); -+ -+ /* ... and the answer is: we should relocate leaf nodes if at least -+ FLUSH_RELOCATE_THRESHOLD nodes were found. */ -+ flush_pos->leaf_relocate = JF_ISSET(node, JNODE_REPACK) || -+ (left_scan->count + right_scan->count >= -+ sbinfo->flush.relocate_threshold); -+ -+ /* Funny business here. We set the 'point' in the flush_position at prior to -+ starting squalloc regardless of whether the first point is -+ formatted or unformatted. Without this there would be an invariant, in the -+ rest of the code, that if the flush_position is unformatted then -+ flush_position->point is NULL and flush_position->parent_{lock,coord} is set, -+ and if the flush_position is formatted then flush_position->point is non-NULL -+ and no parent info is set. -+ -+ This seems lazy, but it makes the initial calls to reverse_relocate_test -+ (which ask "is it the pos->point the leftmost child of its parent") much easier -+ because we know the first child already. Nothing is broken by this, but the -+ reasoning is subtle. Holding an extra reference on a jnode during flush can -+ cause us to see nodes with HEARD_BANSHEE during squalloc, because nodes are not -+ removed from sibling lists until they have zero reference count. Flush would -+ never observe a HEARD_BANSHEE node on the left-edge of flush, nodes are only -+ deleted to the right. So if nothing is broken, why fix it? -+ -+ NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any -+ point and in any moment, because of the concurrent file system -+ activity (for example, truncate). */ -+ -+ /* Check jnode state after flush_scan completed. Having a lock on this -+ node or its parent (in case of unformatted) helps us in case of -+ concurrent flushing. */ -+ if (jnode_check_flushprepped(leftmost_in_slum) -+ && !jnode_convertible(leftmost_in_slum)) { -+ ret = 0; -+ goto failed; -+ } -+ -+ /* Now setup flush_pos using scan_left's endpoint. */ -+ ret = prepare_flush_pos(flush_pos, leftmost_in_slum); -+ if (ret) -+ goto failed; -+ -+ if (znode_get_level(flush_pos->coord.node) == LEAF_LEVEL -+ && node_is_empty(flush_pos->coord.node)) { -+ znode *empty = flush_pos->coord.node; -+ -+ assert("zam-1022", !ZF_ISSET(empty, JNODE_HEARD_BANSHEE)); -+ ret = delete_empty_node(empty); -+ goto failed; -+ } -+ -+ if (jnode_check_flushprepped(leftmost_in_slum) -+ && !jnode_convertible(leftmost_in_slum)) { -+ ret = 0; -+ goto failed; -+ } -+ -+ /* Set pos->preceder and (re)allocate pos and its ancestors if it is needed */ -+ ret = alloc_pos_and_ancestors(flush_pos); -+ if (ret) -+ goto failed; -+ -+ /* Do the main rightward-bottom-up squeeze and allocate loop. */ -+ ret = squalloc(flush_pos); -+ pos_stop(flush_pos); -+ if (ret) -+ goto failed; -+ -+ /* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated children. -+ First, the pos_stop() and pos_valid() routines should be modified -+ so that pos_stop() sets a flush_position->stop flag to 1 without -+ releasing the current position immediately--instead release it in -+ pos_done(). This is a better implementation than the current one anyway. -+ -+ It is not clear that all fields of the flush_position should not be released, -+ but at the very least the parent_lock, parent_coord, and parent_load should -+ remain held because they are hold the last twig when pos_stop() is -+ called. -+ -+ When we reach this point in the code, if the parent_coord is set to after the -+ last item then we know that flush reached the end of a twig (and according to -+ the new flush queueing design, we will return now). If parent_coord is not -+ past the last item, we should check if the current twig has any unallocated -+ children to the right (we are not concerned with unallocated children to the -+ left--in that case the twig itself should not have been allocated). If the -+ twig has unallocated children to the right, set the parent_coord to that -+ position and then repeat the call to squalloc. -+ -+ Testing for unallocated children may be defined in two ways: if any internal -+ item has a fake block number, it is unallocated; if any extent item is -+ unallocated then all of its children are unallocated. But there is a more -+ aggressive approach: if there are any dirty children of the twig to the right -+ of the current position, we may wish to relocate those nodes now. Checking for -+ potential relocation is more expensive as it requires knowing whether there are -+ any dirty children that are not unallocated. The extent_needs_allocation -+ should be used after setting the correct preceder. -+ -+ When we reach the end of a twig at this point in the code, if the flush can -+ continue (when the queue is ready) it will need some information on the future -+ starting point. That should be stored away in the flush_handle using a seal, I -+ believe. Holding a jref() on the future starting point may break other code -+ that deletes that node. -+ */ -+ -+ /* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is called -+ above the twig level. If the VM calls flush above the twig level, do nothing -+ and return (but figure out why this happens). The txnmgr should be modified to -+ only flush its leaf-level dirty list. This will do all the necessary squeeze -+ and allocate steps but leave unallocated branches and possibly unallocated -+ twigs (when the twig's leftmost child is not dirty). After flushing the leaf -+ level, the remaining unallocated nodes should be given write-optimized -+ locations. (Possibly, the remaining unallocated twigs should be allocated just -+ before their leftmost child.) -+ */ -+ -+ /* Any failure reaches this point. */ -+ failed: -+ -+ switch (ret) { -+ case -E_REPEAT: -+ case -EINVAL: -+ case -E_DEADLOCK: -+ case -E_NO_NEIGHBOR: -+ case -ENOENT: -+ /* FIXME(C): Except for E_DEADLOCK, these should probably be handled properly -+ in each case. They already are handled in many cases. */ -+ /* Something bad happened, but difficult to avoid... Try again! */ -+ ret = 0; -+ } -+ -+ if (leftmost_in_slum) -+ jput(leftmost_in_slum); -+ -+ pos_done(flush_pos); -+ scan_done(left_scan); -+ scan_done(right_scan); -+ kfree(right_scan); -+ -+ ON_DEBUG(atomic_dec(&flush_cnt)); -+ -+ reiser4_leave_flush(sb); -+ -+ return ret; -+} -+ -+/* The reiser4 flush subsystem can be turned into "rapid flush mode" means that -+ * flusher should submit all prepped nodes immediately without keeping them in -+ * flush queues for long time. The reason for rapid flush mode is to free -+ * memory as fast as possible. */ -+ -+#if REISER4_USE_RAPID_FLUSH -+ -+/** -+ * submit all prepped nodes if rapid flush mode is set, -+ * turn rapid flush mode off. -+ */ -+ -+static int rapid_flush(flush_pos_t * pos) -+{ -+ if (!wbq_available()) -+ return 0; -+ -+ return write_prepped_nodes(pos); -+} -+ -+#else -+ -+#define rapid_flush(pos) (0) -+ -+#endif /* REISER4_USE_RAPID_FLUSH */ -+ -+static jnode *find_flush_start_jnode(jnode *start, txn_atom *atom, -+ flush_queue_t *fq, int *nr_queued, -+ int flags) -+{ -+ jnode * node; -+ -+ if (start != NULL) { -+ spin_lock_jnode(start); -+ if (!jnode_is_flushprepped(start)) { -+ assert("zam-1056", start->atom == atom); -+ node = start; -+ goto enter; -+ } -+ spin_unlock_jnode(start); -+ } -+ /* -+ * In this loop we process all already prepped (RELOC or OVRWR) and dirtied again -+ * nodes. The atom spin lock is not released until all dirty nodes processed or -+ * not prepped node found in the atom dirty lists. -+ */ -+ while ((node = find_first_dirty_jnode(atom, flags))) { -+ spin_lock_jnode(node); -+ enter: -+ assert("zam-881", JF_ISSET(node, JNODE_DIRTY)); -+ assert("zam-898", !JF_ISSET(node, JNODE_OVRWR)); -+ -+ if (JF_ISSET(node, JNODE_WRITEBACK)) { -+ /* move node to the end of atom's writeback list */ -+ list_move_tail(&node->capture_link, ATOM_WB_LIST(atom)); -+ -+ /* -+ * jnode is not necessarily on dirty list: if it was dirtied when -+ * it was on flush queue - it does not get moved to dirty list -+ */ -+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), -+ WB_LIST, 1)); -+ -+ } else if (jnode_is_znode(node) -+ && znode_above_root(JZNODE(node))) { -+ /* -+ * A special case for znode-above-root. The above-root (fake) -+ * znode is captured and dirtied when the tree height changes or -+ * when the root node is relocated. This causes atoms to fuse so -+ * that changes at the root are serialized. However, this node is -+ * never flushed. This special case used to be in lock.c to -+ * prevent the above-root node from ever being captured, but now -+ * that it is captured we simply prevent it from flushing. The -+ * log-writer code relies on this to properly log superblock -+ * modifications of the tree height. -+ */ -+ jnode_make_wander_nolock(node); -+ } else if (JF_ISSET(node, JNODE_RELOC)) { -+ queue_jnode(fq, node); -+ ++(*nr_queued); -+ } else -+ break; -+ -+ spin_unlock_jnode(node); -+ } -+ return node; -+} -+ -+/* Flush some nodes of current atom, usually slum, return -E_REPEAT if there are more nodes -+ * to flush, return 0 if atom's dirty lists empty and keep current atom locked, return -+ * other errors as they are. */ -+int -+flush_current_atom(int flags, long nr_to_write, long *nr_submitted, -+ txn_atom ** atom, jnode *start) -+{ -+ reiser4_super_info_data *sinfo = get_current_super_private(); -+ flush_queue_t *fq = NULL; -+ jnode *node; -+ int nr_queued; -+ int ret; -+ -+ assert("zam-889", atom != NULL && *atom != NULL); -+ assert_spin_locked(&((*atom)->alock)); -+ assert("zam-892", get_current_context()->trans->atom == *atom); -+ -+ nr_to_write = LONG_MAX; -+ while (1) { -+ ret = reiser4_fq_by_atom(*atom, &fq); -+ if (ret != -E_REPEAT) -+ break; -+ *atom = get_current_atom_locked(); -+ } -+ if (ret) -+ return ret; -+ -+ assert_spin_locked(&((*atom)->alock)); -+ -+ /* parallel flushers limit */ -+ if (sinfo->tmgr.atom_max_flushers != 0) { -+ while ((*atom)->nr_flushers >= sinfo->tmgr.atom_max_flushers) { -+ /* An reiser4_atom_send_event() call is inside -+ reiser4_fq_put_nolock() which is called when flush is -+ finished and nr_flushers is decremented. */ -+ reiser4_atom_wait_event(*atom); -+ *atom = get_current_atom_locked(); -+ } -+ } -+ -+ /* count ourself as a flusher */ -+ (*atom)->nr_flushers++; -+ -+ writeout_mode_enable(); -+ -+ nr_queued = 0; -+ node = find_flush_start_jnode(start, *atom, fq, &nr_queued, flags); -+ -+ if (node == NULL) { -+ if (nr_queued == 0) { -+ (*atom)->nr_flushers--; -+ reiser4_fq_put_nolock(fq); -+ reiser4_atom_send_event(*atom); -+ /* current atom remains locked */ -+ writeout_mode_disable(); -+ return 0; -+ } -+ spin_unlock_atom(*atom); -+ } else { -+ jref(node); -+ BUG_ON((*atom)->super != node->tree->super); -+ spin_unlock_atom(*atom); -+ spin_unlock_jnode(node); -+ BUG_ON(nr_to_write == 0); -+ ret = jnode_flush(node, nr_to_write, nr_submitted, fq, flags); -+ jput(node); -+ } -+ -+ ret = -+ reiser4_write_fq(fq, nr_submitted, -+ WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM); -+ -+ *atom = get_current_atom_locked(); -+ (*atom)->nr_flushers--; -+ reiser4_fq_put_nolock(fq); -+ reiser4_atom_send_event(*atom); -+ spin_unlock_atom(*atom); -+ -+ writeout_mode_disable(); -+ -+ if (ret == 0) -+ ret = -E_REPEAT; -+ -+ return ret; -+} -+ -+/* REVERSE PARENT-FIRST RELOCATION POLICIES */ -+ -+/* This implements the is-it-close-enough-to-its-preceder? test for relocation in the -+ reverse parent-first relocate context. Here all we know is the preceder and the block -+ number. Since we are going in reverse, the preceder may still be relocated as well, so -+ we can't ask the block allocator "is there a closer block available to relocate?" here. -+ In the _forward_ parent-first relocate context (not here) we actually call the block -+ allocator to try and find a closer location. */ -+static int -+reverse_relocate_if_close_enough(const reiser4_block_nr * pblk, -+ const reiser4_block_nr * nblk) -+{ -+ reiser4_block_nr dist; -+ -+ assert("jmacd-7710", *pblk != 0 && *nblk != 0); -+ assert("jmacd-7711", !reiser4_blocknr_is_fake(pblk)); -+ assert("jmacd-7712", !reiser4_blocknr_is_fake(nblk)); -+ -+ /* Distance is the absolute value. */ -+ dist = (*pblk > *nblk) ? (*pblk - *nblk) : (*nblk - *pblk); -+ -+ /* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from its preceder -+ block, do not relocate. */ -+ if (dist <= get_current_super_private()->flush.relocate_distance) { -+ return 0; -+ } -+ -+ return 1; -+} -+ -+/* This function is a predicate that tests for relocation. Always called in the -+ reverse-parent-first context, when we are asking whether the current node should be -+ relocated in order to expand the flush by dirtying the parent level (and thus -+ proceeding to flush that level). When traversing in the forward parent-first direction -+ (not here), relocation decisions are handled in two places: allocate_znode() and -+ extent_needs_allocation(). */ -+static int -+reverse_relocate_test(jnode * node, const coord_t * parent_coord, -+ flush_pos_t * pos) -+{ -+ reiser4_block_nr pblk = 0; -+ reiser4_block_nr nblk = 0; -+ -+ assert("jmacd-8989", !jnode_is_root(node)); -+ -+ /* -+ * This function is called only from the -+ * reverse_relocate_check_dirty_parent() and only if the parent -+ * node is clean. This implies that the parent has the real (i.e., not -+ * fake) block number, and, so does the child, because otherwise the -+ * parent would be dirty. -+ */ -+ -+ /* New nodes are treated as if they are being relocated. */ -+ if (JF_ISSET (node, JNODE_CREATED) || -+ (pos->leaf_relocate && jnode_get_level(node) == LEAF_LEVEL)) { -+ return 1; -+ } -+ -+ /* Find the preceder. FIXME(B): When the child is an unformatted, previously -+ existing node, the coord may be leftmost even though the child is not the -+ parent-first preceder of the parent. If the first dirty node appears somewhere -+ in the middle of the first extent unit, this preceder calculation is wrong. -+ Needs more logic in here. */ -+ if (coord_is_leftmost_unit(parent_coord)) { -+ pblk = *znode_get_block(parent_coord->node); -+ } else { -+ pblk = pos->preceder.blk; -+ } -+ check_preceder(pblk); -+ -+ /* If (pblk == 0) then the preceder isn't allocated or isn't known: relocate. */ -+ if (pblk == 0) { -+ return 1; -+ } -+ -+ nblk = *jnode_get_block(node); -+ -+ if (reiser4_blocknr_is_fake(&nblk)) -+ /* child is unallocated, mark parent dirty */ -+ return 1; -+ -+ return reverse_relocate_if_close_enough(&pblk, &nblk); -+} -+ -+/* This function calls reverse_relocate_test to make a reverse-parent-first -+ relocation decision and then, if yes, it marks the parent dirty. */ -+static int -+reverse_relocate_check_dirty_parent(jnode * node, const coord_t * parent_coord, -+ flush_pos_t * pos) -+{ -+ int ret; -+ -+ if (!JF_ISSET(ZJNODE(parent_coord->node), JNODE_DIRTY)) { -+ -+ ret = reverse_relocate_test(node, parent_coord, pos); -+ if (ret < 0) { -+ return ret; -+ } -+ -+ /* FIXME-ZAM -+ if parent is already relocated - we do not want to grab space, right? */ -+ if (ret == 1) { -+ int grabbed; -+ -+ grabbed = get_current_context()->grabbed_blocks; -+ if (reiser4_grab_space_force((__u64) 1, BA_RESERVED) != -+ 0) -+ reiser4_panic("umka-1250", -+ "No space left during flush."); -+ -+ assert("jmacd-18923", -+ znode_is_write_locked(parent_coord->node)); -+ znode_make_dirty(parent_coord->node); -+ grabbed2free_mark(grabbed); -+ } -+ } -+ -+ return 0; -+} -+ -+/* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE FORWARD -+ PARENT-FIRST LOOP BEGINS) */ -+ -+/* Get the leftmost child for given coord. */ -+static int get_leftmost_child_of_unit(const coord_t * coord, jnode ** child) -+{ -+ int ret; -+ -+ ret = item_utmost_child(coord, LEFT_SIDE, child); -+ -+ if (ret) -+ return ret; -+ -+ if (IS_ERR(*child)) -+ return PTR_ERR(*child); -+ -+ return 0; -+} -+ -+/* This step occurs after the left- and right-scans are completed, before starting the -+ forward parent-first traversal. Here we attempt to allocate ancestors of the starting -+ flush point, which means continuing in the reverse parent-first direction to the -+ parent, grandparent, and so on (as long as the child is a leftmost child). This -+ routine calls a recursive process, alloc_one_ancestor, which does the real work, -+ except there is special-case handling here for the first ancestor, which may be a twig. -+ At each level (here and alloc_one_ancestor), we check for relocation and then, if -+ the child is a leftmost child, repeat at the next level. On the way back down (the -+ recursion), we allocate the ancestors in parent-first order. */ -+static int alloc_pos_and_ancestors(flush_pos_t * pos) -+{ -+ int ret = 0; -+ lock_handle plock; -+ load_count pload; -+ coord_t pcoord; -+ -+ if (znode_check_flushprepped(pos->lock.node)) -+ return 0; -+ -+ coord_init_invalid(&pcoord, NULL); -+ init_lh(&plock); -+ init_load_count(&pload); -+ -+ if (pos->state == POS_ON_EPOINT) { -+ /* a special case for pos on twig level, where we already have -+ a lock on parent node. */ -+ /* The parent may not be dirty, in which case we should decide -+ whether to relocate the child now. If decision is made to -+ relocate the child, the parent is marked dirty. */ -+ ret = -+ reverse_relocate_check_dirty_parent(pos->child, &pos->coord, -+ pos); -+ if (ret) -+ goto exit; -+ -+ /* FIXME_NFQUCMPD: We only need to allocate the twig (if child -+ is leftmost) and the leaf/child, so recursion is not needed. -+ Levels above the twig will be allocated for -+ write-optimization before the transaction commits. */ -+ -+ /* Do the recursive step, allocating zero or more of our -+ * ancestors. */ -+ ret = alloc_one_ancestor(&pos->coord, pos); -+ -+ } else { -+ if (!znode_is_root(pos->lock.node)) { -+ /* all formatted nodes except tree root */ -+ ret = -+ reiser4_get_parent(&plock, pos->lock.node, -+ ZNODE_WRITE_LOCK); -+ if (ret) -+ goto exit; -+ -+ ret = incr_load_count_znode(&pload, plock.node); -+ if (ret) -+ goto exit; -+ -+ ret = -+ find_child_ptr(plock.node, pos->lock.node, &pcoord); -+ if (ret) -+ goto exit; -+ -+ ret = -+ reverse_relocate_check_dirty_parent(ZJNODE -+ (pos->lock. -+ node), &pcoord, -+ pos); -+ if (ret) -+ goto exit; -+ -+ ret = alloc_one_ancestor(&pcoord, pos); -+ if (ret) -+ goto exit; -+ } -+ -+ ret = allocate_znode(pos->lock.node, &pcoord, pos); -+ } -+ exit: -+ done_load_count(&pload); -+ done_lh(&plock); -+ return ret; -+} -+ -+/* This is the recursive step described in alloc_pos_and_ancestors, above. Ignoring the -+ call to set_preceder, which is the next function described, this checks if the -+ child is a leftmost child and returns if it is not. If the child is a leftmost child -+ it checks for relocation, possibly dirtying the parent. Then it performs the recursive -+ step. */ -+static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos) -+{ -+ int ret = 0; -+ lock_handle alock; -+ load_count aload; -+ coord_t acoord; -+ -+ /* As we ascend at the left-edge of the region to flush, take this opportunity at -+ the twig level to find our parent-first preceder unless we have already set -+ it. */ -+ if (pos->preceder.blk == 0) { -+ ret = set_preceder(coord, pos); -+ if (ret != 0) -+ return ret; -+ } -+ -+ /* If the ancestor is clean or already allocated, or if the child is not a -+ leftmost child, stop going up, even leaving coord->node not flushprepped. */ -+ if (znode_check_flushprepped(coord->node) -+ || !coord_is_leftmost_unit(coord)) -+ return 0; -+ -+ init_lh(&alock); -+ init_load_count(&aload); -+ coord_init_invalid(&acoord, NULL); -+ -+ /* Only ascend to the next level if it is a leftmost child, but write-lock the -+ parent in case we will relocate the child. */ -+ if (!znode_is_root(coord->node)) { -+ -+ ret = -+ jnode_lock_parent_coord(ZJNODE(coord->node), &acoord, -+ &alock, &aload, ZNODE_WRITE_LOCK, -+ 0); -+ if (ret != 0) { -+ /* FIXME(C): check EINVAL, E_DEADLOCK */ -+ goto exit; -+ } -+ -+ ret = -+ reverse_relocate_check_dirty_parent(ZJNODE(coord->node), -+ &acoord, pos); -+ if (ret != 0) { -+ goto exit; -+ } -+ -+ /* Recursive call. */ -+ if (!znode_check_flushprepped(acoord.node)) { -+ ret = alloc_one_ancestor(&acoord, pos); -+ if (ret) -+ goto exit; -+ } -+ } -+ -+ /* Note: we call allocate with the parent write-locked (except at the root) in -+ case we relocate the child, in which case it will modify the parent during this -+ call. */ -+ ret = allocate_znode(coord->node, &acoord, pos); -+ -+ exit: -+ done_load_count(&aload); -+ done_lh(&alock); -+ return ret; -+} -+ -+/* During the reverse parent-first alloc_pos_and_ancestors process described above there is -+ a call to this function at the twig level. During alloc_pos_and_ancestors we may ask: -+ should this node be relocated (in reverse parent-first context)? We repeat this -+ process as long as the child is the leftmost child, eventually reaching an ancestor of -+ the flush point that is not a leftmost child. The preceder of that ancestors, which is -+ not a leftmost child, is actually on the leaf level. The preceder of that block is the -+ left-neighbor of the flush point. The preceder of that block is the rightmost child of -+ the twig on the left. So, when alloc_pos_and_ancestors passes upward through the twig -+ level, it stops momentarily to remember the block of the rightmost child of the twig on -+ the left and sets it to the flush_position's preceder_hint. -+ -+ There is one other place where we may set the flush_position's preceder hint, which is -+ during scan-left. -+*/ -+static int set_preceder(const coord_t * coord_in, flush_pos_t * pos) -+{ -+ int ret; -+ coord_t coord; -+ lock_handle left_lock; -+ load_count left_load; -+ -+ coord_dup(&coord, coord_in); -+ -+ init_lh(&left_lock); -+ init_load_count(&left_load); -+ -+ /* FIXME(B): Same FIXME as in "Find the preceder" in reverse_relocate_test. -+ coord_is_leftmost_unit is not the right test if the unformatted child is in the -+ middle of the first extent unit. */ -+ if (!coord_is_leftmost_unit(&coord)) { -+ coord_prev_unit(&coord); -+ } else { -+ ret = -+ reiser4_get_left_neighbor(&left_lock, coord.node, -+ ZNODE_READ_LOCK, GN_SAME_ATOM); -+ if (ret) { -+ /* If we fail for any reason it doesn't matter because the -+ preceder is only a hint. We are low-priority at this point, so -+ this must be the case. */ -+ if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR || -+ ret == -ENOENT || ret == -EINVAL -+ || ret == -E_DEADLOCK) { -+ ret = 0; -+ } -+ goto exit; -+ } -+ -+ ret = incr_load_count_znode(&left_load, left_lock.node); -+ if (ret) -+ goto exit; -+ -+ coord_init_last_unit(&coord, left_lock.node); -+ } -+ -+ ret = -+ item_utmost_child_real_block(&coord, RIGHT_SIDE, -+ &pos->preceder.blk); -+ exit: -+ check_preceder(pos->preceder.blk); -+ done_load_count(&left_load); -+ done_lh(&left_lock); -+ return ret; -+} -+ -+/* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */ -+ -+/* This procedure implements the outer loop of the flush algorithm. To put this in -+ context, here is the general list of steps taken by the flush routine as a whole: -+ -+ 1. Scan-left -+ 2. Scan-right (maybe) -+ 3. Allocate initial flush position and its ancestors -+ 4. -+ 5. -+ 6. -+ -+ This procedure implements the loop in steps 4 through 6 in the above listing. -+ -+ Step 4: if the current flush position is an extent item (position on the twig level), -+ it allocates the extent (allocate_extent_item_in_place) then shifts to the next -+ coordinate. If the next coordinate's leftmost child needs flushprep, we will continue. -+ If the next coordinate is an internal item, we descend back to the leaf level, -+ otherwise we repeat a step #4 (labeled ALLOC_EXTENTS below). If the "next coordinate" -+ brings us past the end of the twig level, then we call -+ reverse_relocate_end_of_twig to possibly dirty the next (right) twig, prior to -+ step #5 which moves to the right. -+ -+ Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up the -+ tree to allocate any ancestors of the next-right flush position that are not also -+ ancestors of the current position. Those ancestors (in top-down order) are the next in -+ parent-first order. We squeeze adjacent nodes on the way up until the right node and -+ current node share the same parent, then allocate on the way back down. Finally, this -+ step sets the flush position to the next-right node. Then repeat steps 4 and 5. -+*/ -+ -+/* SQUEEZE CODE */ -+ -+/* squalloc_right_twig helper function, cut a range of extent items from -+ cut node to->node from the beginning up to coord @to. */ -+static int squalloc_right_twig_cut(coord_t * to, reiser4_key * to_key, -+ znode * left) -+{ -+ coord_t from; -+ reiser4_key from_key; -+ -+ coord_init_first_unit(&from, to->node); -+ item_key_by_coord(&from, &from_key); -+ -+ return cut_node_content(&from, to, &from_key, to_key, NULL); -+} -+ -+/* Copy as much of the leading extents from @right to @left, allocating -+ unallocated extents as they are copied. Returns SQUEEZE_TARGET_FULL or -+ SQUEEZE_SOURCE_EMPTY when no more can be shifted. If the next item is an -+ internal item it calls shift_one_internal_unit and may then return -+ SUBTREE_MOVED. */ -+static int squeeze_right_twig(znode * left, znode * right, flush_pos_t * pos) -+{ -+ int ret = SUBTREE_MOVED; -+ coord_t coord; /* used to iterate over items */ -+ reiser4_key stop_key; -+ -+ assert("jmacd-2008", !node_is_empty(right)); -+ coord_init_first_unit(&coord, right); -+ -+ /* FIXME: can be optimized to cut once */ -+ while (!node_is_empty(coord.node) && item_is_extent(&coord)) { -+ ON_DEBUG(void *vp); -+ -+ assert("vs-1468", coord_is_leftmost_unit(&coord)); -+ ON_DEBUG(vp = shift_check_prepare(left, coord.node)); -+ -+ /* stop_key is used to find what was copied and what to cut */ -+ stop_key = *reiser4_min_key(); -+ ret = squalloc_extent(left, &coord, pos, &stop_key); -+ if (ret != SQUEEZE_CONTINUE) { -+ ON_DEBUG(kfree(vp)); -+ break; -+ } -+ assert("vs-1465", !keyeq(&stop_key, reiser4_min_key())); -+ -+ /* Helper function to do the cutting. */ -+ set_key_offset(&stop_key, get_key_offset(&stop_key) - 1); -+ check_me("vs-1466", -+ squalloc_right_twig_cut(&coord, &stop_key, left) == 0); -+ -+ ON_DEBUG(shift_check(vp, left, coord.node)); -+ } -+ -+ if (node_is_empty(coord.node)) -+ ret = SQUEEZE_SOURCE_EMPTY; -+ -+ if (ret == SQUEEZE_TARGET_FULL) { -+ goto out; -+ } -+ -+ if (node_is_empty(right)) { -+ /* The whole right node was copied into @left. */ -+ assert("vs-464", ret == SQUEEZE_SOURCE_EMPTY); -+ goto out; -+ } -+ -+ coord_init_first_unit(&coord, right); -+ -+ if (!item_is_internal(&coord)) { -+ /* we do not want to squeeze anything else to left neighbor because "slum" -+ is over */ -+ ret = SQUEEZE_TARGET_FULL; -+ goto out; -+ } -+ assert("jmacd-433", item_is_internal(&coord)); -+ -+ /* Shift an internal unit. The child must be allocated before shifting any more -+ extents, so we stop here. */ -+ ret = shift_one_internal_unit(left, right); -+ -+ out: -+ assert("jmacd-8612", ret < 0 || ret == SQUEEZE_TARGET_FULL -+ || ret == SUBTREE_MOVED || ret == SQUEEZE_SOURCE_EMPTY); -+ -+ if (ret == SQUEEZE_TARGET_FULL) { -+ /* We submit prepped nodes here and expect that this @left twig -+ * will not be modified again during this jnode_flush() call. */ -+ int ret1; -+ -+ /* NOTE: seems like io is done under long term locks. */ -+ ret1 = write_prepped_nodes(pos); -+ if (ret1 < 0) -+ return ret1; -+ } -+ -+ return ret; -+} -+ -+#if REISER4_DEBUG -+static void item_convert_invariant(flush_pos_t * pos) -+{ -+ assert("edward-1225", coord_is_existing_item(&pos->coord)); -+ if (chaining_data_present(pos)) { -+ item_plugin *iplug = item_convert_plug(pos); -+ -+ assert("edward-1000", -+ iplug == item_plugin_by_coord(&pos->coord)); -+ assert("edward-1001", iplug->f.convert != NULL); -+ } else -+ assert("edward-1226", pos->child == NULL); -+} -+#else -+ -+#define item_convert_invariant(pos) noop -+ -+#endif -+ -+/* Scan node items starting from the first one and apply for each -+ item its flush ->convert() method (if any). This method may -+ resize/kill the item so the tree will be changed. -+*/ -+static int convert_node(flush_pos_t * pos, znode * node) -+{ -+ int ret = 0; -+ item_plugin *iplug; -+ -+ assert("edward-304", pos != NULL); -+ assert("edward-305", pos->child == NULL); -+ assert("edward-475", znode_convertible(node)); -+ assert("edward-669", znode_is_wlocked(node)); -+ assert("edward-1210", !node_is_empty(node)); -+ -+ if (znode_get_level(node) != LEAF_LEVEL) -+ /* unsupported */ -+ goto exit; -+ -+ coord_init_first_unit(&pos->coord, node); -+ -+ while (1) { -+ ret = 0; -+ coord_set_to_left(&pos->coord); -+ item_convert_invariant(pos); -+ -+ iplug = item_plugin_by_coord(&pos->coord); -+ assert("edward-844", iplug != NULL); -+ -+ if (iplug->f.convert) { -+ ret = iplug->f.convert(pos); -+ if (ret) -+ goto exit; -+ } -+ assert("edward-307", pos->child == NULL); -+ -+ if (coord_next_item(&pos->coord)) { -+ /* node is over */ -+ -+ if (!chaining_data_present(pos)) -+ /* finished this node */ -+ break; -+ if (should_chain_next_node(pos)) { -+ /* go to next node */ -+ move_chaining_data(pos, 0 /* to next node */ ); -+ break; -+ } -+ /* repeat this node */ -+ move_chaining_data(pos, 1 /* this node */ ); -+ continue; -+ } -+ /* Node is not over. -+ Check if there is attached convert data. -+ If so roll one item position back and repeat -+ on this node -+ */ -+ if (chaining_data_present(pos)) { -+ -+ if (iplug != item_plugin_by_coord(&pos->coord)) -+ set_item_convert_count(pos, 0); -+ -+ ret = coord_prev_item(&pos->coord); -+ assert("edward-1003", !ret); -+ -+ move_chaining_data(pos, 1 /* this node */ ); -+ } -+ } -+ JF_CLR(ZJNODE(node), JNODE_CONVERTIBLE); -+ znode_make_dirty(node); -+ exit: -+ assert("edward-1004", !ret); -+ return ret; -+} -+ -+/* Squeeze and allocate the right neighbor. This is called after @left and -+ its current children have been squeezed and allocated already. This -+ procedure's job is to squeeze and items from @right to @left. -+ -+ If at the leaf level, use the shift_everything_left memcpy-optimized -+ version of shifting (squeeze_right_leaf). -+ -+ If at the twig level, extents are allocated as they are shifted from @right -+ to @left (squalloc_right_twig). -+ -+ At any other level, shift one internal item and return to the caller -+ (squalloc_parent_first) so that the shifted-subtree can be processed in -+ parent-first order. -+ -+ When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is -+ returned. When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is -+ returned. If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL -+ is returned. -+*/ -+ -+static int squeeze_right_neighbor(flush_pos_t * pos, znode * left, -+ znode * right) -+{ -+ int ret; -+ -+ /* FIXME it is possible to see empty hasn't-heard-banshee node in a -+ * tree owing to error (for example, ENOSPC) in write */ -+ /* assert("jmacd-9321", !node_is_empty(left)); */ -+ assert("jmacd-9322", !node_is_empty(right)); -+ assert("jmacd-9323", znode_get_level(left) == znode_get_level(right)); -+ -+ switch (znode_get_level(left)) { -+ case TWIG_LEVEL: -+ /* Shift with extent allocating until either an internal item -+ is encountered or everything is shifted or no free space -+ left in @left */ -+ ret = squeeze_right_twig(left, right, pos); -+ break; -+ -+ default: -+ /* All other levels can use shift_everything until we implement per-item -+ flush plugins. */ -+ ret = squeeze_right_non_twig(left, right); -+ break; -+ } -+ -+ assert("jmacd-2011", (ret < 0 || -+ ret == SQUEEZE_SOURCE_EMPTY -+ || ret == SQUEEZE_TARGET_FULL -+ || ret == SUBTREE_MOVED)); -+ return ret; -+} -+ -+static int squeeze_right_twig_and_advance_coord(flush_pos_t * pos, -+ znode * right) -+{ -+ int ret; -+ -+ ret = squeeze_right_twig(pos->lock.node, right, pos); -+ if (ret < 0) -+ return ret; -+ if (ret > 0) { -+ coord_init_after_last_item(&pos->coord, pos->lock.node); -+ return ret; -+ } -+ -+ coord_init_last_unit(&pos->coord, pos->lock.node); -+ return 0; -+} -+ -+/* forward declaration */ -+static int squalloc_upper_levels(flush_pos_t *, znode *, znode *); -+ -+/* do a fast check for "same parents" condition before calling -+ * squalloc_upper_levels() */ -+static inline int check_parents_and_squalloc_upper_levels(flush_pos_t * pos, -+ znode * left, -+ znode * right) -+{ -+ if (znode_same_parents(left, right)) -+ return 0; -+ -+ return squalloc_upper_levels(pos, left, right); -+} -+ -+/* Check whether the parent of given @right node needs to be processes -+ ((re)allocated) prior to processing of the child. If @left and @right do not -+ share at least the parent of the @right is after the @left but before the -+ @right in parent-first order, we have to (re)allocate it before the @right -+ gets (re)allocated. */ -+static int squalloc_upper_levels(flush_pos_t * pos, znode * left, znode * right) -+{ -+ int ret; -+ -+ lock_handle left_parent_lock; -+ lock_handle right_parent_lock; -+ -+ load_count left_parent_load; -+ load_count right_parent_load; -+ -+ init_lh(&left_parent_lock); -+ init_lh(&right_parent_lock); -+ -+ init_load_count(&left_parent_load); -+ init_load_count(&right_parent_load); -+ -+ ret = reiser4_get_parent(&left_parent_lock, left, ZNODE_WRITE_LOCK); -+ if (ret) -+ goto out; -+ -+ ret = reiser4_get_parent(&right_parent_lock, right, ZNODE_WRITE_LOCK); -+ if (ret) -+ goto out; -+ -+ /* Check for same parents */ -+ if (left_parent_lock.node == right_parent_lock.node) -+ goto out; -+ -+ if (znode_check_flushprepped(right_parent_lock.node)) { -+ /* Keep parent-first order. In the order, the right parent node stands -+ before the @right node. If it is already allocated, we set the -+ preceder (next block search start point) to its block number, @right -+ node should be allocated after it. -+ -+ However, preceder is set only if the right parent is on twig level. -+ The explanation is the following: new branch nodes are allocated over -+ already allocated children while the tree grows, it is difficult to -+ keep tree ordered, we assume that only leaves and twings are correctly -+ allocated. So, only twigs are used as a preceder for allocating of the -+ rest of the slum. */ -+ if (znode_get_level(right_parent_lock.node) == TWIG_LEVEL) { -+ pos->preceder.blk = -+ *znode_get_block(right_parent_lock.node); -+ check_preceder(pos->preceder.blk); -+ } -+ goto out; -+ } -+ -+ ret = incr_load_count_znode(&left_parent_load, left_parent_lock.node); -+ if (ret) -+ goto out; -+ -+ ret = incr_load_count_znode(&right_parent_load, right_parent_lock.node); -+ if (ret) -+ goto out; -+ -+ ret = -+ squeeze_right_neighbor(pos, left_parent_lock.node, -+ right_parent_lock.node); -+ /* We stop if error. We stop if some items/units were shifted (ret == 0) -+ * and thus @right changed its parent. It means we have not process -+ * right_parent node prior to processing of @right. Positive return -+ * values say that shifting items was not happen because of "empty -+ * source" or "target full" conditions. */ -+ if (ret <= 0) -+ goto out; -+ -+ /* parent(@left) and parent(@right) may have different parents also. We -+ * do a recursive call for checking that. */ -+ ret = -+ check_parents_and_squalloc_upper_levels(pos, left_parent_lock.node, -+ right_parent_lock.node); -+ if (ret) -+ goto out; -+ -+ /* allocate znode when going down */ -+ ret = lock_parent_and_allocate_znode(right_parent_lock.node, pos); -+ -+ out: -+ done_load_count(&left_parent_load); -+ done_load_count(&right_parent_load); -+ -+ done_lh(&left_parent_lock); -+ done_lh(&right_parent_lock); -+ -+ return ret; -+} -+ -+/* Check the leftmost child "flushprepped" status, also returns true if child -+ * node was not found in cache. */ -+static int leftmost_child_of_unit_check_flushprepped(const coord_t * coord) -+{ -+ int ret; -+ int prepped; -+ -+ jnode *child; -+ -+ ret = get_leftmost_child_of_unit(coord, &child); -+ -+ if (ret) -+ return ret; -+ -+ if (child) { -+ prepped = jnode_check_flushprepped(child); -+ jput(child); -+ } else { -+ /* We consider not existing child as a node which slum -+ processing should not continue to. Not cached node is clean, -+ so it is flushprepped. */ -+ prepped = 1; -+ } -+ -+ return prepped; -+} -+ -+/* (re)allocate znode with automated getting parent node */ -+static int lock_parent_and_allocate_znode(znode * node, flush_pos_t * pos) -+{ -+ int ret; -+ lock_handle parent_lock; -+ load_count parent_load; -+ coord_t pcoord; -+ -+ assert("zam-851", znode_is_write_locked(node)); -+ -+ init_lh(&parent_lock); -+ init_load_count(&parent_load); -+ -+ ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK); -+ if (ret) -+ goto out; -+ -+ ret = incr_load_count_znode(&parent_load, parent_lock.node); -+ if (ret) -+ goto out; -+ -+ ret = find_child_ptr(parent_lock.node, node, &pcoord); -+ if (ret) -+ goto out; -+ -+ ret = allocate_znode(node, &pcoord, pos); -+ -+ out: -+ done_load_count(&parent_load); -+ done_lh(&parent_lock); -+ return ret; -+} -+ -+/* Process nodes on leaf level until unformatted node or rightmost node in the -+ * slum reached. */ -+static int handle_pos_on_formatted(flush_pos_t * pos) -+{ -+ int ret; -+ lock_handle right_lock; -+ load_count right_load; -+ -+ init_lh(&right_lock); -+ init_load_count(&right_load); -+ -+ if (should_convert_node(pos, pos->lock.node)) { -+ ret = convert_node(pos, pos->lock.node); -+ if (ret) -+ return ret; -+ } -+ -+ while (1) { -+ ret = -+ neighbor_in_slum(pos->lock.node, &right_lock, RIGHT_SIDE, -+ ZNODE_WRITE_LOCK, -+ !should_convert_next_node(pos, -+ right_lock. -+ node)); -+ if (ret) -+ break; -+ -+ /* we don't prep(allocate) nodes for flushing twice. This can be suboptimal, or it -+ * can be optimal. For now we choose to live with the risk that it will -+ * be suboptimal because it would be quite complex to code it to be -+ * smarter. */ -+ if (znode_check_flushprepped(right_lock.node) -+ && !znode_convertible(right_lock.node)) { -+ assert("edward-1005", -+ !should_convert_next_node(pos, right_lock.node)); -+ pos_stop(pos); -+ break; -+ } -+ -+ ret = incr_load_count_znode(&right_load, right_lock.node); -+ if (ret) -+ break; -+ -+ if (should_convert_node(pos, right_lock.node)) { -+ ret = convert_node(pos, right_lock.node); -+ if (ret) -+ break; -+ if (node_is_empty(right_lock.node)) { -+ /* node became empty after converting, repeat */ -+ done_load_count(&right_load); -+ done_lh(&right_lock); -+ continue; -+ } -+ } -+ -+ /* squeeze _before_ going upward. */ -+ ret = -+ squeeze_right_neighbor(pos, pos->lock.node, -+ right_lock.node); -+ if (ret < 0) -+ break; -+ -+ if (znode_check_flushprepped(right_lock.node)) { -+ if (should_convert_next_node(pos, right_lock.node)) { -+ /* in spite of flushprepped status of the node, -+ its right slum neighbor should be converted */ -+ assert("edward-953", convert_data(pos)); -+ assert("edward-954", item_convert_data(pos)); -+ -+ if (node_is_empty(right_lock.node)) { -+ done_load_count(&right_load); -+ done_lh(&right_lock); -+ } else -+ move_flush_pos(pos, &right_lock, -+ &right_load, NULL); -+ continue; -+ } -+ pos_stop(pos); -+ break; -+ } -+ -+ if (node_is_empty(right_lock.node)) { -+ /* repeat if right node was squeezed completely */ -+ done_load_count(&right_load); -+ done_lh(&right_lock); -+ continue; -+ } -+ -+ /* parent(right_lock.node) has to be processed before -+ * (right_lock.node) due to "parent-first" allocation order. */ -+ ret = -+ check_parents_and_squalloc_upper_levels(pos, pos->lock.node, -+ right_lock.node); -+ if (ret) -+ break; -+ /* (re)allocate _after_ going upward */ -+ ret = lock_parent_and_allocate_znode(right_lock.node, pos); -+ if (ret) -+ break; -+ -+ if (should_terminate_squalloc(pos)) { -+ set_item_convert_count(pos, 0); -+ break; -+ } -+ -+ /* advance the flush position to the right neighbor */ -+ move_flush_pos(pos, &right_lock, &right_load, NULL); -+ -+ ret = rapid_flush(pos); -+ if (ret) -+ break; -+ } -+ -+ assert("edward-1006", !convert_data(pos) || !item_convert_data(pos)); -+ -+ done_load_count(&right_load); -+ done_lh(&right_lock); -+ -+ /* This function indicates via pos whether to stop or go to twig or continue on current -+ * level. */ -+ return ret; -+ -+} -+ -+/* Process nodes on leaf level until unformatted node or rightmost node in the -+ * slum reached. */ -+static int handle_pos_on_leaf(flush_pos_t * pos) -+{ -+ int ret; -+ -+ assert("zam-845", pos->state == POS_ON_LEAF); -+ -+ ret = handle_pos_on_formatted(pos); -+ -+ if (ret == -E_NO_NEIGHBOR) { -+ /* cannot get right neighbor, go process extents. */ -+ pos->state = POS_TO_TWIG; -+ return 0; -+ } -+ -+ return ret; -+} -+ -+/* Process slum on level > 1 */ -+static int handle_pos_on_internal(flush_pos_t * pos) -+{ -+ assert("zam-850", pos->state == POS_ON_INTERNAL); -+ return handle_pos_on_formatted(pos); -+} -+ -+/* check whether squalloc should stop before processing given extent */ -+static int squalloc_extent_should_stop(flush_pos_t * pos) -+{ -+ assert("zam-869", item_is_extent(&pos->coord)); -+ -+ /* pos->child is a jnode handle_pos_on_extent() should start with in -+ * stead of the first child of the first extent unit. */ -+ if (pos->child) { -+ int prepped; -+ -+ assert("vs-1383", jnode_is_unformatted(pos->child)); -+ prepped = jnode_check_flushprepped(pos->child); -+ pos->pos_in_unit = -+ jnode_get_index(pos->child) - -+ extent_unit_index(&pos->coord); -+ assert("vs-1470", -+ pos->pos_in_unit < extent_unit_width(&pos->coord)); -+ assert("nikita-3434", -+ ergo(extent_is_unallocated(&pos->coord), -+ pos->pos_in_unit == 0)); -+ jput(pos->child); -+ pos->child = NULL; -+ -+ return prepped; -+ } -+ -+ pos->pos_in_unit = 0; -+ if (extent_is_unallocated(&pos->coord)) -+ return 0; -+ -+ return leftmost_child_of_unit_check_flushprepped(&pos->coord); -+} -+ -+/* Handle the case when regular reiser4 tree (znodes connected one to its -+ * neighbors by sibling pointers) is interrupted on leaf level by one or more -+ * unformatted nodes. By having a lock on twig level and use extent code -+ * routines to process unformatted nodes we swim around an irregular part of -+ * reiser4 tree. */ -+static int handle_pos_on_twig(flush_pos_t * pos) -+{ -+ int ret; -+ -+ assert("zam-844", pos->state == POS_ON_EPOINT); -+ assert("zam-843", item_is_extent(&pos->coord)); -+ -+ /* We decide should we continue slum processing with current extent -+ unit: if leftmost child of current extent unit is flushprepped -+ (i.e. clean or already processed by flush) we stop squalloc(). There -+ is a fast check for unallocated extents which we assume contain all -+ not flushprepped nodes. */ -+ /* FIXME: Here we implement simple check, we are only looking on the -+ leftmost child. */ -+ ret = squalloc_extent_should_stop(pos); -+ if (ret != 0) { -+ pos_stop(pos); -+ return ret; -+ } -+ -+ while (pos_valid(pos) && coord_is_existing_unit(&pos->coord) -+ && item_is_extent(&pos->coord)) { -+ ret = reiser4_alloc_extent(pos); -+ if (ret) { -+ break; -+ } -+ coord_next_unit(&pos->coord); -+ } -+ -+ if (coord_is_after_rightmost(&pos->coord)) { -+ pos->state = POS_END_OF_TWIG; -+ return 0; -+ } -+ if (item_is_internal(&pos->coord)) { -+ pos->state = POS_TO_LEAF; -+ return 0; -+ } -+ -+ assert("zam-860", item_is_extent(&pos->coord)); -+ -+ /* "slum" is over */ -+ pos->state = POS_INVALID; -+ return 0; -+} -+ -+/* When we about to return flush position from twig to leaf level we can process -+ * the right twig node or move position to the leaf. This processes right twig -+ * if it is possible and jump to leaf level if not. */ -+static int handle_pos_end_of_twig(flush_pos_t * pos) -+{ -+ int ret; -+ lock_handle right_lock; -+ load_count right_load; -+ coord_t at_right; -+ jnode *child = NULL; -+ -+ assert("zam-848", pos->state == POS_END_OF_TWIG); -+ assert("zam-849", coord_is_after_rightmost(&pos->coord)); -+ -+ init_lh(&right_lock); -+ init_load_count(&right_load); -+ -+ /* We get a lock on the right twig node even it is not dirty because -+ * slum continues or discontinues on leaf level not on next twig. This -+ * lock on the right twig is needed for getting its leftmost child. */ -+ ret = -+ reiser4_get_right_neighbor(&right_lock, pos->lock.node, -+ ZNODE_WRITE_LOCK, GN_SAME_ATOM); -+ if (ret) -+ goto out; -+ -+ ret = incr_load_count_znode(&right_load, right_lock.node); -+ if (ret) -+ goto out; -+ -+ /* right twig could be not dirty */ -+ if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) { -+ /* If right twig node is dirty we always attempt to squeeze it -+ * content to the left... */ -+ became_dirty: -+ ret = -+ squeeze_right_twig_and_advance_coord(pos, right_lock.node); -+ if (ret <= 0) { -+ /* pos->coord is on internal item, go to leaf level, or -+ * we have an error which will be caught in squalloc() */ -+ pos->state = POS_TO_LEAF; -+ goto out; -+ } -+ -+ /* If right twig was squeezed completely we wave to re-lock -+ * right twig. now it is done through the top-level squalloc -+ * routine. */ -+ if (node_is_empty(right_lock.node)) -+ goto out; -+ -+ /* ... and prep it if it is not yet prepped */ -+ if (!znode_check_flushprepped(right_lock.node)) { -+ /* As usual, process parent before ... */ -+ ret = -+ check_parents_and_squalloc_upper_levels(pos, -+ pos->lock. -+ node, -+ right_lock. -+ node); -+ if (ret) -+ goto out; -+ -+ /* ... processing the child */ -+ ret = -+ lock_parent_and_allocate_znode(right_lock.node, -+ pos); -+ if (ret) -+ goto out; -+ } -+ } else { -+ coord_init_first_unit(&at_right, right_lock.node); -+ -+ /* check first child of next twig, should we continue there ? */ -+ ret = get_leftmost_child_of_unit(&at_right, &child); -+ if (ret || child == NULL || jnode_check_flushprepped(child)) { -+ pos_stop(pos); -+ goto out; -+ } -+ -+ /* check clean twig for possible relocation */ -+ if (!znode_check_flushprepped(right_lock.node)) { -+ ret = -+ reverse_relocate_check_dirty_parent(child, -+ &at_right, pos); -+ if (ret) -+ goto out; -+ if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) -+ goto became_dirty; -+ } -+ } -+ -+ assert("zam-875", znode_check_flushprepped(right_lock.node)); -+ -+ /* Update the preceder by a block number of just processed right twig -+ * node. The code above could miss the preceder updating because -+ * allocate_znode() could not be called for this node. */ -+ pos->preceder.blk = *znode_get_block(right_lock.node); -+ check_preceder(pos->preceder.blk); -+ -+ coord_init_first_unit(&at_right, right_lock.node); -+ assert("zam-868", coord_is_existing_unit(&at_right)); -+ -+ pos->state = item_is_extent(&at_right) ? POS_ON_EPOINT : POS_TO_LEAF; -+ move_flush_pos(pos, &right_lock, &right_load, &at_right); -+ -+ out: -+ done_load_count(&right_load); -+ done_lh(&right_lock); -+ -+ if (child) -+ jput(child); -+ -+ return ret; -+} -+ -+/* Move the pos->lock to leaf node pointed by pos->coord, check should we -+ * continue there. */ -+static int handle_pos_to_leaf(flush_pos_t * pos) -+{ -+ int ret; -+ lock_handle child_lock; -+ load_count child_load; -+ jnode *child; -+ -+ assert("zam-846", pos->state == POS_TO_LEAF); -+ assert("zam-847", item_is_internal(&pos->coord)); -+ -+ init_lh(&child_lock); -+ init_load_count(&child_load); -+ -+ ret = get_leftmost_child_of_unit(&pos->coord, &child); -+ if (ret) -+ return ret; -+ if (child == NULL) { -+ pos_stop(pos); -+ return 0; -+ } -+ -+ if (jnode_check_flushprepped(child)) { -+ pos->state = POS_INVALID; -+ goto out; -+ } -+ -+ ret = -+ longterm_lock_znode(&child_lock, JZNODE(child), ZNODE_WRITE_LOCK, -+ ZNODE_LOCK_LOPRI); -+ if (ret) -+ goto out; -+ -+ ret = incr_load_count_znode(&child_load, JZNODE(child)); -+ if (ret) -+ goto out; -+ -+ ret = allocate_znode(JZNODE(child), &pos->coord, pos); -+ if (ret) -+ goto out; -+ -+ /* move flush position to leaf level */ -+ pos->state = POS_ON_LEAF; -+ move_flush_pos(pos, &child_lock, &child_load, NULL); -+ -+ if (node_is_empty(JZNODE(child))) { -+ ret = delete_empty_node(JZNODE(child)); -+ pos->state = POS_INVALID; -+ } -+ out: -+ done_load_count(&child_load); -+ done_lh(&child_lock); -+ jput(child); -+ -+ return ret; -+} -+ -+/* move pos from leaf to twig, and move lock from leaf to twig. */ -+/* Move pos->lock to upper (twig) level */ -+static int handle_pos_to_twig(flush_pos_t * pos) -+{ -+ int ret; -+ -+ lock_handle parent_lock; -+ load_count parent_load; -+ coord_t pcoord; -+ -+ assert("zam-852", pos->state == POS_TO_TWIG); -+ -+ init_lh(&parent_lock); -+ init_load_count(&parent_load); -+ -+ ret = -+ reiser4_get_parent(&parent_lock, pos->lock.node, ZNODE_WRITE_LOCK); -+ if (ret) -+ goto out; -+ -+ ret = incr_load_count_znode(&parent_load, parent_lock.node); -+ if (ret) -+ goto out; -+ -+ ret = find_child_ptr(parent_lock.node, pos->lock.node, &pcoord); -+ if (ret) -+ goto out; -+ -+ assert("zam-870", item_is_internal(&pcoord)); -+ coord_next_item(&pcoord); -+ -+ if (coord_is_after_rightmost(&pcoord)) -+ pos->state = POS_END_OF_TWIG; -+ else if (item_is_extent(&pcoord)) -+ pos->state = POS_ON_EPOINT; -+ else { -+ /* Here we understand that getting -E_NO_NEIGHBOR in -+ * handle_pos_on_leaf() was because of just a reaching edge of -+ * slum */ -+ pos_stop(pos); -+ goto out; -+ } -+ -+ move_flush_pos(pos, &parent_lock, &parent_load, &pcoord); -+ -+ out: -+ done_load_count(&parent_load); -+ done_lh(&parent_lock); -+ -+ return ret; -+} -+ -+typedef int (*pos_state_handle_t) (flush_pos_t *); -+static pos_state_handle_t flush_pos_handlers[] = { -+ /* process formatted nodes on leaf level, keep lock on a leaf node */ -+ [POS_ON_LEAF] = handle_pos_on_leaf, -+ /* process unformatted nodes, keep lock on twig node, pos->coord points to extent currently -+ * being processed */ -+ [POS_ON_EPOINT] = handle_pos_on_twig, -+ /* move a lock from leaf node to its parent for further processing of unformatted nodes */ -+ [POS_TO_TWIG] = handle_pos_to_twig, -+ /* move a lock from twig to leaf level when a processing of unformatted nodes finishes, -+ * pos->coord points to the leaf node we jump to */ -+ [POS_TO_LEAF] = handle_pos_to_leaf, -+ /* after processing last extent in the twig node, attempting to shift items from the twigs -+ * right neighbor and process them while shifting */ -+ [POS_END_OF_TWIG] = handle_pos_end_of_twig, -+ /* process formatted nodes on internal level, keep lock on an internal node */ -+ [POS_ON_INTERNAL] = handle_pos_on_internal -+}; -+ -+/* Advance flush position horizontally, prepare for flushing ((re)allocate, squeeze, -+ * encrypt) nodes and their ancestors in "parent-first" order */ -+static int squalloc(flush_pos_t * pos) -+{ -+ int ret = 0; -+ -+ /* maybe needs to be made a case statement with handle_pos_on_leaf as first case, for -+ * greater CPU efficiency? Measure and see.... -Hans */ -+ while (pos_valid(pos)) { -+ ret = flush_pos_handlers[pos->state] (pos); -+ if (ret < 0) -+ break; -+ -+ ret = rapid_flush(pos); -+ if (ret) -+ break; -+ } -+ -+ /* any positive value or -E_NO_NEIGHBOR are legal return codes for handle_pos* -+ routines, -E_NO_NEIGHBOR means that slum edge was reached */ -+ if (ret > 0 || ret == -E_NO_NEIGHBOR) -+ ret = 0; -+ -+ return ret; -+} -+ -+static void update_ldkey(znode * node) -+{ -+ reiser4_key ldkey; -+ -+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock)); -+ if (node_is_empty(node)) -+ return; -+ -+ znode_set_ld_key(node, leftmost_key_in_node(node, &ldkey)); -+} -+ -+/* this is to be called after calling of shift node's method to shift data from @right to -+ @left. It sets left delimiting keys of @left and @right to keys of first items of @left -+ and @right correspondingly and sets right delimiting key of @left to first key of @right */ -+static void update_znode_dkeys(znode * left, znode * right) -+{ -+ assert_rw_write_locked(&(znode_get_tree(right)->dk_lock)); -+ assert("vs-1629", (znode_is_write_locked(left) && -+ znode_is_write_locked(right))); -+ -+ /* we need to update left delimiting of left if it was empty before shift */ -+ update_ldkey(left); -+ update_ldkey(right); -+ if (node_is_empty(right)) -+ znode_set_rd_key(left, znode_get_rd_key(right)); -+ else -+ znode_set_rd_key(left, znode_get_ld_key(right)); -+} -+ -+/* try to shift everything from @right to @left. If everything was shifted - -+ @right is removed from the tree. Result is the number of bytes shifted. */ -+static int -+shift_everything_left(znode * right, znode * left, carry_level * todo) -+{ -+ coord_t from; -+ node_plugin *nplug; -+ carry_plugin_info info; -+ -+ coord_init_after_last_item(&from, right); -+ -+ nplug = node_plugin_by_node(right); -+ info.doing = NULL; -+ info.todo = todo; -+ return nplug->shift(&from, left, SHIFT_LEFT, -+ 1 /* delete @right if it becomes empty */ , -+ 1 -+ /* move coord @from to node @left if everything will be shifted */ -+ , -+ &info); -+} -+ -+/* Shift as much as possible from @right to @left using the memcpy-optimized -+ shift_everything_left. @left and @right are formatted neighboring nodes on -+ leaf level. */ -+static int squeeze_right_non_twig(znode * left, znode * right) -+{ -+ int ret; -+ carry_pool *pool; -+ carry_level *todo; -+ -+ assert("nikita-2246", znode_get_level(left) == znode_get_level(right)); -+ -+ if (!JF_ISSET(ZJNODE(left), JNODE_DIRTY) || -+ !JF_ISSET(ZJNODE(right), JNODE_DIRTY)) -+ return SQUEEZE_TARGET_FULL; -+ -+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo)); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ todo = (carry_level *) (pool + 1); -+ init_carry_level(todo, pool); -+ -+ ret = shift_everything_left(right, left, todo); -+ if (ret > 0) { -+ /* something was shifted */ -+ reiser4_tree *tree; -+ __u64 grabbed; -+ -+ znode_make_dirty(left); -+ znode_make_dirty(right); -+ -+ /* update delimiting keys of nodes which participated in -+ shift. FIXME: it would be better to have this in shift -+ node's operation. But it can not be done there. Nobody -+ remembers why, though */ -+ tree = znode_get_tree(left); -+ write_lock_dk(tree); -+ update_znode_dkeys(left, right); -+ write_unlock_dk(tree); -+ -+ /* Carry is called to update delimiting key and, maybe, to remove empty -+ node. */ -+ grabbed = get_current_context()->grabbed_blocks; -+ ret = reiser4_grab_space_force(tree->height, BA_RESERVED); -+ assert("nikita-3003", ret == 0); /* reserved space is exhausted. Ask Hans. */ -+ ret = reiser4_carry(todo, NULL /* previous level */ ); -+ grabbed2free_mark(grabbed); -+ } else { -+ /* Shifting impossible, we return appropriate result code */ -+ ret = -+ node_is_empty(right) ? SQUEEZE_SOURCE_EMPTY : -+ SQUEEZE_TARGET_FULL; -+ } -+ -+ done_carry_pool(pool); -+ -+ return ret; -+} -+ -+#if REISER4_DEBUG -+static int sibling_link_is_ok(const znode *left, const znode *right) -+{ -+ int result; -+ -+ read_lock_tree(znode_get_tree(left)); -+ result = (left->right == right && left == right->left); -+ read_unlock_tree(znode_get_tree(left)); -+ return result; -+} -+#endif -+ -+/* Shift first unit of first item if it is an internal one. Return -+ SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return -+ SUBTREE_MOVED. */ -+static int shift_one_internal_unit(znode * left, znode * right) -+{ -+ int ret; -+ carry_pool *pool; -+ carry_level *todo; -+ coord_t *coord; -+ carry_plugin_info *info; -+ int size, moved; -+ -+ assert("nikita-2247", znode_get_level(left) == znode_get_level(right)); -+ assert("nikita-2435", znode_is_write_locked(left)); -+ assert("nikita-2436", znode_is_write_locked(right)); -+ assert("nikita-2434", sibling_link_is_ok(left, right)); -+ -+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) + -+ sizeof(*coord) + sizeof(*info) -+#if REISER4_DEBUG -+ + sizeof(*coord) + 2 * sizeof(reiser4_key) -+#endif -+ ); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ todo = (carry_level *) (pool + 1); -+ init_carry_level(todo, pool); -+ -+ coord = (coord_t *) (todo + 3); -+ coord_init_first_unit(coord, right); -+ info = (carry_plugin_info *) (coord + 1); -+ -+#if REISER4_DEBUG -+ if (!node_is_empty(left)) { -+ coord_t *last; -+ reiser4_key *right_key; -+ reiser4_key *left_key; -+ -+ last = (coord_t *) (info + 1); -+ right_key = (reiser4_key *) (last + 1); -+ left_key = right_key + 1; -+ coord_init_last_unit(last, left); -+ -+ assert("nikita-2463", -+ keyle(item_key_by_coord(last, left_key), -+ item_key_by_coord(coord, right_key))); -+ } -+#endif -+ -+ assert("jmacd-2007", item_is_internal(coord)); -+ -+ size = item_length_by_coord(coord); -+ info->todo = todo; -+ info->doing = NULL; -+ -+ ret = node_plugin_by_node(left)->shift(coord, left, SHIFT_LEFT, -+ 1 -+ /* delete @right if it becomes empty */ -+ , -+ 0 -+ /* do not move coord @coord to node @left */ -+ , -+ info); -+ -+ /* If shift returns positive, then we shifted the item. */ -+ assert("vs-423", ret <= 0 || size == ret); -+ moved = (ret > 0); -+ -+ if (moved) { -+ /* something was moved */ -+ reiser4_tree *tree; -+ int grabbed; -+ -+ znode_make_dirty(left); -+ znode_make_dirty(right); -+ tree = znode_get_tree(left); -+ write_lock_dk(tree); -+ update_znode_dkeys(left, right); -+ write_unlock_dk(tree); -+ -+ /* reserve space for delimiting keys after shifting */ -+ grabbed = get_current_context()->grabbed_blocks; -+ ret = reiser4_grab_space_force(tree->height, BA_RESERVED); -+ assert("nikita-3003", ret == 0); /* reserved space is exhausted. Ask Hans. */ -+ -+ ret = reiser4_carry(todo, NULL /* previous level */ ); -+ grabbed2free_mark(grabbed); -+ } -+ -+ done_carry_pool(pool); -+ -+ if (ret != 0) { -+ /* Shift or carry operation failed. */ -+ assert("jmacd-7325", ret < 0); -+ return ret; -+ } -+ -+ return moved ? SUBTREE_MOVED : SQUEEZE_TARGET_FULL; -+} -+ -+/* Make the final relocate/wander decision during forward parent-first squalloc for a -+ znode. For unformatted nodes this is done in plugin/item/extent.c:extent_needs_allocation(). */ -+static int -+allocate_znode_loaded(znode * node, -+ const coord_t * parent_coord, flush_pos_t * pos) -+{ -+ int ret; -+ reiser4_super_info_data *sbinfo = get_current_super_private(); -+ /* FIXME(D): We have the node write-locked and should have checked for ! -+ allocated() somewhere before reaching this point, but there can be a race, so -+ this assertion is bogus. */ -+ assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node))); -+ assert("jmacd-7988", znode_is_write_locked(node)); -+ assert("jmacd-7989", coord_is_invalid(parent_coord) -+ || znode_is_write_locked(parent_coord->node)); -+ -+ if (ZF_ISSET(node, JNODE_REPACK) || ZF_ISSET(node, JNODE_CREATED) || -+ znode_is_root(node) || -+ /* We have enough nodes to relocate no matter what. */ -+ (pos->leaf_relocate != 0 && znode_get_level(node) == LEAF_LEVEL)) { -+ /* No need to decide with new nodes, they are treated the same as -+ relocate. If the root node is dirty, relocate. */ -+ if (pos->preceder.blk == 0) { -+ /* preceder is unknown and we have decided to relocate node -- -+ using of default value for search start is better than search -+ from block #0. */ -+ get_blocknr_hint_default(&pos->preceder.blk); -+ check_preceder(pos->preceder.blk); -+ } -+ -+ goto best_reloc; -+ -+ } else if (pos->preceder.blk == 0) { -+ /* If we don't know the preceder, leave it where it is. */ -+ jnode_make_wander(ZJNODE(node)); -+ } else { -+ /* Make a decision based on block distance. */ -+ reiser4_block_nr dist; -+ reiser4_block_nr nblk = *znode_get_block(node); -+ -+ assert("jmacd-6172", !reiser4_blocknr_is_fake(&nblk)); -+ assert("jmacd-6173", !reiser4_blocknr_is_fake(&pos->preceder.blk)); -+ assert("jmacd-6174", pos->preceder.blk != 0); -+ -+ if (pos->preceder.blk == nblk - 1) { -+ /* Ideal. */ -+ jnode_make_wander(ZJNODE(node)); -+ } else { -+ -+ dist = -+ (nblk < -+ pos->preceder.blk) ? (pos->preceder.blk - -+ nblk) : (nblk - -+ pos->preceder.blk); -+ -+ /* See if we can find a closer block (forward direction only). */ -+ pos->preceder.max_dist = -+ min((reiser4_block_nr) sbinfo->flush. -+ relocate_distance, dist); -+ pos->preceder.level = znode_get_level(node); -+ -+ ret = allocate_znode_update(node, parent_coord, pos); -+ -+ pos->preceder.max_dist = 0; -+ -+ if (ret && (ret != -ENOSPC)) -+ return ret; -+ -+ if (ret == 0) { -+ /* Got a better allocation. */ -+ znode_make_reloc(node, pos->fq); -+ } else if (dist < sbinfo->flush.relocate_distance) { -+ /* The present allocation is good enough. */ -+ jnode_make_wander(ZJNODE(node)); -+ } else { -+ /* Otherwise, try to relocate to the best position. */ -+ best_reloc: -+ ret = -+ allocate_znode_update(node, parent_coord, -+ pos); -+ if (ret != 0) -+ return ret; -+ -+ /* set JNODE_RELOC bit _after_ node gets allocated */ -+ znode_make_reloc(node, pos->fq); -+ } -+ } -+ } -+ -+ /* This is the new preceder. */ -+ pos->preceder.blk = *znode_get_block(node); -+ check_preceder(pos->preceder.blk); -+ pos->alloc_cnt += 1; -+ -+ assert("jmacd-4277", !reiser4_blocknr_is_fake(&pos->preceder.blk)); -+ -+ return 0; -+} -+ -+static int -+allocate_znode(znode * node, const coord_t * parent_coord, flush_pos_t * pos) -+{ -+ /* -+ * perform znode allocation with znode pinned in memory to avoid races -+ * with asynchronous emergency flush (which plays with -+ * JNODE_FLUSH_RESERVED bit). -+ */ -+ return WITH_DATA(node, allocate_znode_loaded(node, parent_coord, pos)); -+} -+ -+/* A subroutine of allocate_znode, this is called first to see if there is a close -+ position to relocate to. It may return ENOSPC if there is no close position. If there -+ is no close position it may not relocate. This takes care of updating the parent node -+ with the relocated block address. */ -+static int -+allocate_znode_update(znode * node, const coord_t * parent_coord, -+ flush_pos_t * pos) -+{ -+ int ret; -+ reiser4_block_nr blk; -+ lock_handle uber_lock; -+ int flush_reserved_used = 0; -+ int grabbed; -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ init_lh(&uber_lock); -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ grabbed = ctx->grabbed_blocks; -+ -+ /* discard e-flush allocation */ -+ ret = zload(node); -+ if (ret) -+ return ret; -+ -+ if (ZF_ISSET(node, JNODE_CREATED)) { -+ assert("zam-816", reiser4_blocknr_is_fake(znode_get_block(node))); -+ pos->preceder.block_stage = BLOCK_UNALLOCATED; -+ } else { -+ pos->preceder.block_stage = BLOCK_GRABBED; -+ -+ /* The disk space for relocating the @node is already reserved in "flush reserved" -+ * counter if @node is leaf, otherwise we grab space using BA_RESERVED (means grab -+ * space from whole disk not from only 95%). */ -+ if (znode_get_level(node) == LEAF_LEVEL) { -+ /* -+ * earlier (during do_jnode_make_dirty()) we decided -+ * that @node can possibly go into overwrite set and -+ * reserved block for its wandering location. -+ */ -+ txn_atom *atom = get_current_atom_locked(); -+ assert("nikita-3449", -+ ZF_ISSET(node, JNODE_FLUSH_RESERVED)); -+ flush_reserved2grabbed(atom, (__u64) 1); -+ spin_unlock_atom(atom); -+ /* -+ * we are trying to move node into relocate -+ * set. Allocation of relocated position "uses" -+ * reserved block. -+ */ -+ ZF_CLR(node, JNODE_FLUSH_RESERVED); -+ flush_reserved_used = 1; -+ } else { -+ ret = reiser4_grab_space_force((__u64) 1, BA_RESERVED); -+ if (ret != 0) -+ goto exit; -+ } -+ } -+ -+ /* We may do not use 5% of reserved disk space here and flush will not pack tightly. */ -+ ret = reiser4_alloc_block(&pos->preceder, &blk, -+ BA_FORMATTED | BA_PERMANENT); -+ if (ret) -+ goto exit; -+ -+ if (!ZF_ISSET(node, JNODE_CREATED) && -+ (ret = -+ reiser4_dealloc_block(znode_get_block(node), 0, -+ BA_DEFER | BA_FORMATTED))) -+ goto exit; -+ -+ if (likely(!znode_is_root(node))) { -+ item_plugin *iplug; -+ -+ iplug = item_plugin_by_coord(parent_coord); -+ assert("nikita-2954", iplug->f.update != NULL); -+ iplug->f.update(parent_coord, &blk); -+ -+ znode_make_dirty(parent_coord->node); -+ -+ } else { -+ reiser4_tree *tree = znode_get_tree(node); -+ znode *uber; -+ -+ /* We take a longterm lock on the fake node in order to change -+ the root block number. This may cause atom fusion. */ -+ ret = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI, -+ &uber_lock); -+ /* The fake node cannot be deleted, and we must have priority -+ here, and may not be confused with ENOSPC. */ -+ assert("jmacd-74412", -+ ret != -EINVAL && ret != -E_DEADLOCK && ret != -ENOSPC); -+ -+ if (ret) -+ goto exit; -+ -+ uber = uber_lock.node; -+ -+ write_lock_tree(tree); -+ tree->root_block = blk; -+ write_unlock_tree(tree); -+ -+ znode_make_dirty(uber); -+ } -+ -+ ret = znode_rehash(node, &blk); -+ exit: -+ if (ret) { -+ /* Get flush reserved block back if something fails, because -+ * callers assume that on error block wasn't relocated and its -+ * flush reserved block wasn't used. */ -+ if (flush_reserved_used) { -+ /* -+ * ok, we failed to move node into relocate -+ * set. Restore status quo. -+ */ -+ grabbed2flush_reserved((__u64) 1); -+ ZF_SET(node, JNODE_FLUSH_RESERVED); -+ } -+ } -+ zrelse(node); -+ done_lh(&uber_lock); -+ grabbed2free_mark(grabbed); -+ return ret; -+} -+ -+/* JNODE INTERFACE */ -+ -+/* Lock a node (if formatted) and then get its parent locked, set the child's -+ coordinate in the parent. If the child is the root node, the above_root -+ znode is returned but the coord is not set. This function may cause atom -+ fusion, but it is only used for read locks (at this point) and therefore -+ fusion only occurs when the parent is already dirty. */ -+/* Hans adds this note: remember to ask how expensive this operation is vs. storing parent -+ pointer in jnodes. */ -+static int -+jnode_lock_parent_coord(jnode * node, -+ coord_t * coord, -+ lock_handle * parent_lh, -+ load_count * parent_zh, -+ znode_lock_mode parent_mode, int try) -+{ -+ int ret; -+ -+ assert("edward-53", jnode_is_unformatted(node) || jnode_is_znode(node)); -+ assert("edward-54", jnode_is_unformatted(node) -+ || znode_is_any_locked(JZNODE(node))); -+ -+ if (!jnode_is_znode(node)) { -+ reiser4_key key; -+ tree_level stop_level = TWIG_LEVEL; -+ lookup_bias bias = FIND_EXACT; -+ -+ assert("edward-168", !(jnode_get_type(node) == JNODE_BITMAP)); -+ -+ /* The case when node is not znode, but can have parent coord -+ (unformatted node, node which represents cluster page, -+ etc..). Generate a key for the appropriate entry, search -+ in the tree using coord_by_key, which handles locking for -+ us. */ -+ -+ /* -+ * nothing is locked at this moment, so, nothing prevents -+ * concurrent truncate from removing jnode from inode. To -+ * prevent this spin-lock jnode. jnode can be truncated just -+ * after call to the jnode_build_key(), but this is ok, -+ * because coord_by_key() will just fail to find appropriate -+ * extent. -+ */ -+ spin_lock_jnode(node); -+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) { -+ jnode_build_key(node, &key); -+ ret = 0; -+ } else -+ ret = RETERR(-ENOENT); -+ spin_unlock_jnode(node); -+ -+ if (ret != 0) -+ return ret; -+ -+ if (jnode_is_cluster_page(node)) -+ stop_level = LEAF_LEVEL; -+ -+ assert("jmacd-1812", coord != NULL); -+ -+ ret = coord_by_key(jnode_get_tree(node), &key, coord, parent_lh, -+ parent_mode, bias, stop_level, stop_level, -+ CBK_UNIQUE, NULL /*ra_info */ ); -+ switch (ret) { -+ case CBK_COORD_NOTFOUND: -+ assert("edward-1038", -+ ergo(jnode_is_cluster_page(node), -+ JF_ISSET(node, JNODE_HEARD_BANSHEE))); -+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) -+ warning("nikita-3177", "Parent not found"); -+ return ret; -+ case CBK_COORD_FOUND: -+ if (coord->between != AT_UNIT) { -+ /* FIXME: comment needed */ -+ done_lh(parent_lh); -+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) { -+ warning("nikita-3178", -+ "Found but not happy: %i", -+ coord->between); -+ } -+ return RETERR(-ENOENT); -+ } -+ ret = incr_load_count_znode(parent_zh, parent_lh->node); -+ if (ret != 0) -+ return ret; -+ /* if (jnode_is_cluster_page(node)) { -+ races with write() are possible -+ check_child_cluster (parent_lh->node); -+ } -+ */ -+ break; -+ default: -+ return ret; -+ } -+ -+ } else { -+ int flags; -+ znode *z; -+ -+ z = JZNODE(node); -+ /* Formatted node case: */ -+ assert("jmacd-2061", !znode_is_root(z)); -+ -+ flags = GN_ALLOW_NOT_CONNECTED; -+ if (try) -+ flags |= GN_TRY_LOCK; -+ -+ ret = -+ reiser4_get_parent_flags(parent_lh, z, parent_mode, flags); -+ if (ret != 0) -+ /* -E_REPEAT is ok here, it is handled by the caller. */ -+ return ret; -+ -+ /* Make the child's position "hint" up-to-date. (Unless above -+ root, which caller must check.) */ -+ if (coord != NULL) { -+ -+ ret = incr_load_count_znode(parent_zh, parent_lh->node); -+ if (ret != 0) { -+ warning("jmacd-976812386", -+ "incr_load_count_znode failed: %d", -+ ret); -+ return ret; -+ } -+ -+ ret = find_child_ptr(parent_lh->node, z, coord); -+ if (ret != 0) { -+ warning("jmacd-976812", -+ "find_child_ptr failed: %d", ret); -+ return ret; -+ } -+ } -+ } -+ -+ return 0; -+} -+ -+/* Get the (locked) next neighbor of a znode which is dirty and a member of the same atom. -+ If there is no next neighbor or the neighbor is not in memory or if there is a -+ neighbor but it is not dirty or not in the same atom, -E_NO_NEIGHBOR is returned. -+ In some cases the slum may include nodes which are not dirty, if so @check_dirty should be 0 */ -+static int neighbor_in_slum(znode * node, /* starting point */ -+ lock_handle * lock, /* lock on starting point */ -+ sideof side, /* left or right direction we seek the next node in */ -+ znode_lock_mode mode, /* kind of lock we want */ -+ int check_dirty) -+{ /* true if the neighbor should be dirty */ -+ int ret; -+ -+ assert("jmacd-6334", znode_is_connected(node)); -+ -+ ret = -+ reiser4_get_neighbor(lock, node, mode, -+ GN_SAME_ATOM | (side == -+ LEFT_SIDE ? GN_GO_LEFT : 0)); -+ -+ if (ret) { -+ /* May return -ENOENT or -E_NO_NEIGHBOR. */ -+ /* FIXME(C): check EINVAL, E_DEADLOCK */ -+ if (ret == -ENOENT) { -+ ret = RETERR(-E_NO_NEIGHBOR); -+ } -+ -+ return ret; -+ } -+ if (!check_dirty) -+ return 0; -+ /* Check dirty bit of locked znode, no races here */ -+ if (JF_ISSET(ZJNODE(lock->node), JNODE_DIRTY)) -+ return 0; -+ -+ done_lh(lock); -+ return RETERR(-E_NO_NEIGHBOR); -+} -+ -+/* Return true if two znodes have the same parent. This is called with both nodes -+ write-locked (for squeezing) so no tree lock is needed. */ -+static int znode_same_parents(znode * a, znode * b) -+{ -+ int result; -+ -+ assert("jmacd-7011", znode_is_write_locked(a)); -+ assert("jmacd-7012", znode_is_write_locked(b)); -+ -+ /* We lock the whole tree for this check.... I really don't like whole tree -+ * locks... -Hans */ -+ read_lock_tree(znode_get_tree(a)); -+ result = (znode_parent(a) == znode_parent(b)); -+ read_unlock_tree(znode_get_tree(a)); -+ return result; -+} -+ -+/* FLUSH SCAN */ -+ -+/* Initialize the flush_scan data structure. */ -+static void scan_init(flush_scan * scan) -+{ -+ memset(scan, 0, sizeof(*scan)); -+ init_lh(&scan->node_lock); -+ init_lh(&scan->parent_lock); -+ init_load_count(&scan->parent_load); -+ init_load_count(&scan->node_load); -+ coord_init_invalid(&scan->parent_coord, NULL); -+} -+ -+/* Release any resources held by the flush scan, e.g., release locks, free memory, etc. */ -+static void scan_done(flush_scan * scan) -+{ -+ done_load_count(&scan->node_load); -+ if (scan->node != NULL) { -+ jput(scan->node); -+ scan->node = NULL; -+ } -+ done_load_count(&scan->parent_load); -+ done_lh(&scan->parent_lock); -+ done_lh(&scan->node_lock); -+} -+ -+/* Returns true if flush scanning is finished. */ -+int reiser4_scan_finished(flush_scan * scan) -+{ -+ return scan->stop || (scan->direction == RIGHT_SIDE && -+ scan->count >= scan->max_count); -+} -+ -+/* Return true if the scan should continue to the @tonode. True if the node meets the -+ same_slum_check condition. If not, deref the "left" node and stop the scan. */ -+int reiser4_scan_goto(flush_scan * scan, jnode * tonode) -+{ -+ int go = same_slum_check(scan->node, tonode, 1, 0); -+ -+ if (!go) { -+ scan->stop = 1; -+ jput(tonode); -+ } -+ -+ return go; -+} -+ -+/* Set the current scan->node, refcount it, increment count by the @add_count (number to -+ count, e.g., skipped unallocated nodes), deref previous current, and copy the current -+ parent coordinate. */ -+int -+scan_set_current(flush_scan * scan, jnode * node, unsigned add_count, -+ const coord_t * parent) -+{ -+ /* Release the old references, take the new reference. */ -+ done_load_count(&scan->node_load); -+ -+ if (scan->node != NULL) { -+ jput(scan->node); -+ } -+ scan->node = node; -+ scan->count += add_count; -+ -+ /* This next stmt is somewhat inefficient. The reiser4_scan_extent() code could -+ delay this update step until it finishes and update the parent_coord only once. -+ It did that before, but there was a bug and this was the easiest way to make it -+ correct. */ -+ if (parent != NULL) { -+ coord_dup(&scan->parent_coord, parent); -+ } -+ -+ /* Failure may happen at the incr_load_count call, but the caller can assume the reference -+ is safely taken. */ -+ return incr_load_count_jnode(&scan->node_load, node); -+} -+ -+/* Return true if scanning in the leftward direction. */ -+int reiser4_scanning_left(flush_scan * scan) -+{ -+ return scan->direction == LEFT_SIDE; -+} -+ -+/* Performs leftward scanning starting from either kind of node. Counts the starting -+ node. The right-scan object is passed in for the left-scan in order to copy the parent -+ of an unformatted starting position. This way we avoid searching for the unformatted -+ node's parent when scanning in each direction. If we search for the parent once it is -+ set in both scan objects. The limit parameter tells flush-scan when to stop. -+ -+ Rapid scanning is used only during scan_left, where we are interested in finding the -+ 'leftpoint' where we begin flushing. We are interested in stopping at the left child -+ of a twig that does not have a dirty left neighbor. THIS IS A SPECIAL CASE. The -+ problem is finding a way to flush only those nodes without unallocated children, and it -+ is difficult to solve in the bottom-up flushing algorithm we are currently using. The -+ problem can be solved by scanning left at every level as we go upward, but this would -+ basically bring us back to using a top-down allocation strategy, which we already tried -+ (see BK history from May 2002), and has a different set of problems. The top-down -+ strategy makes avoiding unallocated children easier, but makes it difficult to -+ propertly flush dirty children with clean parents that would otherwise stop the -+ top-down flush, only later to dirty the parent once the children are flushed. So we -+ solve the problem in the bottom-up algorithm with a special case for twigs and leaves -+ only. -+ -+ The first step in solving the problem is this rapid leftward scan. After we determine -+ that there are at least enough nodes counted to qualify for FLUSH_RELOCATE_THRESHOLD we -+ are no longer interested in the exact count, we are only interested in finding a the -+ best place to start the flush. We could choose one of two possibilities: -+ -+ 1. Stop at the leftmost child (of a twig) that does not have a dirty left neighbor. -+ This requires checking one leaf per rapid-scan twig -+ -+ 2. Stop at the leftmost child (of a twig) where there are no dirty children of the twig -+ to the left. This requires checking possibly all of the in-memory children of each -+ twig during the rapid scan. -+ -+ For now we implement the first policy. -+*/ -+static int -+scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit) -+{ -+ int ret = 0; -+ -+ scan->max_count = limit; -+ scan->direction = LEFT_SIDE; -+ -+ ret = scan_set_current(scan, jref(node), 1, NULL); -+ if (ret != 0) { -+ return ret; -+ } -+ -+ ret = scan_common(scan, right); -+ if (ret != 0) { -+ return ret; -+ } -+ -+ /* Before rapid scanning, we need a lock on scan->node so that we can get its -+ parent, only if formatted. */ -+ if (jnode_is_znode(scan->node)) { -+ ret = longterm_lock_znode(&scan->node_lock, JZNODE(scan->node), -+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI); -+ } -+ -+ /* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD). */ -+ return ret; -+} -+ -+/* Performs rightward scanning... Does not count the starting node. The limit parameter -+ is described in scan_left. If the starting node is unformatted then the -+ parent_coord was already set during scan_left. The rapid_after parameter is not used -+ during right-scanning. -+ -+ scan_right is only called if the scan_left operation does not count at least -+ FLUSH_RELOCATE_THRESHOLD nodes for flushing. Otherwise, the limit parameter is set to -+ the difference between scan-left's count and FLUSH_RELOCATE_THRESHOLD, meaning -+ scan-right counts as high as FLUSH_RELOCATE_THRESHOLD and then stops. */ -+static int scan_right(flush_scan * scan, jnode * node, unsigned limit) -+{ -+ int ret; -+ -+ scan->max_count = limit; -+ scan->direction = RIGHT_SIDE; -+ -+ ret = scan_set_current(scan, jref(node), 0, NULL); -+ if (ret != 0) { -+ return ret; -+ } -+ -+ return scan_common(scan, NULL); -+} -+ -+/* Common code to perform left or right scanning. */ -+static int scan_common(flush_scan * scan, flush_scan * other) -+{ -+ int ret; -+ -+ assert("nikita-2376", scan->node != NULL); -+ assert("edward-54", jnode_is_unformatted(scan->node) -+ || jnode_is_znode(scan->node)); -+ -+ /* Special case for starting at an unformatted node. Optimization: we only want -+ to search for the parent (which requires a tree traversal) once. Obviously, we -+ shouldn't have to call it once for the left scan and once for the right scan. -+ For this reason, if we search for the parent during scan-left we then duplicate -+ the coord/lock/load into the scan-right object. */ -+ if (jnode_is_unformatted(scan->node)) { -+ ret = scan_unformatted(scan, other); -+ if (ret != 0) -+ return ret; -+ } -+ /* This loop expects to start at a formatted position and performs chaining of -+ formatted regions */ -+ while (!reiser4_scan_finished(scan)) { -+ -+ ret = scan_formatted(scan); -+ if (ret != 0) { -+ return ret; -+ } -+ } -+ -+ return 0; -+} -+ -+static int scan_unformatted(flush_scan * scan, flush_scan * other) -+{ -+ int ret = 0; -+ int try = 0; -+ -+ if (!coord_is_invalid(&scan->parent_coord)) -+ goto scan; -+ -+ /* set parent coord from */ -+ if (!jnode_is_unformatted(scan->node)) { -+ /* formatted position */ -+ -+ lock_handle lock; -+ assert("edward-301", jnode_is_znode(scan->node)); -+ init_lh(&lock); -+ -+ /* -+ * when flush starts from unformatted node, first thing it -+ * does is tree traversal to find formatted parent of starting -+ * node. This parent is then kept lock across scans to the -+ * left and to the right. This means that during scan to the -+ * left we cannot take left-ward lock, because this is -+ * dead-lock prone. So, if we are scanning to the left and -+ * there is already lock held by this thread, -+ * jnode_lock_parent_coord() should use try-lock. -+ */ -+ try = reiser4_scanning_left(scan) -+ && !lock_stack_isclean(get_current_lock_stack()); -+ /* Need the node locked to get the parent lock, We have to -+ take write lock since there is at least one call path -+ where this znode is already write-locked by us. */ -+ ret = -+ longterm_lock_znode(&lock, JZNODE(scan->node), -+ ZNODE_WRITE_LOCK, -+ reiser4_scanning_left(scan) ? -+ ZNODE_LOCK_LOPRI : -+ ZNODE_LOCK_HIPRI); -+ if (ret != 0) -+ /* EINVAL or E_DEADLOCK here mean... try again! At this point we've -+ scanned too far and can't back out, just start over. */ -+ return ret; -+ -+ ret = jnode_lock_parent_coord(scan->node, -+ &scan->parent_coord, -+ &scan->parent_lock, -+ &scan->parent_load, -+ ZNODE_WRITE_LOCK, try); -+ -+ /* FIXME(C): check EINVAL, E_DEADLOCK */ -+ done_lh(&lock); -+ if (ret == -E_REPEAT) { -+ scan->stop = 1; -+ return 0; -+ } -+ if (ret) -+ return ret; -+ -+ } else { -+ /* unformatted position */ -+ -+ ret = -+ jnode_lock_parent_coord(scan->node, &scan->parent_coord, -+ &scan->parent_lock, -+ &scan->parent_load, -+ ZNODE_WRITE_LOCK, try); -+ -+ if (IS_CBKERR(ret)) -+ return ret; -+ -+ if (ret == CBK_COORD_NOTFOUND) -+ /* FIXME(C): check EINVAL, E_DEADLOCK */ -+ return ret; -+ -+ /* parent was found */ -+ assert("jmacd-8661", other != NULL); -+ /* Duplicate the reference into the other flush_scan. */ -+ coord_dup(&other->parent_coord, &scan->parent_coord); -+ copy_lh(&other->parent_lock, &scan->parent_lock); -+ copy_load_count(&other->parent_load, &scan->parent_load); -+ } -+ scan: -+ return scan_by_coord(scan); -+} -+ -+/* Performs left- or rightward scanning starting from a formatted node. Follow left -+ pointers under tree lock as long as: -+ -+ - node->left/right is non-NULL -+ - node->left/right is connected, dirty -+ - node->left/right belongs to the same atom -+ - scan has not reached maximum count -+*/ -+static int scan_formatted(flush_scan * scan) -+{ -+ int ret; -+ znode *neighbor = NULL; -+ -+ assert("jmacd-1401", !reiser4_scan_finished(scan)); -+ -+ do { -+ znode *node = JZNODE(scan->node); -+ -+ /* Node should be connected, but if not stop the scan. */ -+ if (!znode_is_connected(node)) { -+ scan->stop = 1; -+ break; -+ } -+ -+ /* Lock the tree, check-for and reference the next sibling. */ -+ read_lock_tree(znode_get_tree(node)); -+ -+ /* It may be that a node is inserted or removed between a node and its -+ left sibling while the tree lock is released, but the flush-scan count -+ does not need to be precise. Thus, we release the tree lock as soon as -+ we get the neighboring node. */ -+ neighbor = -+ reiser4_scanning_left(scan) ? node->left : node->right; -+ if (neighbor != NULL) { -+ zref(neighbor); -+ } -+ -+ read_unlock_tree(znode_get_tree(node)); -+ -+ /* If neighbor is NULL at the leaf level, need to check for an unformatted -+ sibling using the parent--break in any case. */ -+ if (neighbor == NULL) { -+ break; -+ } -+ -+ /* Check the condition for going left, break if it is not met. This also -+ releases (jputs) the neighbor if false. */ -+ if (!reiser4_scan_goto(scan, ZJNODE(neighbor))) { -+ break; -+ } -+ -+ /* Advance the flush_scan state to the left, repeat. */ -+ ret = scan_set_current(scan, ZJNODE(neighbor), 1, NULL); -+ if (ret != 0) { -+ return ret; -+ } -+ -+ } while (!reiser4_scan_finished(scan)); -+ -+ /* If neighbor is NULL then we reached the end of a formatted region, or else the -+ sibling is out of memory, now check for an extent to the left (as long as -+ LEAF_LEVEL). */ -+ if (neighbor != NULL || jnode_get_level(scan->node) != LEAF_LEVEL -+ || reiser4_scan_finished(scan)) { -+ scan->stop = 1; -+ return 0; -+ } -+ /* Otherwise, calls scan_by_coord for the right(left)most item of the -+ left(right) neighbor on the parent level, then possibly continue. */ -+ -+ coord_init_invalid(&scan->parent_coord, NULL); -+ return scan_unformatted(scan, NULL); -+} -+ -+/* NOTE-EDWARD: -+ This scans adjacent items of the same type and calls scan flush plugin for each one. -+ Performs left(right)ward scanning starting from a (possibly) unformatted node. If we start -+ from unformatted node, then we continue only if the next neighbor is also unformatted. -+ When called from scan_formatted, we skip first iteration (to make sure that -+ right(left)most item of the left(right) neighbor on the parent level is of the same -+ type and set appropriate coord). */ -+static int scan_by_coord(flush_scan * scan) -+{ -+ int ret = 0; -+ int scan_this_coord; -+ lock_handle next_lock; -+ load_count next_load; -+ coord_t next_coord; -+ jnode *child; -+ item_plugin *iplug; -+ -+ init_lh(&next_lock); -+ init_load_count(&next_load); -+ scan_this_coord = (jnode_is_unformatted(scan->node) ? 1 : 0); -+ -+ /* set initial item id */ -+ iplug = item_plugin_by_coord(&scan->parent_coord); -+ -+ for (; !reiser4_scan_finished(scan); scan_this_coord = 1) { -+ if (scan_this_coord) { -+ /* Here we expect that unit is scannable. it would not be so due -+ * to race with extent->tail conversion. */ -+ if (iplug->f.scan == NULL) { -+ scan->stop = 1; -+ ret = -E_REPEAT; -+ /* skip the check at the end. */ -+ goto race; -+ } -+ -+ ret = iplug->f.scan(scan); -+ if (ret != 0) -+ goto exit; -+ -+ if (reiser4_scan_finished(scan)) { -+ checkchild(scan); -+ break; -+ } -+ } else { -+ /* the same race against truncate as above is possible -+ * here, it seems */ -+ -+ /* NOTE-JMACD: In this case, apply the same end-of-node logic but don't scan -+ the first coordinate. */ -+ assert("jmacd-1231", -+ item_is_internal(&scan->parent_coord)); -+ } -+ -+ if (iplug->f.utmost_child == NULL -+ || znode_get_level(scan->parent_coord.node) != TWIG_LEVEL) { -+ /* stop this coord and continue on parrent level */ -+ ret = -+ scan_set_current(scan, -+ ZJNODE(zref -+ (scan->parent_coord.node)), -+ 1, NULL); -+ if (ret != 0) -+ goto exit; -+ break; -+ } -+ -+ /* Either way, the invariant is that scan->parent_coord is set to the -+ parent of scan->node. Now get the next unit. */ -+ coord_dup(&next_coord, &scan->parent_coord); -+ coord_sideof_unit(&next_coord, scan->direction); -+ -+ /* If off-the-end of the twig, try the next twig. */ -+ if (coord_is_after_sideof_unit(&next_coord, scan->direction)) { -+ /* We take the write lock because we may start flushing from this -+ * coordinate. */ -+ ret = -+ neighbor_in_slum(next_coord.node, &next_lock, -+ scan->direction, ZNODE_WRITE_LOCK, -+ 1 /* check dirty */ ); -+ if (ret == -E_NO_NEIGHBOR) { -+ scan->stop = 1; -+ ret = 0; -+ break; -+ } -+ -+ if (ret != 0) { -+ goto exit; -+ } -+ -+ ret = incr_load_count_znode(&next_load, next_lock.node); -+ if (ret != 0) { -+ goto exit; -+ } -+ -+ coord_init_sideof_unit(&next_coord, next_lock.node, -+ sideof_reverse(scan->direction)); -+ } -+ -+ iplug = item_plugin_by_coord(&next_coord); -+ -+ /* Get the next child. */ -+ ret = -+ iplug->f.utmost_child(&next_coord, -+ sideof_reverse(scan->direction), -+ &child); -+ if (ret != 0) -+ goto exit; -+ /* If the next child is not in memory, or, item_utmost_child -+ failed (due to race with unlink, most probably), stop -+ here. */ -+ if (child == NULL || IS_ERR(child)) { -+ scan->stop = 1; -+ checkchild(scan); -+ break; -+ } -+ -+ assert("nikita-2374", jnode_is_unformatted(child) -+ || jnode_is_znode(child)); -+ -+ /* See if it is dirty, part of the same atom. */ -+ if (!reiser4_scan_goto(scan, child)) { -+ checkchild(scan); -+ break; -+ } -+ -+ /* If so, make this child current. */ -+ ret = scan_set_current(scan, child, 1, &next_coord); -+ if (ret != 0) -+ goto exit; -+ -+ /* Now continue. If formatted we release the parent lock and return, then -+ proceed. */ -+ if (jnode_is_znode(child)) -+ break; -+ -+ /* Otherwise, repeat the above loop with next_coord. */ -+ if (next_load.node != NULL) { -+ done_lh(&scan->parent_lock); -+ move_lh(&scan->parent_lock, &next_lock); -+ move_load_count(&scan->parent_load, &next_load); -+ } -+ } -+ -+ assert("jmacd-6233", -+ reiser4_scan_finished(scan) || jnode_is_znode(scan->node)); -+ exit: -+ checkchild(scan); -+ race: /* skip the above check */ -+ if (jnode_is_znode(scan->node)) { -+ done_lh(&scan->parent_lock); -+ done_load_count(&scan->parent_load); -+ } -+ -+ done_load_count(&next_load); -+ done_lh(&next_lock); -+ return ret; -+} -+ -+/* FLUSH POS HELPERS */ -+ -+/* Initialize the fields of a flush_position. */ -+static void pos_init(flush_pos_t * pos) -+{ -+ memset(pos, 0, sizeof *pos); -+ -+ pos->state = POS_INVALID; -+ coord_init_invalid(&pos->coord, NULL); -+ init_lh(&pos->lock); -+ init_load_count(&pos->load); -+ -+ reiser4_blocknr_hint_init(&pos->preceder); -+} -+ -+/* The flush loop inside squalloc periodically checks pos_valid to -+ determine when "enough flushing" has been performed. This will return true until one -+ of the following conditions is met: -+ -+ 1. the number of flush-queued nodes has reached the kernel-supplied "int *nr_to_flush" -+ parameter, meaning we have flushed as many blocks as the kernel requested. When -+ flushing to commit, this parameter is NULL. -+ -+ 2. pos_stop() is called because squalloc discovers that the "next" node in the -+ flush order is either non-existant, not dirty, or not in the same atom. -+*/ -+ -+static int pos_valid(flush_pos_t * pos) -+{ -+ return pos->state != POS_INVALID; -+} -+ -+/* Release any resources of a flush_position. Called when jnode_flush finishes. */ -+static void pos_done(flush_pos_t * pos) -+{ -+ pos_stop(pos); -+ reiser4_blocknr_hint_done(&pos->preceder); -+ if (convert_data(pos)) -+ free_convert_data(pos); -+} -+ -+/* Reset the point and parent. Called during flush subroutines to terminate the -+ squalloc loop. */ -+static int pos_stop(flush_pos_t * pos) -+{ -+ pos->state = POS_INVALID; -+ done_lh(&pos->lock); -+ done_load_count(&pos->load); -+ coord_init_invalid(&pos->coord, NULL); -+ -+ if (pos->child) { -+ jput(pos->child); -+ pos->child = NULL; -+ } -+ -+ return 0; -+} -+ -+/* Return the flush_position's block allocator hint. */ -+reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t * pos) -+{ -+ return &pos->preceder; -+} -+ -+flush_queue_t * reiser4_pos_fq(flush_pos_t * pos) -+{ -+ return pos->fq; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 90 -+ LocalWords: preceder -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/flush.h linux-2.6.20/fs/reiser4/flush.h ---- linux-2.6.20.orig/fs/reiser4/flush.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/flush.h 2007-05-06 14:50:43.718981974 +0400 -@@ -0,0 +1,274 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* DECLARATIONS: */ -+ -+#if !defined(__REISER4_FLUSH_H__) -+#define __REISER4_FLUSH_H__ -+ -+#include "plugin/cluster.h" -+ -+/* The flush_scan data structure maintains the state of an in-progress flush-scan on a -+ single level of the tree. A flush-scan is used for counting the number of adjacent -+ nodes to flush, which is used to determine whether we should relocate, and it is also -+ used to find a starting point for flush. A flush-scan object can scan in both right -+ and left directions via the scan_left() and scan_right() interfaces. The -+ right- and left-variations are similar but perform different functions. When scanning -+ left we (optionally perform rapid scanning and then) longterm-lock the endpoint node. -+ When scanning right we are simply counting the number of adjacent, dirty nodes. */ -+struct flush_scan { -+ -+ /* The current number of nodes scanned on this level. */ -+ unsigned count; -+ -+ /* There may be a maximum number of nodes for a scan on any single level. When -+ going leftward, max_count is determined by FLUSH_SCAN_MAXNODES (see reiser4.h) */ -+ unsigned max_count; -+ -+ /* Direction: Set to one of the sideof enumeration: { LEFT_SIDE, RIGHT_SIDE }. */ -+ sideof direction; -+ -+ /* Initially @stop is set to false then set true once some condition stops the -+ search (e.g., we found a clean node before reaching max_count or we found a -+ node belonging to another atom). */ -+ int stop; -+ -+ /* The current scan position. If @node is non-NULL then its reference count has -+ been incremented to reflect this reference. */ -+ jnode *node; -+ -+ /* A handle for zload/zrelse of current scan position node. */ -+ load_count node_load; -+ -+ /* During left-scan, if the final position (a.k.a. endpoint node) is formatted the -+ node is locked using this lock handle. The endpoint needs to be locked for -+ transfer to the flush_position object after scanning finishes. */ -+ lock_handle node_lock; -+ -+ /* When the position is unformatted, its parent, coordinate, and parent -+ zload/zrelse handle. */ -+ lock_handle parent_lock; -+ coord_t parent_coord; -+ load_count parent_load; -+ -+ /* The block allocator preceder hint. Sometimes flush_scan determines what the -+ preceder is and if so it sets it here, after which it is copied into the -+ flush_position. Otherwise, the preceder is computed later. */ -+ reiser4_block_nr preceder_blk; -+}; -+ -+typedef struct convert_item_info { -+ dc_item_stat d_cur; /* disk cluster state of the current item */ -+ dc_item_stat d_next; /* disk cluster state of the next slum item */ -+ struct inode *inode; -+ flow_t flow; -+} convert_item_info_t; -+ -+typedef struct convert_info { -+ int count; /* for squalloc terminating */ -+ reiser4_cluster_t clust; /* transform cluster */ -+ item_plugin *iplug; /* current item plugin */ -+ convert_item_info_t *itm; /* current item info */ -+} convert_info_t; -+ -+typedef enum flush_position_state { -+ POS_INVALID, /* Invalid or stopped pos, do not continue slum -+ * processing */ -+ POS_ON_LEAF, /* pos points to already prepped, locked formatted node at -+ * leaf level */ -+ POS_ON_EPOINT, /* pos keeps a lock on twig level, "coord" field is used -+ * to traverse unformatted nodes */ -+ POS_TO_LEAF, /* pos is being moved to leaf level */ -+ POS_TO_TWIG, /* pos is being moved to twig level */ -+ POS_END_OF_TWIG, /* special case of POS_ON_TWIG, when coord is after -+ * rightmost unit of the current twig */ -+ POS_ON_INTERNAL /* same as POS_ON_LEAF, but points to internal node */ -+} flushpos_state_t; -+ -+/* An encapsulation of the current flush point and all the parameters that are passed -+ through the entire squeeze-and-allocate stage of the flush routine. A single -+ flush_position object is constructed after left- and right-scanning finishes. */ -+struct flush_position { -+ flushpos_state_t state; -+ -+ coord_t coord; /* coord to traverse unformatted nodes */ -+ lock_handle lock; /* current lock we hold */ -+ load_count load; /* load status for current locked formatted node */ -+ -+ jnode *child; /* for passing a reference to unformatted child -+ * across pos state changes */ -+ -+ reiser4_blocknr_hint preceder; /* The flush 'hint' state. */ -+ int leaf_relocate; /* True if enough leaf-level nodes were -+ * found to suggest a relocate policy. */ -+ int alloc_cnt; /* The number of nodes allocated during squeeze and allococate. */ -+ int prep_or_free_cnt; /* The number of nodes prepared for write (allocate) or squeezed and freed. */ -+ flush_queue_t *fq; -+ long *nr_written; /* number of nodes submitted to disk */ -+ int flags; /* a copy of jnode_flush flags argument */ -+ -+ znode *prev_twig; /* previous parent pointer value, used to catch -+ * processing of new twig node */ -+ convert_info_t *sq; /* convert info */ -+ -+ unsigned long pos_in_unit; /* for extents only. Position -+ within an extent unit of first -+ jnode of slum */ -+ long nr_to_write; /* number of unformatted nodes to handle on flush */ -+}; -+ -+static inline int item_convert_count(flush_pos_t * pos) -+{ -+ return pos->sq->count; -+} -+static inline void inc_item_convert_count(flush_pos_t * pos) -+{ -+ pos->sq->count++; -+} -+static inline void set_item_convert_count(flush_pos_t * pos, int count) -+{ -+ pos->sq->count = count; -+} -+static inline item_plugin *item_convert_plug(flush_pos_t * pos) -+{ -+ return pos->sq->iplug; -+} -+ -+static inline convert_info_t *convert_data(flush_pos_t * pos) -+{ -+ return pos->sq; -+} -+ -+static inline convert_item_info_t *item_convert_data(flush_pos_t * pos) -+{ -+ assert("edward-955", convert_data(pos)); -+ return pos->sq->itm; -+} -+ -+static inline tfm_cluster_t *tfm_cluster_sq(flush_pos_t * pos) -+{ -+ return &pos->sq->clust.tc; -+} -+ -+static inline tfm_stream_t *tfm_stream_sq(flush_pos_t * pos, tfm_stream_id id) -+{ -+ assert("edward-854", pos->sq != NULL); -+ return tfm_stream(tfm_cluster_sq(pos), id); -+} -+ -+static inline int chaining_data_present(flush_pos_t * pos) -+{ -+ return convert_data(pos) && item_convert_data(pos); -+} -+ -+/* Returns true if next node contains next item of the disk cluster -+ so item convert data should be moved to the right slum neighbor. -+*/ -+static inline int should_chain_next_node(flush_pos_t * pos) -+{ -+ int result = 0; -+ -+ assert("edward-1007", chaining_data_present(pos)); -+ -+ switch (item_convert_data(pos)->d_next) { -+ case DC_CHAINED_ITEM: -+ result = 1; -+ break; -+ case DC_AFTER_CLUSTER: -+ break; -+ default: -+ impossible("edward-1009", "bad state of next slum item"); -+ } -+ return result; -+} -+ -+/* update item state in a disk cluster to assign conversion mode */ -+static inline void -+move_chaining_data(flush_pos_t * pos, int this_node /* where is next item */ ) -+{ -+ -+ assert("edward-1010", chaining_data_present(pos)); -+ -+ if (this_node == 0) { -+ /* next item is on the right neighbor */ -+ assert("edward-1011", -+ item_convert_data(pos)->d_cur == DC_FIRST_ITEM || -+ item_convert_data(pos)->d_cur == DC_CHAINED_ITEM); -+ assert("edward-1012", -+ item_convert_data(pos)->d_next == DC_CHAINED_ITEM); -+ -+ item_convert_data(pos)->d_cur = DC_CHAINED_ITEM; -+ item_convert_data(pos)->d_next = DC_INVALID_STATE; -+ } else { -+ /* next item is on the same node */ -+ assert("edward-1013", -+ item_convert_data(pos)->d_cur == DC_FIRST_ITEM || -+ item_convert_data(pos)->d_cur == DC_CHAINED_ITEM); -+ assert("edward-1227", -+ item_convert_data(pos)->d_next == DC_AFTER_CLUSTER || -+ item_convert_data(pos)->d_next == DC_INVALID_STATE); -+ -+ item_convert_data(pos)->d_cur = DC_AFTER_CLUSTER; -+ item_convert_data(pos)->d_next = DC_INVALID_STATE; -+ } -+} -+ -+static inline int should_convert_node(flush_pos_t * pos, znode * node) -+{ -+ return znode_convertible(node); -+} -+ -+/* true if there is attached convert item info */ -+static inline int should_convert_next_node(flush_pos_t * pos, znode * node) -+{ -+ return convert_data(pos) && item_convert_data(pos); -+} -+ -+#define SQUALLOC_THRESHOLD 256 -+ -+static inline int should_terminate_squalloc(flush_pos_t * pos) -+{ -+ return convert_data(pos) && -+ !item_convert_data(pos) && -+ item_convert_count(pos) >= SQUALLOC_THRESHOLD; -+} -+ -+void free_convert_data(flush_pos_t * pos); -+/* used in extent.c */ -+int scan_set_current(flush_scan * scan, jnode * node, unsigned add_size, -+ const coord_t * parent); -+int reiser4_scan_finished(flush_scan * scan); -+int reiser4_scanning_left(flush_scan * scan); -+int reiser4_scan_goto(flush_scan * scan, jnode * tonode); -+txn_atom *atom_locked_by_fq(flush_queue_t * fq); -+int reiser4_alloc_extent(flush_pos_t *flush_pos); -+squeeze_result squalloc_extent(znode *left, const coord_t *, flush_pos_t *, -+ reiser4_key *stop_key); -+extern int reiser4_init_fqs(void); -+extern void reiser4_done_fqs(void); -+ -+#if REISER4_DEBUG -+ -+extern void reiser4_check_fq(const txn_atom *atom); -+extern atomic_t flush_cnt; -+ -+#define check_preceder(blk) \ -+assert("nikita-2588", blk < reiser4_block_count(reiser4_get_current_sb())); -+extern void check_pos(flush_pos_t * pos); -+#else -+#define check_preceder(b) noop -+#define check_pos(pos) noop -+#endif -+ -+/* __REISER4_FLUSH_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 90 -+ LocalWords: preceder -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/flush_queue.c linux-2.6.20/fs/reiser4/flush_queue.c ---- linux-2.6.20.orig/fs/reiser4/flush_queue.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/flush_queue.c 2007-05-06 14:50:43.718981974 +0400 -@@ -0,0 +1,680 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "debug.h" -+#include "super.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "page_cache.h" -+#include "wander.h" -+#include "vfs_ops.h" -+#include "writeout.h" -+#include "flush.h" -+ -+#include -+#include -+#include -+#include -+#include -+ -+/* A flush queue object is an accumulator for keeping jnodes prepared -+ by the jnode_flush() function for writing to disk. Those "queued" jnodes are -+ kept on the flush queue until memory pressure or atom commit asks -+ flush queues to write some or all from their jnodes. */ -+ -+/* -+ LOCKING: -+ -+ fq->guard spin lock protects fq->atom pointer and nothing else. fq->prepped -+ list protected by atom spin lock. fq->prepped list uses the following -+ locking: -+ -+ two ways to protect fq->prepped list for read-only list traversal: -+ -+ 1. atom spin-lock atom. -+ 2. fq is IN_USE, atom->nr_running_queues increased. -+ -+ and one for list modification: -+ -+ 1. atom is spin-locked and one condition is true: fq is IN_USE or -+ atom->nr_running_queues == 0. -+ -+ The deadlock-safe order for flush queues and atoms is: first lock atom, then -+ lock flush queue, then lock jnode. -+*/ -+ -+#define fq_in_use(fq) ((fq)->state & FQ_IN_USE) -+#define fq_ready(fq) (!fq_in_use(fq)) -+ -+#define mark_fq_in_use(fq) do { (fq)->state |= FQ_IN_USE; } while (0) -+#define mark_fq_ready(fq) do { (fq)->state &= ~FQ_IN_USE; } while (0) -+ -+/* get lock on atom from locked flush queue object */ -+static txn_atom *atom_locked_by_fq_nolock(flush_queue_t * fq) -+{ -+ /* This code is similar to jnode_get_atom(), look at it for the -+ * explanation. */ -+ txn_atom *atom; -+ -+ assert_spin_locked(&(fq->guard)); -+ -+ while (1) { -+ atom = fq->atom; -+ if (atom == NULL) -+ break; -+ -+ if (spin_trylock_atom(atom)) -+ break; -+ -+ atomic_inc(&atom->refcount); -+ spin_unlock(&(fq->guard)); -+ spin_lock_atom(atom); -+ spin_lock(&(fq->guard)); -+ -+ if (fq->atom == atom) { -+ atomic_dec(&atom->refcount); -+ break; -+ } -+ -+ spin_unlock(&(fq->guard)); -+ atom_dec_and_unlock(atom); -+ spin_lock(&(fq->guard)); -+ } -+ -+ return atom; -+} -+ -+txn_atom *atom_locked_by_fq(flush_queue_t * fq) -+{ -+ txn_atom *atom; -+ -+ spin_lock(&(fq->guard)); -+ atom = atom_locked_by_fq_nolock(fq); -+ spin_unlock(&(fq->guard)); -+ return atom; -+} -+ -+static void init_fq(flush_queue_t * fq) -+{ -+ memset(fq, 0, sizeof *fq); -+ -+ atomic_set(&fq->nr_submitted, 0); -+ -+ INIT_LIST_HEAD(ATOM_FQ_LIST(fq)); -+ -+ init_waitqueue_head(&fq->wait); -+ spin_lock_init(&fq->guard); -+} -+ -+/* slab for flush queues */ -+static struct kmem_cache *fq_slab; -+ -+/** -+ * reiser4_init_fqs - create flush queue cache -+ * -+ * Initializes slab cache of flush queues. It is part of reiser4 module -+ * initialization. -+ */ -+int reiser4_init_fqs(void) -+{ -+ fq_slab = kmem_cache_create("fq", -+ sizeof(flush_queue_t), -+ 0, SLAB_HWCACHE_ALIGN, NULL, NULL); -+ if (fq_slab == NULL) -+ return RETERR(-ENOMEM); -+ return 0; -+} -+ -+/** -+ * reiser4_done_fqs - delete flush queue cache -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+void reiser4_done_fqs(void) -+{ -+ destroy_reiser4_cache(&fq_slab); -+} -+ -+/* create new flush queue object */ -+static flush_queue_t *create_fq(gfp_t gfp) -+{ -+ flush_queue_t *fq; -+ -+ fq = kmem_cache_alloc(fq_slab, gfp); -+ if (fq) -+ init_fq(fq); -+ -+ return fq; -+} -+ -+/* adjust atom's and flush queue's counters of queued nodes */ -+static void count_enqueued_node(flush_queue_t * fq) -+{ -+ ON_DEBUG(fq->atom->num_queued++); -+} -+ -+static void count_dequeued_node(flush_queue_t * fq) -+{ -+ assert("zam-993", fq->atom->num_queued > 0); -+ ON_DEBUG(fq->atom->num_queued--); -+} -+ -+/* attach flush queue object to the atom */ -+static void attach_fq(txn_atom *atom, flush_queue_t *fq) -+{ -+ assert_spin_locked(&(atom->alock)); -+ list_add(&fq->alink, &atom->flush_queues); -+ fq->atom = atom; -+ ON_DEBUG(atom->nr_flush_queues++); -+} -+ -+static void detach_fq(flush_queue_t * fq) -+{ -+ assert_spin_locked(&(fq->atom->alock)); -+ -+ spin_lock(&(fq->guard)); -+ list_del_init(&fq->alink); -+ assert("vs-1456", fq->atom->nr_flush_queues > 0); -+ ON_DEBUG(fq->atom->nr_flush_queues--); -+ fq->atom = NULL; -+ spin_unlock(&(fq->guard)); -+} -+ -+/* destroy flush queue object */ -+static void done_fq(flush_queue_t * fq) -+{ -+ assert("zam-763", list_empty_careful(ATOM_FQ_LIST(fq))); -+ assert("zam-766", atomic_read(&fq->nr_submitted) == 0); -+ -+ kmem_cache_free(fq_slab, fq); -+} -+ -+/* */ -+static void mark_jnode_queued(flush_queue_t * fq, jnode * node) -+{ -+ JF_SET(node, JNODE_FLUSH_QUEUED); -+ count_enqueued_node(fq); -+} -+ -+/* Putting jnode into the flush queue. Both atom and jnode should be -+ spin-locked. */ -+void queue_jnode(flush_queue_t * fq, jnode * node) -+{ -+ assert_spin_locked(&(node->guard)); -+ assert("zam-713", node->atom != NULL); -+ assert_spin_locked(&(node->atom->alock)); -+ assert("zam-716", fq->atom != NULL); -+ assert("zam-717", fq->atom == node->atom); -+ assert("zam-907", fq_in_use(fq)); -+ -+ assert("zam-714", JF_ISSET(node, JNODE_DIRTY)); -+ assert("zam-826", JF_ISSET(node, JNODE_RELOC)); -+ assert("vs-1481", !JF_ISSET(node, JNODE_FLUSH_QUEUED)); -+ assert("vs-1481", NODE_LIST(node) != FQ_LIST); -+ -+ mark_jnode_queued(fq, node); -+ list_move_tail(&node->capture_link, ATOM_FQ_LIST(fq)); -+ -+ ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node), -+ FQ_LIST, 1)); -+} -+ -+/* repeatable process for waiting io completion on a flush queue object */ -+static int wait_io(flush_queue_t * fq, int *nr_io_errors) -+{ -+ assert("zam-738", fq->atom != NULL); -+ assert_spin_locked(&(fq->atom->alock)); -+ assert("zam-736", fq_in_use(fq)); -+ assert("zam-911", list_empty_careful(ATOM_FQ_LIST(fq))); -+ -+ if (atomic_read(&fq->nr_submitted) != 0) { -+ struct super_block *super; -+ -+ spin_unlock_atom(fq->atom); -+ -+ assert("nikita-3013", reiser4_schedulable()); -+ -+ super = reiser4_get_current_sb(); -+ -+ /* FIXME: this is instead of blk_run_queues() */ -+ blk_run_address_space(reiser4_get_super_fake(super)->i_mapping); -+ -+ if (!(super->s_flags & MS_RDONLY)) -+ wait_event(fq->wait, atomic_read(&fq->nr_submitted) == 0); -+ -+ /* Ask the caller to re-acquire the locks and call this -+ function again. Note: this technique is commonly used in -+ the txnmgr code. */ -+ return -E_REPEAT; -+ } -+ -+ *nr_io_errors += atomic_read(&fq->nr_errors); -+ return 0; -+} -+ -+/* wait on I/O completion, re-submit dirty nodes to write */ -+static int finish_fq(flush_queue_t * fq, int *nr_io_errors) -+{ -+ int ret; -+ txn_atom *atom = fq->atom; -+ -+ assert("zam-801", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ assert("zam-762", fq_in_use(fq)); -+ -+ ret = wait_io(fq, nr_io_errors); -+ if (ret) -+ return ret; -+ -+ detach_fq(fq); -+ done_fq(fq); -+ -+ reiser4_atom_send_event(atom); -+ -+ return 0; -+} -+ -+/* wait for all i/o for given atom to be completed, actually do one iteration -+ on that and return -E_REPEAT if there more iterations needed */ -+static int finish_all_fq(txn_atom * atom, int *nr_io_errors) -+{ -+ flush_queue_t *fq; -+ -+ assert_spin_locked(&(atom->alock)); -+ -+ if (list_empty_careful(&atom->flush_queues)) -+ return 0; -+ -+ list_for_each_entry(fq, &atom->flush_queues, alink) { -+ if (fq_ready(fq)) { -+ int ret; -+ -+ mark_fq_in_use(fq); -+ assert("vs-1247", fq->owner == NULL); -+ ON_DEBUG(fq->owner = current); -+ ret = finish_fq(fq, nr_io_errors); -+ -+ if (*nr_io_errors) -+ reiser4_handle_error(); -+ -+ if (ret) { -+ reiser4_fq_put(fq); -+ return ret; -+ } -+ -+ spin_unlock_atom(atom); -+ -+ return -E_REPEAT; -+ } -+ } -+ -+ /* All flush queues are in use; atom remains locked */ -+ return -EBUSY; -+} -+ -+/* wait all i/o for current atom */ -+int current_atom_finish_all_fq(void) -+{ -+ txn_atom *atom; -+ int nr_io_errors = 0; -+ int ret = 0; -+ -+ do { -+ while (1) { -+ atom = get_current_atom_locked(); -+ ret = finish_all_fq(atom, &nr_io_errors); -+ if (ret != -EBUSY) -+ break; -+ reiser4_atom_wait_event(atom); -+ } -+ } while (ret == -E_REPEAT); -+ -+ /* we do not need locked atom after this function finishes, SUCCESS or -+ -EBUSY are two return codes when atom remains locked after -+ finish_all_fq */ -+ if (!ret) -+ spin_unlock_atom(atom); -+ -+ assert_spin_not_locked(&(atom->alock)); -+ -+ if (ret) -+ return ret; -+ -+ if (nr_io_errors) -+ return RETERR(-EIO); -+ -+ return 0; -+} -+ -+/* change node->atom field for all jnode from given list */ -+static void -+scan_fq_and_update_atom_ref(struct list_head *list, txn_atom *atom) -+{ -+ jnode *cur; -+ -+ list_for_each_entry(cur, list, capture_link) { -+ spin_lock_jnode(cur); -+ cur->atom = atom; -+ spin_unlock_jnode(cur); -+ } -+} -+ -+/* support for atom fusion operation */ -+void reiser4_fuse_fq(txn_atom *to, txn_atom *from) -+{ -+ flush_queue_t *fq; -+ -+ assert_spin_locked(&(to->alock)); -+ assert_spin_locked(&(from->alock)); -+ -+ list_for_each_entry(fq, &from->flush_queues, alink) { -+ scan_fq_and_update_atom_ref(ATOM_FQ_LIST(fq), to); -+ spin_lock(&(fq->guard)); -+ fq->atom = to; -+ spin_unlock(&(fq->guard)); -+ } -+ -+ list_splice_init(&from->flush_queues, to->flush_queues.prev); -+ -+#if REISER4_DEBUG -+ to->num_queued += from->num_queued; -+ to->nr_flush_queues += from->nr_flush_queues; -+ from->nr_flush_queues = 0; -+#endif -+} -+ -+#if REISER4_DEBUG -+int atom_fq_parts_are_clean(txn_atom * atom) -+{ -+ assert("zam-915", atom != NULL); -+ return list_empty_careful(&atom->flush_queues); -+} -+#endif -+/* Bio i/o completion routine for reiser4 write operations. */ -+static int -+end_io_handler(struct bio *bio, unsigned int bytes_done UNUSED_ARG, -+ int err) -+{ -+ int i; -+ int nr_errors = 0; -+ flush_queue_t *fq; -+ -+ assert("zam-958", bio->bi_rw & WRITE); -+ -+ /* i/o op. is not fully completed */ -+ if (bio->bi_size != 0) -+ return 1; -+ -+ if (err == -EOPNOTSUPP) -+ set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); -+ -+ /* we expect that bio->private is set to NULL or fq object which is used -+ * for synchronization and error counting. */ -+ fq = bio->bi_private; -+ /* Check all elements of io_vec for correct write completion. */ -+ for (i = 0; i < bio->bi_vcnt; i += 1) { -+ struct page *pg = bio->bi_io_vec[i].bv_page; -+ -+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { -+ SetPageError(pg); -+ nr_errors++; -+ } -+ -+ { -+ /* jnode WRITEBACK ("write is in progress bit") is -+ * atomically cleared here. */ -+ jnode *node; -+ -+ assert("zam-736", pg != NULL); -+ assert("zam-736", PagePrivate(pg)); -+ node = jprivate(pg); -+ -+ JF_CLR(node, JNODE_WRITEBACK); -+ } -+ -+ end_page_writeback(pg); -+ page_cache_release(pg); -+ } -+ -+ if (fq) { -+ /* count i/o error in fq object */ -+ atomic_add(nr_errors, &fq->nr_errors); -+ -+ /* If all write requests registered in this "fq" are done we up -+ * the waiter. */ -+ if (atomic_sub_and_test(bio->bi_vcnt, &fq->nr_submitted)) -+ wake_up(&fq->wait); -+ } -+ -+ bio_put(bio); -+ return 0; -+} -+ -+/* Count I/O requests which will be submitted by @bio in given flush queues -+ @fq */ -+void add_fq_to_bio(flush_queue_t * fq, struct bio *bio) -+{ -+ bio->bi_private = fq; -+ bio->bi_end_io = end_io_handler; -+ -+ if (fq) -+ atomic_add(bio->bi_vcnt, &fq->nr_submitted); -+} -+ -+/* Move all queued nodes out from @fq->prepped list. */ -+static void release_prepped_list(flush_queue_t * fq) -+{ -+ txn_atom *atom; -+ -+ assert("zam-904", fq_in_use(fq)); -+ atom = atom_locked_by_fq(fq); -+ -+ while (!list_empty(ATOM_FQ_LIST(fq))) { -+ jnode *cur; -+ -+ cur = list_entry(ATOM_FQ_LIST(fq)->next, jnode, capture_link); -+ list_del_init(&cur->capture_link); -+ -+ count_dequeued_node(fq); -+ spin_lock_jnode(cur); -+ assert("nikita-3154", !JF_ISSET(cur, JNODE_OVRWR)); -+ assert("nikita-3154", JF_ISSET(cur, JNODE_RELOC)); -+ assert("nikita-3154", JF_ISSET(cur, JNODE_FLUSH_QUEUED)); -+ JF_CLR(cur, JNODE_FLUSH_QUEUED); -+ -+ if (JF_ISSET(cur, JNODE_DIRTY)) { -+ list_add_tail(&cur->capture_link, -+ ATOM_DIRTY_LIST(atom, jnode_get_level(cur))); -+ ON_DEBUG(count_jnode(atom, cur, FQ_LIST, -+ DIRTY_LIST, 1)); -+ } else { -+ list_add_tail(&cur->capture_link, ATOM_CLEAN_LIST(atom)); -+ ON_DEBUG(count_jnode(atom, cur, FQ_LIST, -+ CLEAN_LIST, 1)); -+ } -+ -+ spin_unlock_jnode(cur); -+ } -+ -+ if (--atom->nr_running_queues == 0) -+ reiser4_atom_send_event(atom); -+ -+ spin_unlock_atom(atom); -+} -+ -+/* Submit write requests for nodes on the already filled flush queue @fq. -+ -+ @fq: flush queue object which contains jnodes we can (and will) write. -+ @return: number of submitted blocks (>=0) if success, otherwise -- an error -+ code (<0). */ -+int reiser4_write_fq(flush_queue_t * fq, long *nr_submitted, int flags) -+{ -+ int ret; -+ txn_atom *atom; -+ -+ while (1) { -+ atom = atom_locked_by_fq(fq); -+ assert("zam-924", atom); -+ /* do not write fq in parallel. */ -+ if (atom->nr_running_queues == 0 -+ || !(flags & WRITEOUT_SINGLE_STREAM)) -+ break; -+ reiser4_atom_wait_event(atom); -+ } -+ -+ atom->nr_running_queues++; -+ spin_unlock_atom(atom); -+ -+ ret = write_jnode_list(ATOM_FQ_LIST(fq), fq, nr_submitted, flags); -+ release_prepped_list(fq); -+ -+ return ret; -+} -+ -+/* Getting flush queue object for exclusive use by one thread. May require -+ several iterations which is indicated by -E_REPEAT return code. -+ -+ This function does not contain code for obtaining an atom lock because an -+ atom lock is obtained by different ways in different parts of reiser4, -+ usually it is current atom, but we need a possibility for getting fq for the -+ atom of given jnode. */ -+static int fq_by_atom_gfp(txn_atom *atom, flush_queue_t **new_fq, gfp_t gfp) -+{ -+ flush_queue_t *fq; -+ -+ assert_spin_locked(&(atom->alock)); -+ -+ fq = list_entry(atom->flush_queues.next, flush_queue_t, alink); -+ while (&atom->flush_queues != &fq->alink) { -+ spin_lock(&(fq->guard)); -+ -+ if (fq_ready(fq)) { -+ mark_fq_in_use(fq); -+ assert("vs-1246", fq->owner == NULL); -+ ON_DEBUG(fq->owner = current); -+ spin_unlock(&(fq->guard)); -+ -+ if (*new_fq) -+ done_fq(*new_fq); -+ -+ *new_fq = fq; -+ -+ return 0; -+ } -+ -+ spin_unlock(&(fq->guard)); -+ -+ fq = list_entry(fq->alink.next, flush_queue_t, alink); -+ } -+ -+ /* Use previously allocated fq object */ -+ if (*new_fq) { -+ mark_fq_in_use(*new_fq); -+ assert("vs-1248", (*new_fq)->owner == 0); -+ ON_DEBUG((*new_fq)->owner = current); -+ attach_fq(atom, *new_fq); -+ -+ return 0; -+ } -+ -+ spin_unlock_atom(atom); -+ -+ *new_fq = create_fq(gfp); -+ -+ if (*new_fq == NULL) -+ return RETERR(-ENOMEM); -+ -+ return RETERR(-E_REPEAT); -+} -+ -+int reiser4_fq_by_atom(txn_atom * atom, flush_queue_t ** new_fq) -+{ -+ return fq_by_atom_gfp(atom, new_fq, reiser4_ctx_gfp_mask_get()); -+} -+ -+/* A wrapper around reiser4_fq_by_atom for getting a flush queue -+ object for current atom, if success fq->atom remains locked. */ -+flush_queue_t *get_fq_for_current_atom(void) -+{ -+ flush_queue_t *fq = NULL; -+ txn_atom *atom; -+ int ret; -+ -+ do { -+ atom = get_current_atom_locked(); -+ ret = reiser4_fq_by_atom(atom, &fq); -+ } while (ret == -E_REPEAT); -+ -+ if (ret) -+ return ERR_PTR(ret); -+ return fq; -+} -+ -+/* Releasing flush queue object after exclusive use */ -+void reiser4_fq_put_nolock(flush_queue_t *fq) -+{ -+ assert("zam-747", fq->atom != NULL); -+ assert("zam-902", list_empty_careful(ATOM_FQ_LIST(fq))); -+ mark_fq_ready(fq); -+ assert("vs-1245", fq->owner == current); -+ ON_DEBUG(fq->owner = NULL); -+} -+ -+void reiser4_fq_put(flush_queue_t * fq) -+{ -+ txn_atom *atom; -+ -+ spin_lock(&(fq->guard)); -+ atom = atom_locked_by_fq_nolock(fq); -+ -+ assert("zam-746", atom != NULL); -+ -+ reiser4_fq_put_nolock(fq); -+ reiser4_atom_send_event(atom); -+ -+ spin_unlock(&(fq->guard)); -+ spin_unlock_atom(atom); -+} -+ -+/* A part of atom object initialization related to the embedded flush queue -+ list head */ -+ -+void init_atom_fq_parts(txn_atom *atom) -+{ -+ INIT_LIST_HEAD(&atom->flush_queues); -+} -+ -+#if REISER4_DEBUG -+ -+void reiser4_check_fq(const txn_atom *atom) -+{ -+ /* check number of nodes on all atom's flush queues */ -+ flush_queue_t *fq; -+ int count; -+ struct list_head *pos; -+ -+ count = 0; -+ list_for_each_entry(fq, &atom->flush_queues, alink) { -+ spin_lock(&(fq->guard)); -+ /* calculate number of jnodes on fq' list of prepped jnodes */ -+ list_for_each(pos, ATOM_FQ_LIST(fq)) -+ count++; -+ spin_unlock(&(fq->guard)); -+ } -+ if (count != atom->fq) -+ warning("", "fq counter %d, real %d\n", atom->fq, count); -+ -+} -+ -+#endif -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/forward.h linux-2.6.20/fs/reiser4/forward.h ---- linux-2.6.20.orig/fs/reiser4/forward.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/forward.h 2007-05-06 14:50:43.718981974 +0400 -@@ -0,0 +1,256 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Forward declarations. Thank you Kernighan. */ -+ -+#if !defined( __REISER4_FORWARD_H__ ) -+#define __REISER4_FORWARD_H__ -+ -+#include -+#include -+ -+typedef struct zlock zlock; -+typedef struct lock_stack lock_stack; -+typedef struct lock_handle lock_handle; -+typedef struct znode znode; -+typedef struct flow flow_t; -+typedef struct coord coord_t; -+typedef struct tree_access_pointer tap_t; -+typedef struct item_coord item_coord; -+typedef struct shift_params shift_params; -+typedef struct reiser4_object_create_data reiser4_object_create_data; -+typedef union reiser4_plugin reiser4_plugin; -+typedef __u16 reiser4_plugin_id; -+typedef __u64 reiser4_plugin_groups; -+typedef struct item_plugin item_plugin; -+typedef struct jnode_plugin jnode_plugin; -+typedef struct reiser4_item_data reiser4_item_data; -+typedef union reiser4_key reiser4_key; -+typedef struct reiser4_tree reiser4_tree; -+typedef struct carry_cut_data carry_cut_data; -+typedef struct carry_kill_data carry_kill_data; -+typedef struct carry_tree_op carry_tree_op; -+typedef struct carry_tree_node carry_tree_node; -+typedef struct carry_plugin_info carry_plugin_info; -+typedef struct reiser4_journal reiser4_journal; -+typedef struct txn_atom txn_atom; -+typedef struct txn_handle txn_handle; -+typedef struct txn_mgr txn_mgr; -+typedef struct reiser4_dir_entry_desc reiser4_dir_entry_desc; -+typedef struct reiser4_context reiser4_context; -+typedef struct carry_level carry_level; -+typedef struct blocknr_set_entry blocknr_set_entry; -+/* super_block->s_fs_info points to this */ -+typedef struct reiser4_super_info_data reiser4_super_info_data; -+/* next two objects are fields of reiser4_super_info_data */ -+typedef struct reiser4_oid_allocator reiser4_oid_allocator; -+typedef struct reiser4_space_allocator reiser4_space_allocator; -+ -+typedef struct flush_scan flush_scan; -+typedef struct flush_position flush_pos_t; -+ -+typedef unsigned short pos_in_node_t; -+#define MAX_POS_IN_NODE 65535 -+ -+typedef struct jnode jnode; -+typedef struct reiser4_blocknr_hint reiser4_blocknr_hint; -+ -+typedef struct uf_coord uf_coord_t; -+typedef struct hint hint_t; -+ -+typedef struct ktxnmgrd_context ktxnmgrd_context; -+ -+typedef struct reiser4_xattr_plugin reiser4_xattr_plugin; -+ -+struct inode; -+struct page; -+struct file; -+struct dentry; -+struct super_block; -+ -+/* return values of coord_by_key(). cbk == coord_by_key */ -+typedef enum { -+ CBK_COORD_FOUND = 0, -+ CBK_COORD_NOTFOUND = -ENOENT, -+} lookup_result; -+ -+/* results of lookup with directory file */ -+typedef enum { -+ FILE_NAME_FOUND = 0, -+ FILE_NAME_NOTFOUND = -ENOENT, -+ FILE_IO_ERROR = -EIO, /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */ -+ FILE_OOM = -ENOMEM /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */ -+} file_lookup_result; -+ -+/* behaviors of lookup. If coord we are looking for is actually in a tree, -+ both coincide. */ -+typedef enum { -+ /* search exactly for the coord with key given */ -+ FIND_EXACT, -+ /* search for coord with the maximal key not greater than one -+ given */ -+ FIND_MAX_NOT_MORE_THAN /*LEFT_SLANT_BIAS */ -+} lookup_bias; -+ -+typedef enum { -+ /* number of leaf level of the tree -+ The fake root has (tree_level=0). */ -+ LEAF_LEVEL = 1, -+ -+ /* number of level one above leaf level of the tree. -+ -+ It is supposed that internal tree used by reiser4 to store file -+ system data and meta data will have height 2 initially (when -+ created by mkfs). -+ */ -+ TWIG_LEVEL = 2, -+} tree_level; -+ -+/* The "real" maximum ztree height is the 0-origin size of any per-level -+ array, since the zero'th level is not used. */ -+#define REAL_MAX_ZTREE_HEIGHT (REISER4_MAX_ZTREE_HEIGHT-LEAF_LEVEL) -+ -+/* enumeration of possible mutual position of item and coord. This enum is -+ return type of ->is_in_item() item plugin method which see. */ -+typedef enum { -+ /* coord is on the left of an item */ -+ IP_ON_THE_LEFT, -+ /* coord is inside item */ -+ IP_INSIDE, -+ /* coord is inside item, but to the right of the rightmost unit of -+ this item */ -+ IP_RIGHT_EDGE, -+ /* coord is on the right of an item */ -+ IP_ON_THE_RIGHT -+} interposition; -+ -+/* type of lock to acquire on znode before returning it to caller */ -+typedef enum { -+ ZNODE_NO_LOCK = 0, -+ ZNODE_READ_LOCK = 1, -+ ZNODE_WRITE_LOCK = 2, -+} znode_lock_mode; -+ -+/* type of lock request */ -+typedef enum { -+ ZNODE_LOCK_LOPRI = 0, -+ ZNODE_LOCK_HIPRI = (1 << 0), -+ -+ /* By setting the ZNODE_LOCK_NONBLOCK flag in a lock request the call to longterm_lock_znode will not sleep -+ waiting for the lock to become available. If the lock is unavailable, reiser4_znode_lock will immediately -+ return the value -E_REPEAT. */ -+ ZNODE_LOCK_NONBLOCK = (1 << 1), -+ /* An option for longterm_lock_znode which prevents atom fusion */ -+ ZNODE_LOCK_DONT_FUSE = (1 << 2) -+} znode_lock_request; -+ -+typedef enum { READ_OP = 0, WRITE_OP = 1 } rw_op; -+ -+/* used to specify direction of shift. These must be -1 and 1 */ -+typedef enum { -+ SHIFT_LEFT = 1, -+ SHIFT_RIGHT = -1 -+} shift_direction; -+ -+typedef enum { -+ LEFT_SIDE, -+ RIGHT_SIDE -+} sideof; -+ -+#define round_up( value, order ) \ -+ ( ( typeof( value ) )( ( ( long ) ( value ) + ( order ) - 1U ) & \ -+ ~( ( order ) - 1 ) ) ) -+ -+/* values returned by squalloc_right_neighbor and its auxiliary functions */ -+typedef enum { -+ /* unit of internal item is moved */ -+ SUBTREE_MOVED = 0, -+ /* nothing else can be squeezed into left neighbor */ -+ SQUEEZE_TARGET_FULL = 1, -+ /* all content of node is squeezed into its left neighbor */ -+ SQUEEZE_SOURCE_EMPTY = 2, -+ /* one more item is copied (this is only returned by -+ allocate_and_copy_extent to squalloc_twig)) */ -+ SQUEEZE_CONTINUE = 3 -+} squeeze_result; -+ -+/* Do not change items ids. If you do - there will be format change */ -+typedef enum { -+ STATIC_STAT_DATA_ID = 0x0, -+ SIMPLE_DIR_ENTRY_ID = 0x1, -+ COMPOUND_DIR_ID = 0x2, -+ NODE_POINTER_ID = 0x3, -+ EXTENT_POINTER_ID = 0x5, -+ FORMATTING_ID = 0x6, -+ CTAIL_ID = 0x7, -+ BLACK_BOX_ID = 0x8, -+ LAST_ITEM_ID = 0x9 -+} item_id; -+ -+/* Flags passed to jnode_flush() to allow it to distinguish default settings based on -+ whether commit() was called or VM memory pressure was applied. */ -+typedef enum { -+ /* submit flush queue to disk at jnode_flush completion */ -+ JNODE_FLUSH_WRITE_BLOCKS = 1, -+ -+ /* flush is called for commit */ -+ JNODE_FLUSH_COMMIT = 2, -+ /* not implemented */ -+ JNODE_FLUSH_MEMORY_FORMATTED = 4, -+ -+ /* not implemented */ -+ JNODE_FLUSH_MEMORY_UNFORMATTED = 8, -+} jnode_flush_flags; -+ -+/* Flags to insert/paste carry operations. Currently they only used in -+ flushing code, but in future, they can be used to optimize for repetitive -+ accesses. */ -+typedef enum { -+ /* carry is not allowed to shift data to the left when trying to find -+ free space */ -+ COPI_DONT_SHIFT_LEFT = (1 << 0), -+ /* carry is not allowed to shift data to the right when trying to find -+ free space */ -+ COPI_DONT_SHIFT_RIGHT = (1 << 1), -+ /* carry is not allowed to allocate new node(s) when trying to find -+ free space */ -+ COPI_DONT_ALLOCATE = (1 << 2), -+ /* try to load left neighbor if its not in a cache */ -+ COPI_LOAD_LEFT = (1 << 3), -+ /* try to load right neighbor if its not in a cache */ -+ COPI_LOAD_RIGHT = (1 << 4), -+ /* shift insertion point to the left neighbor */ -+ COPI_GO_LEFT = (1 << 5), -+ /* shift insertion point to the right neighbor */ -+ COPI_GO_RIGHT = (1 << 6), -+ /* try to step back into original node if insertion into new node -+ fails after shifting data there. */ -+ COPI_STEP_BACK = (1 << 7) -+} cop_insert_flag; -+ -+typedef enum { -+ SAFE_UNLINK, /* safe-link for unlink */ -+ SAFE_TRUNCATE /* safe-link for truncate */ -+} reiser4_safe_link_t; -+ -+/* this is to show on which list of atom jnode is */ -+typedef enum { -+ NOT_CAPTURED, -+ DIRTY_LIST, -+ CLEAN_LIST, -+ FQ_LIST, -+ WB_LIST, -+ OVRWR_LIST -+} atom_list; -+ -+/* __REISER4_FORWARD_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/fsdata.c linux-2.6.20/fs/reiser4/fsdata.c ---- linux-2.6.20.orig/fs/reiser4/fsdata.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/fsdata.c 2007-05-06 14:50:43.722983224 +0400 -@@ -0,0 +1,803 @@ -+/* Copyright 2001, 2002, 2003, 2004, 2005 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#include "fsdata.h" -+#include "inode.h" -+ -+/* cache or dir_cursors */ -+static struct kmem_cache *d_cursor_cache; -+static struct shrinker *d_cursor_shrinker; -+ -+/* list of unused cursors */ -+static LIST_HEAD(cursor_cache); -+ -+/* number of cursors in list of ununsed cursors */ -+static unsigned long d_cursor_unused = 0; -+ -+/* spinlock protecting manipulations with dir_cursor's hash table and lists */ -+DEFINE_SPINLOCK(d_lock); -+ -+static reiser4_file_fsdata *create_fsdata(struct file *file); -+static int file_is_stateless(struct file *file); -+static void free_fsdata(reiser4_file_fsdata *fsdata); -+static void kill_cursor(dir_cursor *); -+ -+/** -+ * d_cursor_shrink - shrink callback for cache of dir_cursor-s -+ * @nr: number of objects to free -+ * @mask: GFP mask -+ * -+ * Shrinks d_cursor_cache. Scan LRU list of unused cursors, freeing requested -+ * number. Return number of still freeable cursors. -+ */ -+static int d_cursor_shrink(int nr, gfp_t mask) -+{ -+ if (nr != 0) { -+ dir_cursor *scan; -+ int killed; -+ -+ killed = 0; -+ spin_lock(&d_lock); -+ while (!list_empty(&cursor_cache)) { -+ scan = list_entry(cursor_cache.next, dir_cursor, alist); -+ assert("nikita-3567", scan->ref == 0); -+ kill_cursor(scan); -+ ++killed; -+ --nr; -+ if (nr == 0) -+ break; -+ } -+ spin_unlock(&d_lock); -+ } -+ return d_cursor_unused; -+} -+ -+/** -+ * reiser4_init_d_cursor - create d_cursor cache -+ * -+ * Initializes slab cache of d_cursors. It is part of reiser4 module -+ * initialization. -+ */ -+int reiser4_init_d_cursor(void) -+{ -+ d_cursor_cache = kmem_cache_create("d_cursor", sizeof(dir_cursor), 0, -+ SLAB_HWCACHE_ALIGN, NULL, NULL); -+ if (d_cursor_cache == NULL) -+ return RETERR(-ENOMEM); -+ -+ /* -+ * actually, d_cursors are "priceless", because there is no way to -+ * recover information stored in them. On the other hand, we don't -+ * want to consume all kernel memory by them. As a compromise, just -+ * assign higher "seeks" value to d_cursor cache, so that it will be -+ * shrunk only if system is really tight on memory. -+ */ -+ d_cursor_shrinker = set_shrinker(DEFAULT_SEEKS << 3, -+ d_cursor_shrink); -+ if (d_cursor_shrinker == NULL) { -+ destroy_reiser4_cache(&d_cursor_cache); -+ d_cursor_cache = NULL; -+ return RETERR(-ENOMEM); -+ } -+ return 0; -+} -+ -+/** -+ * reiser4_done_d_cursor - delete d_cursor cache and d_cursor shrinker -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+void reiser4_done_d_cursor(void) -+{ -+ BUG_ON(d_cursor_shrinker == NULL); -+ remove_shrinker(d_cursor_shrinker); -+ d_cursor_shrinker = NULL; -+ -+ destroy_reiser4_cache(&d_cursor_cache); -+} -+ -+#define D_CURSOR_TABLE_SIZE (256) -+ -+static inline unsigned long -+d_cursor_hash(d_cursor_hash_table *table, const d_cursor_key *key) -+{ -+ assert("nikita-3555", IS_POW(D_CURSOR_TABLE_SIZE)); -+ return (key->oid + key->cid) & (D_CURSOR_TABLE_SIZE - 1); -+} -+ -+static inline int d_cursor_eq(const d_cursor_key *k1, const d_cursor_key *k2) -+{ -+ return k1->cid == k2->cid && k1->oid == k2->oid; -+} -+ -+/* -+ * define functions to manipulate reiser4 super block's hash table of -+ * dir_cursors -+ */ -+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get()) -+#define KFREE(ptr, size) kfree(ptr) -+TYPE_SAFE_HASH_DEFINE(d_cursor, -+ dir_cursor, -+ d_cursor_key, key, hash, d_cursor_hash, d_cursor_eq); -+#undef KFREE -+#undef KMALLOC -+ -+/** -+ * reiser4_init_super_d_info - initialize per-super-block d_cursor resources -+ * @super: super block to initialize -+ * -+ * Initializes per-super-block d_cursor's hash table and radix tree. It is part -+ * of mount. -+ */ -+int reiser4_init_super_d_info(struct super_block *super) -+{ -+ d_cursor_info *p; -+ -+ p = &get_super_private(super)->d_info; -+ -+ INIT_RADIX_TREE(&p->tree, reiser4_ctx_gfp_mask_get()); -+ return d_cursor_hash_init(&p->table, D_CURSOR_TABLE_SIZE); -+} -+ -+/** -+ * reiser4_done_super_d_info - release per-super-block d_cursor resources -+ * @super: super block being umounted -+ * -+ * It is called on umount. Kills all directory cursors attached to suoer block. -+ */ -+void reiser4_done_super_d_info(struct super_block *super) -+{ -+ d_cursor_info *d_info; -+ dir_cursor *cursor, *next; -+ -+ d_info = &get_super_private(super)->d_info; -+ for_all_in_htable(&d_info->table, d_cursor, cursor, next) -+ kill_cursor(cursor); -+ -+ BUG_ON(d_info->tree.rnode != NULL); -+ d_cursor_hash_done(&d_info->table); -+} -+ -+/** -+ * kill_cursor - free dir_cursor and reiser4_file_fsdata attached to it -+ * @cursor: cursor to free -+ * -+ * Removes reiser4_file_fsdata attached to @cursor from readdir list of -+ * reiser4_inode, frees that reiser4_file_fsdata. Removes @cursor from from -+ * indices, hash table, list of unused cursors and frees it. -+ */ -+static void kill_cursor(dir_cursor *cursor) -+{ -+ unsigned long index; -+ -+ assert("nikita-3566", cursor->ref == 0); -+ assert("nikita-3572", cursor->fsdata != NULL); -+ -+ index = (unsigned long)cursor->key.oid; -+ list_del_init(&cursor->fsdata->dir.linkage); -+ free_fsdata(cursor->fsdata); -+ cursor->fsdata = NULL; -+ -+ if (list_empty_careful(&cursor->list)) -+ /* this is last cursor for a file. Kill radix-tree entry */ -+ radix_tree_delete(&cursor->info->tree, index); -+ else { -+ void **slot; -+ -+ /* -+ * there are other cursors for the same oid. -+ */ -+ -+ /* -+ * if radix tree point to the cursor being removed, re-target -+ * radix tree slot to the next cursor in the (non-empty as was -+ * checked above) element of the circular list of all cursors -+ * for this oid. -+ */ -+ slot = radix_tree_lookup_slot(&cursor->info->tree, index); -+ assert("nikita-3571", *slot != NULL); -+ if (*slot == cursor) -+ *slot = list_entry(cursor->list.next, dir_cursor, list); -+ /* remove cursor from circular list */ -+ list_del_init(&cursor->list); -+ } -+ /* remove cursor from the list of unused cursors */ -+ list_del_init(&cursor->alist); -+ /* remove cursor from the hash table */ -+ d_cursor_hash_remove(&cursor->info->table, cursor); -+ /* and free it */ -+ kmem_cache_free(d_cursor_cache, cursor); -+ --d_cursor_unused; -+} -+ -+/* possible actions that can be performed on all cursors for the given file */ -+enum cursor_action { -+ /* -+ * load all detached state: this is called when stat-data is loaded -+ * from the disk to recover information about all pending readdirs -+ */ -+ CURSOR_LOAD, -+ /* -+ * detach all state from inode, leaving it in the cache. This is called -+ * when inode is removed form the memory by memory pressure -+ */ -+ CURSOR_DISPOSE, -+ /* -+ * detach cursors from the inode, and free them. This is called when -+ * inode is destroyed -+ */ -+ CURSOR_KILL -+}; -+ -+/* -+ * return d_cursor data for the file system @inode is in. -+ */ -+static inline d_cursor_info *d_info(struct inode *inode) -+{ -+ return &get_super_private(inode->i_sb)->d_info; -+} -+ -+/* -+ * lookup d_cursor in the per-super-block radix tree. -+ */ -+static inline dir_cursor *lookup(d_cursor_info * info, unsigned long index) -+{ -+ return (dir_cursor *) radix_tree_lookup(&info->tree, index); -+} -+ -+/* -+ * attach @cursor to the radix tree. There may be multiple cursors for the -+ * same oid, they are chained into circular list. -+ */ -+static void bind_cursor(dir_cursor * cursor, unsigned long index) -+{ -+ dir_cursor *head; -+ -+ head = lookup(cursor->info, index); -+ if (head == NULL) { -+ /* this is the first cursor for this index */ -+ INIT_LIST_HEAD(&cursor->list); -+ radix_tree_insert(&cursor->info->tree, index, cursor); -+ } else { -+ /* some cursor already exists. Chain ours */ -+ list_add(&cursor->list, &head->list); -+ } -+} -+ -+/* -+ * detach fsdata (if detachable) from file descriptor, and put cursor on the -+ * "unused" list. Called when file descriptor is not longer in active use. -+ */ -+static void clean_fsdata(struct file *file) -+{ -+ dir_cursor *cursor; -+ reiser4_file_fsdata *fsdata; -+ -+ assert("nikita-3570", file_is_stateless(file)); -+ -+ fsdata = (reiser4_file_fsdata *) file->private_data; -+ if (fsdata != NULL) { -+ cursor = fsdata->cursor; -+ if (cursor != NULL) { -+ spin_lock(&d_lock); -+ --cursor->ref; -+ if (cursor->ref == 0) { -+ list_add_tail(&cursor->alist, &cursor_cache); -+ ++d_cursor_unused; -+ } -+ spin_unlock(&d_lock); -+ file->private_data = NULL; -+ } -+ } -+} -+ -+/* -+ * global counter used to generate "client ids". These ids are encoded into -+ * high bits of fpos. -+ */ -+static __u32 cid_counter = 0; -+#define CID_SHIFT (20) -+#define CID_MASK (0xfffffull) -+ -+static void free_file_fsdata_nolock(struct file *); -+ -+/** -+ * insert_cursor - allocate file_fsdata, insert cursor to tree and hash table -+ * @cursor: -+ * @file: -+ * @inode: -+ * -+ * Allocates reiser4_file_fsdata, attaches it to @cursor, inserts cursor to -+ * reiser4 super block's hash table and radix tree. -+ add detachable readdir -+ * state to the @f -+ */ -+static int insert_cursor(dir_cursor *cursor, struct file *file, -+ struct inode *inode) -+{ -+ int result; -+ reiser4_file_fsdata *fsdata; -+ -+ memset(cursor, 0, sizeof *cursor); -+ -+ /* this is either first call to readdir, or rewind. Anyway, create new -+ * cursor. */ -+ fsdata = create_fsdata(NULL); -+ if (fsdata != NULL) { -+ result = radix_tree_preload(reiser4_ctx_gfp_mask_get()); -+ if (result == 0) { -+ d_cursor_info *info; -+ oid_t oid; -+ -+ info = d_info(inode); -+ oid = get_inode_oid(inode); -+ /* cid occupies higher 12 bits of f->f_pos. Don't -+ * allow it to become negative: this confuses -+ * nfsd_readdir() */ -+ cursor->key.cid = (++cid_counter) & 0x7ff; -+ cursor->key.oid = oid; -+ cursor->fsdata = fsdata; -+ cursor->info = info; -+ cursor->ref = 1; -+ -+ spin_lock_inode(inode); -+ /* install cursor as @f's private_data, discarding old -+ * one if necessary */ -+#if REISER4_DEBUG -+ if (file->private_data) -+ warning("", "file has fsdata already"); -+#endif -+ clean_fsdata(file); -+ free_file_fsdata_nolock(file); -+ file->private_data = fsdata; -+ fsdata->cursor = cursor; -+ spin_unlock_inode(inode); -+ spin_lock(&d_lock); -+ /* insert cursor into hash table */ -+ d_cursor_hash_insert(&info->table, cursor); -+ /* and chain it into radix-tree */ -+ bind_cursor(cursor, (unsigned long)oid); -+ spin_unlock(&d_lock); -+ radix_tree_preload_end(); -+ file->f_pos = ((__u64) cursor->key.cid) << CID_SHIFT; -+ } -+ } else -+ result = RETERR(-ENOMEM); -+ return result; -+} -+ -+/** -+ * process_cursors - do action on each cursor attached to inode -+ * @inode: -+ * @act: action to do -+ * -+ * Finds all cursors of @inode in reiser4's super block radix tree of cursors -+ * and performs action specified by @act on each of cursors. -+ */ -+static void process_cursors(struct inode *inode, enum cursor_action act) -+{ -+ oid_t oid; -+ dir_cursor *start; -+ struct list_head *head; -+ reiser4_context *ctx; -+ d_cursor_info *info; -+ -+ /* this can be called by -+ * -+ * kswapd->...->prune_icache->..reiser4_destroy_inode -+ * -+ * without reiser4_context -+ */ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) { -+ warning("vs-23", "failed to init context"); -+ return; -+ } -+ -+ assert("nikita-3558", inode != NULL); -+ -+ info = d_info(inode); -+ oid = get_inode_oid(inode); -+ spin_lock_inode(inode); -+ head = get_readdir_list(inode); -+ spin_lock(&d_lock); -+ /* find any cursor for this oid: reference to it is hanging of radix -+ * tree */ -+ start = lookup(info, (unsigned long)oid); -+ if (start != NULL) { -+ dir_cursor *scan; -+ reiser4_file_fsdata *fsdata; -+ -+ /* process circular list of cursors for this oid */ -+ scan = start; -+ do { -+ dir_cursor *next; -+ -+ next = list_entry(scan->list.next, dir_cursor, list); -+ fsdata = scan->fsdata; -+ assert("nikita-3557", fsdata != NULL); -+ if (scan->key.oid == oid) { -+ switch (act) { -+ case CURSOR_DISPOSE: -+ list_del_init(&fsdata->dir.linkage); -+ break; -+ case CURSOR_LOAD: -+ list_add(&fsdata->dir.linkage, head); -+ break; -+ case CURSOR_KILL: -+ kill_cursor(scan); -+ break; -+ } -+ } -+ if (scan == next) -+ /* last cursor was just killed */ -+ break; -+ scan = next; -+ } while (scan != start); -+ } -+ spin_unlock(&d_lock); -+ /* check that we killed 'em all */ -+ assert("nikita-3568", -+ ergo(act == CURSOR_KILL, -+ list_empty_careful(get_readdir_list(inode)))); -+ assert("nikita-3569", -+ ergo(act == CURSOR_KILL, lookup(info, oid) == NULL)); -+ spin_unlock_inode(inode); -+ reiser4_exit_context(ctx); -+} -+ -+/** -+ * reiser4_dispose_cursors - removes cursors from inode's list -+ * @inode: inode to dispose cursors of -+ * -+ * For each of cursors corresponding to @inode - removes reiser4_file_fsdata -+ * attached to cursor from inode's readdir list. This is called when inode is -+ * removed from the memory by memory pressure. -+ */ -+void reiser4_dispose_cursors(struct inode *inode) -+{ -+ process_cursors(inode, CURSOR_DISPOSE); -+} -+ -+/** -+ * reiser4_load_cursors - attach cursors to inode -+ * @inode: inode to load cursors to -+ * -+ * For each of cursors corresponding to @inode - attaches reiser4_file_fsdata -+ * attached to cursor to inode's readdir list. This is done when inode is -+ * loaded into memory. -+ */ -+void reiser4_load_cursors(struct inode *inode) -+{ -+ process_cursors(inode, CURSOR_LOAD); -+} -+ -+/** -+ * reiser4_kill_cursors - kill all inode cursors -+ * @inode: inode to kill cursors of -+ * -+ * Frees all cursors for this inode. This is called when inode is destroyed. -+ */ -+void reiser4_kill_cursors(struct inode *inode) -+{ -+ process_cursors(inode, CURSOR_KILL); -+} -+ -+/** -+ * file_is_stateless - -+ * @file: -+ * -+ * true, if file descriptor @f is created by NFS server by "demand" to serve -+ * one file system operation. This means that there may be "detached state" -+ * for underlying inode. -+ */ -+static int file_is_stateless(struct file *file) -+{ -+ return reiser4_get_dentry_fsdata(file->f_dentry)->stateless; -+} -+ -+/** -+ * reiser4_get_dir_fpos - -+ * @dir: -+ * -+ * Calculates ->fpos from user-supplied cookie. Normally it is dir->f_pos, but -+ * in the case of stateless directory operation (readdir-over-nfs), client id -+ * was encoded in the high bits of cookie and should me masked off. -+ */ -+loff_t reiser4_get_dir_fpos(struct file *dir) -+{ -+ if (file_is_stateless(dir)) -+ return dir->f_pos & CID_MASK; -+ else -+ return dir->f_pos; -+} -+ -+/** -+ * reiser4_attach_fsdata - try to attach fsdata -+ * @file: -+ * @inode: -+ * -+ * Finds or creates cursor for readdir-over-nfs. -+ */ -+int reiser4_attach_fsdata(struct file *file, struct inode *inode) -+{ -+ loff_t pos; -+ int result; -+ dir_cursor *cursor; -+ -+ /* -+ * we are serialized by inode->i_mutex -+ */ -+ if (!file_is_stateless(file)) -+ return 0; -+ -+ pos = file->f_pos; -+ result = 0; -+ if (pos == 0) { -+ /* -+ * first call to readdir (or rewind to the beginning of -+ * directory) -+ */ -+ cursor = kmem_cache_alloc(d_cursor_cache, -+ reiser4_ctx_gfp_mask_get()); -+ if (cursor != NULL) -+ result = insert_cursor(cursor, file, inode); -+ else -+ result = RETERR(-ENOMEM); -+ } else { -+ /* try to find existing cursor */ -+ d_cursor_key key; -+ -+ key.cid = pos >> CID_SHIFT; -+ key.oid = get_inode_oid(inode); -+ spin_lock(&d_lock); -+ cursor = d_cursor_hash_find(&d_info(inode)->table, &key); -+ if (cursor != NULL) { -+ /* cursor was found */ -+ if (cursor->ref == 0) { -+ /* move it from unused list */ -+ list_del_init(&cursor->alist); -+ --d_cursor_unused; -+ } -+ ++cursor->ref; -+ } -+ spin_unlock(&d_lock); -+ if (cursor != NULL) { -+ spin_lock_inode(inode); -+ assert("nikita-3556", cursor->fsdata->back == NULL); -+ clean_fsdata(file); -+ free_file_fsdata_nolock(file); -+ file->private_data = cursor->fsdata; -+ spin_unlock_inode(inode); -+ } -+ } -+ return result; -+} -+ -+/** -+ * reiser4_detach_fsdata - ??? -+ * @file: -+ * -+ * detach fsdata, if necessary -+ */ -+void reiser4_detach_fsdata(struct file *file) -+{ -+ struct inode *inode; -+ -+ if (!file_is_stateless(file)) -+ return; -+ -+ inode = file->f_dentry->d_inode; -+ spin_lock_inode(inode); -+ clean_fsdata(file); -+ spin_unlock_inode(inode); -+} -+ -+/* slab for reiser4_dentry_fsdata */ -+static struct kmem_cache *dentry_fsdata_cache; -+ -+/** -+ * reiser4_init_dentry_fsdata - create cache of dentry_fsdata -+ * -+ * Initializes slab cache of structures attached to denty->d_fsdata. It is -+ * part of reiser4 module initialization. -+ */ -+int reiser4_init_dentry_fsdata(void) -+{ -+ dentry_fsdata_cache = kmem_cache_create("dentry_fsdata", -+ sizeof(reiser4_dentry_fsdata), -+ 0, -+ SLAB_HWCACHE_ALIGN | -+ SLAB_RECLAIM_ACCOUNT, NULL, -+ NULL); -+ if (dentry_fsdata_cache == NULL) -+ return RETERR(-ENOMEM); -+ return 0; -+} -+ -+/** -+ * reiser4_done_dentry_fsdata - delete cache of dentry_fsdata -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+void reiser4_done_dentry_fsdata(void) -+{ -+ destroy_reiser4_cache(&dentry_fsdata_cache); -+} -+ -+/** -+ * reiser4_get_dentry_fsdata - get fs-specific dentry data -+ * @dentry: queried dentry -+ * -+ * Allocates if necessary and returns per-dentry data that we attach to each -+ * dentry. -+ */ -+reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *dentry) -+{ -+ assert("nikita-1365", dentry != NULL); -+ -+ if (dentry->d_fsdata == NULL) { -+ dentry->d_fsdata = kmem_cache_alloc(dentry_fsdata_cache, -+ reiser4_ctx_gfp_mask_get()); -+ if (dentry->d_fsdata == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ memset(dentry->d_fsdata, 0, sizeof(reiser4_dentry_fsdata)); -+ } -+ return dentry->d_fsdata; -+} -+ -+/** -+ * reiser4_free_dentry_fsdata - detach and free dentry_fsdata -+ * @dentry: dentry to free fsdata of -+ * -+ * Detaches and frees fs-specific dentry data -+ */ -+void reiser4_free_dentry_fsdata(struct dentry *dentry) -+{ -+ if (dentry->d_fsdata != NULL) { -+ kmem_cache_free(dentry_fsdata_cache, dentry->d_fsdata); -+ dentry->d_fsdata = NULL; -+ } -+} -+ -+/* slab for reiser4_file_fsdata */ -+static struct kmem_cache *file_fsdata_cache; -+ -+/** -+ * reiser4_init_file_fsdata - create cache of reiser4_file_fsdata -+ * -+ * Initializes slab cache of structures attached to file->private_data. It is -+ * part of reiser4 module initialization. -+ */ -+int reiser4_init_file_fsdata(void) -+{ -+ file_fsdata_cache = kmem_cache_create("file_fsdata", -+ sizeof(reiser4_file_fsdata), -+ 0, -+ SLAB_HWCACHE_ALIGN | -+ SLAB_RECLAIM_ACCOUNT, NULL, NULL); -+ if (file_fsdata_cache == NULL) -+ return RETERR(-ENOMEM); -+ return 0; -+} -+ -+/** -+ * reiser4_done_file_fsdata - delete cache of reiser4_file_fsdata -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+void reiser4_done_file_fsdata(void) -+{ -+ destroy_reiser4_cache(&file_fsdata_cache); -+} -+ -+/** -+ * create_fsdata - allocate and initialize reiser4_file_fsdata -+ * @file: what to create file_fsdata for, may be NULL -+ * -+ * Allocates and initializes reiser4_file_fsdata structure. -+ */ -+static reiser4_file_fsdata *create_fsdata(struct file *file) -+{ -+ reiser4_file_fsdata *fsdata; -+ -+ fsdata = kmem_cache_alloc(file_fsdata_cache, -+ reiser4_ctx_gfp_mask_get()); -+ if (fsdata != NULL) { -+ memset(fsdata, 0, sizeof *fsdata); -+ fsdata->ra1.max_window_size = VM_MAX_READAHEAD * 1024; -+ fsdata->back = file; -+ INIT_LIST_HEAD(&fsdata->dir.linkage); -+ } -+ return fsdata; -+} -+ -+/** -+ * free_fsdata - free reiser4_file_fsdata -+ * @fsdata: object to free -+ * -+ * Dual to create_fsdata(). Free reiser4_file_fsdata. -+ */ -+static void free_fsdata(reiser4_file_fsdata *fsdata) -+{ -+ BUG_ON(fsdata == NULL); -+ kmem_cache_free(file_fsdata_cache, fsdata); -+} -+ -+/** -+ * reiser4_get_file_fsdata - get fs-specific file data -+ * @file: queried file -+ * -+ * Returns fs-specific data of @file. If it is NULL, allocates it and attaches -+ * to @file. -+ */ -+reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *file) -+{ -+ assert("nikita-1603", file != NULL); -+ -+ if (file->private_data == NULL) { -+ reiser4_file_fsdata *fsdata; -+ struct inode *inode; -+ -+ fsdata = create_fsdata(file); -+ if (fsdata == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ -+ inode = file->f_dentry->d_inode; -+ spin_lock_inode(inode); -+ if (file->private_data == NULL) { -+ file->private_data = fsdata; -+ fsdata = NULL; -+ } -+ spin_unlock_inode(inode); -+ if (fsdata != NULL) -+ /* other thread initialized ->fsdata */ -+ kmem_cache_free(file_fsdata_cache, fsdata); -+ } -+ assert("nikita-2665", file->private_data != NULL); -+ return file->private_data; -+} -+ -+/** -+ * free_file_fsdata_nolock - detach and free reiser4_file_fsdata -+ * @file: -+ * -+ * Detaches reiser4_file_fsdata from @file, removes reiser4_file_fsdata from -+ * readdir list, frees if it is not linked to d_cursor object. -+ */ -+static void free_file_fsdata_nolock(struct file *file) -+{ -+ reiser4_file_fsdata *fsdata; -+ -+ assert("", spin_inode_is_locked(file->f_dentry->d_inode)); -+ fsdata = file->private_data; -+ if (fsdata != NULL) { -+ list_del_init(&fsdata->dir.linkage); -+ if (fsdata->cursor == NULL) -+ free_fsdata(fsdata); -+ } -+ file->private_data = NULL; -+} -+ -+/** -+ * reiser4_free_file_fsdata - detach from struct file and free reiser4_file_fsdata -+ * @file: -+ * -+ * Spinlocks inode and calls free_file_fsdata_nolock to do the work. -+ */ -+void reiser4_free_file_fsdata(struct file *file) -+{ -+ spin_lock_inode(file->f_dentry->d_inode); -+ free_file_fsdata_nolock(file); -+ spin_unlock_inode(file->f_dentry->d_inode); -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/fsdata.h linux-2.6.20/fs/reiser4/fsdata.h ---- linux-2.6.20.orig/fs/reiser4/fsdata.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/fsdata.h 2007-05-06 14:50:43.722983224 +0400 -@@ -0,0 +1,207 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#if !defined( __REISER4_FSDATA_H__ ) -+#define __REISER4_FSDATA_H__ -+ -+#include "debug.h" -+#include "kassign.h" -+#include "seal.h" -+#include "type_safe_hash.h" -+#include "plugin/file/file.h" -+#include "readahead.h" -+ -+/* -+ * comment about reiser4_dentry_fsdata -+ * -+ * -+ */ -+ -+/* -+ * locking: fields of per file descriptor readdir_pos and ->f_pos are -+ * protected by ->i_mutex on inode. Under this lock following invariant -+ * holds: -+ * -+ * file descriptor is "looking" at the entry_no-th directory entry from -+ * the beginning of directory. This entry has key dir_entry_key and is -+ * pos-th entry with duplicate-key sequence. -+ * -+ */ -+ -+/* logical position within directory */ -+typedef struct { -+ /* key of directory entry (actually, part of a key sufficient to -+ identify directory entry) */ -+ de_id dir_entry_key; -+ /* ordinal number of directory entry among all entries with the same -+ key. (Starting from 0.) */ -+ unsigned pos; -+} dir_pos; -+ -+typedef struct { -+ /* f_pos corresponding to this readdir position */ -+ __u64 fpos; -+ /* logical position within directory */ -+ dir_pos position; -+ /* logical number of directory entry within -+ directory */ -+ __u64 entry_no; -+} readdir_pos; -+ -+/* -+ * this is used to speed up lookups for directory entry: on initial call to -+ * ->lookup() seal and coord of directory entry (if found, that is) are stored -+ * in struct dentry and reused later to avoid tree traversals. -+ */ -+typedef struct de_location { -+ /* seal covering directory entry */ -+ seal_t entry_seal; -+ /* coord of directory entry */ -+ coord_t entry_coord; -+ /* ordinal number of directory entry among all entries with the same -+ key. (Starting from 0.) */ -+ int pos; -+} de_location; -+ -+/** -+ * reiser4_dentry_fsdata - reiser4-specific data attached to dentries -+ * -+ * This is allocated dynamically and released in d_op->d_release() -+ * -+ * Currently it only contains cached location (hint) of directory entry, but -+ * it is expected that other information will be accumulated here. -+ */ -+typedef struct reiser4_dentry_fsdata { -+ /* -+ * here will go fields filled by ->lookup() to speedup next -+ * create/unlink, like blocknr of znode with stat-data, or key of -+ * stat-data. -+ */ -+ de_location dec; -+ int stateless; /* created through reiser4_decode_fh, needs special -+ * treatment in readdir. */ -+} reiser4_dentry_fsdata; -+ -+extern int reiser4_init_dentry_fsdata(void); -+extern void reiser4_done_dentry_fsdata(void); -+extern reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *); -+extern void reiser4_free_dentry_fsdata(struct dentry *dentry); -+ -+/** -+ * reiser4_file_fsdata - reiser4-specific data attached to file->private_data -+ * -+ * This is allocated dynamically and released in inode->i_fop->release -+ */ -+typedef struct reiser4_file_fsdata { -+ /* -+ * pointer back to the struct file which this reiser4_file_fsdata is -+ * part of -+ */ -+ struct file *back; -+ /* detached cursor for stateless readdir. */ -+ struct dir_cursor *cursor; -+ /* -+ * We need both directory and regular file parts here, because there -+ * are file system objects that are files and directories. -+ */ -+ struct { -+ /* -+ * position in directory. It is updated each time directory is -+ * modified -+ */ -+ readdir_pos readdir; -+ /* head of this list is reiser4_inode->lists.readdir_list */ -+ struct list_head linkage; -+ } dir; -+ /* hints to speed up operations with regular files: read and write. */ -+ struct { -+ hint_t hint; -+ } reg; -+ struct reiser4_file_ra_state ra1; -+ -+} reiser4_file_fsdata; -+ -+extern int reiser4_init_file_fsdata(void); -+extern void reiser4_done_file_fsdata(void); -+extern reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *); -+extern void reiser4_free_file_fsdata(struct file *); -+ -+/* -+ * d_cursor is reiser4_file_fsdata not attached to struct file. d_cursors are -+ * used to address problem reiser4 has with readdir accesses via NFS. See -+ * plugin/file_ops_readdir.c for more details. -+ */ -+typedef struct { -+ __u16 cid; -+ __u64 oid; -+} d_cursor_key; -+ -+/* -+ * define structures d_cursor_hash_table d_cursor_hash_link which are used to -+ * maintain hash table of dir_cursor-s in reiser4's super block -+ */ -+typedef struct dir_cursor dir_cursor; -+TYPE_SAFE_HASH_DECLARE(d_cursor, dir_cursor); -+ -+typedef struct d_cursor_info d_cursor_info; -+ -+struct dir_cursor { -+ int ref; -+ reiser4_file_fsdata *fsdata; -+ -+ /* link to reiser4 super block hash table of cursors */ -+ d_cursor_hash_link hash; -+ -+ /* -+ * this is to link cursors to reiser4 super block's radix tree of -+ * cursors if there are more than one cursor of the same objectid -+ */ -+ struct list_head list; -+ d_cursor_key key; -+ d_cursor_info *info; -+ /* list of unused cursors */ -+ struct list_head alist; -+}; -+ -+extern int reiser4_init_d_cursor(void); -+extern void reiser4_done_d_cursor(void); -+ -+extern int reiser4_init_super_d_info(struct super_block *); -+extern void reiser4_done_super_d_info(struct super_block *); -+ -+extern loff_t reiser4_get_dir_fpos(struct file *); -+extern int reiser4_attach_fsdata(struct file *, struct inode *); -+extern void reiser4_detach_fsdata(struct file *); -+ -+/* these are needed for "stateless" readdir. See plugin/file_ops_readdir.c for -+ more details */ -+void reiser4_dispose_cursors(struct inode *inode); -+void reiser4_load_cursors(struct inode *inode); -+void reiser4_kill_cursors(struct inode *inode); -+void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de, -+ int offset, int adj); -+ -+/* -+ * this structure is embedded to reise4_super_info_data. It maintains d_cursors -+ * (detached readdir state). See plugin/file_ops_readdir.c for more details. -+ */ -+struct d_cursor_info { -+ d_cursor_hash_table table; -+ struct radix_tree_root tree; -+}; -+ -+/* spinlock protecting readdir cursors */ -+extern spinlock_t d_lock; -+ -+/* __REISER4_FSDATA_H__ */ -+#endif -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 120 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/init_super.c linux-2.6.20/fs/reiser4/init_super.c ---- linux-2.6.20.orig/fs/reiser4/init_super.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/init_super.c 2007-05-06 14:50:43.722983224 +0400 -@@ -0,0 +1,750 @@ -+/* Copyright by Hans Reiser, 2003 */ -+ -+#include "super.h" -+#include "inode.h" -+#include "plugin/plugin_set.h" -+ -+#include -+ -+/** -+ * init_fs_info - allocate reiser4 specific super block -+ * @super: super block of filesystem -+ * -+ * Allocates and initialize reiser4_super_info_data, attaches it to -+ * super->s_fs_info, initializes structures maintaining d_cursor-s. -+ */ -+int reiser4_init_fs_info(struct super_block *super) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = kmalloc(sizeof(reiser4_super_info_data), -+ reiser4_ctx_gfp_mask_get()); -+ if (!sbinfo) -+ return RETERR(-ENOMEM); -+ -+ super->s_fs_info = sbinfo; -+ super->s_op = NULL; -+ memset(sbinfo, 0, sizeof(*sbinfo)); -+ -+ ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes)); -+ ON_DEBUG(spin_lock_init(&sbinfo->all_guard)); -+ -+ mutex_init(&sbinfo->delete_mutex); -+ spin_lock_init(&(sbinfo->guard)); -+ -+ /* initialize per-super-block d_cursor resources */ -+ reiser4_init_super_d_info(super); -+ -+ return 0; -+} -+ -+/** -+ * reiser4_done_fs_info - free reiser4 specific super block -+ * @super: super block of filesystem -+ * -+ * Performs some sanity checks, releases structures maintaining d_cursor-s, -+ * frees reiser4_super_info_data. -+ */ -+void reiser4_done_fs_info(struct super_block *super) -+{ -+ assert("zam-990", super->s_fs_info != NULL); -+ -+ /* release per-super-block d_cursor resources */ -+ reiser4_done_super_d_info(super); -+ -+ /* make sure that there are not jnodes already */ -+ assert("", list_empty(&get_super_private(super)->all_jnodes)); -+ assert("", get_current_context()->trans->atom == NULL); -+ reiser4_check_block_counters(super); -+ kfree(super->s_fs_info); -+ super->s_fs_info = NULL; -+} -+ -+/* type of option parseable by parse_option() */ -+typedef enum { -+ /* value of option is arbitrary string */ -+ OPT_STRING, -+ -+ /* -+ * option specifies bit in a bitmask. When option is set - bit in -+ * sbinfo->fs_flags is set. Examples are bsdgroups, 32bittimes, mtflush, -+ * dont_load_bitmap, atomic_write. -+ */ -+ OPT_BIT, -+ -+ /* -+ * value of option should conform to sprintf() format. Examples are -+ * tmgr.atom_max_size=N, tmgr.atom_max_age=N -+ */ -+ OPT_FORMAT, -+ -+ /* -+ * option can take one of predefined values. Example is onerror=panic or -+ * onerror=remount-ro -+ */ -+ OPT_ONEOF, -+} opt_type_t; -+ -+typedef struct opt_bitmask_bit { -+ const char *bit_name; -+ int bit_nr; -+} opt_bitmask_bit; -+ -+/* description of option parseable by parse_option() */ -+typedef struct opt_desc { -+ /* option name. -+ -+ parsed portion of string has a form "name=value". -+ */ -+ const char *name; -+ /* type of option */ -+ opt_type_t type; -+ union { -+ /* where to store value of string option (type == OPT_STRING) */ -+ char **string; -+ /* description of bits for bit option (type == OPT_BIT) */ -+ struct { -+ int nr; -+ void *addr; -+ } bit; -+ /* description of format and targets for format option (type -+ == OPT_FORMAT) */ -+ struct { -+ const char *format; -+ int nr_args; -+ void *arg1; -+ void *arg2; -+ void *arg3; -+ void *arg4; -+ } f; -+ struct { -+ int *result; -+ const char *list[10]; -+ } oneof; -+ struct { -+ void *addr; -+ int nr_bits; -+ opt_bitmask_bit *bits; -+ } bitmask; -+ } u; -+} opt_desc_t; -+ -+/** -+ * parse_option - parse one option -+ * @opt_strin: starting point of parsing -+ * @opt: option description -+ * -+ * foo=bar, -+ * ^ ^ ^ -+ * | | +-- replaced to '\0' -+ * | +-- val_start -+ * +-- opt_string -+ * Figures out option type and handles option correspondingly. -+ */ -+static int parse_option(char *opt_string, opt_desc_t *opt) -+{ -+ char *val_start; -+ int result; -+ const char *err_msg; -+ -+ /* NOTE-NIKITA think about using lib/cmdline.c functions here. */ -+ -+ val_start = strchr(opt_string, '='); -+ if (val_start != NULL) { -+ *val_start = '\0'; -+ ++val_start; -+ } -+ -+ err_msg = NULL; -+ result = 0; -+ switch (opt->type) { -+ case OPT_STRING: -+ if (val_start == NULL) { -+ err_msg = "String arg missing"; -+ result = RETERR(-EINVAL); -+ } else -+ *opt->u.string = val_start; -+ break; -+ case OPT_BIT: -+ if (val_start != NULL) -+ err_msg = "Value ignored"; -+ else -+ set_bit(opt->u.bit.nr, opt->u.bit.addr); -+ break; -+ case OPT_FORMAT: -+ if (val_start == NULL) { -+ err_msg = "Formatted arg missing"; -+ result = RETERR(-EINVAL); -+ break; -+ } -+ if (sscanf(val_start, opt->u.f.format, -+ opt->u.f.arg1, opt->u.f.arg2, opt->u.f.arg3, -+ opt->u.f.arg4) != opt->u.f.nr_args) { -+ err_msg = "Wrong conversion"; -+ result = RETERR(-EINVAL); -+ } -+ break; -+ case OPT_ONEOF: -+ { -+ int i = 0; -+ -+ if (val_start == NULL) { -+ err_msg = "Value is missing"; -+ result = RETERR(-EINVAL); -+ break; -+ } -+ err_msg = "Wrong option value"; -+ result = RETERR(-EINVAL); -+ while (opt->u.oneof.list[i]) { -+ if (!strcmp(opt->u.oneof.list[i], val_start)) { -+ result = 0; -+ err_msg = NULL; -+ *opt->u.oneof.result = i; -+ break; -+ } -+ i++; -+ } -+ break; -+ } -+ default: -+ wrong_return_value("nikita-2100", "opt -> type"); -+ break; -+ } -+ if (err_msg != NULL) { -+ warning("nikita-2496", "%s when parsing option \"%s%s%s\"", -+ err_msg, opt->name, val_start ? "=" : "", -+ val_start ? : ""); -+ } -+ return result; -+} -+ -+/** -+ * parse_options - parse reiser4 mount options -+ * @opt_string: starting point -+ * @opts: array of option description -+ * @nr_opts: number of elements in @opts -+ * -+ * Parses comma separated list of reiser4 mount options. -+ */ -+static int parse_options(char *opt_string, opt_desc_t *opts, int nr_opts) -+{ -+ int result; -+ -+ result = 0; -+ while ((result == 0) && opt_string && *opt_string) { -+ int j; -+ char *next; -+ -+ next = strchr(opt_string, ','); -+ if (next != NULL) { -+ *next = '\0'; -+ ++next; -+ } -+ for (j = 0; j < nr_opts; ++j) { -+ if (!strncmp(opt_string, opts[j].name, -+ strlen(opts[j].name))) { -+ result = parse_option(opt_string, &opts[j]); -+ break; -+ } -+ } -+ if (j == nr_opts) { -+ warning("nikita-2307", "Unrecognized option: \"%s\"", -+ opt_string); -+ /* traditionally, -EINVAL is returned on wrong mount -+ option */ -+ result = RETERR(-EINVAL); -+ } -+ opt_string = next; -+ } -+ return result; -+} -+ -+#define NUM_OPT( label, fmt, addr ) \ -+ { \ -+ .name = ( label ), \ -+ .type = OPT_FORMAT, \ -+ .u = { \ -+ .f = { \ -+ .format = ( fmt ), \ -+ .nr_args = 1, \ -+ .arg1 = ( addr ), \ -+ .arg2 = NULL, \ -+ .arg3 = NULL, \ -+ .arg4 = NULL \ -+ } \ -+ } \ -+ } -+ -+#define SB_FIELD_OPT( field, fmt ) NUM_OPT( #field, fmt, &sbinfo -> field ) -+ -+#define BIT_OPT(label, bitnr) \ -+ { \ -+ .name = label, \ -+ .type = OPT_BIT, \ -+ .u = { \ -+ .bit = { \ -+ .nr = bitnr, \ -+ .addr = &sbinfo->fs_flags \ -+ } \ -+ } \ -+ } -+ -+#define MAX_NR_OPTIONS (30) -+ -+/** -+ * reiser4_init_super_data - initialize reiser4 private super block -+ * @super: super block to initialize -+ * @opt_string: list of reiser4 mount options -+ * -+ * Sets various reiser4 parameters to default values. Parses mount options and -+ * overwrites default settings. -+ */ -+int reiser4_init_super_data(struct super_block *super, char *opt_string) -+{ -+ int result; -+ opt_desc_t *opts, *p; -+ reiser4_super_info_data *sbinfo = get_super_private(super); -+ -+ /* initialize super, export, dentry operations */ -+ sbinfo->ops.super = reiser4_super_operations; -+ sbinfo->ops.export = reiser4_export_operations; -+ sbinfo->ops.dentry = reiser4_dentry_operations; -+ super->s_op = &sbinfo->ops.super; -+ super->s_export_op = &sbinfo->ops.export; -+ -+ /* initialize transaction manager parameters to default values */ -+ sbinfo->tmgr.atom_max_size = totalram_pages / 4; -+ sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE / HZ; -+ sbinfo->tmgr.atom_min_size = 256; -+ sbinfo->tmgr.atom_max_flushers = ATOM_MAX_FLUSHERS; -+ -+ /* initialize cbk cache parameter */ -+ sbinfo->tree.cbk_cache.nr_slots = CBK_CACHE_SLOTS; -+ -+ /* initialize flush parameters */ -+ sbinfo->flush.relocate_threshold = FLUSH_RELOCATE_THRESHOLD; -+ sbinfo->flush.relocate_distance = FLUSH_RELOCATE_DISTANCE; -+ sbinfo->flush.written_threshold = FLUSH_WRITTEN_THRESHOLD; -+ sbinfo->flush.scan_maxnodes = FLUSH_SCAN_MAXNODES; -+ -+ sbinfo->optimal_io_size = REISER4_OPTIMAL_IO_SIZE; -+ -+ /* preliminary tree initializations */ -+ sbinfo->tree.super = super; -+ sbinfo->tree.carry.new_node_flags = REISER4_NEW_NODE_FLAGS; -+ sbinfo->tree.carry.new_extent_flags = REISER4_NEW_EXTENT_FLAGS; -+ sbinfo->tree.carry.paste_flags = REISER4_PASTE_FLAGS; -+ sbinfo->tree.carry.insert_flags = REISER4_INSERT_FLAGS; -+ rwlock_init(&(sbinfo->tree.tree_lock)); -+ spin_lock_init(&(sbinfo->tree.epoch_lock)); -+ -+ /* initialize default readahead params */ -+ sbinfo->ra_params.max = num_physpages / 4; -+ sbinfo->ra_params.flags = 0; -+ -+ /* allocate memory for structure describing reiser4 mount options */ -+ opts = kmalloc(sizeof(opt_desc_t) * MAX_NR_OPTIONS, -+ reiser4_ctx_gfp_mask_get()); -+ if (opts == NULL) -+ return RETERR(-ENOMEM); -+ -+ /* initialize structure describing reiser4 mount options */ -+ p = opts; -+ -+#if REISER4_DEBUG -+# define OPT_ARRAY_CHECK if ((p) > (opts) + MAX_NR_OPTIONS) { \ -+ warning ("zam-1046", "opt array is overloaded"); break; \ -+ } -+#else -+# define OPT_ARRAY_CHECK noop -+#endif -+ -+#define PUSH_OPT(...) \ -+do { \ -+ opt_desc_t o = __VA_ARGS__; \ -+ OPT_ARRAY_CHECK; \ -+ *p ++ = o; \ -+} while (0) -+ -+#define PUSH_SB_FIELD_OPT(field, format) PUSH_OPT(SB_FIELD_OPT(field, format)) -+#define PUSH_BIT_OPT(name, bit) PUSH_OPT(BIT_OPT(name, bit)) -+ -+ /* -+ * tmgr.atom_max_size=N -+ * Atoms containing more than N blocks will be forced to commit. N is -+ * decimal. -+ */ -+ PUSH_SB_FIELD_OPT(tmgr.atom_max_size, "%u"); -+ /* -+ * tmgr.atom_max_age=N -+ * Atoms older than N seconds will be forced to commit. N is decimal. -+ */ -+ PUSH_SB_FIELD_OPT(tmgr.atom_max_age, "%u"); -+ /* -+ * tmgr.atom_min_size=N -+ * In committing an atom to free dirty pages, force the atom less than -+ * N in size to fuse with another one. -+ */ -+ PUSH_SB_FIELD_OPT(tmgr.atom_min_size, "%u"); -+ /* -+ * tmgr.atom_max_flushers=N -+ * limit of concurrent flushers for one atom. 0 means no limit. -+ */ -+ PUSH_SB_FIELD_OPT(tmgr.atom_max_flushers, "%u"); -+ /* -+ * tree.cbk_cache_slots=N -+ * Number of slots in the cbk cache. -+ */ -+ PUSH_SB_FIELD_OPT(tree.cbk_cache.nr_slots, "%u"); -+ /* -+ * If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty -+ * leaf-level blocks it will force them to be relocated. -+ */ -+ PUSH_SB_FIELD_OPT(flush.relocate_threshold, "%u"); -+ /* -+ * If flush finds can find a block allocation closer than at most -+ * FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that -+ * position. -+ */ -+ PUSH_SB_FIELD_OPT(flush.relocate_distance, "%u"); -+ /* -+ * If we have written this much or more blocks before encountering busy -+ * jnode in flush list - abort flushing hoping that next time we get -+ * called this jnode will be clean already, and we will save some -+ * seeks. -+ */ -+ PUSH_SB_FIELD_OPT(flush.written_threshold, "%u"); -+ /* The maximum number of nodes to scan left on a level during flush. */ -+ PUSH_SB_FIELD_OPT(flush.scan_maxnodes, "%u"); -+ /* preferred IO size */ -+ PUSH_SB_FIELD_OPT(optimal_io_size, "%u"); -+ /* carry flags used for insertion of new nodes */ -+ PUSH_SB_FIELD_OPT(tree.carry.new_node_flags, "%u"); -+ /* carry flags used for insertion of new extents */ -+ PUSH_SB_FIELD_OPT(tree.carry.new_extent_flags, "%u"); -+ /* carry flags used for paste operations */ -+ PUSH_SB_FIELD_OPT(tree.carry.paste_flags, "%u"); -+ /* carry flags used for insert operations */ -+ PUSH_SB_FIELD_OPT(tree.carry.insert_flags, "%u"); -+ -+#ifdef CONFIG_REISER4_BADBLOCKS -+ /* -+ * Alternative master superblock location in case if it's original -+ * location is not writeable/accessable. This is offset in BYTES. -+ */ -+ PUSH_SB_FIELD_OPT(altsuper, "%lu"); -+#endif -+ -+ /* turn on BSD-style gid assignment */ -+ PUSH_BIT_OPT("bsdgroups", REISER4_BSD_GID); -+ /* turn on 32 bit times */ -+ PUSH_BIT_OPT("32bittimes", REISER4_32_BIT_TIMES); -+ /* -+ * Don't load all bitmap blocks at mount time, it is useful for -+ * machines with tiny RAM and large disks. -+ */ -+ PUSH_BIT_OPT("dont_load_bitmap", REISER4_DONT_LOAD_BITMAP); -+ /* disable transaction commits during write() */ -+ PUSH_BIT_OPT("atomic_write", REISER4_ATOMIC_WRITE); -+ /* disable use of write barriers in the reiser4 log writer. */ -+ PUSH_BIT_OPT("no_write_barrier", REISER4_NO_WRITE_BARRIER); -+ -+ PUSH_OPT( -+ { -+ /* -+ * tree traversal readahead parameters: -+ * -o readahead:MAXNUM:FLAGS -+ * MAXNUM - max number fo nodes to request readahead for: -1UL -+ * will set it to max_sane_readahead() -+ * FLAGS - combination of bits: RA_ADJCENT_ONLY, RA_ALL_LEVELS, -+ * CONTINUE_ON_PRESENT -+ */ -+ .name = "readahead", -+ .type = OPT_FORMAT, -+ .u = { -+ .f = { -+ .format = "%u:%u", -+ .nr_args = 2, -+ .arg1 = &sbinfo->ra_params.max, -+ .arg2 = &sbinfo->ra_params.flags, -+ .arg3 = NULL, -+ .arg4 = NULL -+ } -+ } -+ } -+ ); -+ -+ /* What to do in case of fs error */ -+ PUSH_OPT( -+ { -+ .name = "onerror", -+ .type = OPT_ONEOF, -+ .u = { -+ .oneof = { -+ .result = &sbinfo->onerror, -+ .list = { -+ "panic", "remount-ro", NULL -+ }, -+ } -+ } -+ } -+ ); -+ -+ /* modify default settings to values set by mount options */ -+ result = parse_options(opt_string, opts, p - opts); -+ kfree(opts); -+ if (result != 0) -+ return result; -+ -+ /* correct settings to sanity values */ -+ sbinfo->tmgr.atom_max_age *= HZ; -+ if (sbinfo->tmgr.atom_max_age <= 0) -+ /* overflow */ -+ sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE; -+ -+ /* round optimal io size up to 512 bytes */ -+ sbinfo->optimal_io_size >>= VFS_BLKSIZE_BITS; -+ sbinfo->optimal_io_size <<= VFS_BLKSIZE_BITS; -+ if (sbinfo->optimal_io_size == 0) { -+ warning("nikita-2497", "optimal_io_size is too small"); -+ return RETERR(-EINVAL); -+ } -+ return result; -+} -+ -+/** -+ * reiser4_init_read_super - read reiser4 master super block -+ * @super: super block to fill -+ * @silent: if 0 - print warnings -+ * -+ * Reads reiser4 master super block either from predefined location or from -+ * location specified by altsuper mount option, initializes disk format plugin. -+ */ -+int reiser4_init_read_super(struct super_block *super, int silent) -+{ -+ struct buffer_head *super_bh; -+ struct reiser4_master_sb *master_sb; -+ reiser4_super_info_data *sbinfo = get_super_private(super); -+ unsigned long blocksize; -+ -+ read_super_block: -+#ifdef CONFIG_REISER4_BADBLOCKS -+ if (sbinfo->altsuper) -+ /* -+ * read reiser4 master super block at position specified by -+ * mount option -+ */ -+ super_bh = sb_bread(super, -+ (sector_t)(sbinfo->altsuper / super->s_blocksize)); -+ else -+#endif -+ /* read reiser4 master super block at 16-th 4096 block */ -+ super_bh = sb_bread(super, -+ (sector_t)(REISER4_MAGIC_OFFSET / super->s_blocksize)); -+ if (!super_bh) -+ return RETERR(-EIO); -+ -+ master_sb = (struct reiser4_master_sb *)super_bh->b_data; -+ /* check reiser4 magic string */ -+ if (!strncmp(master_sb->magic, REISER4_SUPER_MAGIC_STRING, -+ sizeof(REISER4_SUPER_MAGIC_STRING))) { -+ /* reiser4 master super block contains filesystem blocksize */ -+ blocksize = le16_to_cpu(get_unaligned(&master_sb->blocksize)); -+ -+ if (blocksize != PAGE_CACHE_SIZE) { -+ /* -+ * currenly reiser4's blocksize must be equal to -+ * pagesize -+ */ -+ if (!silent) -+ warning("nikita-2609", -+ "%s: wrong block size %ld\n", super->s_id, -+ blocksize); -+ brelse(super_bh); -+ return RETERR(-EINVAL); -+ } -+ if (blocksize != super->s_blocksize) { -+ /* -+ * filesystem uses different blocksize. Reread master -+ * super block with correct blocksize -+ */ -+ brelse(super_bh); -+ if (!sb_set_blocksize(super, (int)blocksize)) -+ return RETERR(-EINVAL); -+ goto read_super_block; -+ } -+ -+ sbinfo->df_plug = -+ disk_format_plugin_by_id( -+ le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id))); -+ if (sbinfo->df_plug == NULL) { -+ if (!silent) -+ warning("nikita-26091", -+ "%s: unknown disk format plugin %d\n", -+ super->s_id, -+ le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id))); -+ brelse(super_bh); -+ return RETERR(-EINVAL); -+ } -+ sbinfo->diskmap_block = le64_to_cpu(get_unaligned(&master_sb->diskmap)); -+ brelse(super_bh); -+ return 0; -+ } -+ -+ /* there is no reiser4 on the device */ -+ if (!silent) -+ warning("nikita-2608", -+ "%s: wrong master super block magic", super->s_id); -+ brelse(super_bh); -+ return RETERR(-EINVAL); -+} -+ -+static struct { -+ reiser4_plugin_type type; -+ reiser4_plugin_id id; -+} default_plugins[PSET_LAST] = { -+ [PSET_FILE] = { -+ .type = REISER4_FILE_PLUGIN_TYPE, -+ .id = UNIX_FILE_PLUGIN_ID -+ }, -+ [PSET_DIR] = { -+ .type = REISER4_DIR_PLUGIN_TYPE, -+ .id = HASHED_DIR_PLUGIN_ID -+ }, -+ [PSET_HASH] = { -+ .type = REISER4_HASH_PLUGIN_TYPE, -+ .id = R5_HASH_ID -+ }, -+ [PSET_FIBRATION] = { -+ .type = REISER4_FIBRATION_PLUGIN_TYPE, -+ .id = FIBRATION_DOT_O -+ }, -+ [PSET_PERM] = { -+ .type = REISER4_PERM_PLUGIN_TYPE, -+ .id = NULL_PERM_ID -+ }, -+ [PSET_FORMATTING] = { -+ .type = REISER4_FORMATTING_PLUGIN_TYPE, -+ .id = SMALL_FILE_FORMATTING_ID -+ }, -+ [PSET_SD] = { -+ .type = REISER4_ITEM_PLUGIN_TYPE, -+ .id = STATIC_STAT_DATA_ID -+ }, -+ [PSET_DIR_ITEM] = { -+ .type = REISER4_ITEM_PLUGIN_TYPE, -+ .id = COMPOUND_DIR_ID -+ }, -+ [PSET_CIPHER] = { -+ .type = REISER4_CIPHER_PLUGIN_TYPE, -+ .id = NONE_CIPHER_ID -+ }, -+ [PSET_DIGEST] = { -+ .type = REISER4_DIGEST_PLUGIN_TYPE, -+ .id = SHA256_32_DIGEST_ID -+ }, -+ [PSET_COMPRESSION] = { -+ .type = REISER4_COMPRESSION_PLUGIN_TYPE, -+ .id = LZO1_COMPRESSION_ID -+ }, -+ [PSET_COMPRESSION_MODE] = { -+ .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ .id = CONVX_COMPRESSION_MODE_ID -+ }, -+ [PSET_CLUSTER] = { -+ .type = REISER4_CLUSTER_PLUGIN_TYPE, -+ .id = CLUSTER_64K_ID -+ }, -+ [PSET_CREATE] = { -+ .type = REISER4_FILE_PLUGIN_TYPE, -+ .id = UNIX_FILE_PLUGIN_ID -+ } -+}; -+ -+/* access to default plugin table */ -+reiser4_plugin *get_default_plugin(pset_member memb) -+{ -+ return plugin_by_id(default_plugins[memb].type, -+ default_plugins[memb].id); -+} -+ -+/** -+ * reiser4_init_root_inode - obtain inode of root directory -+ * @super: super block of filesystem -+ * -+ * Obtains inode of root directory (reading it from disk), initializes plugin -+ * set it was not initialized. -+ */ -+int reiser4_init_root_inode(struct super_block *super) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(super); -+ struct inode *inode; -+ int result = 0; -+ -+ inode = reiser4_iget(super, sbinfo->df_plug->root_dir_key(super), 0); -+ if (IS_ERR(inode)) -+ return RETERR(PTR_ERR(inode)); -+ -+ super->s_root = d_alloc_root(inode); -+ if (!super->s_root) { -+ iput(inode); -+ return RETERR(-ENOMEM); -+ } -+ -+ super->s_root->d_op = &sbinfo->ops.dentry; -+ -+ if (!is_inode_loaded(inode)) { -+ pset_member memb; -+ plugin_set *pset; -+ -+ pset = reiser4_inode_data(inode)->pset; -+ for (memb = 0; memb < PSET_LAST; ++memb) { -+ -+ if (aset_get(pset, memb) != NULL) -+ continue; -+ -+ result = grab_plugin_pset(inode, NULL, memb); -+ if (result != 0) -+ break; -+ -+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN); -+ } -+ -+ if (result == 0) { -+ if (REISER4_DEBUG) { -+ for (memb = 0; memb < PSET_LAST; ++memb) -+ assert("nikita-3500", -+ aset_get(pset, memb) != NULL); -+ } -+ } else -+ warning("nikita-3448", "Cannot set plugins of root: %i", -+ result); -+ reiser4_iget_complete(inode); -+ -+ /* As the default pset kept in the root dir may has been changed -+ (length is unknown), call update_sd. */ -+ if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) { -+ result = reiser4_grab_space( -+ inode_file_plugin(inode)->estimate.update(inode), -+ BA_CAN_COMMIT); -+ -+ if (result == 0) -+ result = reiser4_update_sd(inode); -+ -+ all_grabbed2free(); -+ } -+ } -+ -+ super->s_maxbytes = MAX_LFS_FILESIZE; -+ return result; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/inode.c linux-2.6.20/fs/reiser4/inode.c ---- linux-2.6.20.orig/fs/reiser4/inode.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/inode.c 2007-05-06 14:50:43.726984474 +0400 -@@ -0,0 +1,709 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Inode specific operations. */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "key.h" -+#include "kassign.h" -+#include "coord.h" -+#include "seal.h" -+#include "dscale.h" -+#include "plugin/item/item.h" -+#include "plugin/security/perm.h" -+#include "plugin/plugin.h" -+#include "plugin/object.h" -+#include "znode.h" -+#include "vfs_ops.h" -+#include "inode.h" -+#include "super.h" -+#include "reiser4.h" -+ -+#include /* for struct super_block, address_space */ -+ -+/* return reiser4 internal tree which inode belongs to */ -+/* Audited by: green(2002.06.17) */ -+reiser4_tree *reiser4_tree_by_inode(const struct inode *inode /* inode queried */ ) -+{ -+ assert("nikita-256", inode != NULL); -+ assert("nikita-257", inode->i_sb != NULL); -+ return reiser4_get_tree(inode->i_sb); -+} -+ -+/* return reiser4-specific inode flags */ -+static inline unsigned long *inode_flags(const struct inode *const inode) -+{ -+ assert("nikita-2842", inode != NULL); -+ return &reiser4_inode_data(inode)->flags; -+} -+ -+/* set reiser4-specific flag @f in @inode */ -+void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f) -+{ -+ assert("nikita-2248", inode != NULL); -+ set_bit((int)f, inode_flags(inode)); -+} -+ -+/* clear reiser4-specific flag @f in @inode */ -+void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f) -+{ -+ assert("nikita-2250", inode != NULL); -+ clear_bit((int)f, inode_flags(inode)); -+} -+ -+/* true if reiser4-specific flag @f is set in @inode */ -+int reiser4_inode_get_flag(const struct inode *inode, -+ reiser4_file_plugin_flags f) -+{ -+ assert("nikita-2251", inode != NULL); -+ return test_bit((int)f, inode_flags(inode)); -+} -+ -+/* convert oid to inode number */ -+ino_t oid_to_ino(oid_t oid) -+{ -+ return (ino_t) oid; -+} -+ -+/* convert oid to user visible inode number */ -+ino_t oid_to_uino(oid_t oid) -+{ -+ /* reiser4 object is uniquely identified by oid which is 64 bit -+ quantity. Kernel in-memory inode is indexed (in the hash table) by -+ 32 bit i_ino field, but this is not a problem, because there is a -+ way to further distinguish inodes with identical inode numbers -+ (find_actor supplied to iget()). -+ -+ But user space expects unique 32 bit inode number. Obviously this -+ is impossible. Work-around is to somehow hash oid into user visible -+ inode number. -+ */ -+ oid_t max_ino = (ino_t) ~ 0; -+ -+ if (REISER4_INO_IS_OID || (oid <= max_ino)) -+ return oid; -+ else -+ /* this is remotely similar to algorithm used to find next pid -+ to use for process: after wrap-around start from some -+ offset rather than from 0. Idea is that there are some long -+ living objects with which we don't want to collide. -+ */ -+ return REISER4_UINO_SHIFT + ((oid - max_ino) & (max_ino >> 1)); -+} -+ -+/* check that "inode" is on reiser4 file-system */ -+int is_reiser4_inode(const struct inode *inode /* inode queried */ ) -+{ -+ return inode != NULL && is_reiser4_super(inode->i_sb); -+} -+ -+/* Maximal length of a name that can be stored in directory @inode. -+ -+ This is used in check during file creation and lookup. */ -+int reiser4_max_filename_len(const struct inode *inode /* inode queried */ ) -+{ -+ assert("nikita-287", is_reiser4_inode(inode)); -+ assert("nikita-1710", inode_dir_item_plugin(inode)); -+ if (inode_dir_item_plugin(inode)->s.dir.max_name_len) -+ return inode_dir_item_plugin(inode)->s.dir.max_name_len(inode); -+ else -+ return 255; -+} -+ -+#if REISER4_USE_COLLISION_LIMIT -+/* Maximal number of hash collisions for this directory. */ -+int max_hash_collisions(const struct inode *dir /* inode queried */ ) -+{ -+ assert("nikita-1711", dir != NULL); -+ return reiser4_inode_data(dir)->plugin.max_collisions; -+} -+#endif /* REISER4_USE_COLLISION_LIMIT */ -+ -+/* Install file, inode, and address_space operation on @inode, depending on -+ its mode. */ -+int setup_inode_ops(struct inode *inode /* inode to intialize */ , -+ reiser4_object_create_data * data /* parameters to create -+ * object */ ) -+{ -+ reiser4_super_info_data *sinfo; -+ file_plugin *fplug; -+ dir_plugin *dplug; -+ -+ fplug = inode_file_plugin(inode); -+ dplug = inode_dir_plugin(inode); -+ -+ sinfo = get_super_private(inode->i_sb); -+ -+ switch (inode->i_mode & S_IFMT) { -+ case S_IFSOCK: -+ case S_IFBLK: -+ case S_IFCHR: -+ case S_IFIFO: -+ { -+ dev_t rdev; /* to keep gcc happy */ -+ -+ assert("vs-46", fplug != NULL); -+ /* ugly hack with rdev */ -+ if (data == NULL) { -+ rdev = inode->i_rdev; -+ inode->i_rdev = 0; -+ } else -+ rdev = data->rdev; -+ inode->i_blocks = 0; -+ assert("vs-42", fplug->h.id == SPECIAL_FILE_PLUGIN_ID); -+ inode->i_op = &file_plugins[fplug->h.id].inode_ops; -+ /* initialize inode->i_fop and inode->i_rdev for block and char -+ devices */ -+ init_special_inode(inode, inode->i_mode, rdev); -+ /* all address space operations are null */ -+ inode->i_mapping->a_ops = -+ &file_plugins[fplug->h.id].as_ops; -+ break; -+ } -+ case S_IFLNK: -+ assert("vs-46", fplug != NULL); -+ assert("vs-42", fplug->h.id == SYMLINK_FILE_PLUGIN_ID); -+ inode->i_op = &file_plugins[fplug->h.id].inode_ops; -+ inode->i_fop = NULL; -+ /* all address space operations are null */ -+ inode->i_mapping->a_ops = &file_plugins[fplug->h.id].as_ops; -+ break; -+ case S_IFDIR: -+ assert("vs-46", dplug != NULL); -+ assert("vs-43", (dplug->h.id == HASHED_DIR_PLUGIN_ID || -+ dplug->h.id == SEEKABLE_HASHED_DIR_PLUGIN_ID)); -+ inode->i_op = &dir_plugins[dplug->h.id].inode_ops; -+ inode->i_fop = &dir_plugins[dplug->h.id].file_ops; -+ inode->i_mapping->a_ops = &dir_plugins[dplug->h.id].as_ops; -+ break; -+ case S_IFREG: -+ assert("vs-46", fplug != NULL); -+ assert("vs-43", (fplug->h.id == UNIX_FILE_PLUGIN_ID || -+ fplug->h.id == CRYPTCOMPRESS_FILE_PLUGIN_ID)); -+ inode->i_op = &file_plugins[fplug->h.id].inode_ops; -+ inode->i_fop = &file_plugins[fplug->h.id].file_ops; -+ inode->i_mapping->a_ops = &file_plugins[fplug->h.id].as_ops; -+ break; -+ default: -+ warning("nikita-291", "wrong file mode: %o for %llu", -+ inode->i_mode, -+ (unsigned long long)get_inode_oid(inode)); -+ reiser4_make_bad_inode(inode); -+ return RETERR(-EINVAL); -+ } -+ return 0; -+} -+ -+/* Initialize inode from disk data. Called with inode locked. -+ Return inode locked. */ -+static int init_inode(struct inode *inode /* inode to intialise */ , -+ coord_t * coord /* coord of stat data */ ) -+{ -+ int result; -+ item_plugin *iplug; -+ void *body; -+ int length; -+ reiser4_inode *state; -+ -+ assert("nikita-292", coord != NULL); -+ assert("nikita-293", inode != NULL); -+ -+ coord_clear_iplug(coord); -+ result = zload(coord->node); -+ if (result) -+ return result; -+ iplug = item_plugin_by_coord(coord); -+ body = item_body_by_coord(coord); -+ length = item_length_by_coord(coord); -+ -+ assert("nikita-295", iplug != NULL); -+ assert("nikita-296", body != NULL); -+ assert("nikita-297", length > 0); -+ -+ /* inode is under I_LOCK now */ -+ -+ state = reiser4_inode_data(inode); -+ /* call stat-data plugin method to load sd content into inode */ -+ result = iplug->s.sd.init_inode(inode, body, length); -+ set_plugin(&state->pset, PSET_SD, item_plugin_to_plugin(iplug)); -+ if (result == 0) { -+ result = setup_inode_ops(inode, NULL); -+ if (result == 0 && inode->i_sb->s_root && -+ inode->i_sb->s_root->d_inode) -+ result = finish_pset(inode); -+ } -+ zrelse(coord->node); -+ return result; -+} -+ -+/* read `inode' from the disk. This is what was previously in -+ reiserfs_read_inode2(). -+ -+ Must be called with inode locked. Return inode still locked. -+*/ -+static int read_inode(struct inode *inode /* inode to read from disk */ , -+ const reiser4_key * key /* key of stat data */ , -+ int silent) -+{ -+ int result; -+ lock_handle lh; -+ reiser4_inode *info; -+ coord_t coord; -+ -+ assert("nikita-298", inode != NULL); -+ assert("nikita-1945", !is_inode_loaded(inode)); -+ -+ info = reiser4_inode_data(inode); -+ assert("nikita-300", info->locality_id != 0); -+ -+ coord_init_zero(&coord); -+ init_lh(&lh); -+ /* locate stat-data in a tree and return znode locked */ -+ result = lookup_sd(inode, ZNODE_READ_LOCK, &coord, &lh, key, silent); -+ assert("nikita-301", !is_inode_loaded(inode)); -+ if (result == 0) { -+ /* use stat-data plugin to load sd into inode. */ -+ result = init_inode(inode, &coord); -+ if (result == 0) { -+ /* initialize stat-data seal */ -+ spin_lock_inode(inode); -+ reiser4_seal_init(&info->sd_seal, &coord, key); -+ info->sd_coord = coord; -+ spin_unlock_inode(inode); -+ -+ /* call file plugin's method to initialize plugin -+ * specific part of inode */ -+ if (inode_file_plugin(inode)->init_inode_data) -+ inode_file_plugin(inode)->init_inode_data(inode, -+ NULL, -+ 0); -+ /* load detached directory cursors for stateless -+ * directory readers (NFS). */ -+ reiser4_load_cursors(inode); -+ -+ /* Check the opened inode for consistency. */ -+ result = -+ get_super_private(inode->i_sb)->df_plug-> -+ check_open(inode); -+ } -+ } -+ /* lookup_sd() doesn't release coord because we want znode -+ stay read-locked while stat-data fields are accessed in -+ init_inode() */ -+ done_lh(&lh); -+ -+ if (result != 0) -+ reiser4_make_bad_inode(inode); -+ return result; -+} -+ -+/* initialise new reiser4 inode being inserted into hash table. */ -+static int init_locked_inode(struct inode *inode /* new inode */ , -+ void *opaque /* key of stat data passed to the -+ * iget5_locked as cookie */ ) -+{ -+ reiser4_key *key; -+ -+ assert("nikita-1995", inode != NULL); -+ assert("nikita-1996", opaque != NULL); -+ key = opaque; -+ set_inode_oid(inode, get_key_objectid(key)); -+ reiser4_inode_data(inode)->locality_id = get_key_locality(key); -+ return 0; -+} -+ -+/* reiser4_inode_find_actor() - "find actor" supplied by reiser4 to iget5_locked(). -+ -+ This function is called by iget5_locked() to distinguish reiser4 inodes -+ having the same inode numbers. Such inodes can only exist due to some error -+ condition. One of them should be bad. Inodes with identical inode numbers -+ (objectids) are distinguished by their packing locality. -+ -+*/ -+static int reiser4_inode_find_actor(struct inode *inode /* inode from hash table to -+ * check */ , -+ void *opaque /* "cookie" passed to -+ * iget5_locked(). This is stat data -+ * key */ ) -+{ -+ reiser4_key *key; -+ -+ key = opaque; -+ return -+ /* oid is unique, so first term is enough, actually. */ -+ get_inode_oid(inode) == get_key_objectid(key) && -+ /* -+ * also, locality should be checked, but locality is stored in -+ * the reiser4-specific part of the inode, and actor can be -+ * called against arbitrary inode that happened to be in this -+ * hash chain. Hence we first have to check that this is -+ * reiser4 inode at least. is_reiser4_inode() is probably too -+ * early to call, as inode may have ->i_op not yet -+ * initialised. -+ */ -+ is_reiser4_super(inode->i_sb) && -+ /* -+ * usually objectid is unique, but pseudo files use counter to -+ * generate objectid. All pseudo files are placed into special -+ * (otherwise unused) locality. -+ */ -+ reiser4_inode_data(inode)->locality_id == get_key_locality(key); -+} -+ -+/* hook for kmem_cache_create */ -+void loading_init_once(reiser4_inode * info) -+{ -+ mutex_init(&info->loading); -+} -+ -+/* for reiser4_alloc_inode */ -+void loading_alloc(reiser4_inode * info) -+{ -+ assert("vs-1717", !mutex_is_locked(&info->loading)); -+} -+ -+/* for reiser4_destroy */ -+void loading_destroy(reiser4_inode * info) -+{ -+ assert("vs-1717a", !mutex_is_locked(&info->loading)); -+} -+ -+static void loading_begin(reiser4_inode * info) -+{ -+ mutex_lock(&info->loading); -+} -+ -+static void loading_end(reiser4_inode * info) -+{ -+ mutex_unlock(&info->loading); -+} -+ -+/** -+ * reiser4_iget - obtain inode via iget5_locked, read from disk if necessary -+ * @super: super block of filesystem -+ * @key: key of inode's stat-data -+ * @silent: -+ * -+ * This is our helper function a la iget(). This is be called by -+ * lookup_common() and reiser4_read_super(). Return inode locked or error -+ * encountered. -+ */ -+struct inode *reiser4_iget(struct super_block *super, const reiser4_key *key, -+ int silent) -+{ -+ struct inode *inode; -+ int result; -+ reiser4_inode *info; -+ -+ assert("nikita-302", super != NULL); -+ assert("nikita-303", key != NULL); -+ -+ result = 0; -+ -+ /* call iget(). Our ->read_inode() is dummy, so this will either -+ find inode in cache or return uninitialised inode */ -+ inode = iget5_locked(super, -+ (unsigned long)get_key_objectid(key), -+ reiser4_inode_find_actor, -+ init_locked_inode, (reiser4_key *) key); -+ if (inode == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ if (is_bad_inode(inode)) { -+ warning("nikita-304", "Bad inode found"); -+ reiser4_print_key("key", key); -+ iput(inode); -+ return ERR_PTR(RETERR(-EIO)); -+ } -+ -+ info = reiser4_inode_data(inode); -+ -+ /* Reiser4 inode state bit REISER4_LOADED is used to distinguish fully -+ loaded and initialized inode from just allocated inode. If -+ REISER4_LOADED bit is not set, reiser4_iget() completes loading under -+ info->loading. The place in reiser4 which uses not initialized inode -+ is the reiser4 repacker, see repacker-related functions in -+ plugin/item/extent.c */ -+ if (!is_inode_loaded(inode)) { -+ loading_begin(info); -+ if (!is_inode_loaded(inode)) { -+ /* locking: iget5_locked returns locked inode */ -+ assert("nikita-1941", !is_inode_loaded(inode)); -+ assert("nikita-1949", -+ reiser4_inode_find_actor(inode, -+ (reiser4_key *) key)); -+ /* now, inode has objectid as ->i_ino and locality in -+ reiser4-specific part. This is enough for -+ read_inode() to read stat data from the disk */ -+ result = read_inode(inode, key, silent); -+ } else -+ loading_end(info); -+ } -+ -+ if (inode->i_state & I_NEW) -+ unlock_new_inode(inode); -+ -+ if (is_bad_inode(inode)) { -+ assert("vs-1717", result != 0); -+ loading_end(info); -+ iput(inode); -+ inode = ERR_PTR(result); -+ } else if (REISER4_DEBUG) { -+ reiser4_key found_key; -+ -+ assert("vs-1717", result == 0); -+ build_sd_key(inode, &found_key); -+ if (!keyeq(&found_key, key)) { -+ warning("nikita-305", "Wrong key in sd"); -+ reiser4_print_key("sought for", key); -+ reiser4_print_key("found", &found_key); -+ } -+ if (inode->i_nlink == 0) { -+ warning("nikita-3559", "Unlinked inode found: %llu\n", -+ (unsigned long long)get_inode_oid(inode)); -+ } -+ } -+ return inode; -+} -+ -+/* reiser4_iget() may return not fully initialized inode, this function should -+ * be called after one completes reiser4 inode initializing. */ -+void reiser4_iget_complete(struct inode *inode) -+{ -+ assert("zam-988", is_reiser4_inode(inode)); -+ -+ if (!is_inode_loaded(inode)) { -+ reiser4_inode_set_flag(inode, REISER4_LOADED); -+ loading_end(reiser4_inode_data(inode)); -+ } -+} -+ -+void reiser4_make_bad_inode(struct inode *inode) -+{ -+ assert("nikita-1934", inode != NULL); -+ -+ /* clear LOADED bit */ -+ reiser4_inode_clr_flag(inode, REISER4_LOADED); -+ make_bad_inode(inode); -+ return; -+} -+ -+file_plugin *inode_file_plugin(const struct inode * inode) -+{ -+ assert("nikita-1997", inode != NULL); -+ return reiser4_inode_data(inode)->pset->file; -+} -+ -+dir_plugin *inode_dir_plugin(const struct inode * inode) -+{ -+ assert("nikita-1998", inode != NULL); -+ return reiser4_inode_data(inode)->pset->dir; -+} -+ -+formatting_plugin *inode_formatting_plugin(const struct inode * inode) -+{ -+ assert("nikita-2000", inode != NULL); -+ return reiser4_inode_data(inode)->pset->formatting; -+} -+ -+hash_plugin *inode_hash_plugin(const struct inode * inode) -+{ -+ assert("nikita-2001", inode != NULL); -+ return reiser4_inode_data(inode)->pset->hash; -+} -+ -+fibration_plugin *inode_fibration_plugin(const struct inode * inode) -+{ -+ assert("nikita-2001", inode != NULL); -+ return reiser4_inode_data(inode)->pset->fibration; -+} -+ -+cipher_plugin *inode_cipher_plugin(const struct inode * inode) -+{ -+ assert("edward-36", inode != NULL); -+ return reiser4_inode_data(inode)->pset->cipher; -+} -+ -+compression_plugin *inode_compression_plugin(const struct inode * inode) -+{ -+ assert("edward-37", inode != NULL); -+ return reiser4_inode_data(inode)->pset->compression; -+} -+ -+compression_mode_plugin *inode_compression_mode_plugin(const struct inode * -+ inode) -+{ -+ assert("edward-1330", inode != NULL); -+ return reiser4_inode_data(inode)->pset->compression_mode; -+} -+ -+cluster_plugin *inode_cluster_plugin(const struct inode * inode) -+{ -+ assert("edward-1328", inode != NULL); -+ return reiser4_inode_data(inode)->pset->cluster; -+} -+ -+file_plugin *inode_create_plugin(const struct inode * inode) -+{ -+ assert("edward-1329", inode != NULL); -+ return reiser4_inode_data(inode)->pset->create; -+} -+ -+digest_plugin *inode_digest_plugin(const struct inode * inode) -+{ -+ assert("edward-86", inode != NULL); -+ return reiser4_inode_data(inode)->pset->digest; -+} -+ -+item_plugin *inode_sd_plugin(const struct inode * inode) -+{ -+ assert("vs-534", inode != NULL); -+ return reiser4_inode_data(inode)->pset->sd; -+} -+ -+item_plugin *inode_dir_item_plugin(const struct inode * inode) -+{ -+ assert("vs-534", inode != NULL); -+ return reiser4_inode_data(inode)->pset->dir_item; -+} -+ -+file_plugin *child_create_plugin(const struct inode * inode) -+{ -+ assert("edward-1329", inode != NULL); -+ return reiser4_inode_data(inode)->hset->create; -+} -+ -+void inode_set_extension(struct inode *inode, sd_ext_bits ext) -+{ -+ reiser4_inode *state; -+ -+ assert("nikita-2716", inode != NULL); -+ assert("nikita-2717", ext < LAST_SD_EXTENSION); -+ assert("nikita-3491", spin_inode_is_locked(inode)); -+ -+ state = reiser4_inode_data(inode); -+ state->extmask |= 1 << ext; -+ /* force re-calculation of stat-data length on next call to -+ update_sd(). */ -+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN); -+} -+ -+void inode_clr_extension(struct inode *inode, sd_ext_bits ext) -+{ -+ reiser4_inode *state; -+ -+ assert("vpf-1926", inode != NULL); -+ assert("vpf-1927", ext < LAST_SD_EXTENSION); -+ assert("vpf-1928", spin_inode_is_locked(inode)); -+ -+ state = reiser4_inode_data(inode); -+ state->extmask &= ~(1 << ext); -+ /* force re-calculation of stat-data length on next call to -+ update_sd(). */ -+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN); -+} -+ -+void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new) -+{ -+ assert("edward-1287", inode != NULL); -+ if (!dscale_fit(old, new)) -+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN); -+ return; -+} -+ -+void inode_check_scale(struct inode *inode, __u64 old, __u64 new) -+{ -+ assert("nikita-2875", inode != NULL); -+ spin_lock_inode(inode); -+ inode_check_scale_nolock(inode, old, new); -+ spin_unlock_inode(inode); -+} -+ -+/* -+ * initialize ->ordering field of inode. This field defines how file stat-data -+ * and body is ordered within a tree with respect to other objects within the -+ * same parent directory. -+ */ -+void -+init_inode_ordering(struct inode *inode, -+ reiser4_object_create_data * crd, int create) -+{ -+ reiser4_key key; -+ -+ if (create) { -+ struct inode *parent; -+ -+ parent = crd->parent; -+ assert("nikita-3224", inode_dir_plugin(parent) != NULL); -+ inode_dir_plugin(parent)->build_entry_key(parent, -+ &crd->dentry->d_name, -+ &key); -+ } else { -+ coord_t *coord; -+ -+ coord = &reiser4_inode_data(inode)->sd_coord; -+ coord_clear_iplug(coord); -+ /* safe to use ->sd_coord, because node is under long term -+ * lock */ -+ WITH_DATA(coord->node, item_key_by_coord(coord, &key)); -+ } -+ -+ set_inode_ordering(inode, get_key_ordering(&key)); -+} -+ -+znode *inode_get_vroot(struct inode *inode) -+{ -+ reiser4_block_nr blk; -+ znode *result; -+ -+ spin_lock_inode(inode); -+ blk = reiser4_inode_data(inode)->vroot; -+ spin_unlock_inode(inode); -+ if (!disk_addr_eq(&UBER_TREE_ADDR, &blk)) -+ result = zlook(reiser4_tree_by_inode(inode), &blk); -+ else -+ result = NULL; -+ return result; -+} -+ -+void inode_set_vroot(struct inode *inode, znode *vroot) -+{ -+ spin_lock_inode(inode); -+ reiser4_inode_data(inode)->vroot = *znode_get_block(vroot); -+ spin_unlock_inode(inode); -+} -+ -+#if REISER4_DEBUG -+ -+void reiser4_inode_invariant(const struct inode *inode) -+{ -+ assert("nikita-3077", spin_inode_is_locked(inode)); -+} -+ -+int inode_has_no_jnodes(reiser4_inode * r4_inode) -+{ -+ return jnode_tree_by_reiser4_inode(r4_inode)->rnode == NULL && -+ r4_inode->nr_jnodes == 0; -+} -+ -+#endif -+ -+/* true if directory is empty (only contains dot and dotdot) */ -+/* FIXME: shouldn't it be dir plugin method? */ -+int is_dir_empty(const struct inode *dir) -+{ -+ assert("nikita-1976", dir != NULL); -+ -+ /* rely on our method to maintain directory i_size being equal to the -+ number of entries. */ -+ return dir->i_size <= 2 ? 0 : RETERR(-ENOTEMPTY); -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/inode.h linux-2.6.20/fs/reiser4/inode.h ---- linux-2.6.20.orig/fs/reiser4/inode.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/inode.h 2007-05-06 14:50:43.726984474 +0400 -@@ -0,0 +1,438 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Inode functions. */ -+ -+#if !defined( __REISER4_INODE_H__ ) -+#define __REISER4_INODE_H__ -+ -+#include "forward.h" -+#include "debug.h" -+#include "key.h" -+#include "seal.h" -+#include "plugin/plugin.h" -+#include "plugin/file/cryptcompress.h" -+#include "plugin/file/file.h" -+#include "plugin/dir/dir.h" -+#include "plugin/plugin_set.h" -+#include "plugin/security/perm.h" -+#include "vfs_ops.h" -+#include "jnode.h" -+#include "fsdata.h" -+ -+#include /* for __u?? , ino_t */ -+#include /* for struct super_block, struct -+ * rw_semaphore, etc */ -+#include -+#include -+ -+/* reiser4-specific inode flags. They are "transient" and are not -+ supposed to be stored on disk. Used to trace "state" of -+ inode -+*/ -+typedef enum { -+ /* this is light-weight inode, inheriting some state from its -+ parent */ -+ REISER4_LIGHT_WEIGHT = 0, -+ /* stat data wasn't yet created */ -+ REISER4_NO_SD = 1, -+ /* internal immutable flag. Currently is only used -+ to avoid race condition during file creation. -+ See comment in create_object(). */ -+ REISER4_IMMUTABLE = 2, -+ /* inode was read from storage */ -+ REISER4_LOADED = 3, -+ /* this bit is set for symlinks. inode->i_private points to target -+ name of symlink. */ -+ REISER4_GENERIC_PTR_USED = 4, -+ /* set if size of stat-data item for this inode is known. If this is -+ * set we can avoid recalculating size of stat-data on each update. */ -+ REISER4_SDLEN_KNOWN = 5, -+ /* reiser4_inode->crypt points to the crypto stat */ -+ REISER4_CRYPTO_STAT_LOADED = 6, -+ /* cryptcompress_inode_data points to the secret key */ -+ REISER4_SECRET_KEY_INSTALLED = 7, -+ /* File (possibly) has pages corresponding to the tail items, that -+ * were created by ->readpage. It is set by mmap_unix_file() and -+ * sendfile_unix_file(). This bit is inspected by write_unix_file and -+ * kill-hook of tail items. It is never cleared once set. This bit is -+ * modified and inspected under i_mutex. */ -+ REISER4_HAS_MMAP = 8, -+ REISER4_PART_MIXED = 9, -+ REISER4_PART_IN_CONV = 10, -+ /* This flag indicates that file plugin conversion is in progress */ -+ REISER4_FILE_CONV_IN_PROGRESS = 11 -+} reiser4_file_plugin_flags; -+ -+/* state associated with each inode. -+ reiser4 inode. -+ -+ NOTE-NIKITA In 2.5 kernels it is not necessary that all file-system inodes -+ be of the same size. File-system allocates inodes by itself through -+ s_op->allocate_inode() method. So, it is possible to adjust size of inode -+ at the time of its creation. -+ -+ Invariants involving parts of this data-type: -+ -+ [inode->eflushed] -+ -+*/ -+ -+typedef struct reiser4_inode reiser4_inode; -+/* return pointer to reiser4-specific part of inode */ -+static inline reiser4_inode *reiser4_inode_data(const struct inode *inode -+ /* inode queried */ ); -+ -+#if BITS_PER_LONG == 64 -+ -+#define REISER4_INO_IS_OID (1) -+typedef struct {; -+} oid_hi_t; -+ -+/* BITS_PER_LONG == 64 */ -+#else -+ -+#define REISER4_INO_IS_OID (0) -+typedef __u32 oid_hi_t; -+ -+/* BITS_PER_LONG == 64 */ -+#endif -+ -+struct reiser4_inode { -+ /* spin lock protecting fields of this structure. */ -+ spinlock_t guard; -+ /* main plugin set that control the file -+ (see comments in plugin/plugin_set.c) */ -+ plugin_set *pset; -+ /* plugin set for inheritance -+ (see comments in plugin/plugin_set.c) */ -+ plugin_set *hset; -+ /* high 32 bits of object id */ -+ oid_hi_t oid_hi; -+ /* seal for stat-data */ -+ seal_t sd_seal; -+ /* locality id for this file */ -+ oid_t locality_id; -+#if REISER4_LARGE_KEY -+ __u64 ordering; -+#endif -+ /* coord of stat-data in sealed node */ -+ coord_t sd_coord; -+ /* bit-mask of stat-data extentions used by this file */ -+ __u64 extmask; -+ /* bitmask of non-default plugins for this inode */ -+ __u16 plugin_mask; -+ /* bitmask of set heir plugins for this inode. */ -+ __u16 heir_mask; -+ union { -+ struct list_head readdir_list; -+ struct list_head not_used; -+ } lists; -+ /* per-inode flags. Filled by values of reiser4_file_plugin_flags */ -+ unsigned long flags; -+ union { -+ /* fields specific to unix_file plugin */ -+ unix_file_info_t unix_file_info; -+ /* fields specific to cryptcompress plugin */ -+ cryptcompress_info_t cryptcompress_info; -+ } file_plugin_data; -+ -+ /* this semaphore is to serialize readers and writers of @pset->file -+ * when file plugin conversion is enabled -+ */ -+ struct rw_semaphore conv_sem; -+ -+ /* tree of jnodes. Phantom jnodes (ones not attched to any atom) are -+ tagged in that tree by EFLUSH_TAG_ANONYMOUS */ -+ struct radix_tree_root jnodes_tree; -+#if REISER4_DEBUG -+ /* number of unformatted node jnodes of this file in jnode hash table */ -+ unsigned long nr_jnodes; -+#endif -+ -+ /* block number of virtual root for this object. See comment above -+ * fs/reiser4/search.c:handle_vroot() */ -+ reiser4_block_nr vroot; -+ struct mutex loading; -+}; -+ -+void loading_init_once(reiser4_inode *); -+void loading_alloc(reiser4_inode *); -+void loading_destroy(reiser4_inode *); -+ -+typedef struct reiser4_inode_object { -+ /* private part */ -+ reiser4_inode p; -+ /* generic fields not specific to reiser4, but used by VFS */ -+ struct inode vfs_inode; -+} reiser4_inode_object; -+ -+/* return pointer to the reiser4 specific portion of @inode */ -+static inline reiser4_inode *reiser4_inode_data(const struct inode *inode -+ /* inode queried */ ) -+{ -+ assert("nikita-254", inode != NULL); -+ return &container_of(inode, reiser4_inode_object, vfs_inode)->p; -+} -+ -+static inline struct inode *inode_by_reiser4_inode(const reiser4_inode * -+ r4_inode /* inode queried */ -+ ) -+{ -+ return &container_of(r4_inode, reiser4_inode_object, p)->vfs_inode; -+} -+ -+/* -+ * reiser4 inodes are identified by 64bit object-id (oid_t), but in struct -+ * inode ->i_ino field is of type ino_t (long) that can be either 32 or 64 -+ * bits. -+ * -+ * If ->i_ino is 32 bits we store remaining 32 bits in reiser4 specific part -+ * of inode, otherwise whole oid is stored in i_ino. -+ * -+ * Wrappers below ([sg]et_inode_oid()) are used to hide this difference. -+ */ -+ -+#define OID_HI_SHIFT (sizeof(ino_t) * 8) -+ -+#if REISER4_INO_IS_OID -+ -+static inline oid_t get_inode_oid(const struct inode *inode) -+{ -+ return inode->i_ino; -+} -+ -+static inline void set_inode_oid(struct inode *inode, oid_t oid) -+{ -+ inode->i_ino = oid; -+} -+ -+/* REISER4_INO_IS_OID */ -+#else -+ -+static inline oid_t get_inode_oid(const struct inode *inode) -+{ -+ return -+ ((__u64) reiser4_inode_data(inode)->oid_hi << OID_HI_SHIFT) | -+ inode->i_ino; -+} -+ -+static inline void set_inode_oid(struct inode *inode, oid_t oid) -+{ -+ assert("nikita-2519", inode != NULL); -+ inode->i_ino = (ino_t) (oid); -+ reiser4_inode_data(inode)->oid_hi = (oid) >> OID_HI_SHIFT; -+ assert("nikita-2521", get_inode_oid(inode) == (oid)); -+} -+ -+/* REISER4_INO_IS_OID */ -+#endif -+ -+static inline oid_t get_inode_locality(const struct inode *inode) -+{ -+ return reiser4_inode_data(inode)->locality_id; -+} -+ -+#if REISER4_LARGE_KEY -+static inline __u64 get_inode_ordering(const struct inode *inode) -+{ -+ return reiser4_inode_data(inode)->ordering; -+} -+ -+static inline void set_inode_ordering(const struct inode *inode, __u64 ordering) -+{ -+ reiser4_inode_data(inode)->ordering = ordering; -+} -+ -+#else -+ -+#define get_inode_ordering(inode) (0) -+#define set_inode_ordering(inode, val) noop -+ -+#endif -+ -+/* return inode in which @uf_info is embedded */ -+static inline struct inode *unix_file_info_to_inode(const unix_file_info_t * -+ uf_info) -+{ -+ return &container_of(uf_info, reiser4_inode_object, -+ p.file_plugin_data.unix_file_info)->vfs_inode; -+} -+ -+extern ino_t oid_to_ino(oid_t oid) __attribute__ ((const)); -+extern ino_t oid_to_uino(oid_t oid) __attribute__ ((const)); -+ -+extern reiser4_tree *reiser4_tree_by_inode(const struct inode *inode); -+ -+#if REISER4_DEBUG -+extern void reiser4_inode_invariant(const struct inode *inode); -+extern int inode_has_no_jnodes(reiser4_inode *); -+#else -+#define reiser4_inode_invariant(inode) noop -+#endif -+ -+static inline int spin_inode_is_locked(const struct inode *inode) -+{ -+ assert_spin_locked(&reiser4_inode_data(inode)->guard); -+ return 1; -+} -+ -+/** -+ * spin_lock_inode - lock reiser4_inode' embedded spinlock -+ * @inode: inode to lock -+ * -+ * In debug mode it checks that lower priority locks are not held and -+ * increments reiser4_context's lock counters on which lock ordering checking -+ * is based. -+ */ -+static inline void spin_lock_inode(struct inode *inode) -+{ -+ assert("", LOCK_CNT_NIL(spin_locked)); -+ /* check lock ordering */ -+ assert_spin_not_locked(&d_lock); -+ -+ spin_lock(&reiser4_inode_data(inode)->guard); -+ -+ LOCK_CNT_INC(spin_locked_inode); -+ LOCK_CNT_INC(spin_locked); -+ -+ reiser4_inode_invariant(inode); -+} -+ -+/** -+ * spin_unlock_inode - unlock reiser4_inode' embedded spinlock -+ * @inode: inode to unlock -+ * -+ * In debug mode it checks that spinlock is held and decrements -+ * reiser4_context's lock counters on which lock ordering checking is based. -+ */ -+static inline void spin_unlock_inode(struct inode *inode) -+{ -+ assert_spin_locked(&reiser4_inode_data(inode)->guard); -+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_inode)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ reiser4_inode_invariant(inode); -+ -+ LOCK_CNT_DEC(spin_locked_inode); -+ LOCK_CNT_DEC(spin_locked); -+ -+ spin_unlock(&reiser4_inode_data(inode)->guard); -+} -+ -+extern znode *inode_get_vroot(struct inode *inode); -+extern void inode_set_vroot(struct inode *inode, znode * vroot); -+ -+extern int reiser4_max_filename_len(const struct inode *inode); -+extern int max_hash_collisions(const struct inode *dir); -+extern void reiser4_unlock_inode(struct inode *inode); -+extern int is_reiser4_inode(const struct inode *inode); -+extern int setup_inode_ops(struct inode *inode, reiser4_object_create_data *); -+extern struct inode *reiser4_iget(struct super_block *super, -+ const reiser4_key * key, int silent); -+extern void reiser4_iget_complete(struct inode *inode); -+extern void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f); -+extern void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f); -+extern int reiser4_inode_get_flag(const struct inode *inode, -+ reiser4_file_plugin_flags f); -+ -+/* has inode been initialized? */ -+static inline int -+is_inode_loaded(const struct inode *inode /* inode queried */ ) -+{ -+ assert("nikita-1120", inode != NULL); -+ return reiser4_inode_get_flag(inode, REISER4_LOADED); -+} -+ -+extern file_plugin *inode_file_plugin(const struct inode *inode); -+extern dir_plugin *inode_dir_plugin(const struct inode *inode); -+extern formatting_plugin *inode_formatting_plugin(const struct inode *inode); -+extern hash_plugin *inode_hash_plugin(const struct inode *inode); -+extern fibration_plugin *inode_fibration_plugin(const struct inode *inode); -+extern cipher_plugin *inode_cipher_plugin(const struct inode *inode); -+extern digest_plugin *inode_digest_plugin(const struct inode *inode); -+extern compression_plugin *inode_compression_plugin(const struct inode *inode); -+extern compression_mode_plugin *inode_compression_mode_plugin(const struct inode -+ *inode); -+extern cluster_plugin *inode_cluster_plugin(const struct inode *inode); -+extern file_plugin *inode_create_plugin(const struct inode *inode); -+extern item_plugin *inode_sd_plugin(const struct inode *inode); -+extern item_plugin *inode_dir_item_plugin(const struct inode *inode); -+extern file_plugin *child_create_plugin(const struct inode *inode); -+ -+extern void reiser4_make_bad_inode(struct inode *inode); -+ -+extern void inode_set_extension(struct inode *inode, sd_ext_bits ext); -+extern void inode_clr_extension(struct inode *inode, sd_ext_bits ext); -+extern void inode_check_scale(struct inode *inode, __u64 old, __u64 new); -+extern void inode_check_scale_nolock(struct inode * inode, __u64 old, __u64 new); -+ -+/* -+ * update field @field in inode @i to contain value @value. -+ */ -+#define INODE_SET_FIELD(i, field, value) \ -+({ \ -+ struct inode *__i; \ -+ typeof(value) __v; \ -+ \ -+ __i = (i); \ -+ __v = (value); \ -+ inode_check_scale(__i, __i->field, __v); \ -+ __i->field = __v; \ -+}) -+ -+#define INODE_INC_FIELD(i, field) \ -+({ \ -+ struct inode *__i; \ -+ \ -+ __i = (i); \ -+ inode_check_scale(__i, __i->field, __i->field + 1); \ -+ ++ __i->field; \ -+}) -+ -+#define INODE_DEC_FIELD(i, field) \ -+({ \ -+ struct inode *__i; \ -+ \ -+ __i = (i); \ -+ inode_check_scale(__i, __i->field, __i->field - 1); \ -+ -- __i->field; \ -+}) -+ -+/* See comment before reiser4_readdir_common() for description. */ -+static inline struct list_head *get_readdir_list(const struct inode *inode) -+{ -+ return &reiser4_inode_data(inode)->lists.readdir_list; -+} -+ -+extern void init_inode_ordering(struct inode *inode, -+ reiser4_object_create_data * crd, int create); -+ -+static inline struct radix_tree_root *jnode_tree_by_inode(struct inode *inode) -+{ -+ return &reiser4_inode_data(inode)->jnodes_tree; -+} -+ -+static inline struct radix_tree_root *jnode_tree_by_reiser4_inode(reiser4_inode -+ * r4_inode) -+{ -+ return &r4_inode->jnodes_tree; -+} -+ -+#if REISER4_DEBUG -+extern void print_inode(const char *prefix, const struct inode *i); -+#endif -+ -+int is_dir_empty(const struct inode *); -+ -+/* __REISER4_INODE_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/ioctl.h linux-2.6.20/fs/reiser4/ioctl.h ---- linux-2.6.20.orig/fs/reiser4/ioctl.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/ioctl.h 2007-05-06 14:50:43.726984474 +0400 -@@ -0,0 +1,41 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#if !defined( __REISER4_IOCTL_H__ ) -+#define __REISER4_IOCTL_H__ -+ -+#include -+ -+/* -+ * ioctl(2) command used to "unpack" reiser4 file, that is, convert it into -+ * extents and fix in this state. This is used by applications that rely on -+ * -+ * . files being block aligned, and -+ * -+ * . files never migrating on disk -+ * -+ * for example, boot loaders (LILO) need this. -+ * -+ * This ioctl should be used as -+ * -+ * result = ioctl(fd, REISER4_IOC_UNPACK); -+ * -+ * File behind fd descriptor will be converted to the extents (if necessary), -+ * and its stat-data will be updated so that it will never be converted back -+ * into tails again. -+ */ -+#define REISER4_IOC_UNPACK _IOW(0xCD,1,long) -+ -+/* __REISER4_IOCTL_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/jnode.c linux-2.6.20/fs/reiser4/jnode.c ---- linux-2.6.20.orig/fs/reiser4/jnode.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/jnode.c 2007-05-06 14:50:43.730985723 +0400 -@@ -0,0 +1,1925 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+/* Jnode manipulation functions. */ -+/* Jnode is entity used to track blocks with data and meta-data in reiser4. -+ -+ In particular, jnodes are used to track transactional information -+ associated with each block. Each znode contains jnode as ->zjnode field. -+ -+ Jnode stands for either Josh or Journal node. -+*/ -+ -+/* -+ * Taxonomy. -+ * -+ * Jnode represents block containing data or meta-data. There are jnodes -+ * for: -+ * -+ * unformatted blocks (jnodes proper). There are plans, however to -+ * have a handle per extent unit rather than per each unformatted -+ * block, because there are so many of them. -+ * -+ * For bitmaps. Each bitmap is actually represented by two jnodes--one -+ * for working and another for "commit" data, together forming bnode. -+ * -+ * For io-heads. These are used by log writer. -+ * -+ * For formatted nodes (znode). See comment at the top of znode.c for -+ * details specific to the formatted nodes (znodes). -+ * -+ * Node data. -+ * -+ * Jnode provides access to the data of node it represents. Data are -+ * stored in a page. Page is kept in a page cache. This means, that jnodes -+ * are highly interconnected with page cache and VM internals. -+ * -+ * jnode has a pointer to page (->pg) containing its data. Pointer to data -+ * themselves is cached in ->data field to avoid frequent calls to -+ * page_address(). -+ * -+ * jnode and page are attached to each other by jnode_attach_page(). This -+ * function places pointer to jnode in set_page_private(), sets PG_private -+ * flag and increments page counter. -+ * -+ * Opposite operation is performed by page_clear_jnode(). -+ * -+ * jnode->pg is protected by jnode spin lock, and page->private is -+ * protected by page lock. See comment at the top of page_cache.c for -+ * more. -+ * -+ * page can be detached from jnode for two reasons: -+ * -+ * . jnode is removed from a tree (file is truncated, of formatted -+ * node is removed by balancing). -+ * -+ * . during memory pressure, VM calls ->releasepage() method -+ * (reiser4_releasepage()) to evict page from memory. -+ * -+ * (there, of course, is also umount, but this is special case we are not -+ * concerned with here). -+ * -+ * To protect jnode page from eviction, one calls jload() function that -+ * "pins" page in memory (loading it if necessary), increments -+ * jnode->d_count, and kmap()s page. Page is unpinned through call to -+ * jrelse(). -+ * -+ * Jnode life cycle. -+ * -+ * jnode is created, placed in hash table, and, optionally, in per-inode -+ * radix tree. Page can be attached to jnode, pinned, released, etc. -+ * -+ * When jnode is captured into atom its reference counter is -+ * increased. While being part of an atom, jnode can be "early -+ * flushed". This means that as part of flush procedure, jnode is placed -+ * into "relocate set", and its page is submitted to the disk. After io -+ * completes, page can be detached, then loaded again, re-dirtied, etc. -+ * -+ * Thread acquired reference to jnode by calling jref() and releases it by -+ * jput(). When last reference is removed, jnode is still retained in -+ * memory (cached) if it has page attached, _unless_ it is scheduled for -+ * destruction (has JNODE_HEARD_BANSHEE bit set). -+ * -+ * Tree read-write lock was used as "existential" lock for jnodes. That is, -+ * jnode->x_count could be changed from 0 to 1 only under tree write lock, -+ * that is, tree lock protected unreferenced jnodes stored in the hash -+ * table, from recycling. -+ * -+ * This resulted in high contention on tree lock, because jref()/jput() is -+ * frequent operation. To ameliorate this problem, RCU is used: when jput() -+ * is just about to release last reference on jnode it sets JNODE_RIP bit -+ * on it, and then proceed with jnode destruction (removing jnode from hash -+ * table, cbk_cache, detaching page, etc.). All places that change jnode -+ * reference counter from 0 to 1 (jlookup(), zlook(), zget(), and -+ * cbk_cache_scan_slots()) check for JNODE_RIP bit (this is done by -+ * jnode_rip_check() function), and pretend that nothing was found in hash -+ * table if bit is set. -+ * -+ * jput defers actual return of jnode into slab cache to some later time -+ * (by call_rcu()), this guarantees that other threads can safely continue -+ * working with JNODE_RIP-ped jnode. -+ * -+ */ -+ -+#include "reiser4.h" -+#include "debug.h" -+#include "dformat.h" -+#include "jnode.h" -+#include "plugin/plugin_header.h" -+#include "plugin/plugin.h" -+#include "txnmgr.h" -+/*#include "jnode.h"*/ -+#include "znode.h" -+#include "tree.h" -+#include "tree_walk.h" -+#include "super.h" -+#include "inode.h" -+#include "page_cache.h" -+ -+#include /* UML needs this for PAGE_OFFSET */ -+#include -+#include -+#include -+#include -+#include /* for struct address_space */ -+#include /* for inode_lock */ -+ -+static struct kmem_cache *_jnode_slab = NULL; -+ -+static void jnode_set_type(jnode * node, jnode_type type); -+static int jdelete(jnode * node); -+static int jnode_try_drop(jnode * node); -+ -+#if REISER4_DEBUG -+static int jnode_invariant(const jnode * node, int tlocked, int jlocked); -+#endif -+ -+/* true if valid page is attached to jnode */ -+static inline int jnode_is_parsed(jnode * node) -+{ -+ return JF_ISSET(node, JNODE_PARSED); -+} -+ -+/* hash table support */ -+ -+/* compare two jnode keys for equality. Used by hash-table macros */ -+static inline int jnode_key_eq(const jnode_key_t * k1, const jnode_key_t * k2) -+{ -+ assert("nikita-2350", k1 != NULL); -+ assert("nikita-2351", k2 != NULL); -+ -+ return (k1->index == k2->index && k1->objectid == k2->objectid); -+} -+ -+/* Hash jnode by its key (inode plus offset). Used by hash-table macros */ -+static inline __u32 -+jnode_key_hashfn(j_hash_table * table, const jnode_key_t * key) -+{ -+ assert("nikita-2352", key != NULL); -+ assert("nikita-3346", IS_POW(table->_buckets)); -+ -+ /* yes, this is remarkable simply (where not stupid) hash function. */ -+ return (key->objectid + key->index) & (table->_buckets - 1); -+} -+ -+/* The hash table definition */ -+#define KMALLOC(size) reiser4_vmalloc(size) -+#define KFREE(ptr, size) vfree(ptr) -+TYPE_SAFE_HASH_DEFINE(j, jnode, jnode_key_t, key.j, link.j, jnode_key_hashfn, -+ jnode_key_eq); -+#undef KFREE -+#undef KMALLOC -+ -+/* call this to initialise jnode hash table */ -+int jnodes_tree_init(reiser4_tree * tree /* tree to initialise jnodes for */ ) -+{ -+ assert("nikita-2359", tree != NULL); -+ return j_hash_init(&tree->jhash_table, 16384); -+} -+ -+/* call this to destroy jnode hash table. This is called during umount. */ -+int jnodes_tree_done(reiser4_tree * tree /* tree to destroy jnodes for */ ) -+{ -+ j_hash_table *jtable; -+ jnode *node; -+ jnode *next; -+ -+ assert("nikita-2360", tree != NULL); -+ -+ /* -+ * Scan hash table and free all jnodes. -+ */ -+ jtable = &tree->jhash_table; -+ if (jtable->_table) { -+ for_all_in_htable(jtable, j, node, next) { -+ assert("nikita-2361", !atomic_read(&node->x_count)); -+ jdrop(node); -+ } -+ -+ j_hash_done(&tree->jhash_table); -+ } -+ return 0; -+} -+ -+/** -+ * init_jnodes - create jnode cache -+ * -+ * Initializes slab cache jnodes. It is part of reiser4 module initialization. -+ */ -+int init_jnodes(void) -+{ -+ assert("umka-168", _jnode_slab == NULL); -+ -+ _jnode_slab = kmem_cache_create("jnode", sizeof(jnode), 0, -+ SLAB_HWCACHE_ALIGN | -+ SLAB_RECLAIM_ACCOUNT, NULL, NULL); -+ if (_jnode_slab == NULL) -+ return RETERR(-ENOMEM); -+ -+ return 0; -+} -+ -+/** -+ * done_znodes - delete znode cache -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+void done_jnodes(void) -+{ -+ destroy_reiser4_cache(&_jnode_slab); -+} -+ -+/* Initialize a jnode. */ -+void jnode_init(jnode * node, reiser4_tree * tree, jnode_type type) -+{ -+ assert("umka-175", node != NULL); -+ -+ memset(node, 0, sizeof(jnode)); -+ ON_DEBUG(node->magic = JMAGIC); -+ jnode_set_type(node, type); -+ atomic_set(&node->d_count, 0); -+ atomic_set(&node->x_count, 0); -+ spin_lock_init(&node->guard); -+ spin_lock_init(&node->load); -+ node->atom = NULL; -+ node->tree = tree; -+ INIT_LIST_HEAD(&node->capture_link); -+ -+ ASSIGN_NODE_LIST(node, NOT_CAPTURED); -+ -+ INIT_RCU_HEAD(&node->rcu); -+ -+#if REISER4_DEBUG -+ { -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = get_super_private(tree->super); -+ spin_lock_irq(&sbinfo->all_guard); -+ list_add(&node->jnodes, &sbinfo->all_jnodes); -+ spin_unlock_irq(&sbinfo->all_guard); -+ } -+#endif -+} -+ -+#if REISER4_DEBUG -+/* -+ * Remove jnode from ->all_jnodes list. -+ */ -+static void jnode_done(jnode * node, reiser4_tree * tree) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = get_super_private(tree->super); -+ -+ spin_lock_irq(&sbinfo->all_guard); -+ assert("nikita-2422", !list_empty(&node->jnodes)); -+ list_del_init(&node->jnodes); -+ spin_unlock_irq(&sbinfo->all_guard); -+} -+#endif -+ -+/* return already existing jnode of page */ -+jnode *jnode_by_page(struct page *pg) -+{ -+ assert("nikita-2066", pg != NULL); -+ assert("nikita-2400", PageLocked(pg)); -+ assert("nikita-2068", PagePrivate(pg)); -+ assert("nikita-2067", jprivate(pg) != NULL); -+ return jprivate(pg); -+} -+ -+/* exported functions to allocate/free jnode objects outside this file */ -+jnode *jalloc(void) -+{ -+ jnode *jal = kmem_cache_alloc(_jnode_slab, reiser4_ctx_gfp_mask_get()); -+ return jal; -+} -+ -+/* return jnode back to the slab allocator */ -+inline void jfree(jnode * node) -+{ -+ assert("zam-449", node != NULL); -+ -+ assert("nikita-2663", (list_empty_careful(&node->capture_link) && -+ NODE_LIST(node) == NOT_CAPTURED)); -+ assert("nikita-3222", list_empty(&node->jnodes)); -+ assert("nikita-3221", jnode_page(node) == NULL); -+ -+ /* not yet phash_jnode_destroy(node); */ -+ -+ kmem_cache_free(_jnode_slab, node); -+} -+ -+/* -+ * This function is supplied as RCU callback. It actually frees jnode when -+ * last reference to it is gone. -+ */ -+static void jnode_free_actor(struct rcu_head *head) -+{ -+ jnode *node; -+ jnode_type jtype; -+ -+ node = container_of(head, jnode, rcu); -+ jtype = jnode_get_type(node); -+ -+ ON_DEBUG(jnode_done(node, jnode_get_tree(node))); -+ -+ switch (jtype) { -+ case JNODE_IO_HEAD: -+ case JNODE_BITMAP: -+ case JNODE_UNFORMATTED_BLOCK: -+ jfree(node); -+ break; -+ case JNODE_FORMATTED_BLOCK: -+ zfree(JZNODE(node)); -+ break; -+ case JNODE_INODE: -+ default: -+ wrong_return_value("nikita-3197", "Wrong jnode type"); -+ } -+} -+ -+/* -+ * Free a jnode. Post a callback to be executed later through RCU when all -+ * references to @node are released. -+ */ -+static inline void jnode_free(jnode * node, jnode_type jtype) -+{ -+ if (jtype != JNODE_INODE) { -+ /*assert("nikita-3219", list_empty(&node->rcu.list)); */ -+ call_rcu(&node->rcu, jnode_free_actor); -+ } else -+ jnode_list_remove(node); -+} -+ -+/* allocate new unformatted jnode */ -+static jnode *jnew_unformatted(void) -+{ -+ jnode *jal; -+ -+ jal = jalloc(); -+ if (jal == NULL) -+ return NULL; -+ -+ jnode_init(jal, current_tree, JNODE_UNFORMATTED_BLOCK); -+ jal->key.j.mapping = NULL; -+ jal->key.j.index = (unsigned long)-1; -+ jal->key.j.objectid = 0; -+ return jal; -+} -+ -+/* look for jnode with given mapping and offset within hash table */ -+jnode *jlookup(reiser4_tree * tree, oid_t objectid, unsigned long index) -+{ -+ jnode_key_t jkey; -+ jnode *node; -+ -+ assert("nikita-2353", tree != NULL); -+ -+ jkey.objectid = objectid; -+ jkey.index = index; -+ -+ /* -+ * hash table is _not_ protected by any lock during lookups. All we -+ * have to do is to disable preemption to keep RCU happy. -+ */ -+ -+ rcu_read_lock(); -+ node = j_hash_find(&tree->jhash_table, &jkey); -+ if (node != NULL) { -+ /* protect @node from recycling */ -+ jref(node); -+ assert("nikita-2955", jnode_invariant(node, 0, 0)); -+ node = jnode_rip_check(tree, node); -+ } -+ rcu_read_unlock(); -+ return node; -+} -+ -+/* per inode radix tree of jnodes is protected by tree's read write spin lock */ -+static jnode *jfind_nolock(struct address_space *mapping, unsigned long index) -+{ -+ assert("vs-1694", mapping->host != NULL); -+ -+ return radix_tree_lookup(jnode_tree_by_inode(mapping->host), index); -+} -+ -+jnode *jfind(struct address_space * mapping, unsigned long index) -+{ -+ reiser4_tree *tree; -+ jnode *node; -+ -+ assert("vs-1694", mapping->host != NULL); -+ tree = reiser4_tree_by_inode(mapping->host); -+ -+ read_lock_tree(tree); -+ node = jfind_nolock(mapping, index); -+ if (node != NULL) -+ jref(node); -+ read_unlock_tree(tree); -+ return node; -+} -+ -+static void inode_attach_jnode(jnode * node) -+{ -+ struct inode *inode; -+ reiser4_inode *info; -+ struct radix_tree_root *rtree; -+ -+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock)); -+ assert("zam-1043", node->key.j.mapping != NULL); -+ inode = node->key.j.mapping->host; -+ info = reiser4_inode_data(inode); -+ rtree = jnode_tree_by_reiser4_inode(info); -+ if (rtree->rnode == NULL) { -+ /* prevent inode from being pruned when it has jnodes attached -+ to it */ -+ write_lock_irq(&inode->i_data.tree_lock); -+ inode->i_data.nrpages++; -+ write_unlock_irq(&inode->i_data.tree_lock); -+ } -+ assert("zam-1049", equi(rtree->rnode != NULL, info->nr_jnodes != 0)); -+ check_me("zam-1045", -+ !radix_tree_insert(rtree, node->key.j.index, node)); -+ ON_DEBUG(info->nr_jnodes++); -+} -+ -+static void inode_detach_jnode(jnode * node) -+{ -+ struct inode *inode; -+ reiser4_inode *info; -+ struct radix_tree_root *rtree; -+ -+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock)); -+ assert("zam-1044", node->key.j.mapping != NULL); -+ inode = node->key.j.mapping->host; -+ info = reiser4_inode_data(inode); -+ rtree = jnode_tree_by_reiser4_inode(info); -+ -+ assert("zam-1051", info->nr_jnodes != 0); -+ assert("zam-1052", rtree->rnode != NULL); -+ ON_DEBUG(info->nr_jnodes--); -+ -+ /* delete jnode from inode's radix tree of jnodes */ -+ check_me("zam-1046", radix_tree_delete(rtree, node->key.j.index)); -+ if (rtree->rnode == NULL) { -+ /* inode can be pruned now */ -+ write_lock_irq(&inode->i_data.tree_lock); -+ inode->i_data.nrpages--; -+ write_unlock_irq(&inode->i_data.tree_lock); -+ } -+} -+ -+/* put jnode into hash table (where they can be found by flush who does not know -+ mapping) and to inode's tree of jnodes (where they can be found (hopefully -+ faster) in places where mapping is known). Currently it is used by -+ fs/reiser4/plugin/item/extent_file_ops.c:index_extent_jnode when new jnode is -+ created */ -+static void -+hash_unformatted_jnode(jnode * node, struct address_space *mapping, -+ unsigned long index) -+{ -+ j_hash_table *jtable; -+ -+ assert("vs-1446", jnode_is_unformatted(node)); -+ assert("vs-1442", node->key.j.mapping == 0); -+ assert("vs-1443", node->key.j.objectid == 0); -+ assert("vs-1444", node->key.j.index == (unsigned long)-1); -+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock)); -+ -+ node->key.j.mapping = mapping; -+ node->key.j.objectid = get_inode_oid(mapping->host); -+ node->key.j.index = index; -+ -+ jtable = &jnode_get_tree(node)->jhash_table; -+ -+ /* race with some other thread inserting jnode into the hash table is -+ * impossible, because we keep the page lock. */ -+ /* -+ * following assertion no longer holds because of RCU: it is possible -+ * jnode is in the hash table, but with JNODE_RIP bit set. -+ */ -+ /* assert("nikita-3211", j_hash_find(jtable, &node->key.j) == NULL); */ -+ j_hash_insert_rcu(jtable, node); -+ inode_attach_jnode(node); -+} -+ -+static void unhash_unformatted_node_nolock(jnode * node) -+{ -+ assert("vs-1683", node->key.j.mapping != NULL); -+ assert("vs-1684", -+ node->key.j.objectid == -+ get_inode_oid(node->key.j.mapping->host)); -+ -+ /* remove jnode from hash-table */ -+ j_hash_remove_rcu(&node->tree->jhash_table, node); -+ inode_detach_jnode(node); -+ node->key.j.mapping = NULL; -+ node->key.j.index = (unsigned long)-1; -+ node->key.j.objectid = 0; -+ -+} -+ -+/* remove jnode from hash table and from inode's tree of jnodes. This is used in -+ reiser4_invalidatepage and in kill_hook_extent -> truncate_inode_jnodes -> -+ reiser4_uncapture_jnode */ -+void unhash_unformatted_jnode(jnode * node) -+{ -+ assert("vs-1445", jnode_is_unformatted(node)); -+ -+ write_lock_tree(node->tree); -+ unhash_unformatted_node_nolock(node); -+ write_unlock_tree(node->tree); -+} -+ -+/* -+ * search hash table for a jnode with given oid and index. If not found, -+ * allocate new jnode, insert it, and also insert into radix tree for the -+ * given inode/mapping. -+ */ -+static jnode *find_get_jnode(reiser4_tree * tree, -+ struct address_space *mapping, -+ oid_t oid, unsigned long index) -+{ -+ jnode *result; -+ jnode *shadow; -+ int preload; -+ -+ result = jnew_unformatted(); -+ -+ if (unlikely(result == NULL)) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ -+ preload = radix_tree_preload(reiser4_ctx_gfp_mask_get()); -+ if (preload != 0) -+ return ERR_PTR(preload); -+ -+ write_lock_tree(tree); -+ shadow = jfind_nolock(mapping, index); -+ if (likely(shadow == NULL)) { -+ /* add new jnode to hash table and inode's radix tree of jnodes */ -+ jref(result); -+ hash_unformatted_jnode(result, mapping, index); -+ } else { -+ /* jnode is found in inode's radix tree of jnodes */ -+ jref(shadow); -+ jnode_free(result, JNODE_UNFORMATTED_BLOCK); -+ assert("vs-1498", shadow->key.j.mapping == mapping); -+ result = shadow; -+ } -+ write_unlock_tree(tree); -+ -+ assert("nikita-2955", -+ ergo(result != NULL, jnode_invariant(result, 0, 0))); -+ radix_tree_preload_end(); -+ return result; -+} -+ -+/* jget() (a la zget() but for unformatted nodes). Returns (and possibly -+ creates) jnode corresponding to page @pg. jnode is attached to page and -+ inserted into jnode hash-table. */ -+static jnode *do_jget(reiser4_tree * tree, struct page *pg) -+{ -+ /* -+ * There are two ways to create jnode: starting with pre-existing page -+ * and without page. -+ * -+ * When page already exists, jnode is created -+ * (jnode_of_page()->do_jget()) under page lock. This is done in -+ * ->writepage(), or when capturing anonymous page dirtied through -+ * mmap. -+ * -+ * Jnode without page is created by index_extent_jnode(). -+ * -+ */ -+ -+ jnode *result; -+ oid_t oid = get_inode_oid(pg->mapping->host); -+ -+ assert("umka-176", pg != NULL); -+ assert("nikita-2394", PageLocked(pg)); -+ -+ result = jprivate(pg); -+ if (likely(result != NULL)) -+ return jref(result); -+ -+ tree = reiser4_tree_by_page(pg); -+ -+ /* check hash-table first */ -+ result = jfind(pg->mapping, pg->index); -+ if (unlikely(result != NULL)) { -+ spin_lock_jnode(result); -+ jnode_attach_page(result, pg); -+ spin_unlock_jnode(result); -+ result->key.j.mapping = pg->mapping; -+ return result; -+ } -+ -+ /* since page is locked, jnode should be allocated with GFP_NOFS flag */ -+ reiser4_ctx_gfp_mask_force(GFP_NOFS); -+ result = find_get_jnode(tree, pg->mapping, oid, pg->index); -+ if (unlikely(IS_ERR(result))) -+ return result; -+ /* attach jnode to page */ -+ spin_lock_jnode(result); -+ jnode_attach_page(result, pg); -+ spin_unlock_jnode(result); -+ return result; -+} -+ -+/* -+ * return jnode for @pg, creating it if necessary. -+ */ -+jnode *jnode_of_page(struct page * pg) -+{ -+ jnode *result; -+ -+ assert("umka-176", pg != NULL); -+ assert("nikita-2394", PageLocked(pg)); -+ -+ result = do_jget(reiser4_tree_by_page(pg), pg); -+ -+ if (REISER4_DEBUG && !IS_ERR(result)) { -+ assert("nikita-3210", result == jprivate(pg)); -+ assert("nikita-2046", jnode_page(jprivate(pg)) == pg); -+ if (jnode_is_unformatted(jprivate(pg))) { -+ assert("nikita-2364", -+ jprivate(pg)->key.j.index == pg->index); -+ assert("nikita-2367", -+ jprivate(pg)->key.j.mapping == pg->mapping); -+ assert("nikita-2365", -+ jprivate(pg)->key.j.objectid == -+ get_inode_oid(pg->mapping->host)); -+ assert("vs-1200", -+ jprivate(pg)->key.j.objectid == -+ pg->mapping->host->i_ino); -+ assert("nikita-2356", -+ jnode_is_unformatted(jnode_by_page(pg))); -+ } -+ assert("nikita-2956", jnode_invariant(jprivate(pg), 0, 0)); -+ } -+ return result; -+} -+ -+/* attach page to jnode: set ->pg pointer in jnode, and ->private one in the -+ * page.*/ -+void jnode_attach_page(jnode * node, struct page *pg) -+{ -+ assert("nikita-2060", node != NULL); -+ assert("nikita-2061", pg != NULL); -+ -+ assert("nikita-2050", jprivate(pg) == 0ul); -+ assert("nikita-2393", !PagePrivate(pg)); -+ assert("vs-1741", node->pg == NULL); -+ -+ assert("nikita-2396", PageLocked(pg)); -+ assert_spin_locked(&(node->guard)); -+ -+ page_cache_get(pg); -+ set_page_private(pg, (unsigned long)node); -+ node->pg = pg; -+ SetPagePrivate(pg); -+} -+ -+/* Dual to jnode_attach_page: break a binding between page and jnode */ -+void page_clear_jnode(struct page *page, jnode * node) -+{ -+ assert("nikita-2424", page != NULL); -+ assert("nikita-2425", PageLocked(page)); -+ assert("nikita-2426", node != NULL); -+ assert_spin_locked(&(node->guard)); -+ assert("nikita-2428", PagePrivate(page)); -+ -+ assert("nikita-3551", !PageWriteback(page)); -+ -+ JF_CLR(node, JNODE_PARSED); -+ set_page_private(page, 0ul); -+ ClearPagePrivate(page); -+ node->pg = NULL; -+ page_cache_release(page); -+} -+ -+#if 0 -+/* it is only used in one place to handle error */ -+void -+page_detach_jnode(struct page *page, struct address_space *mapping, -+ unsigned long index) -+{ -+ assert("nikita-2395", page != NULL); -+ -+ lock_page(page); -+ if ((page->mapping == mapping) && (page->index == index) -+ && PagePrivate(page)) { -+ jnode *node; -+ -+ node = jprivate(page); -+ spin_lock_jnode(node); -+ page_clear_jnode(page, node); -+ spin_unlock_jnode(node); -+ } -+ unlock_page(page); -+} -+#endif /* 0 */ -+ -+/* return @node page locked. -+ -+ Locking ordering requires that one first takes page lock and afterwards -+ spin lock on node attached to this page. Sometimes it is necessary to go in -+ the opposite direction. This is done through standard trylock-and-release -+ loop. -+*/ -+static struct page *jnode_lock_page(jnode * node) -+{ -+ struct page *page; -+ -+ assert("nikita-2052", node != NULL); -+ assert("nikita-2401", LOCK_CNT_NIL(spin_locked_jnode)); -+ -+ while (1) { -+ -+ spin_lock_jnode(node); -+ page = jnode_page(node); -+ if (page == NULL) { -+ break; -+ } -+ -+ /* no need to page_cache_get( page ) here, because page cannot -+ be evicted from memory without detaching it from jnode and -+ this requires spin lock on jnode that we already hold. -+ */ -+ if (!TestSetPageLocked(page)) { -+ /* We won a lock on jnode page, proceed. */ -+ break; -+ } -+ -+ /* Page is locked by someone else. */ -+ page_cache_get(page); -+ spin_unlock_jnode(node); -+ wait_on_page_locked(page); -+ /* it is possible that page was detached from jnode and -+ returned to the free pool, or re-assigned while we were -+ waiting on locked bit. This will be rechecked on the next -+ loop iteration. -+ */ -+ page_cache_release(page); -+ -+ /* try again */ -+ } -+ return page; -+} -+ -+/* -+ * is JNODE_PARSED bit is not set, call ->parse() method of jnode, to verify -+ * validness of jnode content. -+ */ -+static inline int jparse(jnode * node) -+{ -+ int result; -+ -+ assert("nikita-2466", node != NULL); -+ -+ spin_lock_jnode(node); -+ if (likely(!jnode_is_parsed(node))) { -+ result = jnode_ops(node)->parse(node); -+ if (likely(result == 0)) -+ JF_SET(node, JNODE_PARSED); -+ } else -+ result = 0; -+ spin_unlock_jnode(node); -+ return result; -+} -+ -+/* Lock a page attached to jnode, create and attach page to jnode if it had no -+ * one. */ -+static struct page *jnode_get_page_locked(jnode * node, gfp_t gfp_flags) -+{ -+ struct page *page; -+ -+ spin_lock_jnode(node); -+ page = jnode_page(node); -+ -+ if (page == NULL) { -+ spin_unlock_jnode(node); -+ page = find_or_create_page(jnode_get_mapping(node), -+ jnode_get_index(node), gfp_flags); -+ if (page == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ } else { -+ if (!TestSetPageLocked(page)) { -+ spin_unlock_jnode(node); -+ return page; -+ } -+ page_cache_get(page); -+ spin_unlock_jnode(node); -+ lock_page(page); -+ assert("nikita-3134", page->mapping == jnode_get_mapping(node)); -+ } -+ -+ spin_lock_jnode(node); -+ if (!jnode_page(node)) -+ jnode_attach_page(node, page); -+ spin_unlock_jnode(node); -+ -+ page_cache_release(page); -+ assert("zam-894", jnode_page(node) == page); -+ return page; -+} -+ -+/* Start read operation for jnode's page if page is not up-to-date. */ -+static int jnode_start_read(jnode * node, struct page *page) -+{ -+ assert("zam-893", PageLocked(page)); -+ -+ if (PageUptodate(page)) { -+ unlock_page(page); -+ return 0; -+ } -+ return reiser4_page_io(page, node, READ, reiser4_ctx_gfp_mask_get()); -+} -+ -+#if REISER4_DEBUG -+static void check_jload(jnode * node, struct page *page) -+{ -+ if (jnode_is_znode(node)) { -+ node40_header *nh; -+ znode *z; -+ -+ z = JZNODE(node); -+ if (znode_is_any_locked(z)) { -+ nh = (node40_header *) kmap(page); -+ /* this only works for node40-only file systems. For -+ * debugging. */ -+ assert("nikita-3253", -+ z->nr_items == le16_to_cpu(get_unaligned(&nh->nr_items))); -+ kunmap(page); -+ } -+ assert("nikita-3565", znode_invariant(z)); -+ } -+} -+#else -+#define check_jload(node, page) noop -+#endif -+ -+/* prefetch jnode to speed up next call to jload. Call this when you are going -+ * to call jload() shortly. This will bring appropriate portion of jnode into -+ * CPU cache. */ -+void jload_prefetch(jnode * node) -+{ -+ prefetchw(&node->x_count); -+} -+ -+/* load jnode's data into memory */ -+int jload_gfp(jnode * node /* node to load */ , -+ gfp_t gfp_flags /* allocation flags */ , -+ int do_kmap /* true if page should be kmapped */ ) -+{ -+ struct page *page; -+ int result = 0; -+ int parsed; -+ -+ assert("nikita-3010", reiser4_schedulable()); -+ -+ prefetchw(&node->pg); -+ -+ /* taking d-reference implies taking x-reference. */ -+ jref(node); -+ -+ /* -+ * acquiring d-reference to @jnode and check for JNODE_PARSED bit -+ * should be atomic, otherwise there is a race against -+ * reiser4_releasepage(). -+ */ -+ spin_lock(&(node->load)); -+ add_d_ref(node); -+ parsed = jnode_is_parsed(node); -+ spin_unlock(&(node->load)); -+ -+ if (unlikely(!parsed)) { -+ page = jnode_get_page_locked(node, gfp_flags); -+ if (unlikely(IS_ERR(page))) { -+ result = PTR_ERR(page); -+ goto failed; -+ } -+ -+ result = jnode_start_read(node, page); -+ if (unlikely(result != 0)) -+ goto failed; -+ -+ wait_on_page_locked(page); -+ if (unlikely(!PageUptodate(page))) { -+ result = RETERR(-EIO); -+ goto failed; -+ } -+ -+ if (do_kmap) -+ node->data = kmap(page); -+ -+ result = jparse(node); -+ if (unlikely(result != 0)) { -+ if (do_kmap) -+ kunmap(page); -+ goto failed; -+ } -+ check_jload(node, page); -+ } else { -+ page = jnode_page(node); -+ check_jload(node, page); -+ if (do_kmap) -+ node->data = kmap(page); -+ } -+ -+ if (!is_writeout_mode()) -+ /* We do not mark pages active if jload is called as a part of -+ * jnode_flush() or reiser4_write_logs(). Both jnode_flush() -+ * and write_logs() add no value to cached data, there is no -+ * sense to mark pages as active when they go to disk, it just -+ * confuses vm scanning routines because clean page could be -+ * moved out from inactive list as a result of this -+ * mark_page_accessed() call. */ -+ mark_page_accessed(page); -+ -+ return 0; -+ -+ failed: -+ jrelse_tail(node); -+ return result; -+ -+} -+ -+/* start asynchronous reading for given jnode's page. */ -+int jstartio(jnode * node) -+{ -+ struct page *page; -+ -+ page = jnode_get_page_locked(node, reiser4_ctx_gfp_mask_get()); -+ if (IS_ERR(page)) -+ return PTR_ERR(page); -+ -+ return jnode_start_read(node, page); -+} -+ -+/* Initialize a node by calling appropriate plugin instead of reading -+ * node from disk as in jload(). */ -+int jinit_new(jnode * node, gfp_t gfp_flags) -+{ -+ struct page *page; -+ int result; -+ -+ jref(node); -+ add_d_ref(node); -+ -+ page = jnode_get_page_locked(node, gfp_flags); -+ if (IS_ERR(page)) { -+ result = PTR_ERR(page); -+ goto failed; -+ } -+ -+ SetPageUptodate(page); -+ unlock_page(page); -+ -+ node->data = kmap(page); -+ -+ if (!jnode_is_parsed(node)) { -+ jnode_plugin *jplug = jnode_ops(node); -+ spin_lock_jnode(node); -+ result = jplug->init(node); -+ spin_unlock_jnode(node); -+ if (result) { -+ kunmap(page); -+ goto failed; -+ } -+ JF_SET(node, JNODE_PARSED); -+ } -+ -+ return 0; -+ -+ failed: -+ jrelse(node); -+ return result; -+} -+ -+/* release a reference to jnode acquired by jload(), decrement ->d_count */ -+void jrelse_tail(jnode * node /* jnode to release references to */ ) -+{ -+ assert("nikita-489", atomic_read(&node->d_count) > 0); -+ atomic_dec(&node->d_count); -+ /* release reference acquired in jload_gfp() or jinit_new() */ -+ jput(node); -+ if (jnode_is_unformatted(node) || jnode_is_znode(node)) -+ LOCK_CNT_DEC(d_refs); -+} -+ -+/* drop reference to node data. When last reference is dropped, data are -+ unloaded. */ -+void jrelse(jnode * node /* jnode to release references to */ ) -+{ -+ struct page *page; -+ -+ assert("nikita-487", node != NULL); -+ assert_spin_not_locked(&(node->guard)); -+ -+ page = jnode_page(node); -+ if (likely(page != NULL)) { -+ /* -+ * it is safe not to lock jnode here, because at this point -+ * @node->d_count is greater than zero (if jrelse() is used -+ * correctly, that is). JNODE_PARSED may be not set yet, if, -+ * for example, we got here as a result of error handling path -+ * in jload(). Anyway, page cannot be detached by -+ * reiser4_releasepage(). truncate will invalidate page -+ * regardless, but this should not be a problem. -+ */ -+ kunmap(page); -+ } -+ jrelse_tail(node); -+} -+ -+/* called from jput() to wait for io completion */ -+static void jnode_finish_io(jnode * node) -+{ -+ struct page *page; -+ -+ assert("nikita-2922", node != NULL); -+ -+ spin_lock_jnode(node); -+ page = jnode_page(node); -+ if (page != NULL) { -+ page_cache_get(page); -+ spin_unlock_jnode(node); -+ wait_on_page_writeback(page); -+ page_cache_release(page); -+ } else -+ spin_unlock_jnode(node); -+} -+ -+/* -+ * This is called by jput() when last reference to jnode is released. This is -+ * separate function, because we want fast path of jput() to be inline and, -+ * therefore, small. -+ */ -+void jput_final(jnode * node) -+{ -+ int r_i_p; -+ -+ /* A fast check for keeping node in cache. We always keep node in cache -+ * if its page is present and node was not marked for deletion */ -+ if (jnode_page(node) != NULL && !JF_ISSET(node, JNODE_HEARD_BANSHEE)) { -+ rcu_read_unlock(); -+ return; -+ } -+ assert("edward-1432", node->page_count == 0); -+ -+ r_i_p = !JF_TEST_AND_SET(node, JNODE_RIP); -+ /* -+ * if r_i_p is true, we were first to set JNODE_RIP on this node. In -+ * this case it is safe to access node after unlock. -+ */ -+ rcu_read_unlock(); -+ if (r_i_p) { -+ jnode_finish_io(node); -+ if (JF_ISSET(node, JNODE_HEARD_BANSHEE)) -+ /* node is removed from the tree. */ -+ jdelete(node); -+ else -+ jnode_try_drop(node); -+ } -+ /* if !r_i_p some other thread is already killing it */ -+} -+ -+int jwait_io(jnode * node, int rw) -+{ -+ struct page *page; -+ int result; -+ -+ assert("zam-447", node != NULL); -+ assert("zam-448", jnode_page(node) != NULL); -+ -+ page = jnode_page(node); -+ -+ result = 0; -+ if (rw == READ) { -+ wait_on_page_locked(page); -+ } else { -+ assert("nikita-2227", rw == WRITE); -+ wait_on_page_writeback(page); -+ } -+ if (PageError(page)) -+ result = RETERR(-EIO); -+ -+ return result; -+} -+ -+/* -+ * jnode types and plugins. -+ * -+ * jnode by itself is a "base type". There are several different jnode -+ * flavors, called "jnode types" (see jnode_type for a list). Sometimes code -+ * has to do different things based on jnode type. In the standard reiser4 way -+ * this is done by having jnode plugin (see fs/reiser4/plugin.h:jnode_plugin). -+ * -+ * Functions below deal with jnode types and define methods of jnode plugin. -+ * -+ */ -+ -+/* set jnode type. This is done during jnode initialization. */ -+static void jnode_set_type(jnode * node, jnode_type type) -+{ -+ static unsigned long type_to_mask[] = { -+ [JNODE_UNFORMATTED_BLOCK] = 1, -+ [JNODE_FORMATTED_BLOCK] = 0, -+ [JNODE_BITMAP] = 2, -+ [JNODE_IO_HEAD] = 6, -+ [JNODE_INODE] = 4 -+ }; -+ -+ assert("zam-647", type < LAST_JNODE_TYPE); -+ assert("nikita-2815", !jnode_is_loaded(node)); -+ assert("nikita-3386", node->state == 0); -+ -+ node->state |= (type_to_mask[type] << JNODE_TYPE_1); -+} -+ -+/* ->init() method of jnode plugin for jnodes that don't require plugin -+ * specific initialization. */ -+static int init_noinit(jnode * node UNUSED_ARG) -+{ -+ return 0; -+} -+ -+/* ->parse() method of jnode plugin for jnodes that don't require plugin -+ * specific pasring. */ -+static int parse_noparse(jnode * node UNUSED_ARG) -+{ -+ return 0; -+} -+ -+/* ->mapping() method for unformatted jnode */ -+struct address_space *mapping_jnode(const jnode * node) -+{ -+ struct address_space *map; -+ -+ assert("nikita-2713", node != NULL); -+ -+ /* mapping is stored in jnode */ -+ -+ map = node->key.j.mapping; -+ assert("nikita-2714", map != NULL); -+ assert("nikita-2897", is_reiser4_inode(map->host)); -+ assert("nikita-2715", get_inode_oid(map->host) == node->key.j.objectid); -+ return map; -+} -+ -+/* ->index() method for unformatted jnodes */ -+unsigned long index_jnode(const jnode * node) -+{ -+ /* index is stored in jnode */ -+ return node->key.j.index; -+} -+ -+/* ->remove() method for unformatted jnodes */ -+static inline void remove_jnode(jnode * node, reiser4_tree * tree) -+{ -+ /* remove jnode from hash table and radix tree */ -+ if (node->key.j.mapping) -+ unhash_unformatted_node_nolock(node); -+} -+ -+/* ->mapping() method for znodes */ -+static struct address_space *mapping_znode(const jnode * node) -+{ -+ /* all znodes belong to fake inode */ -+ return reiser4_get_super_fake(jnode_get_tree(node)->super)->i_mapping; -+} -+ -+/* ->index() method for znodes */ -+static unsigned long index_znode(const jnode * node) -+{ -+ unsigned long addr; -+ assert("nikita-3317", (1 << znode_shift_order) < sizeof(znode)); -+ -+ /* index of znode is just its address (shifted) */ -+ addr = (unsigned long)node; -+ return (addr - PAGE_OFFSET) >> znode_shift_order; -+} -+ -+/* ->mapping() method for bitmap jnode */ -+static struct address_space *mapping_bitmap(const jnode * node) -+{ -+ /* all bitmap blocks belong to special bitmap inode */ -+ return get_super_private(jnode_get_tree(node)->super)->bitmap-> -+ i_mapping; -+} -+ -+/* ->index() method for jnodes that are indexed by address */ -+static unsigned long index_is_address(const jnode * node) -+{ -+ unsigned long ind; -+ -+ ind = (unsigned long)node; -+ return ind - PAGE_OFFSET; -+} -+ -+/* resolve race with jput */ -+jnode *jnode_rip_sync(reiser4_tree *tree, jnode *node) -+{ -+ /* -+ * This is used as part of RCU-based jnode handling. -+ * -+ * jlookup(), zlook(), zget(), and cbk_cache_scan_slots() have to work -+ * with unreferenced jnodes (ones with ->x_count == 0). Hash table is -+ * not protected during this, so concurrent thread may execute -+ * zget-set-HEARD_BANSHEE-zput, or somehow else cause jnode to be -+ * freed in jput_final(). To avoid such races, jput_final() sets -+ * JNODE_RIP on jnode (under tree lock). All places that work with -+ * unreferenced jnodes call this function. It checks for JNODE_RIP bit -+ * (first without taking tree lock), and if this bit is set, released -+ * reference acquired by the current thread and returns NULL. -+ * -+ * As a result, if jnode is being concurrently freed, NULL is returned -+ * and caller should pretend that jnode wasn't found in the first -+ * place. -+ * -+ * Otherwise it's safe to release "rcu-read-lock" and continue with -+ * jnode. -+ */ -+ if (unlikely(JF_ISSET(node, JNODE_RIP))) { -+ read_lock_tree(tree); -+ if (JF_ISSET(node, JNODE_RIP)) { -+ dec_x_ref(node); -+ node = NULL; -+ } -+ read_unlock_tree(tree); -+ } -+ return node; -+} -+ -+reiser4_key *jnode_build_key(const jnode * node, reiser4_key * key) -+{ -+ struct inode *inode; -+ item_plugin *iplug; -+ loff_t off; -+ -+ assert("nikita-3092", node != NULL); -+ assert("nikita-3093", key != NULL); -+ assert("nikita-3094", jnode_is_unformatted(node)); -+ -+ off = ((loff_t) index_jnode(node)) << PAGE_CACHE_SHIFT; -+ inode = mapping_jnode(node)->host; -+ -+ if (node->parent_item_id != 0) -+ iplug = item_plugin_by_id(node->parent_item_id); -+ else -+ iplug = NULL; -+ -+ if (iplug != NULL && iplug->f.key_by_offset) -+ iplug->f.key_by_offset(inode, off, key); -+ else { -+ file_plugin *fplug; -+ -+ fplug = inode_file_plugin(inode); -+ assert("zam-1007", fplug != NULL); -+ assert("zam-1008", fplug->key_by_inode != NULL); -+ -+ fplug->key_by_inode(inode, off, key); -+ } -+ -+ return key; -+} -+ -+/* ->parse() method for formatted nodes */ -+static int parse_znode(jnode * node) -+{ -+ return zparse(JZNODE(node)); -+} -+ -+/* ->delete() method for formatted nodes */ -+static void delete_znode(jnode * node, reiser4_tree * tree) -+{ -+ znode *z; -+ -+ assert_rw_write_locked(&(tree->tree_lock)); -+ assert("vs-898", JF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ -+ z = JZNODE(node); -+ assert("vs-899", z->c_count == 0); -+ -+ /* delete znode from sibling list. */ -+ sibling_list_remove(z); -+ -+ znode_remove(z, tree); -+} -+ -+/* ->remove() method for formatted nodes */ -+static int remove_znode(jnode * node, reiser4_tree * tree) -+{ -+ znode *z; -+ -+ assert_rw_write_locked(&(tree->tree_lock)); -+ z = JZNODE(node); -+ -+ if (z->c_count == 0) { -+ /* detach znode from sibling list. */ -+ sibling_list_drop(z); -+ /* this is called with tree spin-lock held, so call -+ znode_remove() directly (rather than znode_lock_remove()). */ -+ znode_remove(z, tree); -+ return 0; -+ } -+ return RETERR(-EBUSY); -+} -+ -+/* ->init() method for formatted nodes */ -+static int init_znode(jnode * node) -+{ -+ znode *z; -+ -+ z = JZNODE(node); -+ /* call node plugin to do actual initialization */ -+ return z->nplug->init(z); -+} -+ -+/* ->clone() method for formatted nodes */ -+static jnode *clone_formatted(jnode * node) -+{ -+ znode *clone; -+ -+ assert("vs-1430", jnode_is_znode(node)); -+ clone = zalloc(reiser4_ctx_gfp_mask_get()); -+ if (clone == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ zinit(clone, NULL, current_tree); -+ jnode_set_block(ZJNODE(clone), jnode_get_block(node)); -+ /* ZJNODE(clone)->key.z is not initialized */ -+ clone->level = JZNODE(node)->level; -+ -+ return ZJNODE(clone); -+} -+ -+/* jplug->clone for unformatted nodes */ -+static jnode *clone_unformatted(jnode * node) -+{ -+ jnode *clone; -+ -+ assert("vs-1431", jnode_is_unformatted(node)); -+ clone = jalloc(); -+ if (clone == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ -+ jnode_init(clone, current_tree, JNODE_UNFORMATTED_BLOCK); -+ jnode_set_block(clone, jnode_get_block(node)); -+ -+ return clone; -+ -+} -+ -+/* -+ * Setup jnode plugin methods for various jnode types. -+ */ -+jnode_plugin jnode_plugins[LAST_JNODE_TYPE] = { -+ [JNODE_UNFORMATTED_BLOCK] = { -+ .h = { -+ .type_id = REISER4_JNODE_PLUGIN_TYPE, -+ .id = JNODE_UNFORMATTED_BLOCK, -+ .pops = NULL, -+ .label = "unformatted", -+ .desc = "unformatted node", -+ .linkage = {NULL, NULL} -+ }, -+ .init = init_noinit, -+ .parse = parse_noparse, -+ .mapping = mapping_jnode, -+ .index = index_jnode, -+ .clone = clone_unformatted -+ }, -+ [JNODE_FORMATTED_BLOCK] = { -+ .h = { -+ .type_id = REISER4_JNODE_PLUGIN_TYPE, -+ .id = JNODE_FORMATTED_BLOCK, -+ .pops = NULL, -+ .label = "formatted", -+ .desc = "formatted tree node", -+ .linkage = {NULL, NULL} -+ }, -+ .init = init_znode, -+ .parse = parse_znode, -+ .mapping = mapping_znode, -+ .index = index_znode, -+ .clone = clone_formatted -+ }, -+ [JNODE_BITMAP] = { -+ .h = { -+ .type_id = REISER4_JNODE_PLUGIN_TYPE, -+ .id = JNODE_BITMAP, -+ .pops = NULL, -+ .label = "bitmap", -+ .desc = "bitmap node", -+ .linkage = {NULL, NULL} -+ }, -+ .init = init_noinit, -+ .parse = parse_noparse, -+ .mapping = mapping_bitmap, -+ .index = index_is_address, -+ .clone = NULL -+ }, -+ [JNODE_IO_HEAD] = { -+ .h = { -+ .type_id = REISER4_JNODE_PLUGIN_TYPE, -+ .id = JNODE_IO_HEAD, -+ .pops = NULL, -+ .label = "io head", -+ .desc = "io head", -+ .linkage = {NULL, NULL} -+ }, -+ .init = init_noinit, -+ .parse = parse_noparse, -+ .mapping = mapping_bitmap, -+ .index = index_is_address, -+ .clone = NULL -+ }, -+ [JNODE_INODE] = { -+ .h = { -+ .type_id = REISER4_JNODE_PLUGIN_TYPE, -+ .id = JNODE_INODE, -+ .pops = NULL, -+ .label = "inode", -+ .desc = "inode's builtin jnode", -+ .linkage = {NULL, NULL} -+ }, -+ .init = NULL, -+ .parse = NULL, -+ .mapping = NULL, -+ .index = NULL, -+ .clone = NULL -+ } -+}; -+ -+/* -+ * jnode destruction. -+ * -+ * Thread may use a jnode after it acquired a reference to it. References are -+ * counted in ->x_count field. Reference protects jnode from being -+ * recycled. This is different from protecting jnode data (that are stored in -+ * jnode page) from being evicted from memory. Data are protected by jload() -+ * and released by jrelse(). -+ * -+ * If thread already possesses a reference to the jnode it can acquire another -+ * one through jref(). Initial reference is obtained (usually) by locating -+ * jnode in some indexing structure that depends on jnode type: formatted -+ * nodes are kept in global hash table, where they are indexed by block -+ * number, and also in the cbk cache. Unformatted jnodes are also kept in hash -+ * table, which is indexed by oid and offset within file, and in per-inode -+ * radix tree. -+ * -+ * Reference to jnode is released by jput(). If last reference is released, -+ * jput_final() is called. This function determines whether jnode has to be -+ * deleted (this happens when corresponding node is removed from the file -+ * system, jnode is marked with JNODE_HEARD_BANSHEE bit in this case), or it -+ * should be just "removed" (deleted from memory). -+ * -+ * Jnode destruction is signally delicate dance because of locking and RCU. -+ */ -+ -+/* -+ * Returns true if jnode cannot be removed right now. This check is called -+ * under tree lock. If it returns true, jnode is irrevocably committed to be -+ * deleted/removed. -+ */ -+static inline int jnode_is_busy(const jnode * node, jnode_type jtype) -+{ -+ /* if other thread managed to acquire a reference to this jnode, don't -+ * free it. */ -+ if (atomic_read(&node->x_count) > 0) -+ return 1; -+ /* also, don't free znode that has children in memory */ -+ if (jtype == JNODE_FORMATTED_BLOCK && JZNODE(node)->c_count > 0) -+ return 1; -+ return 0; -+} -+ -+/* -+ * this is called as part of removing jnode. Based on jnode type, call -+ * corresponding function that removes jnode from indices and returns it back -+ * to the appropriate slab (through RCU). -+ */ -+static inline void -+jnode_remove(jnode * node, jnode_type jtype, reiser4_tree * tree) -+{ -+ switch (jtype) { -+ case JNODE_UNFORMATTED_BLOCK: -+ remove_jnode(node, tree); -+ break; -+ case JNODE_IO_HEAD: -+ case JNODE_BITMAP: -+ break; -+ case JNODE_INODE: -+ break; -+ case JNODE_FORMATTED_BLOCK: -+ remove_znode(node, tree); -+ break; -+ default: -+ wrong_return_value("nikita-3196", "Wrong jnode type"); -+ } -+} -+ -+/* -+ * this is called as part of deleting jnode. Based on jnode type, call -+ * corresponding function that removes jnode from indices and returns it back -+ * to the appropriate slab (through RCU). -+ * -+ * This differs from jnode_remove() only for formatted nodes---for them -+ * sibling list handling is different for removal and deletion. -+ */ -+static inline void -+jnode_delete(jnode * node, jnode_type jtype, reiser4_tree * tree UNUSED_ARG) -+{ -+ switch (jtype) { -+ case JNODE_UNFORMATTED_BLOCK: -+ remove_jnode(node, tree); -+ break; -+ case JNODE_IO_HEAD: -+ case JNODE_BITMAP: -+ break; -+ case JNODE_FORMATTED_BLOCK: -+ delete_znode(node, tree); -+ break; -+ case JNODE_INODE: -+ default: -+ wrong_return_value("nikita-3195", "Wrong jnode type"); -+ } -+} -+ -+#if REISER4_DEBUG -+/* -+ * remove jnode from the debugging list of all jnodes hanging off super-block. -+ */ -+void jnode_list_remove(jnode * node) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = get_super_private(jnode_get_tree(node)->super); -+ -+ spin_lock_irq(&sbinfo->all_guard); -+ assert("nikita-2422", !list_empty(&node->jnodes)); -+ list_del_init(&node->jnodes); -+ spin_unlock_irq(&sbinfo->all_guard); -+} -+#endif -+ -+/* -+ * this is called by jput_final() to remove jnode when last reference to it is -+ * released. -+ */ -+static int jnode_try_drop(jnode * node) -+{ -+ int result; -+ reiser4_tree *tree; -+ jnode_type jtype; -+ -+ assert("nikita-2491", node != NULL); -+ assert("nikita-2583", JF_ISSET(node, JNODE_RIP)); -+ -+ tree = jnode_get_tree(node); -+ jtype = jnode_get_type(node); -+ -+ spin_lock_jnode(node); -+ write_lock_tree(tree); -+ /* -+ * if jnode has a page---leave it alone. Memory pressure will -+ * eventually kill page and jnode. -+ */ -+ if (jnode_page(node) != NULL) { -+ write_unlock_tree(tree); -+ spin_unlock_jnode(node); -+ JF_CLR(node, JNODE_RIP); -+ return RETERR(-EBUSY); -+ } -+ -+ /* re-check ->x_count under tree lock. */ -+ result = jnode_is_busy(node, jtype); -+ if (result == 0) { -+ assert("nikita-2582", !JF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ assert("jmacd-511/b", atomic_read(&node->d_count) == 0); -+ -+ spin_unlock_jnode(node); -+ /* no page and no references---despatch him. */ -+ jnode_remove(node, jtype, tree); -+ write_unlock_tree(tree); -+ jnode_free(node, jtype); -+ } else { -+ /* busy check failed: reference was acquired by concurrent -+ * thread. */ -+ write_unlock_tree(tree); -+ spin_unlock_jnode(node); -+ JF_CLR(node, JNODE_RIP); -+ } -+ return result; -+} -+ -+/* jdelete() -- Delete jnode from the tree and file system */ -+static int jdelete(jnode * node /* jnode to finish with */ ) -+{ -+ struct page *page; -+ int result; -+ reiser4_tree *tree; -+ jnode_type jtype; -+ -+ assert("nikita-467", node != NULL); -+ assert("nikita-2531", JF_ISSET(node, JNODE_RIP)); -+ -+ jtype = jnode_get_type(node); -+ -+ page = jnode_lock_page(node); -+ assert_spin_locked(&(node->guard)); -+ -+ tree = jnode_get_tree(node); -+ -+ write_lock_tree(tree); -+ /* re-check ->x_count under tree lock. */ -+ result = jnode_is_busy(node, jtype); -+ if (likely(!result)) { -+ assert("nikita-2123", JF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ assert("jmacd-511", atomic_read(&node->d_count) == 0); -+ -+ /* detach page */ -+ if (page != NULL) { -+ /* -+ * FIXME this is racy against jnode_extent_write(). -+ */ -+ page_clear_jnode(page, node); -+ } -+ spin_unlock_jnode(node); -+ /* goodbye */ -+ jnode_delete(node, jtype, tree); -+ write_unlock_tree(tree); -+ jnode_free(node, jtype); -+ /* @node is no longer valid pointer */ -+ if (page != NULL) -+ reiser4_drop_page(page); -+ } else { -+ /* busy check failed: reference was acquired by concurrent -+ * thread. */ -+ JF_CLR(node, JNODE_RIP); -+ write_unlock_tree(tree); -+ spin_unlock_jnode(node); -+ if (page != NULL) -+ unlock_page(page); -+ } -+ return result; -+} -+ -+/* drop jnode on the floor. -+ -+ Return value: -+ -+ -EBUSY: failed to drop jnode, because there are still references to it -+ -+ 0: successfully dropped jnode -+ -+*/ -+static int jdrop_in_tree(jnode * node, reiser4_tree * tree) -+{ -+ struct page *page; -+ jnode_type jtype; -+ int result; -+ -+ assert("zam-602", node != NULL); -+ assert_rw_not_read_locked(&(tree->tree_lock)); -+ assert_rw_not_write_locked(&(tree->tree_lock)); -+ assert("nikita-2403", !JF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ -+ jtype = jnode_get_type(node); -+ -+ page = jnode_lock_page(node); -+ assert_spin_locked(&(node->guard)); -+ -+ write_lock_tree(tree); -+ -+ /* re-check ->x_count under tree lock. */ -+ result = jnode_is_busy(node, jtype); -+ if (!result) { -+ assert("nikita-2488", page == jnode_page(node)); -+ assert("nikita-2533", atomic_read(&node->d_count) == 0); -+ if (page != NULL) { -+ assert("nikita-2126", !PageDirty(page)); -+ assert("nikita-2127", PageUptodate(page)); -+ assert("nikita-2181", PageLocked(page)); -+ page_clear_jnode(page, node); -+ } -+ spin_unlock_jnode(node); -+ jnode_remove(node, jtype, tree); -+ write_unlock_tree(tree); -+ jnode_free(node, jtype); -+ if (page != NULL) { -+ reiser4_drop_page(page); -+ } -+ } else { -+ /* busy check failed: reference was acquired by concurrent -+ * thread. */ -+ JF_CLR(node, JNODE_RIP); -+ write_unlock_tree(tree); -+ spin_unlock_jnode(node); -+ if (page != NULL) -+ unlock_page(page); -+ } -+ return result; -+} -+ -+/* This function frees jnode "if possible". In particular, [dcx]_count has to -+ be 0 (where applicable). */ -+void jdrop(jnode * node) -+{ -+ jdrop_in_tree(node, jnode_get_tree(node)); -+} -+ -+/* IO head jnode implementation; The io heads are simple j-nodes with limited -+ functionality (these j-nodes are not in any hash table) just for reading -+ from and writing to disk. */ -+ -+jnode *reiser4_alloc_io_head(const reiser4_block_nr * block) -+{ -+ jnode *jal = jalloc(); -+ -+ if (jal != NULL) { -+ jnode_init(jal, current_tree, JNODE_IO_HEAD); -+ jnode_set_block(jal, block); -+ } -+ -+ jref(jal); -+ -+ return jal; -+} -+ -+void reiser4_drop_io_head(jnode * node) -+{ -+ assert("zam-648", jnode_get_type(node) == JNODE_IO_HEAD); -+ -+ jput(node); -+ jdrop(node); -+} -+ -+/* protect keep jnode data from reiser4_releasepage() */ -+void pin_jnode_data(jnode * node) -+{ -+ assert("zam-671", jnode_page(node) != NULL); -+ page_cache_get(jnode_page(node)); -+} -+ -+/* make jnode data free-able again */ -+void unpin_jnode_data(jnode * node) -+{ -+ assert("zam-672", jnode_page(node) != NULL); -+ page_cache_release(jnode_page(node)); -+} -+ -+struct address_space *jnode_get_mapping(const jnode * node) -+{ -+ assert("nikita-3162", node != NULL); -+ return jnode_ops(node)->mapping(node); -+} -+ -+#if REISER4_DEBUG -+/* debugging aid: jnode invariant */ -+int jnode_invariant_f(const jnode * node, char const **msg) -+{ -+#define _ergo(ant, con) \ -+ ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con))) -+#define _check(exp) ((*msg) = #exp, (exp)) -+ -+ return _check(node != NULL) && -+ /* [jnode-queued] */ -+ /* only relocated node can be queued, except that when znode -+ * is being deleted, its JNODE_RELOC bit is cleared */ -+ _ergo(JF_ISSET(node, JNODE_FLUSH_QUEUED), -+ JF_ISSET(node, JNODE_RELOC) || -+ JF_ISSET(node, JNODE_HEARD_BANSHEE)) && -+ _check(node->jnodes.prev != NULL) && -+ _check(node->jnodes.next != NULL) && -+ /* [jnode-dirty] invariant */ -+ /* dirty inode is part of atom */ -+ _ergo(JF_ISSET(node, JNODE_DIRTY), node->atom != NULL) && -+ /* [jnode-oid] invariant */ -+ /* for unformatted node ->objectid and ->mapping fields are -+ * consistent */ -+ _ergo(jnode_is_unformatted(node) && node->key.j.mapping != NULL, -+ node->key.j.objectid == -+ get_inode_oid(node->key.j.mapping->host)) && -+ /* [jnode-atom-valid] invariant */ -+ /* node atom has valid state */ -+ _ergo(node->atom != NULL, node->atom->stage != ASTAGE_INVALID) && -+ /* [jnode-page-binding] invariant */ -+ /* if node points to page, it points back to node */ -+ _ergo(node->pg != NULL, jprivate(node->pg) == node) && -+ /* [jnode-refs] invariant */ -+ /* only referenced jnode can be loaded */ -+ _check(atomic_read(&node->x_count) >= atomic_read(&node->d_count)); -+ -+} -+ -+static const char *jnode_type_name(jnode_type type) -+{ -+ switch (type) { -+ case JNODE_UNFORMATTED_BLOCK: -+ return "unformatted"; -+ case JNODE_FORMATTED_BLOCK: -+ return "formatted"; -+ case JNODE_BITMAP: -+ return "bitmap"; -+ case JNODE_IO_HEAD: -+ return "io head"; -+ case JNODE_INODE: -+ return "inode"; -+ case LAST_JNODE_TYPE: -+ return "last"; -+ default:{ -+ static char unknown[30]; -+ -+ sprintf(unknown, "unknown %i", type); -+ return unknown; -+ } -+ } -+} -+ -+#define jnode_state_name( node, flag ) \ -+ ( JF_ISSET( ( node ), ( flag ) ) ? ((#flag "|")+6) : "" ) -+ -+/* debugging aid: output human readable information about @node */ -+static void info_jnode(const char *prefix /* prefix to print */ , -+ const jnode * node /* node to print */ ) -+{ -+ assert("umka-068", prefix != NULL); -+ -+ if (node == NULL) { -+ printk("%s: null\n", prefix); -+ return; -+ } -+ -+ printk -+ ("%s: %p: state: %lx: [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s], level: %i," -+ " block: %s, d_count: %d, x_count: %d, " -+ "pg: %p, atom: %p, lock: %i:%i, type: %s, ", prefix, node, -+ node->state, -+ jnode_state_name(node, JNODE_PARSED), -+ jnode_state_name(node, JNODE_HEARD_BANSHEE), -+ jnode_state_name(node, JNODE_LEFT_CONNECTED), -+ jnode_state_name(node, JNODE_RIGHT_CONNECTED), -+ jnode_state_name(node, JNODE_ORPHAN), -+ jnode_state_name(node, JNODE_CREATED), -+ jnode_state_name(node, JNODE_RELOC), -+ jnode_state_name(node, JNODE_OVRWR), -+ jnode_state_name(node, JNODE_DIRTY), -+ jnode_state_name(node, JNODE_IS_DYING), -+ jnode_state_name(node, JNODE_RIP), -+ jnode_state_name(node, JNODE_MISSED_IN_CAPTURE), -+ jnode_state_name(node, JNODE_WRITEBACK), -+ jnode_state_name(node, JNODE_NEW), -+ jnode_state_name(node, JNODE_DKSET), -+ jnode_state_name(node, JNODE_REPACK), -+ jnode_state_name(node, JNODE_CLUSTER_PAGE), -+ jnode_get_level(node), sprint_address(jnode_get_block(node)), -+ atomic_read(&node->d_count), atomic_read(&node->x_count), -+ jnode_page(node), node->atom, 0, 0, -+ jnode_type_name(jnode_get_type(node))); -+ if (jnode_is_unformatted(node)) { -+ printk("inode: %llu, index: %lu, ", -+ node->key.j.objectid, node->key.j.index); -+ } -+} -+ -+/* debugging aid: check znode invariant and panic if it doesn't hold */ -+static int jnode_invariant(const jnode * node, int tlocked, int jlocked) -+{ -+ char const *failed_msg; -+ int result; -+ reiser4_tree *tree; -+ -+ tree = jnode_get_tree(node); -+ -+ assert("umka-063312", node != NULL); -+ assert("umka-064321", tree != NULL); -+ -+ if (!jlocked && !tlocked) -+ spin_lock_jnode((jnode *) node); -+ if (!tlocked) -+ read_lock_tree(jnode_get_tree(node)); -+ result = jnode_invariant_f(node, &failed_msg); -+ if (!result) { -+ info_jnode("corrupted node", node); -+ warning("jmacd-555", "Condition %s failed", failed_msg); -+ } -+ if (!tlocked) -+ read_unlock_tree(jnode_get_tree(node)); -+ if (!jlocked && !tlocked) -+ spin_unlock_jnode((jnode *) node); -+ return result; -+} -+ -+#endif /* REISER4_DEBUG */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/jnode.h linux-2.6.20/fs/reiser4/jnode.h ---- linux-2.6.20.orig/fs/reiser4/jnode.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/jnode.h 2007-05-06 14:50:43.734986973 +0400 -@@ -0,0 +1,705 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Declaration of jnode. See jnode.c for details. */ -+ -+#ifndef __JNODE_H__ -+#define __JNODE_H__ -+ -+#include "forward.h" -+#include "type_safe_hash.h" -+#include "txnmgr.h" -+#include "key.h" -+#include "debug.h" -+#include "dformat.h" -+#include "page_cache.h" -+#include "context.h" -+ -+#include "plugin/plugin.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* declare hash table of jnodes (jnodes proper, that is, unformatted -+ nodes) */ -+TYPE_SAFE_HASH_DECLARE(j, jnode); -+ -+/* declare hash table of znodes */ -+TYPE_SAFE_HASH_DECLARE(z, znode); -+ -+typedef struct { -+ __u64 objectid; -+ unsigned long index; -+ struct address_space *mapping; -+} jnode_key_t; -+ -+/* -+ Jnode is the "base class" of other nodes in reiser4. It is also happens to -+ be exactly the node we use for unformatted tree nodes. -+ -+ Jnode provides following basic functionality: -+ -+ . reference counting and indexing. -+ -+ . integration with page cache. Jnode has ->pg reference to which page can -+ be attached. -+ -+ . interface to transaction manager. It is jnode that is kept in transaction -+ manager lists, attached to atoms, etc. (NOTE-NIKITA one may argue that this -+ means, there should be special type of jnode for inode.) -+ -+ Locking: -+ -+ Spin lock: the following fields are protected by the per-jnode spin lock: -+ -+ ->state -+ ->atom -+ ->capture_link -+ -+ Following fields are protected by the global tree lock: -+ -+ ->link -+ ->key.z (content of ->key.z is only changed in znode_rehash()) -+ ->key.j -+ -+ Atomic counters -+ -+ ->x_count -+ ->d_count -+ -+ ->pg, and ->data are protected by spin lock for unused jnode and are -+ immutable for used jnode (one for which fs/reiser4/vfs_ops.c:releasable() -+ is false). -+ -+ ->tree is immutable after creation -+ -+ Unclear -+ -+ ->blocknr: should be under jnode spin-lock, but current interface is based -+ on passing of block address. -+ -+ If you ever need to spin lock two nodes at once, do this in "natural" -+ memory order: lock znode with lower address first. (See lock_two_nodes().) -+ -+ Invariants involving this data-type: -+ -+ [jnode-dirty] -+ [jnode-refs] -+ [jnode-oid] -+ [jnode-queued] -+ [jnode-atom-valid] -+ [jnode-page-binding] -+*/ -+ -+struct jnode { -+#if REISER4_DEBUG -+#define JMAGIC 0x52654973 /* "ReIs" */ -+ int magic; -+#endif -+ /* FIRST CACHE LINE (16 bytes): data used by jload */ -+ -+ /* jnode's state: bitwise flags from the reiser4_jnode_state enum. */ -+ /* 0 */ unsigned long state; -+ -+ /* lock, protecting jnode's fields. */ -+ /* 4 */ spinlock_t load; -+ -+ /* counter of references to jnode itself. Increased on jref(). -+ Decreased on jput(). -+ */ -+ /* 8 */ atomic_t x_count; -+ -+ /* counter of references to jnode's data. Pin data page(s) in -+ memory while this is greater than 0. Increased on jload(). -+ Decreased on jrelse(). -+ */ -+ /* 12 */ atomic_t d_count; -+ -+ /* SECOND CACHE LINE: data used by hash table lookups */ -+ -+ /* 16 */ union { -+ /* znodes are hashed by block number */ -+ reiser4_block_nr z; -+ /* unformatted nodes are hashed by mapping plus offset */ -+ jnode_key_t j; -+ } key; -+ -+ /* THIRD CACHE LINE */ -+ -+ /* 32 */ union { -+ /* pointers to maintain hash-table */ -+ z_hash_link z; -+ j_hash_link j; -+ } link; -+ -+ /* pointer to jnode page. */ -+ /* 36 */ struct page *pg; -+ /* pointer to node itself. This is page_address(node->pg) when page is -+ attached to the jnode -+ */ -+ /* 40 */ void *data; -+ -+ /* 44 */ reiser4_tree *tree; -+ -+ /* FOURTH CACHE LINE: atom related fields */ -+ -+ /* 48 */ spinlock_t guard; -+ -+ /* atom the block is in, if any */ -+ /* 52 */ txn_atom *atom; -+ -+ /* capture list */ -+ /* 56 */ struct list_head capture_link; -+ -+ /* FIFTH CACHE LINE */ -+ -+ /* 64 */ struct rcu_head rcu; -+ /* crosses cache line */ -+ -+ /* SIXTH CACHE LINE */ -+ -+ /* the real blocknr (where io is going to/from) */ -+ /* 80 */ reiser4_block_nr blocknr; -+ /* Parent item type, unformatted and CRC need it for offset => key conversion. */ -+ /* NOTE: this parent_item_id looks like jnode type. */ -+ /* 88 */ reiser4_plugin_id parent_item_id; -+ /* 92 */ -+#if REISER4_DEBUG -+ /* number of pages referenced by the jnode (meaningful while capturing of -+ page clusters) */ -+ int page_count; -+ /* list of all jnodes for debugging purposes. */ -+ struct list_head jnodes; -+ /* how many times this jnode was written in one transaction */ -+ int written; -+ /* this indicates which atom's list the jnode is on */ -+ atom_list list; -+#endif -+} __attribute__ ((aligned(16))); -+ -+/* -+ * jnode types. Enumeration of existing jnode types. -+ */ -+typedef enum { -+ JNODE_UNFORMATTED_BLOCK, /* unformatted block */ -+ JNODE_FORMATTED_BLOCK, /* formatted block, znode */ -+ JNODE_BITMAP, /* bitmap */ -+ JNODE_IO_HEAD, /* jnode representing a block in the -+ * wandering log */ -+ JNODE_INODE, /* jnode embedded into inode */ -+ LAST_JNODE_TYPE -+} jnode_type; -+ -+/* jnode states */ -+typedef enum { -+ /* jnode's page is loaded and data checked */ -+ JNODE_PARSED = 0, -+ /* node was deleted, not all locks on it were released. This -+ node is empty and is going to be removed from the tree -+ shortly. */ -+ JNODE_HEARD_BANSHEE = 1, -+ /* left sibling pointer is valid */ -+ JNODE_LEFT_CONNECTED = 2, -+ /* right sibling pointer is valid */ -+ JNODE_RIGHT_CONNECTED = 3, -+ -+ /* znode was just created and doesn't yet have a pointer from -+ its parent */ -+ JNODE_ORPHAN = 4, -+ -+ /* this node was created by its transaction and has not been assigned -+ a block address. */ -+ JNODE_CREATED = 5, -+ -+ /* this node is currently relocated */ -+ JNODE_RELOC = 6, -+ /* this node is currently wandered */ -+ JNODE_OVRWR = 7, -+ -+ /* this znode has been modified */ -+ JNODE_DIRTY = 8, -+ -+ /* znode lock is being invalidated */ -+ JNODE_IS_DYING = 9, -+ -+ /* THIS PLACE IS INTENTIONALLY LEFT BLANK */ -+ -+ /* jnode is queued for flushing. */ -+ JNODE_FLUSH_QUEUED = 12, -+ -+ /* In the following bits jnode type is encoded. */ -+ JNODE_TYPE_1 = 13, -+ JNODE_TYPE_2 = 14, -+ JNODE_TYPE_3 = 15, -+ -+ /* jnode is being destroyed */ -+ JNODE_RIP = 16, -+ -+ /* znode was not captured during locking (it might so be because -+ ->level != LEAF_LEVEL and lock_mode == READ_LOCK) */ -+ JNODE_MISSED_IN_CAPTURE = 17, -+ -+ /* write is in progress */ -+ JNODE_WRITEBACK = 18, -+ -+ /* FIXME: now it is used by crypto-compress plugin only */ -+ JNODE_NEW = 19, -+ -+ /* delimiting keys are already set for this znode. */ -+ JNODE_DKSET = 20, -+ -+ /* when this bit is set page and jnode can not be disconnected */ -+ JNODE_WRITE_PREPARED = 21, -+ -+ JNODE_CLUSTER_PAGE = 22, -+ /* Jnode is marked for repacking, that means the reiser4 flush and the -+ * block allocator should process this node special way */ -+ JNODE_REPACK = 23, -+ /* node should be converted by flush in squalloc phase */ -+ JNODE_CONVERTIBLE = 24, -+ /* -+ * When jnode is dirtied for the first time in given transaction, -+ * do_jnode_make_dirty() checks whether this jnode can possible became -+ * member of overwrite set. If so, this bit is set, and one block is -+ * reserved in the ->flush_reserved space of atom. -+ * -+ * This block is "used" (and JNODE_FLUSH_RESERVED bit is cleared) when -+ * -+ * (1) flush decides that we want this block to go into relocate -+ * set after all. -+ * -+ * (2) wandering log is allocated (by log writer) -+ * -+ * (3) extent is allocated -+ * -+ */ -+ JNODE_FLUSH_RESERVED = 29 -+} reiser4_jnode_state; -+ -+/* Macros for accessing the jnode state. */ -+ -+static inline void JF_CLR(jnode * j, int f) -+{ -+ assert("unknown-1", j->magic == JMAGIC); -+ clear_bit(f, &j->state); -+} -+static inline int JF_ISSET(const jnode * j, int f) -+{ -+ assert("unknown-2", j->magic == JMAGIC); -+ return test_bit(f, &((jnode *) j)->state); -+} -+static inline void JF_SET(jnode * j, int f) -+{ -+ assert("unknown-3", j->magic == JMAGIC); -+ set_bit(f, &j->state); -+} -+ -+static inline int JF_TEST_AND_SET(jnode * j, int f) -+{ -+ assert("unknown-4", j->magic == JMAGIC); -+ return test_and_set_bit(f, &j->state); -+} -+ -+static inline void spin_lock_jnode(jnode *node) -+{ -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", (LOCK_CNT_NIL(rw_locked_tree) && -+ LOCK_CNT_NIL(spin_locked_txnh) && -+ LOCK_CNT_NIL(spin_locked_zlock) && -+ LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_LT(spin_locked_jnode, 2))); -+ -+ spin_lock(&(node->guard)); -+ -+ LOCK_CNT_INC(spin_locked_jnode); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline void spin_unlock_jnode(jnode *node) -+{ -+ assert_spin_locked(&(node->guard)); -+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_jnode)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(spin_locked_jnode); -+ LOCK_CNT_DEC(spin_locked); -+ -+ spin_unlock(&(node->guard)); -+} -+ -+static inline int jnode_is_in_deleteset(const jnode * node) -+{ -+ return JF_ISSET(node, JNODE_RELOC); -+} -+ -+extern int init_jnodes(void); -+extern void done_jnodes(void); -+ -+/* Jnode routines */ -+extern jnode *jalloc(void); -+extern void jfree(jnode * node) NONNULL; -+extern jnode *jclone(jnode *); -+extern jnode *jlookup(reiser4_tree * tree, -+ oid_t objectid, unsigned long ind) NONNULL; -+extern jnode *jfind(struct address_space *, unsigned long index) NONNULL; -+extern jnode *jnode_by_page(struct page *pg) NONNULL; -+extern jnode *jnode_of_page(struct page *pg) NONNULL; -+void jnode_attach_page(jnode * node, struct page *pg); -+ -+void unhash_unformatted_jnode(jnode *); -+extern jnode *page_next_jnode(jnode * node) NONNULL; -+extern void jnode_init(jnode * node, reiser4_tree * tree, jnode_type) NONNULL; -+extern void jnode_make_dirty(jnode * node) NONNULL; -+extern void jnode_make_clean(jnode * node) NONNULL; -+extern void jnode_make_wander_nolock(jnode * node) NONNULL; -+extern void jnode_make_wander(jnode *) NONNULL; -+extern void znode_make_reloc(znode *, flush_queue_t *) NONNULL; -+extern void unformatted_make_reloc(jnode *, flush_queue_t *) NONNULL; -+extern struct address_space *jnode_get_mapping(const jnode * node) NONNULL; -+ -+/** -+ * jnode_get_block -+ * @node: jnode to query -+ * -+ */ -+static inline const reiser4_block_nr *jnode_get_block(const jnode *node) -+{ -+ assert("nikita-528", node != NULL); -+ -+ return &node->blocknr; -+} -+ -+/** -+ * jnode_set_block -+ * @node: jnode to update -+ * @blocknr: new block nr -+ */ -+static inline void jnode_set_block(jnode *node, const reiser4_block_nr *blocknr) -+{ -+ assert("nikita-2020", node != NULL); -+ assert("umka-055", blocknr != NULL); -+ node->blocknr = *blocknr; -+} -+ -+ -+/* block number for IO. Usually this is the same as jnode_get_block(), unless -+ * jnode was emergency flushed---then block number chosen by eflush is -+ * used. */ -+static inline const reiser4_block_nr *jnode_get_io_block(jnode * node) -+{ -+ assert("nikita-2768", node != NULL); -+ assert_spin_locked(&(node->guard)); -+ -+ return jnode_get_block(node); -+} -+ -+/* Jnode flush interface. */ -+extern reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t * pos); -+extern flush_queue_t *reiser4_pos_fq(flush_pos_t * pos); -+ -+/* FIXME-VS: these are used in plugin/item/extent.c */ -+ -+/* does extent_get_block have to be called */ -+#define jnode_mapped(node) JF_ISSET (node, JNODE_MAPPED) -+#define jnode_set_mapped(node) JF_SET (node, JNODE_MAPPED) -+ -+/* the node should be converted during flush squalloc phase */ -+#define jnode_convertible(node) JF_ISSET (node, JNODE_CONVERTIBLE) -+#define jnode_set_convertible(node) JF_SET (node, JNODE_CONVERTIBLE) -+ -+/* Macros to convert from jnode to znode, znode to jnode. These are macros -+ because C doesn't allow overloading of const prototypes. */ -+#define ZJNODE(x) (& (x) -> zjnode) -+#define JZNODE(x) \ -+({ \ -+ typeof (x) __tmp_x; \ -+ \ -+ __tmp_x = (x); \ -+ assert ("jmacd-1300", jnode_is_znode (__tmp_x)); \ -+ (znode*) __tmp_x; \ -+}) -+ -+extern int jnodes_tree_init(reiser4_tree * tree); -+extern int jnodes_tree_done(reiser4_tree * tree); -+ -+#if REISER4_DEBUG -+ -+extern int znode_is_any_locked(const znode * node); -+extern void jnode_list_remove(jnode * node); -+ -+#else -+ -+#define jnode_list_remove(node) noop -+ -+#endif -+ -+int znode_is_root(const znode * node) NONNULL; -+ -+/* bump reference counter on @node */ -+static inline void add_x_ref(jnode * node /* node to increase x_count of */ ) -+{ -+ assert("nikita-1911", node != NULL); -+ -+ atomic_inc(&node->x_count); -+ LOCK_CNT_INC(x_refs); -+} -+ -+static inline void dec_x_ref(jnode * node) -+{ -+ assert("nikita-3215", node != NULL); -+ assert("nikita-3216", atomic_read(&node->x_count) > 0); -+ -+ atomic_dec(&node->x_count); -+ assert("nikita-3217", LOCK_CNT_GTZ(x_refs)); -+ LOCK_CNT_DEC(x_refs); -+} -+ -+/* jref() - increase counter of references to jnode/znode (x_count) */ -+static inline jnode *jref(jnode * node) -+{ -+ assert("jmacd-508", (node != NULL) && !IS_ERR(node)); -+ add_x_ref(node); -+ return node; -+} -+ -+/* get the page of jnode */ -+static inline struct page *jnode_page(const jnode * node) -+{ -+ return node->pg; -+} -+ -+/* return pointer to jnode data */ -+static inline char *jdata(const jnode * node) -+{ -+ assert("nikita-1415", node != NULL); -+ assert("nikita-3198", jnode_page(node) != NULL); -+ return node->data; -+} -+ -+static inline int jnode_is_loaded(const jnode * node) -+{ -+ assert("zam-506", node != NULL); -+ return atomic_read(&node->d_count) > 0; -+} -+ -+extern void page_clear_jnode(struct page *page, jnode * node) NONNULL; -+ -+static inline void jnode_set_reloc(jnode * node) -+{ -+ assert("nikita-2431", node != NULL); -+ assert("nikita-2432", !JF_ISSET(node, JNODE_OVRWR)); -+ JF_SET(node, JNODE_RELOC); -+} -+ -+/* jload/jwrite/junload give a bread/bwrite/brelse functionality for jnodes */ -+ -+extern int jload_gfp(jnode *, gfp_t, int do_kmap) NONNULL; -+ -+static inline int jload(jnode *node) -+{ -+ return jload_gfp(node, reiser4_ctx_gfp_mask_get(), 1); -+} -+ -+extern int jinit_new(jnode *, gfp_t) NONNULL; -+extern int jstartio(jnode *) NONNULL; -+ -+extern void jdrop(jnode *) NONNULL; -+extern int jwait_io(jnode *, int rw) NONNULL; -+ -+void jload_prefetch(jnode *); -+ -+extern jnode *reiser4_alloc_io_head(const reiser4_block_nr * block) NONNULL; -+extern void reiser4_drop_io_head(jnode * node) NONNULL; -+ -+static inline reiser4_tree *jnode_get_tree(const jnode * node) -+{ -+ assert("nikita-2691", node != NULL); -+ return node->tree; -+} -+ -+extern void pin_jnode_data(jnode *); -+extern void unpin_jnode_data(jnode *); -+ -+static inline jnode_type jnode_get_type(const jnode * node) -+{ -+ static const unsigned long state_mask = -+ (1 << JNODE_TYPE_1) | (1 << JNODE_TYPE_2) | (1 << JNODE_TYPE_3); -+ -+ static jnode_type mask_to_type[] = { -+ /* JNODE_TYPE_3 : JNODE_TYPE_2 : JNODE_TYPE_1 */ -+ -+ /* 000 */ -+ [0] = JNODE_FORMATTED_BLOCK, -+ /* 001 */ -+ [1] = JNODE_UNFORMATTED_BLOCK, -+ /* 010 */ -+ [2] = JNODE_BITMAP, -+ /* 011 */ -+ [3] = LAST_JNODE_TYPE, /*invalid */ -+ /* 100 */ -+ [4] = JNODE_INODE, -+ /* 101 */ -+ [5] = LAST_JNODE_TYPE, -+ /* 110 */ -+ [6] = JNODE_IO_HEAD, -+ /* 111 */ -+ [7] = LAST_JNODE_TYPE, /* invalid */ -+ }; -+ -+ return mask_to_type[(node->state & state_mask) >> JNODE_TYPE_1]; -+} -+ -+/* returns true if node is a znode */ -+static inline int jnode_is_znode(const jnode * node) -+{ -+ return jnode_get_type(node) == JNODE_FORMATTED_BLOCK; -+} -+ -+static inline int jnode_is_flushprepped(jnode * node) -+{ -+ assert("jmacd-78212", node != NULL); -+ assert_spin_locked(&(node->guard)); -+ return !JF_ISSET(node, JNODE_DIRTY) || JF_ISSET(node, JNODE_RELOC) || -+ JF_ISSET(node, JNODE_OVRWR); -+} -+ -+/* Return true if @node has already been processed by the squeeze and allocate -+ process. This implies the block address has been finalized for the -+ duration of this atom (or it is clean and will remain in place). If this -+ returns true you may use the block number as a hint. */ -+static inline int jnode_check_flushprepped(jnode * node) -+{ -+ int result; -+ -+ /* It must be clean or relocated or wandered. New allocations are set to relocate. */ -+ spin_lock_jnode(node); -+ result = jnode_is_flushprepped(node); -+ spin_unlock_jnode(node); -+ return result; -+} -+ -+/* returns true if node is unformatted */ -+static inline int jnode_is_unformatted(const jnode * node) -+{ -+ assert("jmacd-0123", node != NULL); -+ return jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK; -+} -+ -+/* returns true if node represents a cluster cache page */ -+static inline int jnode_is_cluster_page(const jnode * node) -+{ -+ assert("edward-50", node != NULL); -+ return (JF_ISSET(node, JNODE_CLUSTER_PAGE)); -+} -+ -+/* returns true is node is builtin inode's jnode */ -+static inline int jnode_is_inode(const jnode * node) -+{ -+ assert("vs-1240", node != NULL); -+ return jnode_get_type(node) == JNODE_INODE; -+} -+ -+static inline jnode_plugin *jnode_ops_of(const jnode_type type) -+{ -+ assert("nikita-2367", type < LAST_JNODE_TYPE); -+ return jnode_plugin_by_id((reiser4_plugin_id) type); -+} -+ -+static inline jnode_plugin *jnode_ops(const jnode * node) -+{ -+ assert("nikita-2366", node != NULL); -+ -+ return jnode_ops_of(jnode_get_type(node)); -+} -+ -+/* Get the index of a block. */ -+static inline unsigned long jnode_get_index(jnode * node) -+{ -+ return jnode_ops(node)->index(node); -+} -+ -+/* return true if "node" is the root */ -+static inline int jnode_is_root(const jnode * node) -+{ -+ return jnode_is_znode(node) && znode_is_root(JZNODE(node)); -+} -+ -+extern struct address_space *mapping_jnode(const jnode * node); -+extern unsigned long index_jnode(const jnode * node); -+ -+static inline void jput(jnode * node); -+extern void jput_final(jnode * node); -+ -+/* bump data counter on @node */ -+static inline void add_d_ref(jnode * node /* node to increase d_count of */ ) -+{ -+ assert("nikita-1962", node != NULL); -+ -+ atomic_inc(&node->d_count); -+ if (jnode_is_unformatted(node) || jnode_is_znode(node)) -+ LOCK_CNT_INC(d_refs); -+} -+ -+/* jput() - decrement x_count reference counter on znode. -+ -+ Count may drop to 0, jnode stays in cache until memory pressure causes the -+ eviction of its page. The c_count variable also ensures that children are -+ pressured out of memory before the parent. The jnode remains hashed as -+ long as the VM allows its page to stay in memory. -+*/ -+static inline void jput(jnode * node) -+{ -+ assert("jmacd-509", node != NULL); -+ assert("jmacd-510", atomic_read(&node->x_count) > 0); -+ assert("zam-926", reiser4_schedulable()); -+ LOCK_CNT_DEC(x_refs); -+ -+ rcu_read_lock(); -+ /* -+ * we don't need any kind of lock here--jput_final() uses RCU. -+ */ -+ if (unlikely(atomic_dec_and_test(&node->x_count))) { -+ jput_final(node); -+ } else -+ rcu_read_unlock(); -+ assert("nikita-3473", reiser4_schedulable()); -+} -+ -+extern void jrelse(jnode * node); -+extern void jrelse_tail(jnode * node); -+ -+extern jnode *jnode_rip_sync(reiser4_tree * t, jnode * node); -+ -+/* resolve race with jput */ -+static inline jnode *jnode_rip_check(reiser4_tree * tree, jnode * node) -+{ -+ if (unlikely(JF_ISSET(node, JNODE_RIP))) -+ node = jnode_rip_sync(tree, node); -+ return node; -+} -+ -+extern reiser4_key *jnode_build_key(const jnode *node, reiser4_key * key); -+ -+#if REISER4_DEBUG -+extern int jnode_invariant_f(const jnode *node, char const **msg); -+#endif -+ -+extern jnode_plugin jnode_plugins[LAST_JNODE_TYPE]; -+ -+/* __JNODE_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/kassign.c linux-2.6.20/fs/reiser4/kassign.c ---- linux-2.6.20.orig/fs/reiser4/kassign.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/kassign.c 2007-05-06 14:50:43.734986973 +0400 -@@ -0,0 +1,661 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Key assignment policy implementation */ -+ -+/* -+ * In reiser4 every piece of file system data and meta-data has a key. Keys -+ * are used to store information in and retrieve it from reiser4 internal -+ * tree. In addition to this, keys define _ordering_ of all file system -+ * information: things having close keys are placed into the same or -+ * neighboring (in the tree order) nodes of the tree. As our block allocator -+ * tries to respect tree order (see flush.c), keys also define order in which -+ * things are laid out on the disk, and hence, affect performance directly. -+ * -+ * Obviously, assignment of keys to data and meta-data should be consistent -+ * across whole file system. Algorithm that calculates a key for a given piece -+ * of data or meta-data is referred to as "key assignment". -+ * -+ * Key assignment is too expensive to be implemented as a plugin (that is, -+ * with an ability to support different key assignment schemas in the same -+ * compiled kernel image). As a compromise, all key-assignment functions and -+ * data-structures are collected in this single file, so that modifications to -+ * key assignment algorithm can be localized. Additional changes may be -+ * required in key.[ch]. -+ * -+ * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one -+ * may guess, there is "Plan B" too. -+ * -+ */ -+ -+/* -+ * Additional complication with key assignment implementation is a requirement -+ * to support different key length. -+ */ -+ -+/* -+ * KEY ASSIGNMENT: PLAN A, LONG KEYS. -+ * -+ * DIRECTORY ITEMS -+ * -+ * | 60 | 4 | 7 |1| 56 | 64 | 64 | -+ * +--------------+---+---+-+-------------+------------------+-----------------+ -+ * | dirid | 0 | F |H| prefix-1 | prefix-2 | prefix-3/hash | -+ * +--------------+---+---+-+-------------+------------------+-----------------+ -+ * | | | | | -+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes | -+ * -+ * dirid objectid of directory this item is for -+ * -+ * F fibration, see fs/reiser4/plugin/fibration.[ch] -+ * -+ * H 1 if last 8 bytes of the key contain hash, -+ * 0 if last 8 bytes of the key contain prefix-3 -+ * -+ * prefix-1 first 7 characters of file name. -+ * Padded by zeroes if name is not long enough. -+ * -+ * prefix-2 next 8 characters of the file name. -+ * -+ * prefix-3 next 8 characters of the file name. -+ * -+ * hash hash of the rest of file name (i.e., portion of file -+ * name not included into prefix-1 and prefix-2). -+ * -+ * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded -+ * in the key. Such file names are called "short". They are distinguished by H -+ * bit set 0 in the key. -+ * -+ * Other file names are "long". For long name, H bit is 1, and first 15 (== 7 -+ * + 8) characters are encoded in prefix-1 and prefix-2 portions of the -+ * key. Last 8 bytes of the key are occupied by hash of the remaining -+ * characters of the name. -+ * -+ * This key assignment reaches following important goals: -+ * -+ * (1) directory entries are sorted in approximately lexicographical -+ * order. -+ * -+ * (2) collisions (when multiple directory items have the same key), while -+ * principally unavoidable in a tree with fixed length keys, are rare. -+ * -+ * STAT DATA -+ * -+ * | 60 | 4 | 64 | 4 | 60 | 64 | -+ * +--------------+---+-----------------+---+--------------+-----------------+ -+ * | locality id | 1 | ordering | 0 | objectid | 0 | -+ * +--------------+---+-----------------+---+--------------+-----------------+ -+ * | | | | | -+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes | -+ * -+ * locality id object id of a directory where first name was created for -+ * the object -+ * -+ * ordering copy of second 8-byte portion of the key of directory -+ * entry for the first name of this object. Ordering has a form -+ * { -+ * fibration :7; -+ * h :1; -+ * prefix1 :56; -+ * } -+ * see description of key for directory entry above. -+ * -+ * objectid object id for this object -+ * -+ * This key assignment policy is designed to keep stat-data in the same order -+ * as corresponding directory items, thus speeding up readdir/stat types of -+ * workload. -+ * -+ * FILE BODY -+ * -+ * | 60 | 4 | 64 | 4 | 60 | 64 | -+ * +--------------+---+-----------------+---+--------------+-----------------+ -+ * | locality id | 4 | ordering | 0 | objectid | offset | -+ * +--------------+---+-----------------+---+--------------+-----------------+ -+ * | | | | | -+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes | -+ * -+ * locality id object id of a directory where first name was created for -+ * the object -+ * -+ * ordering the same as in the key of stat-data for this object -+ * -+ * objectid object id for this object -+ * -+ * offset logical offset from the beginning of this file. -+ * Measured in bytes. -+ * -+ * -+ * KEY ASSIGNMENT: PLAN A, SHORT KEYS. -+ * -+ * DIRECTORY ITEMS -+ * -+ * | 60 | 4 | 7 |1| 56 | 64 | -+ * +--------------+---+---+-+-------------+-----------------+ -+ * | dirid | 0 | F |H| prefix-1 | prefix-2/hash | -+ * +--------------+---+---+-+-------------+-----------------+ -+ * | | | | -+ * | 8 bytes | 8 bytes | 8 bytes | -+ * -+ * dirid objectid of directory this item is for -+ * -+ * F fibration, see fs/reiser4/plugin/fibration.[ch] -+ * -+ * H 1 if last 8 bytes of the key contain hash, -+ * 0 if last 8 bytes of the key contain prefix-2 -+ * -+ * prefix-1 first 7 characters of file name. -+ * Padded by zeroes if name is not long enough. -+ * -+ * prefix-2 next 8 characters of the file name. -+ * -+ * hash hash of the rest of file name (i.e., portion of file -+ * name not included into prefix-1). -+ * -+ * File names shorter than 15 (== 7 + 8) characters are completely encoded in -+ * the key. Such file names are called "short". They are distinguished by H -+ * bit set in the key. -+ * -+ * Other file names are "long". For long name, H bit is 0, and first 7 -+ * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the -+ * key are occupied by hash of the remaining characters of the name. -+ * -+ * STAT DATA -+ * -+ * | 60 | 4 | 4 | 60 | 64 | -+ * +--------------+---+---+--------------+-----------------+ -+ * | locality id | 1 | 0 | objectid | 0 | -+ * +--------------+---+---+--------------+-----------------+ -+ * | | | | -+ * | 8 bytes | 8 bytes | 8 bytes | -+ * -+ * locality id object id of a directory where first name was created for -+ * the object -+ * -+ * objectid object id for this object -+ * -+ * FILE BODY -+ * -+ * | 60 | 4 | 4 | 60 | 64 | -+ * +--------------+---+---+--------------+-----------------+ -+ * | locality id | 4 | 0 | objectid | offset | -+ * +--------------+---+---+--------------+-----------------+ -+ * | | | | -+ * | 8 bytes | 8 bytes | 8 bytes | -+ * -+ * locality id object id of a directory where first name was created for -+ * the object -+ * -+ * objectid object id for this object -+ * -+ * offset logical offset from the beginning of this file. -+ * Measured in bytes. -+ * -+ * -+ */ -+ -+#include "debug.h" -+#include "key.h" -+#include "kassign.h" -+#include "vfs_ops.h" -+#include "inode.h" -+#include "super.h" -+#include "dscale.h" -+ -+#include /* for __u?? */ -+#include /* for struct super_block, etc */ -+ -+/* bitmask for H bit (see comment at the beginning of this file */ -+static const __u64 longname_mark = 0x0100000000000000ull; -+/* bitmask for F and H portions of the key. */ -+static const __u64 fibration_mask = 0xff00000000000000ull; -+ -+/* return true if name is not completely encoded in @key */ -+int is_longname_key(const reiser4_key * key) -+{ -+ __u64 highpart; -+ -+ assert("nikita-2863", key != NULL); -+ if (get_key_type(key) != KEY_FILE_NAME_MINOR) -+ reiser4_print_key("oops", key); -+ assert("nikita-2864", get_key_type(key) == KEY_FILE_NAME_MINOR); -+ -+ if (REISER4_LARGE_KEY) -+ highpart = get_key_ordering(key); -+ else -+ highpart = get_key_objectid(key); -+ -+ return (highpart & longname_mark) ? 1 : 0; -+} -+ -+/* return true if @name is too long to be completely encoded in the key */ -+int is_longname(const char *name UNUSED_ARG, int len) -+{ -+ if (REISER4_LARGE_KEY) -+ return len > 23; -+ else -+ return len > 15; -+} -+ -+/* code ascii string into __u64. -+ -+ Put characters of @name into result (@str) one after another starting -+ from @start_idx-th highest (arithmetically) byte. This produces -+ endian-safe encoding. memcpy(2) will not do. -+ -+*/ -+static __u64 pack_string(const char *name /* string to encode */ , -+ int start_idx /* highest byte in result from -+ * which to start encoding */ ) -+{ -+ unsigned i; -+ __u64 str; -+ -+ str = 0; -+ for (i = 0; (i < sizeof str - start_idx) && name[i]; ++i) { -+ str <<= 8; -+ str |= (unsigned char)name[i]; -+ } -+ str <<= (sizeof str - i - start_idx) << 3; -+ return str; -+} -+ -+/* opposite to pack_string(). Takes value produced by pack_string(), restores -+ * string encoded in it and stores result in @buf */ -+char * reiser4_unpack_string(__u64 value, char *buf) -+{ -+ do { -+ *buf = value >> (64 - 8); -+ if (*buf) -+ ++buf; -+ value <<= 8; -+ } while (value != 0); -+ *buf = 0; -+ return buf; -+} -+ -+/* obtain name encoded in @key and store it in @buf */ -+char *extract_name_from_key(const reiser4_key * key, char *buf) -+{ -+ char *c; -+ -+ assert("nikita-2868", !is_longname_key(key)); -+ -+ c = buf; -+ if (REISER4_LARGE_KEY) { -+ c = reiser4_unpack_string(get_key_ordering(key) & -+ ~fibration_mask, c); -+ c = reiser4_unpack_string(get_key_fulloid(key), c); -+ } else -+ c = reiser4_unpack_string(get_key_fulloid(key) & -+ ~fibration_mask, c); -+ reiser4_unpack_string(get_key_offset(key), c); -+ return buf; -+} -+ -+/** -+ * complete_entry_key - calculate entry key by name -+ * @dir: directory where entry is (or will be) in -+ * @name: name to calculate key of -+ * @len: lenth of name -+ * @result: place to store result in -+ * -+ * Sets fields of entry key @result which depend on file name. -+ * When REISER4_LARGE_KEY is defined three fields of @result are set: ordering, -+ * objectid and offset. Otherwise, objectid and offset are set. -+ */ -+void complete_entry_key(const struct inode *dir, const char *name, -+ int len, reiser4_key *result) -+{ -+#if REISER4_LARGE_KEY -+ __u64 ordering; -+ __u64 objectid; -+ __u64 offset; -+ -+ assert("nikita-1139", dir != NULL); -+ assert("nikita-1142", result != NULL); -+ assert("nikita-2867", strlen(name) == len); -+ -+ /* -+ * key allocation algorithm for directory entries in case of large -+ * keys: -+ * -+ * If name is not longer than 7 + 8 + 8 = 23 characters, put first 7 -+ * characters into ordering field of key, next 8 charactes (if any) -+ * into objectid field of key and next 8 ones (of any) into offset -+ * field of key -+ * -+ * If file name is longer than 23 characters, put first 7 characters -+ * into key's ordering, next 8 to objectid and hash of remaining -+ * characters into offset field. -+ * -+ * To distinguish above cases, in latter set up unused high bit in -+ * ordering field. -+ */ -+ -+ /* [0-6] characters to ordering */ -+ ordering = pack_string(name, 1); -+ if (len > 7) { -+ /* [7-14] characters to objectid */ -+ objectid = pack_string(name + 7, 0); -+ if (len > 15) { -+ if (len <= 23) { -+ /* [15-23] characters to offset */ -+ offset = pack_string(name + 15, 0); -+ } else { -+ /* note in a key the fact that offset contains hash. */ -+ ordering |= longname_mark; -+ -+ /* offset is the hash of the file name's tail. */ -+ offset = inode_hash_plugin(dir)->hash(name + 15, -+ len - 15); -+ } -+ } else { -+ offset = 0ull; -+ } -+ } else { -+ objectid = 0ull; -+ offset = 0ull; -+ } -+ -+ assert("nikita-3480", inode_fibration_plugin(dir) != NULL); -+ ordering |= inode_fibration_plugin(dir)->fibre(dir, name, len); -+ -+ set_key_ordering(result, ordering); -+ set_key_fulloid(result, objectid); -+ set_key_offset(result, offset); -+ return; -+ -+#else -+ __u64 objectid; -+ __u64 offset; -+ -+ assert("nikita-1139", dir != NULL); -+ assert("nikita-1142", result != NULL); -+ assert("nikita-2867", strlen(name) == len); -+ -+ /* -+ * key allocation algorithm for directory entries in case of not large -+ * keys: -+ * -+ * If name is not longer than 7 + 8 = 15 characters, put first 7 -+ * characters into objectid field of key, next 8 charactes (if any) -+ * into offset field of key -+ * -+ * If file name is longer than 15 characters, put first 7 characters -+ * into key's objectid, and hash of remaining characters into offset -+ * field. -+ * -+ * To distinguish above cases, in latter set up unused high bit in -+ * objectid field. -+ */ -+ -+ /* [0-6] characters to objectid */ -+ objectid = pack_string(name, 1); -+ if (len > 7) { -+ if (len <= 15) { -+ /* [7-14] characters to offset */ -+ offset = pack_string(name + 7, 0); -+ } else { -+ /* note in a key the fact that offset contains hash. */ -+ objectid |= longname_mark; -+ -+ /* offset is the hash of the file name. */ -+ offset = inode_hash_plugin(dir)->hash(name + 7, -+ len - 7); -+ } -+ } else -+ offset = 0ull; -+ -+ assert("nikita-3480", inode_fibration_plugin(dir) != NULL); -+ objectid |= inode_fibration_plugin(dir)->fibre(dir, name, len); -+ -+ set_key_fulloid(result, objectid); -+ set_key_offset(result, offset); -+ return; -+#endif /* ! REISER4_LARGE_KEY */ -+} -+ -+/* true, if @key is the key of "." */ -+int is_dot_key(const reiser4_key * key /* key to check */ ) -+{ -+ assert("nikita-1717", key != NULL); -+ assert("nikita-1718", get_key_type(key) == KEY_FILE_NAME_MINOR); -+ return -+ (get_key_ordering(key) == 0ull) && -+ (get_key_objectid(key) == 0ull) && (get_key_offset(key) == 0ull); -+} -+ -+/* build key for stat-data. -+ -+ return key of stat-data of this object. This should became sd plugin -+ method in the future. For now, let it be here. -+ -+*/ -+reiser4_key *build_sd_key(const struct inode * target /* inode of an object */ , -+ reiser4_key * result /* resulting key of @target -+ stat-data */ ) -+{ -+ assert("nikita-261", result != NULL); -+ -+ reiser4_key_init(result); -+ set_key_locality(result, reiser4_inode_data(target)->locality_id); -+ set_key_ordering(result, get_inode_ordering(target)); -+ set_key_objectid(result, get_inode_oid(target)); -+ set_key_type(result, KEY_SD_MINOR); -+ set_key_offset(result, (__u64) 0); -+ return result; -+} -+ -+/* encode part of key into &obj_key_id -+ -+ This encodes into @id part of @key sufficient to restore @key later, -+ given that latter is key of object (key of stat-data). -+ -+ See &obj_key_id -+*/ -+int build_obj_key_id(const reiser4_key * key /* key to encode */ , -+ obj_key_id * id /* id where key is encoded in */ ) -+{ -+ assert("nikita-1151", key != NULL); -+ assert("nikita-1152", id != NULL); -+ -+ memcpy(id, key, sizeof *id); -+ return 0; -+} -+ -+/* encode reference to @obj in @id. -+ -+ This is like build_obj_key_id() above, but takes inode as parameter. */ -+int build_inode_key_id(const struct inode *obj /* object to build key of */ , -+ obj_key_id * id /* result */ ) -+{ -+ reiser4_key sdkey; -+ -+ assert("nikita-1166", obj != NULL); -+ assert("nikita-1167", id != NULL); -+ -+ build_sd_key(obj, &sdkey); -+ build_obj_key_id(&sdkey, id); -+ return 0; -+} -+ -+/* decode @id back into @key -+ -+ Restore key of object stat-data from @id. This is dual to -+ build_obj_key_id() above. -+*/ -+int extract_key_from_id(const obj_key_id * id /* object key id to extract key -+ * from */ , -+ reiser4_key * key /* result */ ) -+{ -+ assert("nikita-1153", id != NULL); -+ assert("nikita-1154", key != NULL); -+ -+ reiser4_key_init(key); -+ memcpy(key, id, sizeof *id); -+ return 0; -+} -+ -+/* extract objectid of directory from key of directory entry within said -+ directory. -+ */ -+oid_t extract_dir_id_from_key(const reiser4_key * de_key /* key of -+ * directory -+ * entry */ ) -+{ -+ assert("nikita-1314", de_key != NULL); -+ return get_key_locality(de_key); -+} -+ -+/* encode into @id key of directory entry. -+ -+ Encode into @id information sufficient to later distinguish directory -+ entries within the same directory. This is not whole key, because all -+ directory entries within directory item share locality which is equal -+ to objectid of their directory. -+ -+*/ -+int build_de_id(const struct inode *dir /* inode of directory */ , -+ const struct qstr *name /* name to be given to @obj by -+ * directory entry being -+ * constructed */ , -+ de_id * id /* short key of directory entry */ ) -+{ -+ reiser4_key key; -+ -+ assert("nikita-1290", dir != NULL); -+ assert("nikita-1292", id != NULL); -+ -+ /* NOTE-NIKITA this is suboptimal. */ -+ inode_dir_plugin(dir)->build_entry_key(dir, name, &key); -+ return build_de_id_by_key(&key, id); -+} -+ -+/* encode into @id key of directory entry. -+ -+ Encode into @id information sufficient to later distinguish directory -+ entries within the same directory. This is not whole key, because all -+ directory entries within directory item share locality which is equal -+ to objectid of their directory. -+ -+*/ -+int build_de_id_by_key(const reiser4_key * entry_key /* full key of directory -+ * entry */ , -+ de_id * id /* short key of directory entry */ ) -+{ -+ memcpy(id, ((__u64 *) entry_key) + 1, sizeof *id); -+ return 0; -+} -+ -+/* restore from @id key of directory entry. -+ -+ Function dual to build_de_id(): given @id and locality, build full -+ key of directory entry within directory item. -+ -+*/ -+int extract_key_from_de_id(const oid_t locality /* locality of directory -+ * entry */ , -+ const de_id * id /* directory entry id */ , -+ reiser4_key * key /* result */ ) -+{ -+ /* no need to initialise key here: all fields are overwritten */ -+ memcpy(((__u64 *) key) + 1, id, sizeof *id); -+ set_key_locality(key, locality); -+ set_key_type(key, KEY_FILE_NAME_MINOR); -+ return 0; -+} -+ -+/* compare two &de_id's */ -+cmp_t de_id_cmp(const de_id * id1 /* first &de_id to compare */ , -+ const de_id * id2 /* second &de_id to compare */ ) -+{ -+ /* NOTE-NIKITA ugly implementation */ -+ reiser4_key k1; -+ reiser4_key k2; -+ -+ extract_key_from_de_id((oid_t) 0, id1, &k1); -+ extract_key_from_de_id((oid_t) 0, id2, &k2); -+ return keycmp(&k1, &k2); -+} -+ -+/* compare &de_id with key */ -+cmp_t de_id_key_cmp(const de_id * id /* directory entry id to compare */ , -+ const reiser4_key * key /* key to compare */ ) -+{ -+ cmp_t result; -+ reiser4_key *k1; -+ -+ k1 = (reiser4_key *) (((unsigned long)id) - sizeof key->el[0]); -+ result = KEY_DIFF_EL(k1, key, 1); -+ if (result == EQUAL_TO) { -+ result = KEY_DIFF_EL(k1, key, 2); -+ if (REISER4_LARGE_KEY && result == EQUAL_TO) { -+ result = KEY_DIFF_EL(k1, key, 3); -+ } -+ } -+ return result; -+} -+ -+/* -+ * return number of bytes necessary to encode @inode identity. -+ */ -+int inode_onwire_size(const struct inode *inode) -+{ -+ int result; -+ -+ result = dscale_bytes(get_inode_oid(inode)); -+ result += dscale_bytes(get_inode_locality(inode)); -+ -+ /* -+ * ordering is large (it usually has highest bits set), so it makes -+ * little sense to dscale it. -+ */ -+ if (REISER4_LARGE_KEY) -+ result += sizeof(get_inode_ordering(inode)); -+ return result; -+} -+ -+/* -+ * encode @inode identity at @start -+ */ -+char *build_inode_onwire(const struct inode *inode, char *start) -+{ -+ start += dscale_write(start, get_inode_locality(inode)); -+ start += dscale_write(start, get_inode_oid(inode)); -+ -+ if (REISER4_LARGE_KEY) { -+ put_unaligned(cpu_to_le64(get_inode_ordering(inode)), (__le64 *)start); -+ start += sizeof(get_inode_ordering(inode)); -+ } -+ return start; -+} -+ -+/* -+ * extract key that was previously encoded by build_inode_onwire() at @addr -+ */ -+char *extract_obj_key_id_from_onwire(char *addr, obj_key_id * key_id) -+{ -+ __u64 val; -+ -+ addr += dscale_read(addr, &val); -+ val = (val << KEY_LOCALITY_SHIFT) | KEY_SD_MINOR; -+ put_unaligned(cpu_to_le64(val), (__le64 *)key_id->locality); -+ addr += dscale_read(addr, &val); -+ put_unaligned(cpu_to_le64(val), (__le64 *)key_id->objectid); -+#if REISER4_LARGE_KEY -+ memcpy(&key_id->ordering, addr, sizeof key_id->ordering); -+ addr += sizeof key_id->ordering; -+#endif -+ return addr; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/kassign.h linux-2.6.20/fs/reiser4/kassign.h ---- linux-2.6.20.orig/fs/reiser4/kassign.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/kassign.h 2007-05-06 14:50:43.734986973 +0400 -@@ -0,0 +1,110 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Key assignment policy interface. See kassign.c for details. */ -+ -+#if !defined( __KASSIGN_H__ ) -+#define __KASSIGN_H__ -+ -+#include "forward.h" -+#include "key.h" -+#include "dformat.h" -+ -+#include /* for __u?? */ -+#include /* for struct super_block, etc */ -+#include /* for struct qstr */ -+ -+/* key assignment functions */ -+ -+/* Information from which key of file stat-data can be uniquely -+ restored. This depends on key assignment policy for -+ stat-data. Currently it's enough to store object id and locality id -+ (60+60==120) bits, because minor packing locality and offset of -+ stat-data key are always known constants: KEY_SD_MINOR and 0 -+ respectively. For simplicity 4 bits are wasted in each id, and just -+ two 64 bit integers are stored. -+ -+ This field has to be byte-aligned, because we don't want to waste -+ space in directory entries. There is another side of a coin of -+ course: we waste CPU and bus bandwidth in stead, by copying data back -+ and forth. -+ -+ Next optimization: &obj_key_id is mainly used to address stat data from -+ directory entries. Under the assumption that majority of files only have -+ only name (one hard link) from *the* parent directory it seems reasonable -+ to only store objectid of stat data and take its locality from key of -+ directory item. -+ -+ This requires some flag to be added to the &obj_key_id to distinguish -+ between these two cases. Remaining bits in flag byte are then asking to be -+ used to store file type. -+ -+ This optimization requires changes in directory item handling code. -+ -+*/ -+typedef struct obj_key_id { -+ d8 locality[sizeof(__u64)]; -+ ON_LARGE_KEY(d8 ordering[sizeof(__u64)]; -+ ) -+ d8 objectid[sizeof(__u64)]; -+} -+obj_key_id; -+ -+/* Information sufficient to uniquely identify directory entry within -+ compressed directory item. -+ -+ For alignment issues see &obj_key_id above. -+*/ -+typedef struct de_id { -+ ON_LARGE_KEY(d8 ordering[sizeof(__u64)];) -+ d8 objectid[sizeof(__u64)]; -+ d8 offset[sizeof(__u64)]; -+} -+de_id; -+ -+extern int inode_onwire_size(const struct inode *obj); -+extern char *build_inode_onwire(const struct inode *obj, char *area); -+extern char *extract_obj_key_id_from_onwire(char *area, obj_key_id * key_id); -+ -+extern int build_inode_key_id(const struct inode *obj, obj_key_id * id); -+extern int extract_key_from_id(const obj_key_id * id, reiser4_key * key); -+extern int build_obj_key_id(const reiser4_key * key, obj_key_id * id); -+extern oid_t extract_dir_id_from_key(const reiser4_key * de_key); -+extern int build_de_id(const struct inode *dir, const struct qstr *name, -+ de_id * id); -+extern int build_de_id_by_key(const reiser4_key * entry_key, de_id * id); -+extern int extract_key_from_de_id(const oid_t locality, const de_id * id, -+ reiser4_key * key); -+extern cmp_t de_id_cmp(const de_id * id1, const de_id * id2); -+extern cmp_t de_id_key_cmp(const de_id * id, const reiser4_key * key); -+ -+extern int build_readdir_key_common(struct file *dir, reiser4_key * result); -+extern void build_entry_key_common(const struct inode *dir, -+ const struct qstr *name, -+ reiser4_key * result); -+extern void build_entry_key_stable_entry(const struct inode *dir, -+ const struct qstr *name, -+ reiser4_key * result); -+extern int is_dot_key(const reiser4_key * key); -+extern reiser4_key *build_sd_key(const struct inode *target, -+ reiser4_key * result); -+ -+extern int is_longname_key(const reiser4_key * key); -+extern int is_longname(const char *name, int len); -+extern char *extract_name_from_key(const reiser4_key * key, char *buf); -+extern char *reiser4_unpack_string(__u64 value, char *buf); -+extern void complete_entry_key(const struct inode *dir, const char *name, -+ int len, reiser4_key *result); -+ -+/* __KASSIGN_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/Kconfig linux-2.6.20/fs/reiser4/Kconfig ---- linux-2.6.20.orig/fs/reiser4/Kconfig 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/Kconfig 2007-05-06 14:50:43.734986973 +0400 -@@ -0,0 +1,32 @@ -+config REISER4_FS -+ tristate "Reiser4 (EXPERIMENTAL)" -+ depends on EXPERIMENTAL -+ select ZLIB_INFLATE -+ select ZLIB_DEFLATE -+ select CRYPTO -+ help -+ Reiser4 is a filesystem that performs all filesystem operations -+ as atomic transactions, which means that it either performs a -+ write, or it does not, and in the event of a crash it does not -+ partially perform it or corrupt it. -+ -+ It stores files in dancing trees, which are like balanced trees but -+ faster. It packs small files together so that they share blocks -+ without wasting space. This means you can use it to store really -+ small files. It also means that it saves you disk space. It avoids -+ hassling you with anachronisms like having a maximum number of -+ inodes, and wasting space if you use less than that number. -+ -+ Reiser4 is a distinct filesystem type from reiserfs (V3). -+ It's therefore not possible to use reiserfs file systems -+ with reiser4. -+ -+ To learn more about reiser4, go to http://www.namesys.com -+ -+config REISER4_DEBUG -+ bool "Enable reiser4 debug mode" -+ depends on REISER4_FS -+ help -+ Don't use this unless you are debugging reiser4. -+ -+ If unsure, say N. -diff -urN linux-2.6.20.orig/fs/reiser4/key.c linux-2.6.20/fs/reiser4/key.c ---- linux-2.6.20.orig/fs/reiser4/key.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/key.c 2007-05-06 14:50:43.734986973 +0400 -@@ -0,0 +1,137 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Key manipulations. */ -+ -+#include "debug.h" -+#include "key.h" -+#include "super.h" -+#include "reiser4.h" -+ -+#include /* for __u?? */ -+ -+/* Minimal possible key: all components are zero. It is presumed that this is -+ independent of key scheme. */ -+static const reiser4_key MINIMAL_KEY = { -+ .el = { -+ 0ull, -+ ON_LARGE_KEY(0ull,) -+ 0ull, -+ 0ull -+ } -+}; -+ -+/* Maximal possible key: all components are ~0. It is presumed that this is -+ independent of key scheme. */ -+static const reiser4_key MAXIMAL_KEY = { -+ .el = { -+ __constant_cpu_to_le64(~0ull), -+ ON_LARGE_KEY(__constant_cpu_to_le64(~0ull),) -+ __constant_cpu_to_le64(~0ull), -+ __constant_cpu_to_le64(~0ull) -+ } -+}; -+ -+/* Initialize key. */ -+void reiser4_key_init(reiser4_key * key /* key to init */ ) -+{ -+ assert("nikita-1169", key != NULL); -+ memset(key, 0, sizeof *key); -+} -+ -+/* minimal possible key in the tree. Return pointer to the static storage. */ -+const reiser4_key *reiser4_min_key(void) -+{ -+ return &MINIMAL_KEY; -+} -+ -+/* maximum possible key in the tree. Return pointer to the static storage. */ -+const reiser4_key *reiser4_max_key(void) -+{ -+ return &MAXIMAL_KEY; -+} -+ -+#if REISER4_DEBUG -+/* debugging aid: print symbolic name of key type */ -+static const char *type_name(unsigned int key_type /* key type */ ) -+{ -+ switch (key_type) { -+ case KEY_FILE_NAME_MINOR: -+ return "file name"; -+ case KEY_SD_MINOR: -+ return "stat data"; -+ case KEY_ATTR_NAME_MINOR: -+ return "attr name"; -+ case KEY_ATTR_BODY_MINOR: -+ return "attr body"; -+ case KEY_BODY_MINOR: -+ return "file body"; -+ default: -+ return "unknown"; -+ } -+} -+ -+/* debugging aid: print human readable information about key */ -+void reiser4_print_key(const char *prefix /* prefix to print */ , -+ const reiser4_key * key /* key to print */ ) -+{ -+ /* turn bold on */ -+ /* printf ("\033[1m"); */ -+ if (key == NULL) -+ printk("%s: null key\n", prefix); -+ else { -+ if (REISER4_LARGE_KEY) -+ printk("%s: (%Lx:%x:%Lx:%Lx:%Lx:%Lx)", prefix, -+ get_key_locality(key), -+ get_key_type(key), -+ get_key_ordering(key), -+ get_key_band(key), -+ get_key_objectid(key), get_key_offset(key)); -+ else -+ printk("%s: (%Lx:%x:%Lx:%Lx:%Lx)", prefix, -+ get_key_locality(key), -+ get_key_type(key), -+ get_key_band(key), -+ get_key_objectid(key), get_key_offset(key)); -+ /* -+ * if this is a key of directory entry, try to decode part of -+ * a name stored in the key, and output it. -+ */ -+ if (get_key_type(key) == KEY_FILE_NAME_MINOR) { -+ char buf[DE_NAME_BUF_LEN]; -+ char *c; -+ -+ c = buf; -+ c = reiser4_unpack_string(get_key_ordering(key), c); -+ reiser4_unpack_string(get_key_fulloid(key), c); -+ printk("[%s", buf); -+ if (is_longname_key(key)) -+ /* -+ * only part of the name is stored in the key. -+ */ -+ printk("...]\n"); -+ else { -+ /* -+ * whole name is stored in the key. -+ */ -+ reiser4_unpack_string(get_key_offset(key), buf); -+ printk("%s]\n", buf); -+ } -+ } else { -+ printk("[%s]\n", type_name(get_key_type(key))); -+ } -+ } -+ /* turn bold off */ -+ /* printf ("\033[m\017"); */ -+} -+ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/key.h linux-2.6.20/fs/reiser4/key.h ---- linux-2.6.20.orig/fs/reiser4/key.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/key.h 2007-05-06 14:50:43.738988223 +0400 -@@ -0,0 +1,384 @@ -+/* Copyright 2000, 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Declarations of key-related data-structures and operations on keys. */ -+ -+#if !defined( __REISER4_KEY_H__ ) -+#define __REISER4_KEY_H__ -+ -+#include "dformat.h" -+#include "forward.h" -+#include "debug.h" -+ -+#include /* for __u?? */ -+ -+/* Operations on keys in reiser4 tree */ -+ -+/* No access to any of these fields shall be done except via a -+ wrapping macro/function, and that wrapping macro/function shall -+ convert to little endian order. Compare keys will consider cpu byte order. */ -+ -+/* A storage layer implementation difference between a regular unix file body and its attributes is in the typedef below -+ which causes all of the attributes of a file to be near in key to all of the other attributes for all of the files -+ within that directory, and not near to the file itself. It is interesting to consider whether this is the wrong -+ approach, and whether there should be no difference at all. For current usage patterns this choice is probably the -+ right one. */ -+ -+/* possible values for minor packing locality (4 bits required) */ -+typedef enum { -+ /* file name */ -+ KEY_FILE_NAME_MINOR = 0, -+ /* stat-data */ -+ KEY_SD_MINOR = 1, -+ /* file attribute name */ -+ KEY_ATTR_NAME_MINOR = 2, -+ /* file attribute value */ -+ KEY_ATTR_BODY_MINOR = 3, -+ /* file body (tail or extent) */ -+ KEY_BODY_MINOR = 4, -+} key_minor_locality; -+ -+/* everything stored in the tree has a unique key, which means that the tree is (logically) fully ordered by key. -+ Physical order is determined by dynamic heuristics that attempt to reflect key order when allocating available space, -+ and by the repacker. It is stylistically better to put aggregation information into the key. Thus, if you want to -+ segregate extents from tails, it is better to give them distinct minor packing localities rather than changing -+ block_alloc.c to check the node type when deciding where to allocate the node. -+ -+ The need to randomly displace new directories and large files disturbs this symmetry unfortunately. However, it -+ should be noted that this is a need that is not clearly established given the existence of a repacker. Also, in our -+ current implementation tails have a different minor packing locality from extents, and no files have both extents and -+ tails, so maybe symmetry can be had without performance cost after all. Symmetry is what we ship for now.... -+*/ -+ -+/* Arbitrary major packing localities can be assigned to objects using -+ the reiser4(filenameA/..packing<=some_number) system call. -+ -+ In reiser4, the creat() syscall creates a directory -+ -+ whose default flow (that which is referred to if the directory is -+ read as a file) is the traditional unix file body. -+ -+ whose directory plugin is the 'filedir' -+ -+ whose major packing locality is that of the parent of the object created. -+ -+ The static_stat item is a particular commonly used directory -+ compression (the one for normal unix files). -+ -+ The filedir plugin checks to see if the static_stat item exists. -+ There is a unique key for static_stat. If yes, then it uses the -+ static_stat item for all of the values that it contains. The -+ static_stat item contains a flag for each stat it contains which -+ indicates whether one should look outside the static_stat item for its -+ contents. -+*/ -+ -+/* offset of fields in reiser4_key. Value of each element of this enum -+ is index within key (thought as array of __u64's) where this field -+ is. */ -+typedef enum { -+ /* major "locale", aka dirid. Sits in 1st element */ -+ KEY_LOCALITY_INDEX = 0, -+ /* minor "locale", aka item type. Sits in 1st element */ -+ KEY_TYPE_INDEX = 0, -+ ON_LARGE_KEY(KEY_ORDERING_INDEX,) -+ /* "object band". Sits in 2nd element */ -+ KEY_BAND_INDEX, -+ /* objectid. Sits in 2nd element */ -+ KEY_OBJECTID_INDEX = KEY_BAND_INDEX, -+ /* full objectid. Sits in 2nd element */ -+ KEY_FULLOID_INDEX = KEY_BAND_INDEX, -+ /* Offset. Sits in 3rd element */ -+ KEY_OFFSET_INDEX, -+ /* Name hash. Sits in 3rd element */ -+ KEY_HASH_INDEX = KEY_OFFSET_INDEX, -+ KEY_CACHELINE_END = KEY_OFFSET_INDEX, -+ KEY_LAST_INDEX -+} reiser4_key_field_index; -+ -+/* key in reiser4 internal "balanced" tree. It is just array of three -+ 64bit integers in disk byte order (little-endian by default). This -+ array is actually indexed by reiser4_key_field. Each __u64 within -+ this array is called "element". Logical key component encoded within -+ elements are called "fields". -+ -+ We declare this as union with second component dummy to suppress -+ inconvenient array<->pointer casts implied in C. */ -+union reiser4_key { -+ __le64 el[KEY_LAST_INDEX]; -+ int pad; -+}; -+ -+/* bitmasks showing where within reiser4_key particular key is stored. */ -+/* major locality occupies higher 60 bits of the first element */ -+#define KEY_LOCALITY_MASK 0xfffffffffffffff0ull -+ -+/* minor locality occupies lower 4 bits of the first element */ -+#define KEY_TYPE_MASK 0xfull -+ -+/* controversial band occupies higher 4 bits of the 2nd element */ -+#define KEY_BAND_MASK 0xf000000000000000ull -+ -+/* objectid occupies lower 60 bits of the 2nd element */ -+#define KEY_OBJECTID_MASK 0x0fffffffffffffffull -+ -+/* full 64bit objectid*/ -+#define KEY_FULLOID_MASK 0xffffffffffffffffull -+ -+/* offset is just 3rd L.M.Nt itself */ -+#define KEY_OFFSET_MASK 0xffffffffffffffffull -+ -+/* ordering is whole second element */ -+#define KEY_ORDERING_MASK 0xffffffffffffffffull -+ -+/* how many bits key element should be shifted to left to get particular field */ -+typedef enum { -+ KEY_LOCALITY_SHIFT = 4, -+ KEY_TYPE_SHIFT = 0, -+ KEY_BAND_SHIFT = 60, -+ KEY_OBJECTID_SHIFT = 0, -+ KEY_FULLOID_SHIFT = 0, -+ KEY_OFFSET_SHIFT = 0, -+ KEY_ORDERING_SHIFT = 0, -+} reiser4_key_field_shift; -+ -+static inline __u64 -+get_key_el(const reiser4_key * key, reiser4_key_field_index off) -+{ -+ assert("nikita-753", key != NULL); -+ assert("nikita-754", off < KEY_LAST_INDEX); -+ return le64_to_cpu(get_unaligned(&key->el[off])); -+} -+ -+static inline void -+set_key_el(reiser4_key * key, reiser4_key_field_index off, __u64 value) -+{ -+ assert("nikita-755", key != NULL); -+ assert("nikita-756", off < KEY_LAST_INDEX); -+ put_unaligned(cpu_to_le64(value), &key->el[off]); -+} -+ -+/* macro to define getter and setter functions for field F with type T */ -+#define DEFINE_KEY_FIELD( L, U, T ) \ -+static inline T get_key_ ## L ( const reiser4_key *key ) \ -+{ \ -+ assert( "nikita-750", key != NULL ); \ -+ return ( T ) ( get_key_el( key, KEY_ ## U ## _INDEX ) & \ -+ KEY_ ## U ## _MASK ) >> KEY_ ## U ## _SHIFT; \ -+} \ -+ \ -+static inline void set_key_ ## L ( reiser4_key *key, T loc ) \ -+{ \ -+ __u64 el; \ -+ \ -+ assert( "nikita-752", key != NULL ); \ -+ \ -+ el = get_key_el( key, KEY_ ## U ## _INDEX ); \ -+ /* clear field bits in the key */ \ -+ el &= ~KEY_ ## U ## _MASK; \ -+ /* actually it should be \ -+ \ -+ el |= ( loc << KEY_ ## U ## _SHIFT ) & KEY_ ## U ## _MASK; \ -+ \ -+ but we trust user to never pass values that wouldn't fit \ -+ into field. Clearing extra bits is one operation, but this \ -+ function is time-critical. \ -+ But check this in assertion. */ \ -+ assert( "nikita-759", ( ( loc << KEY_ ## U ## _SHIFT ) & \ -+ ~KEY_ ## U ## _MASK ) == 0 ); \ -+ el |= ( loc << KEY_ ## U ## _SHIFT ); \ -+ set_key_el( key, KEY_ ## U ## _INDEX, el ); \ -+} -+ -+typedef __u64 oid_t; -+ -+/* define get_key_locality(), set_key_locality() */ -+DEFINE_KEY_FIELD(locality, LOCALITY, oid_t); -+/* define get_key_type(), set_key_type() */ -+DEFINE_KEY_FIELD(type, TYPE, key_minor_locality); -+/* define get_key_band(), set_key_band() */ -+DEFINE_KEY_FIELD(band, BAND, __u64); -+/* define get_key_objectid(), set_key_objectid() */ -+DEFINE_KEY_FIELD(objectid, OBJECTID, oid_t); -+/* define get_key_fulloid(), set_key_fulloid() */ -+DEFINE_KEY_FIELD(fulloid, FULLOID, oid_t); -+/* define get_key_offset(), set_key_offset() */ -+DEFINE_KEY_FIELD(offset, OFFSET, __u64); -+#if (REISER4_LARGE_KEY) -+/* define get_key_ordering(), set_key_ordering() */ -+DEFINE_KEY_FIELD(ordering, ORDERING, __u64); -+#else -+static inline __u64 get_key_ordering(const reiser4_key * key) -+{ -+ return 0; -+} -+ -+static inline void set_key_ordering(reiser4_key * key, __u64 val) -+{ -+} -+#endif -+ -+/* key comparison result */ -+typedef enum { LESS_THAN = -1, /* if first key is less than second */ -+ EQUAL_TO = 0, /* if keys are equal */ -+ GREATER_THAN = +1 /* if first key is greater than second */ -+} cmp_t; -+ -+void reiser4_key_init(reiser4_key * key); -+ -+/* minimal possible key in the tree. Return pointer to the static storage. */ -+extern const reiser4_key *reiser4_min_key(void); -+extern const reiser4_key *reiser4_max_key(void); -+ -+/* helper macro for keycmp() */ -+#define KEY_DIFF(k1, k2, field) \ -+({ \ -+ typeof (get_key_ ## field (k1)) f1; \ -+ typeof (get_key_ ## field (k2)) f2; \ -+ \ -+ f1 = get_key_ ## field (k1); \ -+ f2 = get_key_ ## field (k2); \ -+ \ -+ (f1 < f2) ? LESS_THAN : ((f1 == f2) ? EQUAL_TO : GREATER_THAN); \ -+}) -+ -+/* helper macro for keycmp() */ -+#define KEY_DIFF_EL(k1, k2, off) \ -+({ \ -+ __u64 e1; \ -+ __u64 e2; \ -+ \ -+ e1 = get_key_el(k1, off); \ -+ e2 = get_key_el(k2, off); \ -+ \ -+ (e1 < e2) ? LESS_THAN : ((e1 == e2) ? EQUAL_TO : GREATER_THAN); \ -+}) -+ -+/* compare `k1' and `k2'. This function is a heart of "key allocation -+ policy". All you need to implement new policy is to add yet another -+ clause here. */ -+static inline cmp_t keycmp(const reiser4_key * k1 /* first key to compare */ , -+ const reiser4_key * k2 /* second key to compare */ ) -+{ -+ cmp_t result; -+ -+ /* -+ * This function is the heart of reiser4 tree-routines. Key comparison -+ * is among most heavily used operations in the file system. -+ */ -+ -+ assert("nikita-439", k1 != NULL); -+ assert("nikita-440", k2 != NULL); -+ -+ /* there is no actual branch here: condition is compile time constant -+ * and constant folding and propagation ensures that only one branch -+ * is actually compiled in. */ -+ -+ if (REISER4_PLANA_KEY_ALLOCATION) { -+ /* if physical order of fields in a key is identical -+ with logical order, we can implement key comparison -+ as three 64bit comparisons. */ -+ /* logical order of fields in plan-a: -+ locality->type->objectid->offset. */ -+ /* compare locality and type at once */ -+ result = KEY_DIFF_EL(k1, k2, 0); -+ if (result == EQUAL_TO) { -+ /* compare objectid (and band if it's there) */ -+ result = KEY_DIFF_EL(k1, k2, 1); -+ /* compare offset */ -+ if (result == EQUAL_TO) { -+ result = KEY_DIFF_EL(k1, k2, 2); -+ if (REISER4_LARGE_KEY && result == EQUAL_TO) { -+ result = KEY_DIFF_EL(k1, k2, 3); -+ } -+ } -+ } -+ } else if (REISER4_3_5_KEY_ALLOCATION) { -+ result = KEY_DIFF(k1, k2, locality); -+ if (result == EQUAL_TO) { -+ result = KEY_DIFF(k1, k2, objectid); -+ if (result == EQUAL_TO) { -+ result = KEY_DIFF(k1, k2, type); -+ if (result == EQUAL_TO) -+ result = KEY_DIFF(k1, k2, offset); -+ } -+ } -+ } else -+ impossible("nikita-441", "Unknown key allocation scheme!"); -+ return result; -+} -+ -+/* true if @k1 equals @k2 */ -+static inline int keyeq(const reiser4_key * k1 /* first key to compare */ , -+ const reiser4_key * k2 /* second key to compare */ ) -+{ -+ assert("nikita-1879", k1 != NULL); -+ assert("nikita-1880", k2 != NULL); -+ return !memcmp(k1, k2, sizeof *k1); -+} -+ -+/* true if @k1 is less than @k2 */ -+static inline int keylt(const reiser4_key * k1 /* first key to compare */ , -+ const reiser4_key * k2 /* second key to compare */ ) -+{ -+ assert("nikita-1952", k1 != NULL); -+ assert("nikita-1953", k2 != NULL); -+ return keycmp(k1, k2) == LESS_THAN; -+} -+ -+/* true if @k1 is less than or equal to @k2 */ -+static inline int keyle(const reiser4_key * k1 /* first key to compare */ , -+ const reiser4_key * k2 /* second key to compare */ ) -+{ -+ assert("nikita-1954", k1 != NULL); -+ assert("nikita-1955", k2 != NULL); -+ return keycmp(k1, k2) != GREATER_THAN; -+} -+ -+/* true if @k1 is greater than @k2 */ -+static inline int keygt(const reiser4_key * k1 /* first key to compare */ , -+ const reiser4_key * k2 /* second key to compare */ ) -+{ -+ assert("nikita-1959", k1 != NULL); -+ assert("nikita-1960", k2 != NULL); -+ return keycmp(k1, k2) == GREATER_THAN; -+} -+ -+/* true if @k1 is greater than or equal to @k2 */ -+static inline int keyge(const reiser4_key * k1 /* first key to compare */ , -+ const reiser4_key * k2 /* second key to compare */ ) -+{ -+ assert("nikita-1956", k1 != NULL); -+ assert("nikita-1957", k2 != NULL); /* October 4: sputnik launched -+ * November 3: Laika */ -+ return keycmp(k1, k2) != LESS_THAN; -+} -+ -+static inline void prefetchkey(reiser4_key * key) -+{ -+ prefetch(key); -+ prefetch(&key->el[KEY_CACHELINE_END]); -+} -+ -+/* (%Lx:%x:%Lx:%Lx:%Lx:%Lx) = -+ 1 + 16 + 1 + 1 + 1 + 1 + 1 + 16 + 1 + 16 + 1 + 16 + 1 */ -+/* size of a buffer suitable to hold human readable key representation */ -+#define KEY_BUF_LEN (80) -+ -+#if REISER4_DEBUG -+extern void reiser4_print_key(const char *prefix, const reiser4_key * key); -+#else -+#define reiser4_print_key(p,k) noop -+#endif -+ -+/* __FS_REISERFS_KEY_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/ktxnmgrd.c linux-2.6.20/fs/reiser4/ktxnmgrd.c ---- linux-2.6.20.orig/fs/reiser4/ktxnmgrd.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/ktxnmgrd.c 2007-05-06 14:50:43.738988223 +0400 -@@ -0,0 +1,215 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+/* Transaction manager daemon. */ -+ -+/* -+ * ktxnmgrd is a kernel daemon responsible for committing transactions. It is -+ * needed/important for the following reasons: -+ * -+ * 1. in reiser4 atom is not committed immediately when last transaction -+ * handle closes, unless atom is either too old or too large (see -+ * atom_should_commit()). This is done to avoid committing too frequently. -+ * because: -+ * -+ * 2. sometimes we don't want to commit atom when closing last transaction -+ * handle even if it is old and fat enough. For example, because we are at -+ * this point under directory semaphore, and committing would stall all -+ * accesses to this directory. -+ * -+ * ktxnmgrd binds its time sleeping on condition variable. When is awakes -+ * either due to (tunable) timeout or because it was explicitly woken up by -+ * call to ktxnmgrd_kick(), it scans list of all atoms and commits ones -+ * eligible. -+ * -+ */ -+ -+#include "debug.h" -+#include "txnmgr.h" -+#include "tree.h" -+#include "ktxnmgrd.h" -+#include "super.h" -+#include "reiser4.h" -+ -+#include /* for struct task_struct */ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static int scan_mgr(struct super_block *); -+ -+/* -+ * change current->comm so that ps, top, and friends will see changed -+ * state. This serves no useful purpose whatsoever, but also costs nothing. May -+ * be it will make lonely system administrator feeling less alone at 3 A.M. -+ */ -+#define set_comm( state ) \ -+ snprintf( current -> comm, sizeof( current -> comm ), \ -+ "%s:%s:%s", __FUNCTION__, (super)->s_id, ( state ) ) -+ -+/** -+ * ktxnmgrd - kernel txnmgr daemon -+ * @arg: pointer to super block -+ * -+ * The background transaction manager daemon, started as a kernel thread during -+ * reiser4 initialization. -+ */ -+static int ktxnmgrd(void *arg) -+{ -+ struct super_block *super; -+ ktxnmgrd_context *ctx; -+ txn_mgr *mgr; -+ int done = 0; -+ -+ super = arg; -+ mgr = &get_super_private(super)->tmgr; -+ -+ /* -+ * do_fork() just copies task_struct into the new thread. ->fs_context -+ * shouldn't be copied of course. This shouldn't be a problem for the -+ * rest of the code though. -+ */ -+ current->journal_info = NULL; -+ ctx = mgr->daemon; -+ while (1) { -+ try_to_freeze(); -+ set_comm("wait"); -+ { -+ DEFINE_WAIT(__wait); -+ -+ prepare_to_wait(&ctx->wait, &__wait, TASK_INTERRUPTIBLE); -+ if (kthread_should_stop()) { -+ done = 1; -+ } else -+ schedule_timeout(ctx->timeout); -+ finish_wait(&ctx->wait, &__wait); -+ } -+ if (done) -+ break; -+ set_comm("run"); -+ spin_lock(&ctx->guard); -+ /* -+ * wait timed out or ktxnmgrd was woken up by explicit request -+ * to commit something. Scan list of atoms in txnmgr and look -+ * for too old atoms. -+ */ -+ do { -+ ctx->rescan = 0; -+ scan_mgr(super); -+ spin_lock(&ctx->guard); -+ if (ctx->rescan) { -+ /* -+ * the list could be modified while ctx -+ * spinlock was released, we have to repeat -+ * scanning from the beginning -+ */ -+ break; -+ } -+ } while (ctx->rescan); -+ spin_unlock(&ctx->guard); -+ } -+ return 0; -+} -+ -+#undef set_comm -+ -+/** -+ * reiser4_init_ktxnmgrd - initialize ktxnmgrd context and start kernel daemon -+ * @super: pointer to super block -+ * -+ * Allocates and initializes ktxnmgrd_context, attaches it to transaction -+ * manager. Starts kernel txnmgr daemon. This is called on mount. -+ */ -+int reiser4_init_ktxnmgrd(struct super_block *super) -+{ -+ txn_mgr *mgr; -+ ktxnmgrd_context *ctx; -+ -+ mgr = &get_super_private(super)->tmgr; -+ -+ assert("zam-1014", mgr->daemon == NULL); -+ -+ ctx = kmalloc(sizeof(ktxnmgrd_context), reiser4_ctx_gfp_mask_get()); -+ if (ctx == NULL) -+ return RETERR(-ENOMEM); -+ -+ assert("nikita-2442", ctx != NULL); -+ -+ memset(ctx, 0, sizeof *ctx); -+ init_waitqueue_head(&ctx->wait); -+ -+ /*kcond_init(&ctx->startup);*/ -+ spin_lock_init(&ctx->guard); -+ ctx->timeout = REISER4_TXNMGR_TIMEOUT; -+ ctx->rescan = 1; -+ mgr->daemon = ctx; -+ -+ ctx->tsk = kthread_run(ktxnmgrd, super, "ktxnmgrd"); -+ if (IS_ERR(ctx->tsk)) { -+ int ret = PTR_ERR(ctx->tsk); -+ mgr->daemon = NULL; -+ kfree(ctx); -+ return RETERR(ret); -+ } -+ return 0; -+} -+ -+void ktxnmgrd_kick(txn_mgr *mgr) -+{ -+ assert("nikita-3234", mgr != NULL); -+ assert("nikita-3235", mgr->daemon != NULL); -+ wake_up(&mgr->daemon->wait); -+} -+ -+int is_current_ktxnmgrd(void) -+{ -+ return (get_current_super_private()->tmgr.daemon->tsk == current); -+} -+ -+/** -+ * scan_mgr - commit atoms which are to be committed -+ * @super: super block to commit atoms of -+ * -+ * Commits old atoms. -+ */ -+static int scan_mgr(struct super_block *super) -+{ -+ int ret; -+ reiser4_context ctx; -+ -+ init_stack_context(&ctx, super); -+ -+ ret = commit_some_atoms(&get_super_private(super)->tmgr); -+ -+ reiser4_exit_context(&ctx); -+ return ret; -+} -+ -+/** -+ * reiser4_done_ktxnmgrd - stop kernel thread and frees ktxnmgrd context -+ * @mgr: -+ * -+ * This is called on umount. Stops ktxnmgrd and free t -+ */ -+void reiser4_done_ktxnmgrd(struct super_block *super) -+{ -+ txn_mgr *mgr; -+ -+ mgr = &get_super_private(super)->tmgr; -+ assert("zam-1012", mgr->daemon != NULL); -+ -+ kthread_stop(mgr->daemon->tsk); -+ kfree(mgr->daemon); -+ mgr->daemon = NULL; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 120 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/ktxnmgrd.h linux-2.6.20/fs/reiser4/ktxnmgrd.h ---- linux-2.6.20.orig/fs/reiser4/ktxnmgrd.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/ktxnmgrd.h 2007-05-06 14:50:43.738988223 +0400 -@@ -0,0 +1,52 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Transaction manager daemon. See ktxnmgrd.c for comments. */ -+ -+#ifndef __KTXNMGRD_H__ -+#define __KTXNMGRD_H__ -+ -+#include "txnmgr.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include /* for struct task_struct */ -+ -+/* in this structure all data necessary to start up, shut down and communicate -+ * with ktxnmgrd are kept. */ -+struct ktxnmgrd_context { -+ /* wait queue head on which ktxnmgrd sleeps */ -+ wait_queue_head_t wait; -+ /* spin lock protecting all fields of this structure */ -+ spinlock_t guard; -+ /* timeout of sleeping on ->wait */ -+ signed long timeout; -+ /* kernel thread running ktxnmgrd */ -+ struct task_struct *tsk; -+ /* list of all file systems served by this ktxnmgrd */ -+ struct list_head queue; -+ /* should ktxnmgrd repeat scanning of atoms? */ -+ unsigned int rescan:1; -+}; -+ -+extern int reiser4_init_ktxnmgrd(struct super_block *); -+extern void reiser4_done_ktxnmgrd(struct super_block *); -+ -+extern void ktxnmgrd_kick(txn_mgr * mgr); -+extern int is_current_ktxnmgrd(void); -+ -+/* __KTXNMGRD_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/lock.c linux-2.6.20/fs/reiser4/lock.c ---- linux-2.6.20.orig/fs/reiser4/lock.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/lock.c 2007-05-06 14:50:43.742989473 +0400 -@@ -0,0 +1,1232 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Traditional deadlock avoidance is achieved by acquiring all locks in a single -+ order. V4 balances the tree from the bottom up, and searches the tree from -+ the top down, and that is really the way we want it, so tradition won't work -+ for us. -+ -+ Instead we have two lock orderings, a high priority lock ordering, and a low -+ priority lock ordering. Each node in the tree has a lock in its znode. -+ -+ Suppose we have a set of processes which lock (R/W) tree nodes. Each process -+ has a set (maybe empty) of already locked nodes ("process locked set"). Each -+ process may have a pending lock request to a node locked by another process. -+ Note: we lock and unlock, but do not transfer locks: it is possible -+ transferring locks instead would save some bus locking.... -+ -+ Deadlock occurs when we have a loop constructed from process locked sets and -+ lock request vectors. -+ -+ NOTE: The reiser4 "tree" is a tree on disk, but its cached representation in -+ memory is extended with "znodes" with which we connect nodes with their left -+ and right neighbors using sibling pointers stored in the znodes. When we -+ perform balancing operations we often go from left to right and from right to -+ left. -+ -+ +-P1-+ +-P3-+ -+ |+--+| V1 |+--+| -+ ||N1|| -------> ||N3|| -+ |+--+| |+--+| -+ +----+ +----+ -+ ^ | -+ |V2 |V3 -+ | v -+ +---------P2---------+ -+ |+--+ +--+| -+ ||N2| -------- |N4|| -+ |+--+ +--+| -+ +--------------------+ -+ -+ We solve this by ensuring that only low priority processes lock in top to -+ bottom order and from right to left, and high priority processes lock from -+ bottom to top and left to right. -+ -+ ZAM-FIXME-HANS: order not just node locks in this way, order atom locks, and -+ kill those damn busy loops. -+ ANSWER(ZAM): atom locks (which are introduced by ASTAGE_CAPTURE_WAIT atom -+ stage) cannot be ordered that way. There are no rules what nodes can belong -+ to the atom and what nodes cannot. We cannot define what is right or left -+ direction, what is top or bottom. We can take immediate parent or side -+ neighbor of one node, but nobody guarantees that, say, left neighbor node is -+ not a far right neighbor for other nodes from the same atom. It breaks -+ deadlock avoidance rules and hi-low priority locking cannot be applied for -+ atom locks. -+ -+ How does it help to avoid deadlocks ? -+ -+ Suppose we have a deadlock with n processes. Processes from one priority -+ class never deadlock because they take locks in one consistent -+ order. -+ -+ So, any possible deadlock loop must have low priority as well as high -+ priority processes. There are no other lock priority levels except low and -+ high. We know that any deadlock loop contains at least one node locked by a -+ low priority process and requested by a high priority process. If this -+ situation is caught and resolved it is sufficient to avoid deadlocks. -+ -+ V4 DEADLOCK PREVENTION ALGORITHM IMPLEMENTATION. -+ -+ The deadlock prevention algorithm is based on comparing -+ priorities of node owners (processes which keep znode locked) and -+ requesters (processes which want to acquire a lock on znode). We -+ implement a scheme where low-priority owners yield locks to -+ high-priority requesters. We created a signal passing system that -+ is used to ask low-priority processes to yield one or more locked -+ znodes. -+ -+ The condition when a znode needs to change its owners is described by the -+ following formula: -+ -+ ############################################# -+ # # -+ # (number of high-priority requesters) > 0 # -+ # AND # -+ # (numbers of high-priority owners) == 0 # -+ # # -+ ############################################# -+ -+ Note that a low-priority process delays node releasing if another -+ high-priority process owns this node. So, slightly more strictly speaking, -+ to have a deadlock capable cycle you must have a loop in which a high -+ priority process is waiting on a low priority process to yield a node, which -+ is slightly different from saying a high priority process is waiting on a -+ node owned by a low priority process. -+ -+ It is enough to avoid deadlocks if we prevent any low-priority process from -+ falling asleep if its locked set contains a node which satisfies the -+ deadlock condition. -+ -+ That condition is implicitly or explicitly checked in all places where new -+ high-priority requests may be added or removed from node request queue or -+ high-priority process takes or releases a lock on node. The main -+ goal of these checks is to never lose the moment when node becomes "has -+ wrong owners" and send "must-yield-this-lock" signals to its low-pri owners -+ at that time. -+ -+ The information about received signals is stored in the per-process -+ structure (lock stack) and analyzed before a low-priority process goes to -+ sleep but after a "fast" attempt to lock a node fails. Any signal wakes -+ sleeping process up and forces him to re-check lock status and received -+ signal info. If "must-yield-this-lock" signals were received the locking -+ primitive (longterm_lock_znode()) fails with -E_DEADLOCK error code. -+ -+ V4 LOCKING DRAWBACKS -+ -+ If we have already balanced on one level, and we are propagating our changes -+ upward to a higher level, it could be very messy to surrender all locks on -+ the lower level because we put so much computational work into it, and -+ reverting them to their state before they were locked might be very complex. -+ We also don't want to acquire all locks before performing balancing because -+ that would either be almost as much work as the balancing, or it would be -+ too conservative and lock too much. We want balancing to be done only at -+ high priority. Yet, we might want to go to the left one node and use some -+ of its empty space... So we make one attempt at getting the node to the left -+ using try_lock, and if it fails we do without it, because we didn't really -+ need it, it was only a nice to have. -+ -+ LOCK STRUCTURES DESCRIPTION -+ -+ The following data structures are used in the reiser4 locking -+ implementation: -+ -+ All fields related to long-term locking are stored in znode->lock. -+ -+ The lock stack is a per thread object. It owns all znodes locked by the -+ thread. One znode may be locked by several threads in case of read lock or -+ one znode may be write locked by one thread several times. The special link -+ objects (lock handles) support n<->m relation between znodes and lock -+ owners. -+ -+ -+ -+ +---------+ +---------+ -+ | LS1 | | LS2 | -+ +---------+ +---------+ -+ ^ ^ -+ |---------------+ +----------+ -+ v v v v -+ +---------+ +---------+ +---------+ +---------+ -+ | LH1 | | LH2 | | LH3 | | LH4 | -+ +---------+ +---------+ +---------+ +---------+ -+ ^ ^ ^ ^ -+ | +------------+ | -+ v v v -+ +---------+ +---------+ +---------+ -+ | Z1 | | Z2 | | Z3 | -+ +---------+ +---------+ +---------+ -+ -+ Thread 1 locked znodes Z1 and Z2, thread 2 locked znodes Z2 and Z3. The -+ picture above shows that lock stack LS1 has a list of 2 lock handles LH1 and -+ LH2, lock stack LS2 has a list with lock handles LH3 and LH4 on it. Znode -+ Z1 is locked by only one thread, znode has only one lock handle LH1 on its -+ list, similar situation is for Z3 which is locked by the thread 2 only. Z2 -+ is locked (for read) twice by different threads and two lock handles are on -+ its list. Each lock handle represents a single relation of a locking of a -+ znode by a thread. Locking of a znode is an establishing of a locking -+ relation between the lock stack and the znode by adding of a new lock handle -+ to a list of lock handles, the lock stack. The lock stack links all lock -+ handles for all znodes locked by the lock stack. The znode list groups all -+ lock handles for all locks stacks which locked the znode. -+ -+ Yet another relation may exist between znode and lock owners. If lock -+ procedure cannot immediately take lock on an object it adds the lock owner -+ on special `requestors' list belongs to znode. That list represents a -+ queue of pending lock requests. Because one lock owner may request only -+ only one lock object at a time, it is a 1->n relation between lock objects -+ and a lock owner implemented as it is described above. Full information -+ (priority, pointers to lock and link objects) about each lock request is -+ stored in lock owner structure in `request' field. -+ -+ SHORT_TERM LOCKING -+ -+ This is a list of primitive operations over lock stacks / lock handles / -+ znodes and locking descriptions for them. -+ -+ 1. locking / unlocking which is done by two list insertion/deletion, one -+ to/from znode's list of lock handles, another one is to/from lock stack's -+ list of lock handles. The first insertion is protected by -+ znode->lock.guard spinlock. The list owned by the lock stack can be -+ modified only by thread who owns the lock stack and nobody else can -+ modify/read it. There is nothing to be protected by a spinlock or -+ something else. -+ -+ 2. adding/removing a lock request to/from znode requesters list. The rule is -+ that znode->lock.guard spinlock should be taken for this. -+ -+ 3. we can traverse list of lock handles and use references to lock stacks who -+ locked given znode if znode->lock.guard spinlock is taken. -+ -+ 4. If a lock stack is associated with a znode as a lock requestor or lock -+ owner its existence is guaranteed by znode->lock.guard spinlock. Some its -+ (lock stack's) fields should be protected from being accessed in parallel -+ by two or more threads. Please look at lock_stack structure definition -+ for the info how those fields are protected. */ -+ -+/* Znode lock and capturing intertwining. */ -+/* In current implementation we capture formatted nodes before locking -+ them. Take a look on longterm lock znode, reiser4_try_capture() request -+ precedes locking requests. The longterm_lock_znode function unconditionally -+ captures znode before even checking of locking conditions. -+ -+ Another variant is to capture znode after locking it. It was not tested, but -+ at least one deadlock condition is supposed to be there. One thread has -+ locked a znode (Node-1) and calls reiser4_try_capture() for it. -+ reiser4_try_capture() sleeps because znode's atom has CAPTURE_WAIT state. -+ Second thread is a flushing thread, its current atom is the atom Node-1 -+ belongs to. Second thread wants to lock Node-1 and sleeps because Node-1 -+ is locked by the first thread. The described situation is a deadlock. */ -+ -+#include "debug.h" -+#include "txnmgr.h" -+#include "znode.h" -+#include "jnode.h" -+#include "tree.h" -+#include "plugin/node/node.h" -+#include "super.h" -+ -+#include -+ -+#if REISER4_DEBUG -+static int request_is_deadlock_safe(znode *, znode_lock_mode, -+ znode_lock_request); -+#endif -+ -+/* Returns a lock owner associated with current thread */ -+lock_stack *get_current_lock_stack(void) -+{ -+ return &get_current_context()->stack; -+} -+ -+/* Wakes up all low priority owners informing them about possible deadlock */ -+static void wake_up_all_lopri_owners(znode * node) -+{ -+ lock_handle *handle; -+ -+ assert_spin_locked(&(node->lock.guard)); -+ list_for_each_entry(handle, &node->lock.owners, owners_link) { -+ assert("nikita-1832", handle->node == node); -+ /* count this signal in owner->nr_signaled */ -+ if (!handle->signaled) { -+ handle->signaled = 1; -+ atomic_inc(&handle->owner->nr_signaled); -+ /* Wake up a single process */ -+ reiser4_wake_up(handle->owner); -+ } -+ } -+} -+ -+/* Adds a lock to a lock owner, which means creating a link to the lock and -+ putting the link into the two lists all links are on (the doubly linked list -+ that forms the lock_stack, and the doubly linked list of links attached -+ to a lock. -+*/ -+static inline void -+link_object(lock_handle * handle, lock_stack * owner, znode * node) -+{ -+ assert("jmacd-810", handle->owner == NULL); -+ assert_spin_locked(&(node->lock.guard)); -+ -+ handle->owner = owner; -+ handle->node = node; -+ -+ assert("reiser4-4", -+ ergo(list_empty_careful(&owner->locks), owner->nr_locks == 0)); -+ -+ /* add lock handle to the end of lock_stack's list of locks */ -+ list_add_tail(&handle->locks_link, &owner->locks); -+ ON_DEBUG(owner->nr_locks++); -+ reiser4_ctx_gfp_mask_set(); -+ -+ /* add lock handle to the head of znode's list of owners */ -+ list_add(&handle->owners_link, &node->lock.owners); -+ handle->signaled = 0; -+} -+ -+/* Breaks a relation between a lock and its owner */ -+static inline void unlink_object(lock_handle * handle) -+{ -+ assert("zam-354", handle->owner != NULL); -+ assert("nikita-1608", handle->node != NULL); -+ assert_spin_locked(&(handle->node->lock.guard)); -+ assert("nikita-1829", handle->owner == get_current_lock_stack()); -+ assert("reiser4-5", handle->owner->nr_locks > 0); -+ -+ /* remove lock handle from lock_stack's list of locks */ -+ list_del(&handle->locks_link); -+ ON_DEBUG(handle->owner->nr_locks--); -+ reiser4_ctx_gfp_mask_set(); -+ assert("reiser4-6", -+ ergo(list_empty_careful(&handle->owner->locks), -+ handle->owner->nr_locks == 0)); -+ /* remove lock handle from znode's list of owners */ -+ list_del(&handle->owners_link); -+ /* indicates that lock handle is free now */ -+ handle->node = NULL; -+#if REISER4_DEBUG -+ INIT_LIST_HEAD(&handle->locks_link); -+ INIT_LIST_HEAD(&handle->owners_link); -+ handle->owner = NULL; -+#endif -+} -+ -+/* Actually locks an object knowing that we are able to do this */ -+static void lock_object(lock_stack * owner) -+{ -+ lock_request *request; -+ znode *node; -+ -+ request = &owner->request; -+ node = request->node; -+ assert_spin_locked(&(node->lock.guard)); -+ if (request->mode == ZNODE_READ_LOCK) { -+ node->lock.nr_readers++; -+ } else { -+ /* check that we don't switched from read to write lock */ -+ assert("nikita-1840", node->lock.nr_readers <= 0); -+ /* We allow recursive locking; a node can be locked several -+ times for write by same process */ -+ node->lock.nr_readers--; -+ } -+ -+ link_object(request->handle, owner, node); -+ -+ if (owner->curpri) { -+ node->lock.nr_hipri_owners++; -+ } -+} -+ -+/* Check for recursive write locking */ -+static int recursive(lock_stack * owner) -+{ -+ int ret; -+ znode *node; -+ lock_handle *lh; -+ -+ node = owner->request.node; -+ -+ /* Owners list is not empty for a locked node */ -+ assert("zam-314", !list_empty_careful(&node->lock.owners)); -+ assert("nikita-1841", owner == get_current_lock_stack()); -+ assert_spin_locked(&(node->lock.guard)); -+ -+ lh = list_entry(node->lock.owners.next, lock_handle, owners_link); -+ ret = (lh->owner == owner); -+ -+ /* Recursive read locking should be done usual way */ -+ assert("zam-315", !ret || owner->request.mode == ZNODE_WRITE_LOCK); -+ /* mixing of read/write locks is not allowed */ -+ assert("zam-341", !ret || znode_is_wlocked(node)); -+ -+ return ret; -+} -+ -+#if REISER4_DEBUG -+/* Returns true if the lock is held by the calling thread. */ -+int znode_is_any_locked(const znode * node) -+{ -+ lock_handle *handle; -+ lock_stack *stack; -+ int ret; -+ -+ if (!znode_is_locked(node)) { -+ return 0; -+ } -+ -+ stack = get_current_lock_stack(); -+ -+ spin_lock_stack(stack); -+ -+ ret = 0; -+ -+ list_for_each_entry(handle, &stack->locks, locks_link) { -+ if (handle->node == node) { -+ ret = 1; -+ break; -+ } -+ } -+ -+ spin_unlock_stack(stack); -+ -+ return ret; -+} -+ -+#endif -+ -+/* Returns true if a write lock is held by the calling thread. */ -+int znode_is_write_locked(const znode * node) -+{ -+ lock_stack *stack; -+ lock_handle *handle; -+ -+ assert("jmacd-8765", node != NULL); -+ -+ if (!znode_is_wlocked(node)) { -+ return 0; -+ } -+ -+ stack = get_current_lock_stack(); -+ -+ /* -+ * When znode is write locked, all owner handles point to the same lock -+ * stack. Get pointer to lock stack from the first lock handle from -+ * znode's owner list -+ */ -+ handle = list_entry(node->lock.owners.next, lock_handle, owners_link); -+ -+ return (handle->owner == stack); -+} -+ -+/* This "deadlock" condition is the essential part of reiser4 locking -+ implementation. This condition is checked explicitly by calling -+ check_deadlock_condition() or implicitly in all places where znode lock -+ state (set of owners and request queue) is changed. Locking code is -+ designed to use this condition to trigger procedure of passing object from -+ low priority owner(s) to high priority one(s). -+ -+ The procedure results in passing an event (setting lock_handle->signaled -+ flag) and counting this event in nr_signaled field of owner's lock stack -+ object and wakeup owner's process. -+*/ -+static inline int check_deadlock_condition(znode * node) -+{ -+ assert_spin_locked(&(node->lock.guard)); -+ return node->lock.nr_hipri_requests > 0 -+ && node->lock.nr_hipri_owners == 0; -+} -+ -+static int check_livelock_condition(znode * node, znode_lock_mode mode) -+{ -+ zlock * lock = &node->lock; -+ -+ return mode == ZNODE_READ_LOCK && -+ lock -> nr_readers >= 0 && lock->nr_hipri_write_requests > 0; -+} -+ -+/* checks lock/request compatibility */ -+static int can_lock_object(lock_stack * owner) -+{ -+ znode *node = owner->request.node; -+ -+ assert_spin_locked(&(node->lock.guard)); -+ -+ /* See if the node is disconnected. */ -+ if (unlikely(ZF_ISSET(node, JNODE_IS_DYING))) -+ return RETERR(-EINVAL); -+ -+ /* Do not ever try to take a lock if we are going in low priority -+ direction and a node have a high priority request without high -+ priority owners. */ -+ if (unlikely(!owner->curpri && check_deadlock_condition(node))) -+ return RETERR(-E_REPEAT); -+ if (unlikely(owner->curpri && check_livelock_condition(node, owner->request.mode))) -+ return RETERR(-E_REPEAT); -+ if (unlikely(!is_lock_compatible(node, owner->request.mode))) -+ return RETERR(-E_REPEAT); -+ return 0; -+} -+ -+/* Setting of a high priority to the process. It clears "signaled" flags -+ because znode locked by high-priority process can't satisfy our "deadlock -+ condition". */ -+static void set_high_priority(lock_stack * owner) -+{ -+ assert("nikita-1846", owner == get_current_lock_stack()); -+ /* Do nothing if current priority is already high */ -+ if (!owner->curpri) { -+ /* We don't need locking for owner->locks list, because, this -+ * function is only called with the lock stack of the current -+ * thread, and no other thread can play with owner->locks list -+ * and/or change ->node pointers of lock handles in this list. -+ * -+ * (Interrupts also are not involved.) -+ */ -+ lock_handle *item = list_entry(owner->locks.next, lock_handle, locks_link); -+ while (&owner->locks != &item->locks_link) { -+ znode *node = item->node; -+ -+ spin_lock_zlock(&node->lock); -+ -+ node->lock.nr_hipri_owners++; -+ -+ /* we can safely set signaled to zero, because -+ previous statement (nr_hipri_owners ++) guarantees -+ that signaled will be never set again. */ -+ item->signaled = 0; -+ spin_unlock_zlock(&node->lock); -+ -+ item = list_entry(item->locks_link.next, lock_handle, locks_link); -+ } -+ owner->curpri = 1; -+ atomic_set(&owner->nr_signaled, 0); -+ } -+} -+ -+/* Sets a low priority to the process. */ -+static void set_low_priority(lock_stack * owner) -+{ -+ assert("nikita-3075", owner == get_current_lock_stack()); -+ /* Do nothing if current priority is already low */ -+ if (owner->curpri) { -+ /* scan all locks (lock handles) held by @owner, which is -+ actually current thread, and check whether we are reaching -+ deadlock possibility anywhere. -+ */ -+ lock_handle *handle = list_entry(owner->locks.next, lock_handle, locks_link); -+ while (&owner->locks != &handle->locks_link) { -+ znode *node = handle->node; -+ spin_lock_zlock(&node->lock); -+ /* this thread just was hipri owner of @node, so -+ nr_hipri_owners has to be greater than zero. */ -+ assert("nikita-1835", node->lock.nr_hipri_owners > 0); -+ node->lock.nr_hipri_owners--; -+ /* If we have deadlock condition, adjust a nr_signaled -+ field. It is enough to set "signaled" flag only for -+ current process, other low-pri owners will be -+ signaled and waken up after current process unlocks -+ this object and any high-priority requestor takes -+ control. */ -+ if (check_deadlock_condition(node) -+ && !handle->signaled) { -+ handle->signaled = 1; -+ atomic_inc(&owner->nr_signaled); -+ } -+ spin_unlock_zlock(&node->lock); -+ handle = list_entry(handle->locks_link.next, lock_handle, locks_link); -+ } -+ owner->curpri = 0; -+ } -+} -+ -+static void remove_lock_request(lock_stack * requestor) -+{ -+ zlock * lock = &requestor->request.node->lock; -+ -+ if (requestor->curpri) { -+ assert("nikita-1838", lock->nr_hipri_requests > 0); -+ lock->nr_hipri_requests--; -+ if (requestor->request.mode == ZNODE_WRITE_LOCK) -+ lock->nr_hipri_write_requests --; -+ } -+ list_del(&requestor->requestors_link); -+} -+ -+static void invalidate_all_lock_requests(znode * node) -+{ -+ lock_stack *requestor, *tmp; -+ -+ assert_spin_locked(&(node->lock.guard)); -+ -+ list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) { -+ remove_lock_request(requestor); -+ requestor->request.ret_code = -EINVAL; -+ reiser4_wake_up(requestor); -+ requestor->request.mode = ZNODE_NO_LOCK; -+ } -+} -+ -+static void dispatch_lock_requests(znode * node) -+{ -+ lock_stack *requestor, *tmp; -+ -+ assert_spin_locked(&(node->lock.guard)); -+ -+ list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) { -+ if (znode_is_write_locked(node)) -+ break; -+ if (!can_lock_object(requestor)) { -+ lock_object(requestor); -+ remove_lock_request(requestor); -+ requestor->request.ret_code = 0; -+ reiser4_wake_up(requestor); -+ requestor->request.mode = ZNODE_NO_LOCK; -+ } -+ } -+} -+ -+/* release long-term lock, acquired by longterm_lock_znode() */ -+void longterm_unlock_znode(lock_handle * handle) -+{ -+ znode *node = handle->node; -+ lock_stack *oldowner = handle->owner; -+ int hipri; -+ int readers; -+ int rdelta; -+ int youdie; -+ -+ /* -+ * this is time-critical and highly optimized code. Modify carefully. -+ */ -+ -+ assert("jmacd-1021", handle != NULL); -+ assert("jmacd-1022", handle->owner != NULL); -+ assert("nikita-1392", LOCK_CNT_GTZ(long_term_locked_znode)); -+ -+ assert("zam-130", oldowner == get_current_lock_stack()); -+ -+ LOCK_CNT_DEC(long_term_locked_znode); -+ -+ /* -+ * to minimize amount of operations performed under lock, pre-compute -+ * all variables used within critical section. This makes code -+ * obscure. -+ */ -+ -+ /* was this lock of hi or lo priority */ -+ hipri = oldowner->curpri ? 1 : 0; -+ /* number of readers */ -+ readers = node->lock.nr_readers; -+ /* +1 if write lock, -1 if read lock */ -+ rdelta = (readers > 0) ? -1 : +1; -+ /* true if node is to die and write lock is released */ -+ youdie = ZF_ISSET(node, JNODE_HEARD_BANSHEE) && (readers < 0); -+ -+ spin_lock_zlock(&node->lock); -+ -+ assert("zam-101", znode_is_locked(node)); -+ -+ /* Adjust a number of high priority owners of this lock */ -+ assert("nikita-1836", node->lock.nr_hipri_owners >= hipri); -+ node->lock.nr_hipri_owners -= hipri; -+ -+ /* Handle znode deallocation on last write-lock release. */ -+ if (znode_is_wlocked_once(node)) { -+ if (youdie) { -+ forget_znode(handle); -+ assert("nikita-2191", znode_invariant(node)); -+ zput(node); -+ return; -+ } -+ } -+ -+ if (handle->signaled) -+ atomic_dec(&oldowner->nr_signaled); -+ -+ /* Unlocking means owner<->object link deletion */ -+ unlink_object(handle); -+ -+ /* This is enough to be sure whether an object is completely -+ unlocked. */ -+ node->lock.nr_readers += rdelta; -+ -+ /* If the node is locked it must have an owners list. Likewise, if -+ the node is unlocked it must have an empty owners list. */ -+ assert("zam-319", equi(znode_is_locked(node), -+ !list_empty_careful(&node->lock.owners))); -+ -+#if REISER4_DEBUG -+ if (!znode_is_locked(node)) -+ ++node->times_locked; -+#endif -+ -+ /* If there are pending lock requests we wake up a requestor */ -+ if (!znode_is_wlocked(node)) -+ dispatch_lock_requests(node); -+ if (check_deadlock_condition(node)) -+ wake_up_all_lopri_owners(node); -+ spin_unlock_zlock(&node->lock); -+ -+ /* minus one reference from handle->node */ -+ assert("nikita-2190", znode_invariant(node)); -+ ON_DEBUG(check_lock_data()); -+ ON_DEBUG(check_lock_node_data(node)); -+ zput(node); -+} -+ -+/* final portion of longterm-lock */ -+static int -+lock_tail(lock_stack * owner, int ok, znode_lock_mode mode) -+{ -+ znode *node = owner->request.node; -+ -+ assert_spin_locked(&(node->lock.guard)); -+ -+ /* If we broke with (ok == 0) it means we can_lock, now do it. */ -+ if (ok == 0) { -+ lock_object(owner); -+ owner->request.mode = 0; -+ /* count a reference from lockhandle->node -+ -+ znode was already referenced at the entry to this function, -+ hence taking spin-lock here is not necessary (see comment -+ in the zref()). -+ */ -+ zref(node); -+ -+ LOCK_CNT_INC(long_term_locked_znode); -+ } -+ spin_unlock_zlock(&node->lock); -+ ON_DEBUG(check_lock_data()); -+ ON_DEBUG(check_lock_node_data(node)); -+ return ok; -+} -+ -+/* -+ * version of longterm_znode_lock() optimized for the most common case: read -+ * lock without any special flags. This is the kind of lock that any tree -+ * traversal takes on the root node of the tree, which is very frequent. -+ */ -+static int longterm_lock_tryfast(lock_stack * owner) -+{ -+ int result; -+ znode *node; -+ zlock *lock; -+ -+ node = owner->request.node; -+ lock = &node->lock; -+ -+ assert("nikita-3340", reiser4_schedulable()); -+ assert("nikita-3341", request_is_deadlock_safe(node, -+ ZNODE_READ_LOCK, -+ ZNODE_LOCK_LOPRI)); -+ spin_lock_zlock(lock); -+ result = can_lock_object(owner); -+ spin_unlock_zlock(lock); -+ -+ if (likely(result != -EINVAL)) { -+ spin_lock_znode(node); -+ result = reiser4_try_capture(ZJNODE(node), ZNODE_READ_LOCK, 0); -+ spin_unlock_znode(node); -+ spin_lock_zlock(lock); -+ if (unlikely(result != 0)) { -+ owner->request.mode = 0; -+ } else { -+ result = can_lock_object(owner); -+ if (unlikely(result == -E_REPEAT)) { -+ /* fall back to longterm_lock_znode() */ -+ spin_unlock_zlock(lock); -+ return 1; -+ } -+ } -+ return lock_tail(owner, result, ZNODE_READ_LOCK); -+ } else -+ return 1; -+} -+ -+/* locks given lock object */ -+int longterm_lock_znode( -+ /* local link object (allocated by lock owner thread, usually on its own -+ * stack) */ -+ lock_handle * handle, -+ /* znode we want to lock. */ -+ znode * node, -+ /* {ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}; */ -+ znode_lock_mode mode, -+ /* {0, -EINVAL, -E_DEADLOCK}, see return codes description. */ -+ znode_lock_request request) { -+ int ret; -+ int hipri = (request & ZNODE_LOCK_HIPRI) != 0; -+ int non_blocking = 0; -+ int has_atom; -+ txn_capture cap_flags; -+ zlock *lock; -+ txn_handle *txnh; -+ tree_level level; -+ -+ /* Get current process context */ -+ lock_stack *owner = get_current_lock_stack(); -+ -+ /* Check that the lock handle is initialized and isn't already being -+ * used. */ -+ assert("jmacd-808", handle->owner == NULL); -+ assert("nikita-3026", reiser4_schedulable()); -+ assert("nikita-3219", request_is_deadlock_safe(node, mode, request)); -+ assert("zam-1056", atomic_read(&ZJNODE(node)->x_count) > 0); -+ /* long term locks are not allowed in the VM contexts (->writepage(), -+ * prune_{d,i}cache()). -+ * -+ * FIXME this doesn't work due to unused-dentry-with-unlinked-inode -+ * bug caused by d_splice_alias() only working for directories. -+ */ -+ assert("nikita-3547", 1 || ((current->flags & PF_MEMALLOC) == 0)); -+ assert ("zam-1055", mode != ZNODE_NO_LOCK); -+ -+ cap_flags = 0; -+ if (request & ZNODE_LOCK_NONBLOCK) { -+ cap_flags |= TXN_CAPTURE_NONBLOCKING; -+ non_blocking = 1; -+ } -+ -+ if (request & ZNODE_LOCK_DONT_FUSE) -+ cap_flags |= TXN_CAPTURE_DONT_FUSE; -+ -+ /* If we are changing our process priority we must adjust a number -+ of high priority owners for each znode that we already lock */ -+ if (hipri) { -+ set_high_priority(owner); -+ } else { -+ set_low_priority(owner); -+ } -+ -+ level = znode_get_level(node); -+ -+ /* Fill request structure with our values. */ -+ owner->request.mode = mode; -+ owner->request.handle = handle; -+ owner->request.node = node; -+ -+ txnh = get_current_context()->trans; -+ lock = &node->lock; -+ -+ if (mode == ZNODE_READ_LOCK && request == 0) { -+ ret = longterm_lock_tryfast(owner); -+ if (ret <= 0) -+ return ret; -+ } -+ -+ has_atom = (txnh->atom != NULL); -+ -+ /* Synchronize on node's zlock guard lock. */ -+ spin_lock_zlock(lock); -+ -+ if (znode_is_locked(node) && -+ mode == ZNODE_WRITE_LOCK && recursive(owner)) -+ return lock_tail(owner, 0, mode); -+ -+ for (;;) { -+ /* Check the lock's availability: if it is unavaiable we get -+ E_REPEAT, 0 indicates "can_lock", otherwise the node is -+ invalid. */ -+ ret = can_lock_object(owner); -+ -+ if (unlikely(ret == -EINVAL)) { -+ /* @node is dying. Leave it alone. */ -+ break; -+ } -+ -+ if (unlikely(ret == -E_REPEAT && non_blocking)) { -+ /* either locking of @node by the current thread will -+ * lead to the deadlock, or lock modes are -+ * incompatible. */ -+ break; -+ } -+ -+ assert("nikita-1844", (ret == 0) -+ || ((ret == -E_REPEAT) && !non_blocking)); -+ /* If we can get the lock... Try to capture first before -+ taking the lock. */ -+ -+ /* first handle commonest case where node and txnh are already -+ * in the same atom. */ -+ /* safe to do without taking locks, because: -+ * -+ * 1. read of aligned word is atomic with respect to writes to -+ * this word -+ * -+ * 2. false negatives are handled in reiser4_try_capture(). -+ * -+ * 3. false positives are impossible. -+ * -+ * PROOF: left as an exercise to the curious reader. -+ * -+ * Just kidding. Here is one: -+ * -+ * At the time T0 txnh->atom is stored in txnh_atom. -+ * -+ * At the time T1 node->atom is stored in node_atom. -+ * -+ * At the time T2 we observe that -+ * -+ * txnh_atom != NULL && node_atom == txnh_atom. -+ * -+ * Imagine that at this moment we acquire node and txnh spin -+ * lock in this order. Suppose that under spin lock we have -+ * -+ * node->atom != txnh->atom, (S1) -+ * -+ * at the time T3. -+ * -+ * txnh->atom != NULL still, because txnh is open by the -+ * current thread. -+ * -+ * Suppose node->atom == NULL, that is, node was un-captured -+ * between T1, and T3. But un-capturing of formatted node is -+ * always preceded by the call to reiser4_invalidate_lock(), -+ * which marks znode as JNODE_IS_DYING under zlock spin -+ * lock. Contradiction, because can_lock_object() above checks -+ * for JNODE_IS_DYING. Hence, node->atom != NULL at T3. -+ * -+ * Suppose that node->atom != node_atom, that is, atom, node -+ * belongs to was fused into another atom: node_atom was fused -+ * into node->atom. Atom of txnh was equal to node_atom at T2, -+ * which means that under spin lock, txnh->atom == node->atom, -+ * because txnh->atom can only follow fusion -+ * chain. Contradicts S1. -+ * -+ * The same for hypothesis txnh->atom != txnh_atom. Hence, -+ * node->atom == node_atom == txnh_atom == txnh->atom. Again -+ * contradicts S1. Hence S1 is false. QED. -+ * -+ */ -+ -+ if (likely(has_atom && ZJNODE(node)->atom == txnh->atom)) { -+ ; -+ } else { -+ /* -+ * unlock zlock spin lock here. It is possible for -+ * longterm_unlock_znode() to sneak in here, but there -+ * is no harm: reiser4_invalidate_lock() will mark znode -+ * as JNODE_IS_DYING and this will be noted by -+ * can_lock_object() below. -+ */ -+ spin_unlock_zlock(lock); -+ spin_lock_znode(node); -+ ret = reiser4_try_capture(ZJNODE(node), mode, cap_flags); -+ spin_unlock_znode(node); -+ spin_lock_zlock(lock); -+ if (unlikely(ret != 0)) { -+ /* In the failure case, the txnmgr releases -+ the znode's lock (or in some cases, it was -+ released a while ago). There's no need to -+ reacquire it so we should return here, -+ avoid releasing the lock. */ -+ owner->request.mode = 0; -+ break; -+ } -+ -+ /* Check the lock's availability again -- this is -+ because under some circumstances the capture code -+ has to release and reacquire the znode spinlock. */ -+ ret = can_lock_object(owner); -+ } -+ -+ /* This time, a return of (ret == 0) means we can lock, so we -+ should break out of the loop. */ -+ if (likely(ret != -E_REPEAT || non_blocking)) -+ break; -+ -+ /* Lock is unavailable, we have to wait. */ -+ ret = reiser4_prepare_to_sleep(owner); -+ if (unlikely(ret != 0)) -+ break; -+ -+ assert_spin_locked(&(node->lock.guard)); -+ if (hipri) { -+ /* If we are going in high priority direction then -+ increase high priority requests counter for the -+ node */ -+ lock->nr_hipri_requests++; -+ if (mode == ZNODE_WRITE_LOCK) -+ lock->nr_hipri_write_requests ++; -+ /* If there are no high priority owners for a node, -+ then immediately wake up low priority owners, so -+ they can detect possible deadlock */ -+ if (lock->nr_hipri_owners == 0) -+ wake_up_all_lopri_owners(node); -+ } -+ list_add_tail(&owner->requestors_link, &lock->requestors); -+ -+ /* Ok, here we have prepared a lock request, so unlock -+ a znode ... */ -+ spin_unlock_zlock(lock); -+ /* ... and sleep */ -+ reiser4_go_to_sleep(owner); -+ if (owner->request.mode == ZNODE_NO_LOCK) -+ goto request_is_done; -+ spin_lock_zlock(lock); -+ if (owner->request.mode == ZNODE_NO_LOCK) { -+ spin_unlock_zlock(lock); -+ request_is_done: -+ if (owner->request.ret_code == 0) { -+ LOCK_CNT_INC(long_term_locked_znode); -+ zref(node); -+ } -+ return owner->request.ret_code; -+ } -+ remove_lock_request(owner); -+ } -+ -+ return lock_tail(owner, ret, mode); -+} -+ -+/* lock object invalidation means changing of lock object state to `INVALID' -+ and waiting for all other processes to cancel theirs lock requests. */ -+void reiser4_invalidate_lock(lock_handle * handle /* path to lock -+ * owner and lock -+ * object is being -+ * invalidated. */ ) -+{ -+ znode *node = handle->node; -+ lock_stack *owner = handle->owner; -+ -+ assert("zam-325", owner == get_current_lock_stack()); -+ assert("zam-103", znode_is_write_locked(node)); -+ assert("nikita-1393", !ZF_ISSET(node, JNODE_LEFT_CONNECTED)); -+ assert("nikita-1793", !ZF_ISSET(node, JNODE_RIGHT_CONNECTED)); -+ assert("nikita-1394", ZF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ assert("nikita-3097", znode_is_wlocked_once(node)); -+ assert_spin_locked(&(node->lock.guard)); -+ -+ if (handle->signaled) -+ atomic_dec(&owner->nr_signaled); -+ -+ ZF_SET(node, JNODE_IS_DYING); -+ unlink_object(handle); -+ node->lock.nr_readers = 0; -+ -+ invalidate_all_lock_requests(node); -+ spin_unlock_zlock(&node->lock); -+} -+ -+/* Initializes lock_stack. */ -+void init_lock_stack(lock_stack * owner /* pointer to -+ * allocated -+ * structure. */ ) -+{ -+ INIT_LIST_HEAD(&owner->locks); -+ INIT_LIST_HEAD(&owner->requestors_link); -+ spin_lock_init(&owner->sguard); -+ owner->curpri = 1; -+ init_waitqueue_head(&owner->wait); -+} -+ -+/* Initializes lock object. */ -+void reiser4_init_lock(zlock * lock /* pointer on allocated -+ * uninitialized lock object -+ * structure. */ ) -+{ -+ memset(lock, 0, sizeof(zlock)); -+ spin_lock_init(&lock->guard); -+ INIT_LIST_HEAD(&lock->requestors); -+ INIT_LIST_HEAD(&lock->owners); -+} -+ -+/* Transfer a lock handle (presumably so that variables can be moved between stack and -+ heap locations). */ -+static void -+move_lh_internal(lock_handle * new, lock_handle * old, int unlink_old) -+{ -+ znode *node = old->node; -+ lock_stack *owner = old->owner; -+ int signaled; -+ -+ /* locks_list, modified by link_object() is not protected by -+ anything. This is valid because only current thread ever modifies -+ locks_list of its lock_stack. -+ */ -+ assert("nikita-1827", owner == get_current_lock_stack()); -+ assert("nikita-1831", new->owner == NULL); -+ -+ spin_lock_zlock(&node->lock); -+ -+ signaled = old->signaled; -+ if (unlink_old) { -+ unlink_object(old); -+ } else { -+ if (node->lock.nr_readers > 0) { -+ node->lock.nr_readers += 1; -+ } else { -+ node->lock.nr_readers -= 1; -+ } -+ if (signaled) { -+ atomic_inc(&owner->nr_signaled); -+ } -+ if (owner->curpri) { -+ node->lock.nr_hipri_owners += 1; -+ } -+ LOCK_CNT_INC(long_term_locked_znode); -+ -+ zref(node); -+ } -+ link_object(new, owner, node); -+ new->signaled = signaled; -+ -+ spin_unlock_zlock(&node->lock); -+} -+ -+void move_lh(lock_handle * new, lock_handle * old) -+{ -+ move_lh_internal(new, old, /*unlink_old */ 1); -+} -+ -+void copy_lh(lock_handle * new, lock_handle * old) -+{ -+ move_lh_internal(new, old, /*unlink_old */ 0); -+} -+ -+/* after getting -E_DEADLOCK we unlock znodes until this function returns false */ -+int reiser4_check_deadlock(void) -+{ -+ lock_stack *owner = get_current_lock_stack(); -+ return atomic_read(&owner->nr_signaled) != 0; -+} -+ -+/* Before going to sleep we re-check "release lock" requests which might come from threads with hi-pri lock -+ priorities. */ -+int reiser4_prepare_to_sleep(lock_stack * owner) -+{ -+ assert("nikita-1847", owner == get_current_lock_stack()); -+ -+ /* We return -E_DEADLOCK if one or more "give me the lock" messages are -+ * counted in nr_signaled */ -+ if (unlikely(atomic_read(&owner->nr_signaled) != 0)) { -+ assert("zam-959", !owner->curpri); -+ return RETERR(-E_DEADLOCK); -+ } -+ return 0; -+} -+ -+/* Wakes up a single thread */ -+void __reiser4_wake_up(lock_stack * owner) -+{ -+ atomic_set(&owner->wakeup, 1); -+ wake_up(&owner->wait); -+} -+ -+/* Puts a thread to sleep */ -+void reiser4_go_to_sleep(lock_stack * owner) -+{ -+ /* Well, we might sleep here, so holding of any spinlocks is no-no */ -+ assert("nikita-3027", reiser4_schedulable()); -+ -+ wait_event(owner->wait, atomic_read(&owner->wakeup)); -+ atomic_set(&owner->wakeup, 0); -+} -+ -+int lock_stack_isclean(lock_stack * owner) -+{ -+ if (list_empty_careful(&owner->locks)) { -+ assert("zam-353", atomic_read(&owner->nr_signaled) == 0); -+ return 1; -+ } -+ -+ return 0; -+} -+ -+#if REISER4_DEBUG -+ -+/* -+ * debugging functions -+ */ -+ -+static void list_check(struct list_head *head) -+{ -+ struct list_head *pos; -+ -+ list_for_each(pos, head) -+ assert("", (pos->prev != NULL && pos->next != NULL && -+ pos->prev->next == pos && pos->next->prev == pos)); -+} -+ -+/* check consistency of locking data-structures hanging of the @stack */ -+static void check_lock_stack(lock_stack * stack) -+{ -+ spin_lock_stack(stack); -+ /* check that stack->locks is not corrupted */ -+ list_check(&stack->locks); -+ spin_unlock_stack(stack); -+} -+ -+/* check consistency of locking data structures */ -+void check_lock_data(void) -+{ -+ check_lock_stack(&get_current_context()->stack); -+} -+ -+/* check consistency of locking data structures for @node */ -+void check_lock_node_data(znode * node) -+{ -+ spin_lock_zlock(&node->lock); -+ list_check(&node->lock.owners); -+ list_check(&node->lock.requestors); -+ spin_unlock_zlock(&node->lock); -+} -+ -+/* check that given lock request is dead lock safe. This check is, of course, -+ * not exhaustive. */ -+static int -+request_is_deadlock_safe(znode * node, znode_lock_mode mode, -+ znode_lock_request request) -+{ -+ lock_stack *owner; -+ -+ owner = get_current_lock_stack(); -+ /* -+ * check that hipri lock request is not issued when there are locked -+ * nodes at the higher levels. -+ */ -+ if (request & ZNODE_LOCK_HIPRI && !(request & ZNODE_LOCK_NONBLOCK) && -+ znode_get_level(node) != 0) { -+ lock_handle *item; -+ -+ list_for_each_entry(item, &owner->locks, locks_link) { -+ znode *other; -+ -+ other = item->node; -+ -+ if (znode_get_level(other) == 0) -+ continue; -+ if (znode_get_level(other) > znode_get_level(node)) -+ return 0; -+ } -+ } -+ return 1; -+} -+ -+#endif -+ -+/* return pointer to static storage with name of lock_mode. For -+ debugging */ -+const char *lock_mode_name(znode_lock_mode lock /* lock mode to get name of */ ) -+{ -+ if (lock == ZNODE_READ_LOCK) -+ return "read"; -+ else if (lock == ZNODE_WRITE_LOCK) -+ return "write"; -+ else { -+ static char buf[30]; -+ -+ sprintf(buf, "unknown: %i", lock); -+ return buf; -+ } -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 79 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/lock.h linux-2.6.20/fs/reiser4/lock.h ---- linux-2.6.20.orig/fs/reiser4/lock.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/lock.h 2007-05-06 14:50:43.742989473 +0400 -@@ -0,0 +1,249 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Long term locking data structures. See lock.c for details. */ -+ -+#ifndef __LOCK_H__ -+#define __LOCK_H__ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/node/node.h" -+#include "txnmgr.h" -+#include "readahead.h" -+ -+#include -+#include -+#include /* for PAGE_CACHE_SIZE */ -+#include -+#include -+ -+/* Per-znode lock object */ -+struct zlock { -+ spinlock_t guard; -+ /* The number of readers if positive; the number of recursively taken -+ write locks if negative. Protected by zlock spin lock. */ -+ int nr_readers; -+ /* A number of processes (lock_stacks) that have this object -+ locked with high priority */ -+ unsigned nr_hipri_owners; -+ /* A number of attempts to lock znode in high priority direction */ -+ unsigned nr_hipri_requests; -+ /* A linked list of lock_handle objects that contains pointers -+ for all lock_stacks which have this lock object locked */ -+ unsigned nr_hipri_write_requests; -+ struct list_head owners; -+ /* A linked list of lock_stacks that wait for this lock */ -+ struct list_head requestors; -+}; -+ -+static inline void spin_lock_zlock(zlock *lock) -+{ -+ /* check that zlock is not locked */ -+ assert("", LOCK_CNT_NIL(spin_locked_zlock)); -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", LOCK_CNT_NIL(spin_locked_stack)); -+ -+ spin_lock(&lock->guard); -+ -+ LOCK_CNT_INC(spin_locked_zlock); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline void spin_unlock_zlock(zlock *lock) -+{ -+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_zlock)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(spin_locked_zlock); -+ LOCK_CNT_DEC(spin_locked); -+ -+ spin_unlock(&lock->guard); -+} -+ -+#define lock_is_locked(lock) ((lock)->nr_readers != 0) -+#define lock_is_rlocked(lock) ((lock)->nr_readers > 0) -+#define lock_is_wlocked(lock) ((lock)->nr_readers < 0) -+#define lock_is_wlocked_once(lock) ((lock)->nr_readers == -1) -+#define lock_can_be_rlocked(lock) ((lock)->nr_readers >=0) -+#define lock_mode_compatible(lock, mode) \ -+ (((mode) == ZNODE_WRITE_LOCK && !lock_is_locked(lock)) || \ -+ ((mode) == ZNODE_READ_LOCK && lock_can_be_rlocked(lock))) -+ -+/* Since we have R/W znode locks we need additional bidirectional `link' -+ objects to implement n<->m relationship between lock owners and lock -+ objects. We call them `lock handles'. -+ -+ Locking: see lock.c/"SHORT-TERM LOCKING" -+*/ -+struct lock_handle { -+ /* This flag indicates that a signal to yield a lock was passed to -+ lock owner and counted in owner->nr_signalled -+ -+ Locking: this is accessed under spin lock on ->node. -+ */ -+ int signaled; -+ /* A link to owner of a lock */ -+ lock_stack *owner; -+ /* A link to znode locked */ -+ znode *node; -+ /* A list of all locks for a process */ -+ struct list_head locks_link; -+ /* A list of all owners for a znode */ -+ struct list_head owners_link; -+}; -+ -+typedef struct lock_request { -+ /* A pointer to uninitialized link object */ -+ lock_handle *handle; -+ /* A pointer to the object we want to lock */ -+ znode *node; -+ /* Lock mode (ZNODE_READ_LOCK or ZNODE_WRITE_LOCK) */ -+ znode_lock_mode mode; -+ /* how dispatch_lock_requests() returns lock request result code */ -+ int ret_code; -+} lock_request; -+ -+/* A lock stack structure for accumulating locks owned by a process */ -+struct lock_stack { -+ /* A guard lock protecting a lock stack */ -+ spinlock_t sguard; -+ /* number of znodes which were requested by high priority processes */ -+ atomic_t nr_signaled; -+ /* Current priority of a process -+ -+ This is only accessed by the current thread and thus requires no -+ locking. -+ */ -+ int curpri; -+ /* A list of all locks owned by this process. Elements can be added to -+ * this list only by the current thread. ->node pointers in this list -+ * can be only changed by the current thread. */ -+ struct list_head locks; -+ /* When lock_stack waits for the lock, it puts itself on double-linked -+ requestors list of that lock */ -+ struct list_head requestors_link; -+ /* Current lock request info. -+ -+ This is only accessed by the current thread and thus requires no -+ locking. -+ */ -+ lock_request request; -+ /* the following two fields are the lock stack's -+ * synchronization object to use with the standard linux/wait.h -+ * interface. See reiser4_go_to_sleep and __reiser4_wake_up for -+ * usage details. */ -+ wait_queue_head_t wait; -+ atomic_t wakeup; -+#if REISER4_DEBUG -+ int nr_locks; /* number of lock handles in the above list */ -+#endif -+}; -+ -+/* -+ User-visible znode locking functions -+*/ -+ -+extern int longterm_lock_znode(lock_handle * handle, -+ znode * node, -+ znode_lock_mode mode, -+ znode_lock_request request); -+ -+extern void longterm_unlock_znode(lock_handle * handle); -+ -+extern int reiser4_check_deadlock(void); -+ -+extern lock_stack *get_current_lock_stack(void); -+ -+extern void init_lock_stack(lock_stack * owner); -+extern void reiser4_init_lock(zlock * lock); -+ -+static inline void init_lh(lock_handle *lh) -+{ -+#if REISER4_DEBUG -+ memset(lh, 0, sizeof *lh); -+ INIT_LIST_HEAD(&lh->locks_link); -+ INIT_LIST_HEAD(&lh->owners_link); -+#else -+ lh->node = NULL; -+#endif -+} -+ -+static inline void done_lh(lock_handle *lh) -+{ -+ assert("zam-342", lh != NULL); -+ if (lh->node != NULL) -+ longterm_unlock_znode(lh); -+} -+ -+extern void move_lh(lock_handle * new, lock_handle * old); -+extern void copy_lh(lock_handle * new, lock_handle * old); -+ -+extern int reiser4_prepare_to_sleep(lock_stack * owner); -+extern void reiser4_go_to_sleep(lock_stack * owner); -+extern void __reiser4_wake_up(lock_stack * owner); -+ -+extern int lock_stack_isclean(lock_stack * owner); -+ -+/* zlock object state check macros: only used in assertions. Both forms imply that the -+ lock is held by the current thread. */ -+extern int znode_is_write_locked(const znode *); -+extern void reiser4_invalidate_lock(lock_handle *); -+ -+/* lock ordering is: first take zlock spin lock, then lock stack spin lock */ -+#define spin_ordering_pred_stack(stack) \ -+ (LOCK_CNT_NIL(spin_locked_stack) && \ -+ LOCK_CNT_NIL(spin_locked_txnmgr) && \ -+ LOCK_CNT_NIL(spin_locked_inode) && \ -+ LOCK_CNT_NIL(rw_locked_cbk_cache) && \ -+ LOCK_CNT_NIL(spin_locked_super_eflush) ) -+ -+static inline void spin_lock_stack(lock_stack *stack) -+{ -+ assert("", spin_ordering_pred_stack(stack)); -+ spin_lock(&(stack->sguard)); -+ LOCK_CNT_INC(spin_locked_stack); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline void spin_unlock_stack(lock_stack *stack) -+{ -+ assert_spin_locked(&(stack->sguard)); -+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_stack)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ LOCK_CNT_DEC(spin_locked_stack); -+ LOCK_CNT_DEC(spin_locked); -+ spin_unlock(&(stack->sguard)); -+} -+ -+static inline void reiser4_wake_up(lock_stack * owner) -+{ -+ spin_lock_stack(owner); -+ __reiser4_wake_up(owner); -+ spin_unlock_stack(owner); -+} -+ -+const char *lock_mode_name(znode_lock_mode lock); -+ -+#if REISER4_DEBUG -+extern void check_lock_data(void); -+extern void check_lock_node_data(znode * node); -+#else -+#define check_lock_data() noop -+#define check_lock_node_data() noop -+#endif -+ -+/* __LOCK_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/Makefile linux-2.6.20/fs/reiser4/Makefile ---- linux-2.6.20.orig/fs/reiser4/Makefile 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/Makefile 2007-05-06 14:50:43.742989473 +0400 -@@ -0,0 +1,99 @@ -+# -+# reiser4/Makefile -+# -+ -+obj-$(CONFIG_REISER4_FS) += reiser4.o -+ -+reiser4-y := \ -+ debug.o \ -+ jnode.o \ -+ znode.o \ -+ key.o \ -+ pool.o \ -+ tree_mod.o \ -+ estimate.o \ -+ carry.o \ -+ carry_ops.o \ -+ lock.o \ -+ tree.o \ -+ context.o \ -+ tap.o \ -+ coord.o \ -+ block_alloc.o \ -+ txnmgr.o \ -+ kassign.o \ -+ flush.o \ -+ wander.o \ -+ eottl.o \ -+ search.o \ -+ page_cache.o \ -+ seal.o \ -+ dscale.o \ -+ flush_queue.o \ -+ ktxnmgrd.o \ -+ blocknrset.o \ -+ super.o \ -+ super_ops.o \ -+ fsdata.o \ -+ export_ops.o \ -+ oid.o \ -+ tree_walk.o \ -+ inode.o \ -+ vfs_ops.o \ -+ as_ops.o \ -+ entd.o\ -+ readahead.o \ -+ status_flags.o \ -+ init_super.o \ -+ safe_link.o \ -+ \ -+ plugin/plugin.o \ -+ plugin/plugin_set.o \ -+ plugin/node/node.o \ -+ plugin/object.o \ -+ plugin/cluster.o \ -+ plugin/inode_ops.o \ -+ plugin/inode_ops_rename.o \ -+ plugin/file_ops.o \ -+ plugin/file_ops_readdir.o \ -+ plugin/file_plugin_common.o \ -+ plugin/file/file.o \ -+ plugin/file/tail_conversion.o \ -+ plugin/file/file_conversion.o \ -+ plugin/file/symlink.o \ -+ plugin/file/cryptcompress.o \ -+ plugin/dir_plugin_common.o \ -+ plugin/dir/hashed_dir.o \ -+ plugin/dir/seekable_dir.o \ -+ plugin/node/node40.o \ -+ \ -+ plugin/crypto/cipher.o \ -+ plugin/crypto/digest.o \ -+ \ -+ plugin/compress/minilzo.o \ -+ plugin/compress/compress.o \ -+ plugin/compress/compress_mode.o \ -+ \ -+ plugin/item/static_stat.o \ -+ plugin/item/sde.o \ -+ plugin/item/cde.o \ -+ plugin/item/blackbox.o \ -+ plugin/item/internal.o \ -+ plugin/item/tail.o \ -+ plugin/item/ctail.o \ -+ plugin/item/extent.o \ -+ plugin/item/extent_item_ops.o \ -+ plugin/item/extent_file_ops.o \ -+ plugin/item/extent_flush_ops.o \ -+ \ -+ plugin/hash.o \ -+ plugin/fibration.o \ -+ plugin/tail_policy.o \ -+ plugin/item/item.o \ -+ \ -+ plugin/security/perm.o \ -+ plugin/space/bitmap.o \ -+ \ -+ plugin/disk_format/disk_format40.o \ -+ plugin/disk_format/disk_format.o -+ -diff -urN linux-2.6.20.orig/fs/reiser4/oid.c linux-2.6.20/fs/reiser4/oid.c ---- linux-2.6.20.orig/fs/reiser4/oid.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/oid.c 2007-05-06 14:50:43.742989473 +0400 -@@ -0,0 +1,141 @@ -+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "debug.h" -+#include "super.h" -+#include "txnmgr.h" -+ -+/* we used to have oid allocation plugin. It was removed because it -+ was recognized as providing unneeded level of abstraction. If one -+ ever will find it useful - look at yet_unneeded_abstractions/oid -+*/ -+ -+/* -+ * initialize in-memory data for oid allocator at @super. @nr_files and @next -+ * are provided by disk format plugin that reads them from the disk during -+ * mount. -+ */ -+int oid_init_allocator(struct super_block *super, oid_t nr_files, oid_t next) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = get_super_private(super); -+ -+ sbinfo->next_to_use = next; -+ sbinfo->oids_in_use = nr_files; -+ return 0; -+} -+ -+/* -+ * allocate oid and return it. ABSOLUTE_MAX_OID is returned when allocator -+ * runs out of oids. -+ */ -+oid_t oid_allocate(struct super_block * super) -+{ -+ reiser4_super_info_data *sbinfo; -+ oid_t oid; -+ -+ sbinfo = get_super_private(super); -+ -+ spin_lock_reiser4_super(sbinfo); -+ if (sbinfo->next_to_use != ABSOLUTE_MAX_OID) { -+ oid = sbinfo->next_to_use++; -+ sbinfo->oids_in_use++; -+ } else -+ oid = ABSOLUTE_MAX_OID; -+ spin_unlock_reiser4_super(sbinfo); -+ return oid; -+} -+ -+/* -+ * Tell oid allocator that @oid is now free. -+ */ -+int oid_release(struct super_block *super, oid_t oid UNUSED_ARG) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = get_super_private(super); -+ -+ spin_lock_reiser4_super(sbinfo); -+ sbinfo->oids_in_use--; -+ spin_unlock_reiser4_super(sbinfo); -+ return 0; -+} -+ -+/* -+ * return next @oid that would be allocated (i.e., returned by oid_allocate()) -+ * without actually allocating it. This is used by disk format plugin to save -+ * oid allocator state on the disk. -+ */ -+oid_t oid_next(const struct super_block * super) -+{ -+ reiser4_super_info_data *sbinfo; -+ oid_t oid; -+ -+ sbinfo = get_super_private(super); -+ -+ spin_lock_reiser4_super(sbinfo); -+ oid = sbinfo->next_to_use; -+ spin_unlock_reiser4_super(sbinfo); -+ return oid; -+} -+ -+/* -+ * returns number of currently used oids. This is used by statfs(2) to report -+ * number of "inodes" and by disk format plugin to save oid allocator state on -+ * the disk. -+ */ -+long oids_used(const struct super_block *super) -+{ -+ reiser4_super_info_data *sbinfo; -+ oid_t used; -+ -+ sbinfo = get_super_private(super); -+ -+ spin_lock_reiser4_super(sbinfo); -+ used = sbinfo->oids_in_use; -+ spin_unlock_reiser4_super(sbinfo); -+ if (used < (__u64) ((long)~0) >> 1) -+ return (long)used; -+ else -+ return (long)-1; -+} -+ -+/* -+ * Count oid as allocated in atom. This is done after call to oid_allocate() -+ * at the point when we are irrevocably committed to creation of the new file -+ * (i.e., when oid allocation cannot be any longer rolled back due to some -+ * error). -+ */ -+void oid_count_allocated(void) -+{ -+ txn_atom *atom; -+ -+ atom = get_current_atom_locked(); -+ atom->nr_objects_created++; -+ spin_unlock_atom(atom); -+} -+ -+/* -+ * Count oid as free in atom. This is done after call to oid_release() at the -+ * point when we are irrevocably committed to the deletion of the file (i.e., -+ * when oid release cannot be any longer rolled back due to some error). -+ */ -+void oid_count_released(void) -+{ -+ txn_atom *atom; -+ -+ atom = get_current_atom_locked(); -+ atom->nr_objects_deleted++; -+ spin_unlock_atom(atom); -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/page_cache.c linux-2.6.20/fs/reiser4/page_cache.c ---- linux-2.6.20.orig/fs/reiser4/page_cache.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/page_cache.c 2007-05-06 14:50:43.742989473 +0400 -@@ -0,0 +1,736 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Memory pressure hooks. Fake inodes handling. */ -+ -+/* GLOSSARY -+ -+ . Formatted and unformatted nodes. -+ Elements of reiser4 balanced tree to store data and metadata. -+ Unformatted nodes are pointed to by extent pointers. Such nodes -+ are used to store data of large objects. Unlike unformatted nodes, -+ formatted ones have associated format described by node4X plugin. -+ -+ . Jnode (or journal node) -+ The in-memory header which is used to track formatted and unformatted -+ nodes, bitmap nodes, etc. In particular, jnodes are used to track -+ transactional information associated with each block(see reiser4/jnode.c -+ for details). -+ -+ . Znode -+ The in-memory header which is used to track formatted nodes. Contains -+ embedded jnode (see reiser4/znode.c for details). -+*/ -+ -+/* We store all file system meta data (and data, of course) in the page cache. -+ -+ What does this mean? In stead of using bread/brelse we create special -+ "fake" inode (one per super block) and store content of formatted nodes -+ into pages bound to this inode in the page cache. In newer kernels bread() -+ already uses inode attached to block device (bd_inode). Advantage of having -+ our own fake inode is that we can install appropriate methods in its -+ address_space operations. Such methods are called by VM on memory pressure -+ (or during background page flushing) and we can use them to react -+ appropriately. -+ -+ In initial version we only support one block per page. Support for multiple -+ blocks per page is complicated by relocation. -+ -+ To each page, used by reiser4, jnode is attached. jnode is analogous to -+ buffer head. Difference is that jnode is bound to the page permanently: -+ jnode cannot be removed from memory until its backing page is. -+ -+ jnode contain pointer to page (->pg field) and page contain pointer to -+ jnode in ->private field. Pointer from jnode to page is protected to by -+ jnode's spinlock and pointer from page to jnode is protected by page lock -+ (PG_locked bit). Lock ordering is: first take page lock, then jnode spin -+ lock. To go into reverse direction use jnode_lock_page() function that uses -+ standard try-lock-and-release device. -+ -+ Properties: -+ -+ 1. when jnode-to-page mapping is established (by jnode_attach_page()), page -+ reference counter is increased. -+ -+ 2. when jnode-to-page mapping is destroyed (by page_clear_jnode(), page -+ reference counter is decreased. -+ -+ 3. on jload() reference counter on jnode page is increased, page is -+ kmapped and `referenced'. -+ -+ 4. on jrelse() inverse operations are performed. -+ -+ 5. kmapping/kunmapping of unformatted pages is done by read/write methods. -+ -+ DEADLOCKS RELATED TO MEMORY PRESSURE. [OUTDATED. Only interesting -+ historically.] -+ -+ [In the following discussion, `lock' invariably means long term lock on -+ znode.] (What about page locks?) -+ -+ There is some special class of deadlock possibilities related to memory -+ pressure. Locks acquired by other reiser4 threads are accounted for in -+ deadlock prevention mechanism (lock.c), but when ->vm_writeback() is -+ invoked additional hidden arc is added to the locking graph: thread that -+ tries to allocate memory waits for ->vm_writeback() to finish. If this -+ thread keeps lock and ->vm_writeback() tries to acquire this lock, deadlock -+ prevention is useless. -+ -+ Another related problem is possibility for ->vm_writeback() to run out of -+ memory itself. This is not a problem for ext2 and friends, because their -+ ->vm_writeback() don't allocate much memory, but reiser4 flush is -+ definitely able to allocate huge amounts of memory. -+ -+ It seems that there is no reliable way to cope with the problems above. In -+ stead it was decided that ->vm_writeback() (as invoked in the kswapd -+ context) wouldn't perform any flushing itself, but rather should just wake -+ up some auxiliary thread dedicated for this purpose (or, the same thread -+ that does periodic commit of old atoms (ktxnmgrd.c)). -+ -+ Details: -+ -+ 1. Page is called `reclaimable' against particular reiser4 mount F if this -+ page can be ultimately released by try_to_free_pages() under presumptions -+ that: -+ -+ a. ->vm_writeback() for F is no-op, and -+ -+ b. none of the threads accessing F are making any progress, and -+ -+ c. other reiser4 mounts obey the same memory reservation protocol as F -+ (described below). -+ -+ For example, clean un-pinned page, or page occupied by ext2 data are -+ reclaimable against any reiser4 mount. -+ -+ When there is more than one reiser4 mount in a system, condition (c) makes -+ reclaim-ability not easily verifiable beyond trivial cases mentioned above. -+ -+ THIS COMMENT IS VALID FOR "MANY BLOCKS ON PAGE" CASE -+ -+ Fake inode is used to bound formatted nodes and each node is indexed within -+ fake inode by its block number. If block size of smaller than page size, it -+ may so happen that block mapped to the page with formatted node is occupied -+ by unformatted node or is unallocated. This lead to some complications, -+ because flushing whole page can lead to an incorrect overwrite of -+ unformatted node that is moreover, can be cached in some other place as -+ part of the file body. To avoid this, buffers for unformatted nodes are -+ never marked dirty. Also pages in the fake are never marked dirty. This -+ rules out usage of ->writepage() as memory pressure hook. In stead -+ ->releasepage() is used. -+ -+ Josh is concerned that page->buffer is going to die. This should not pose -+ significant problem though, because we need to add some data structures to -+ the page anyway (jnode) and all necessary book keeping can be put there. -+ -+*/ -+ -+/* Life cycle of pages/nodes. -+ -+ jnode contains reference to page and page contains reference back to -+ jnode. This reference is counted in page ->count. Thus, page bound to jnode -+ cannot be released back into free pool. -+ -+ 1. Formatted nodes. -+ -+ 1. formatted node is represented by znode. When new znode is created its -+ ->pg pointer is NULL initially. -+ -+ 2. when node content is loaded into znode (by call to zload()) for the -+ first time following happens (in call to ->read_node() or -+ ->allocate_node()): -+ -+ 1. new page is added to the page cache. -+ -+ 2. this page is attached to znode and its ->count is increased. -+ -+ 3. page is kmapped. -+ -+ 3. if more calls to zload() follow (without corresponding zrelses), page -+ counter is left intact and in its stead ->d_count is increased in znode. -+ -+ 4. each call to zrelse decreases ->d_count. When ->d_count drops to zero -+ ->release_node() is called and page is kunmapped as result. -+ -+ 5. at some moment node can be captured by a transaction. Its ->x_count -+ is then increased by transaction manager. -+ -+ 6. if node is removed from the tree (empty node with JNODE_HEARD_BANSHEE -+ bit set) following will happen (also see comment at the top of znode.c): -+ -+ 1. when last lock is released, node will be uncaptured from -+ transaction. This released reference that transaction manager acquired -+ at the step 5. -+ -+ 2. when last reference is released, zput() detects that node is -+ actually deleted and calls ->delete_node() -+ operation. page_cache_delete_node() implementation detaches jnode from -+ page and releases page. -+ -+ 7. otherwise (node wasn't removed from the tree), last reference to -+ znode will be released after transaction manager committed transaction -+ node was in. This implies squallocing of this node (see -+ flush.c). Nothing special happens at this point. Znode is still in the -+ hash table and page is still attached to it. -+ -+ 8. znode is actually removed from the memory because of the memory -+ pressure, or during umount (znodes_tree_done()). Anyway, znode is -+ removed by the call to zdrop(). At this moment, page is detached from -+ znode and removed from the inode address space. -+ -+*/ -+ -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree.h" -+#include "vfs_ops.h" -+#include "inode.h" -+#include "super.h" -+#include "entd.h" -+#include "page_cache.h" -+#include "ktxnmgrd.h" -+ -+#include -+#include -+#include /* for struct page */ -+#include /* for struct page */ -+#include -+#include -+#include -+#include -+ -+static struct bio *page_bio(struct page *, jnode *, int rw, gfp_t gfp); -+ -+static struct address_space_operations formatted_fake_as_ops; -+ -+static const oid_t fake_ino = 0x1; -+static const oid_t bitmap_ino = 0x2; -+static const oid_t cc_ino = 0x3; -+ -+static void -+init_fake_inode(struct super_block *super, struct inode *fake, -+ struct inode **pfake) -+{ -+ assert("nikita-2168", fake->i_state & I_NEW); -+ fake->i_mapping->a_ops = &formatted_fake_as_ops; -+ *pfake = fake; -+ /* NOTE-NIKITA something else? */ -+ unlock_new_inode(fake); -+} -+ -+/** -+ * reiser4_init_formatted_fake - iget inodes for formatted nodes and bitmaps -+ * @super: super block to init fake inode for -+ * -+ * Initializes fake inode to which formatted nodes are bound in the page cache -+ * and inode for bitmaps. -+ */ -+int reiser4_init_formatted_fake(struct super_block *super) -+{ -+ struct inode *fake; -+ struct inode *bitmap; -+ struct inode *cc; -+ reiser4_super_info_data *sinfo; -+ -+ assert("nikita-1703", super != NULL); -+ -+ sinfo = get_super_private_nocheck(super); -+ fake = iget_locked(super, oid_to_ino(fake_ino)); -+ -+ if (fake != NULL) { -+ init_fake_inode(super, fake, &sinfo->fake); -+ -+ bitmap = iget_locked(super, oid_to_ino(bitmap_ino)); -+ if (bitmap != NULL) { -+ init_fake_inode(super, bitmap, &sinfo->bitmap); -+ -+ cc = iget_locked(super, oid_to_ino(cc_ino)); -+ if (cc != NULL) { -+ init_fake_inode(super, cc, &sinfo->cc); -+ return 0; -+ } else { -+ iput(sinfo->fake); -+ iput(sinfo->bitmap); -+ sinfo->fake = NULL; -+ sinfo->bitmap = NULL; -+ } -+ } else { -+ iput(sinfo->fake); -+ sinfo->fake = NULL; -+ } -+ } -+ return RETERR(-ENOMEM); -+} -+ -+/** -+ * reiser4_done_formatted_fake - release inode used by formatted nodes and bitmaps -+ * @super: super block to init fake inode for -+ * -+ * Releases inodes which were used as address spaces of bitmap and formatted -+ * nodes. -+ */ -+void reiser4_done_formatted_fake(struct super_block *super) -+{ -+ reiser4_super_info_data *sinfo; -+ -+ sinfo = get_super_private_nocheck(super); -+ -+ if (sinfo->fake != NULL) { -+ iput(sinfo->fake); -+ sinfo->fake = NULL; -+ } -+ -+ if (sinfo->bitmap != NULL) { -+ iput(sinfo->bitmap); -+ sinfo->bitmap = NULL; -+ } -+ -+ if (sinfo->cc != NULL) { -+ iput(sinfo->cc); -+ sinfo->cc = NULL; -+ } -+ return; -+} -+ -+void reiser4_wait_page_writeback(struct page *page) -+{ -+ assert("zam-783", PageLocked(page)); -+ -+ do { -+ unlock_page(page); -+ wait_on_page_writeback(page); -+ lock_page(page); -+ } while (PageWriteback(page)); -+} -+ -+/* return tree @page is in */ -+reiser4_tree *reiser4_tree_by_page(const struct page *page /* page to query */ ) -+{ -+ assert("nikita-2461", page != NULL); -+ return &get_super_private(page->mapping->host->i_sb)->tree; -+} -+ -+/* completion handler for single page bio-based read. -+ -+ mpage_end_io_read() would also do. But it's static. -+ -+*/ -+static int -+end_bio_single_page_read(struct bio *bio, unsigned int bytes_done UNUSED_ARG, -+ int err UNUSED_ARG) -+{ -+ struct page *page; -+ -+ if (bio->bi_size != 0) { -+ warning("nikita-3332", "Truncated single page read: %i", -+ bio->bi_size); -+ return 1; -+ } -+ -+ page = bio->bi_io_vec[0].bv_page; -+ -+ if (test_bit(BIO_UPTODATE, &bio->bi_flags)) { -+ SetPageUptodate(page); -+ } else { -+ ClearPageUptodate(page); -+ SetPageError(page); -+ } -+ unlock_page(page); -+ bio_put(bio); -+ return 0; -+} -+ -+/* completion handler for single page bio-based write. -+ -+ mpage_end_io_write() would also do. But it's static. -+ -+*/ -+static int -+end_bio_single_page_write(struct bio *bio, unsigned int bytes_done UNUSED_ARG, -+ int err UNUSED_ARG) -+{ -+ struct page *page; -+ -+ if (bio->bi_size != 0) { -+ warning("nikita-3333", "Truncated single page write: %i", -+ bio->bi_size); -+ return 1; -+ } -+ -+ page = bio->bi_io_vec[0].bv_page; -+ -+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) -+ SetPageError(page); -+ end_page_writeback(page); -+ bio_put(bio); -+ return 0; -+} -+ -+/* ->readpage() method for formatted nodes */ -+static int formatted_readpage(struct file *f UNUSED_ARG, -+ struct page *page /* page to read */ ) -+{ -+ assert("nikita-2412", PagePrivate(page) && jprivate(page)); -+ return reiser4_page_io(page, jprivate(page), READ, -+ reiser4_ctx_gfp_mask_get()); -+} -+ -+/** -+ * reiser4_page_io - submit single-page bio request -+ * @page: page to perform io for -+ * @node: jnode of page -+ * @rw: read or write -+ * @gfp: gfp mask for bio allocation -+ * -+ * Submits single page read or write. -+ */ -+int reiser4_page_io(struct page *page, jnode *node, int rw, gfp_t gfp) -+{ -+ struct bio *bio; -+ int result; -+ -+ assert("nikita-2094", page != NULL); -+ assert("nikita-2226", PageLocked(page)); -+ assert("nikita-2634", node != NULL); -+ assert("nikita-2893", rw == READ || rw == WRITE); -+ -+ if (rw) { -+ if (unlikely(page->mapping->host->i_sb->s_flags & MS_RDONLY)) { -+ unlock_page(page); -+ return 0; -+ } -+ } -+ -+ bio = page_bio(page, node, rw, gfp); -+ if (!IS_ERR(bio)) { -+ if (rw == WRITE) { -+ SetPageWriteback(page); -+ unlock_page(page); -+ } -+ reiser4_submit_bio(rw, bio); -+ result = 0; -+ } else { -+ unlock_page(page); -+ result = PTR_ERR(bio); -+ } -+ -+ return result; -+} -+ -+/* helper function to construct bio for page */ -+static struct bio *page_bio(struct page *page, jnode * node, int rw, gfp_t gfp) -+{ -+ struct bio *bio; -+ assert("nikita-2092", page != NULL); -+ assert("nikita-2633", node != NULL); -+ -+ /* Simple implementation in the assumption that blocksize == pagesize. -+ -+ We only have to submit one block, but submit_bh() will allocate bio -+ anyway, so lets use all the bells-and-whistles of bio code. -+ */ -+ -+ bio = bio_alloc(gfp, 1); -+ if (bio != NULL) { -+ int blksz; -+ struct super_block *super; -+ reiser4_block_nr blocknr; -+ -+ super = page->mapping->host->i_sb; -+ assert("nikita-2029", super != NULL); -+ blksz = super->s_blocksize; -+ assert("nikita-2028", blksz == (int)PAGE_CACHE_SIZE); -+ -+ spin_lock_jnode(node); -+ blocknr = *jnode_get_io_block(node); -+ spin_unlock_jnode(node); -+ -+ assert("nikita-2275", blocknr != (reiser4_block_nr) 0); -+ assert("nikita-2276", !reiser4_blocknr_is_fake(&blocknr)); -+ -+ bio->bi_bdev = super->s_bdev; -+ /* fill bio->bi_sector before calling bio_add_page(), because -+ * q->merge_bvec_fn may want to inspect it (see -+ * drivers/md/linear.c:linear_mergeable_bvec() for example. */ -+ bio->bi_sector = blocknr * (blksz >> 9); -+ -+ if (!bio_add_page(bio, page, blksz, 0)) { -+ warning("nikita-3452", -+ "Single page bio cannot be constructed"); -+ return ERR_PTR(RETERR(-EINVAL)); -+ } -+ -+ /* bio -> bi_idx is filled by bio_init() */ -+ bio->bi_end_io = (rw == READ) ? -+ end_bio_single_page_read : end_bio_single_page_write; -+ -+ return bio; -+ } else -+ return ERR_PTR(RETERR(-ENOMEM)); -+} -+ -+/* this function is internally called by jnode_make_dirty() */ -+int reiser4_set_page_dirty_internal(struct page *page) -+{ -+ struct address_space *mapping; -+ -+ mapping = page->mapping; -+ BUG_ON(mapping == NULL); -+ -+ if (!TestSetPageDirty(page)) { -+ if (mapping_cap_account_dirty(mapping)) -+ inc_zone_page_state(page, NR_FILE_DIRTY); -+ -+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); -+ } -+ -+ /* znode must be dirty ? */ -+ if (mapping->host == reiser4_get_super_fake(mapping->host->i_sb)) -+ assert("", JF_ISSET(jprivate(page), JNODE_DIRTY)); -+ return 0; -+} -+ -+#if REISER4_DEBUG -+ -+/** -+ * can_hit_entd -+ * -+ * This is used on -+ */ -+static int can_hit_entd(reiser4_context *ctx, struct super_block *s) -+{ -+ if (ctx == NULL || ((unsigned long)ctx->magic) != context_magic) -+ return 1; -+ if (ctx->super != s) -+ return 1; -+ if (get_super_private(s)->entd.tsk == current) -+ return 0; -+ if (!lock_stack_isclean(&ctx->stack)) -+ return 0; -+ if (ctx->trans->atom != NULL) -+ return 0; -+ return 1; -+} -+ -+#endif -+ -+/** -+ * reiser4_writepage - writepage of struct address_space_operations -+ * @page: page to write -+ * @wbc: -+ * -+ * -+ */ -+/* Common memory pressure notification. */ -+int reiser4_writepage(struct page *page, -+ struct writeback_control *wbc) -+{ -+ struct super_block *s; -+ reiser4_context *ctx; -+ -+ assert("vs-828", PageLocked(page)); -+ -+ s = page->mapping->host->i_sb; -+ ctx = get_current_context_check(); -+ -+ assert("", can_hit_entd(ctx, s)); -+ -+ return write_page_by_ent(page, wbc); -+} -+ -+/* ->set_page_dirty() method of formatted address_space */ -+static int formatted_set_page_dirty(struct page *page) -+{ -+ assert("nikita-2173", page != NULL); -+ BUG(); -+ return __set_page_dirty_nobuffers(page); -+} -+ -+/* writepages method of address space operations in reiser4 is used to involve -+ into transactions pages which are dirtied via mmap. Only regular files can -+ have such pages. Fake inode is used to access formatted nodes via page -+ cache. As formatted nodes can never be mmaped, fake inode's writepages has -+ nothing to do */ -+static int -+writepages_fake(struct address_space *mapping, struct writeback_control *wbc) -+{ -+ return 0; -+} -+ -+/* address space operations for the fake inode */ -+static struct address_space_operations formatted_fake_as_ops = { -+ /* Perform a writeback of a single page as a memory-freeing -+ * operation. */ -+ .writepage = reiser4_writepage, -+ /* this is called to read formatted node */ -+ .readpage = formatted_readpage, -+ /* ->sync_page() method of fake inode address space operations. Called -+ from wait_on_page() and lock_page(). -+ -+ This is most annoyingly misnomered method. Actually it is called -+ from wait_on_page_bit() and lock_page() and its purpose is to -+ actually start io by jabbing device drivers. -+ */ -+ .sync_page = block_sync_page, -+ /* Write back some dirty pages from this mapping. Called from sync. -+ called during sync (pdflush) */ -+ .writepages = writepages_fake, -+ /* Set a page dirty */ -+ .set_page_dirty = formatted_set_page_dirty, -+ /* used for read-ahead. Not applicable */ -+ .readpages = NULL, -+ .prepare_write = NULL, -+ .commit_write = NULL, -+ .bmap = NULL, -+ /* called just before page is being detached from inode mapping and -+ removed from memory. Called on truncate, cut/squeeze, and -+ umount. */ -+ .invalidatepage = reiser4_invalidatepage, -+ /* this is called by shrink_cache() so that file system can try to -+ release objects (jnodes, buffers, journal heads) attached to page -+ and, may be made page itself free-able. -+ */ -+ .releasepage = reiser4_releasepage, -+ .direct_IO = NULL -+}; -+ -+/* called just before page is released (no longer used by reiser4). Callers: -+ jdelete() and extent2tail(). */ -+void reiser4_drop_page(struct page *page) -+{ -+ assert("nikita-2181", PageLocked(page)); -+ clear_page_dirty_for_io(page); -+ ClearPageUptodate(page); -+#if defined(PG_skipped) -+ ClearPageSkipped(page); -+#endif -+ unlock_page(page); -+} -+ -+#define JNODE_GANG_SIZE (16) -+ -+/* find all jnodes from range specified and invalidate them */ -+static int -+truncate_jnodes_range(struct inode *inode, pgoff_t from, pgoff_t count) -+{ -+ reiser4_inode *info; -+ int truncated_jnodes; -+ reiser4_tree *tree; -+ unsigned long index; -+ unsigned long end; -+ -+ if (inode_file_plugin(inode) == -+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)) -+ /* No need to get rid of jnodes here: if the single jnode of -+ page cluster did not have page, then it was found and killed -+ before in -+ truncate_page_cluster_cryptcompress()->jput()->jput_final(), -+ otherwise it will be dropped by reiser4_invalidatepage() */ -+ return 0; -+ truncated_jnodes = 0; -+ -+ info = reiser4_inode_data(inode); -+ tree = reiser4_tree_by_inode(inode); -+ -+ index = from; -+ end = from + count; -+ -+ while (1) { -+ jnode *gang[JNODE_GANG_SIZE]; -+ int taken; -+ int i; -+ jnode *node; -+ -+ assert("nikita-3466", index <= end); -+ -+ read_lock_tree(tree); -+ taken = -+ radix_tree_gang_lookup(jnode_tree_by_reiser4_inode(info), -+ (void **)gang, index, -+ JNODE_GANG_SIZE); -+ for (i = 0; i < taken; ++i) { -+ node = gang[i]; -+ if (index_jnode(node) < end) -+ jref(node); -+ else -+ gang[i] = NULL; -+ } -+ read_unlock_tree(tree); -+ -+ for (i = 0; i < taken; ++i) { -+ node = gang[i]; -+ if (node != NULL) { -+ index = max(index, index_jnode(node)); -+ spin_lock_jnode(node); -+ assert("edward-1457", node->pg == NULL); -+ /* this is always called after -+ truncate_inode_pages_range(). Therefore, here -+ jnode can not have page. New pages can not be -+ created because truncate_jnodes_range goes -+ under exclusive access on file obtained, -+ where as new page creation requires -+ non-exclusive access obtained */ -+ JF_SET(node, JNODE_HEARD_BANSHEE); -+ reiser4_uncapture_jnode(node); -+ unhash_unformatted_jnode(node); -+ truncated_jnodes++; -+ jput(node); -+ } else -+ break; -+ } -+ if (i != taken || taken == 0) -+ break; -+ } -+ return truncated_jnodes; -+} -+ -+/* Truncating files in reiser4: problems and solutions. -+ -+ VFS calls fs's truncate after it has called truncate_inode_pages() -+ to get rid of pages corresponding to part of file being truncated. -+ In reiser4 it may cause existence of unallocated extents which do -+ not have jnodes. Flush code does not expect that. Solution of this -+ problem is straightforward. As vfs's truncate is implemented using -+ setattr operation, it seems reasonable to have ->setattr() that -+ will cut file body. However, flush code also does not expect dirty -+ pages without parent items, so it is impossible to cut all items, -+ then truncate all pages in two steps. We resolve this problem by -+ cutting items one-by-one. Each such fine-grained step performed -+ under longterm znode lock calls at the end ->kill_hook() method of -+ a killed item to remove its binded pages and jnodes. -+ -+ The following function is a common part of mentioned kill hooks. -+ Also, this is called before tail-to-extent conversion (to not manage -+ few copies of the data). -+*/ -+void reiser4_invalidate_pages(struct address_space *mapping, pgoff_t from, -+ unsigned long count, int even_cows) -+{ -+ loff_t from_bytes, count_bytes; -+ -+ if (count == 0) -+ return; -+ from_bytes = ((loff_t) from) << PAGE_CACHE_SHIFT; -+ count_bytes = ((loff_t) count) << PAGE_CACHE_SHIFT; -+ -+ unmap_mapping_range(mapping, from_bytes, count_bytes, even_cows); -+ truncate_inode_pages_range(mapping, from_bytes, -+ from_bytes + count_bytes - 1); -+ truncate_jnodes_range(mapping->host, from, count); -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 120 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/page_cache.h linux-2.6.20/fs/reiser4/page_cache.h ---- linux-2.6.20.orig/fs/reiser4/page_cache.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/page_cache.h 2007-05-06 14:50:43.746990723 +0400 -@@ -0,0 +1,68 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+/* Memory pressure hooks. Fake inodes handling. See page_cache.c. */ -+ -+#if !defined( __REISER4_PAGE_CACHE_H__ ) -+#define __REISER4_PAGE_CACHE_H__ -+ -+#include "forward.h" -+#include "context.h" /* for reiser4_ctx_gfp_mask_get() */ -+ -+#include /* for struct super_block, address_space */ -+#include /* for struct page */ -+#include /* for lock_page() */ -+#include /* for __vmalloc() */ -+ -+extern int reiser4_init_formatted_fake(struct super_block *); -+extern void reiser4_done_formatted_fake(struct super_block *); -+ -+extern reiser4_tree *reiser4_tree_by_page(const struct page *); -+ -+extern int reiser4_set_page_dirty_internal(struct page *); -+ -+#define reiser4_submit_bio(rw, bio) submit_bio((rw), (bio)) -+ -+extern void reiser4_wait_page_writeback(struct page *); -+static inline void lock_and_wait_page_writeback(struct page *page) -+{ -+ lock_page(page); -+ if (unlikely(PageWriteback(page))) -+ reiser4_wait_page_writeback(page); -+} -+ -+#define jprivate(page) ((jnode *)page_private(page)) -+ -+extern int reiser4_page_io(struct page *, jnode *, int rw, gfp_t); -+extern void reiser4_drop_page(struct page *); -+extern void reiser4_invalidate_pages(struct address_space *, pgoff_t from, -+ unsigned long count, int even_cows); -+extern void capture_reiser4_inodes(struct super_block *, -+ struct writeback_control *); -+static inline void * reiser4_vmalloc (unsigned long size) -+{ -+ return __vmalloc(size, -+ reiser4_ctx_gfp_mask_get() | __GFP_HIGHMEM, -+ PAGE_KERNEL); -+} -+ -+#define PAGECACHE_TAG_REISER4_MOVED PAGECACHE_TAG_DIRTY -+ -+#if REISER4_DEBUG -+extern void print_page(const char *prefix, struct page *page); -+#else -+#define print_page(prf, p) noop -+#endif -+ -+/* __REISER4_PAGE_CACHE_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/cluster.c linux-2.6.20/fs/reiser4/plugin/cluster.c ---- linux-2.6.20.orig/fs/reiser4/plugin/cluster.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/cluster.c 2007-05-06 14:50:43.746990723 +0400 -@@ -0,0 +1,71 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Contains reiser4 cluster plugins (see -+ http://www.namesys.com/cryptcompress_design.html -+ "Concepts of clustering" for details). */ -+ -+#include "plugin_header.h" -+#include "plugin.h" -+#include "../inode.h" -+ -+static int change_cluster(struct inode *inode, -+ reiser4_plugin * plugin, -+ pset_member memb) -+{ -+ assert("edward-1324", inode != NULL); -+ assert("edward-1325", plugin != NULL); -+ assert("edward-1326", is_reiser4_inode(inode)); -+ assert("edward-1327", plugin->h.type_id == REISER4_CLUSTER_PLUGIN_TYPE); -+ -+ /* Can't change the cluster plugin for already existent regular files. */ -+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE)) -+ return RETERR(-EINVAL); -+ -+ /* If matches, nothing to change. */ -+ if (inode_hash_plugin(inode) != NULL && -+ inode_hash_plugin(inode)->h.id == plugin->h.id) -+ return 0; -+ -+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset, -+ PSET_CLUSTER, plugin); -+} -+ -+static reiser4_plugin_ops cluster_plugin_ops = { -+ .init = NULL, -+ .load = NULL, -+ .save_len = NULL, -+ .save = NULL, -+ .change = &change_cluster -+}; -+ -+#define SUPPORT_CLUSTER(SHIFT, ID, LABEL, DESC) \ -+ [CLUSTER_ ## ID ## _ID] = { \ -+ .h = { \ -+ .type_id = REISER4_CLUSTER_PLUGIN_TYPE, \ -+ .id = CLUSTER_ ## ID ## _ID, \ -+ .pops = &cluster_plugin_ops, \ -+ .label = LABEL, \ -+ .desc = DESC, \ -+ .linkage = {NULL, NULL} \ -+ }, \ -+ .shift = SHIFT \ -+ } -+ -+cluster_plugin cluster_plugins[LAST_CLUSTER_ID] = { -+ SUPPORT_CLUSTER(16, 64K, "64K", "Large"), -+ SUPPORT_CLUSTER(15, 32K, "32K", "Big"), -+ SUPPORT_CLUSTER(14, 16K, "16K", "Average"), -+ SUPPORT_CLUSTER(13, 8K, "8K", "Small"), -+ SUPPORT_CLUSTER(12, 4K, "4K", "Minimal") -+}; -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/cluster.h linux-2.6.20/fs/reiser4/plugin/cluster.h ---- linux-2.6.20.orig/fs/reiser4/plugin/cluster.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/cluster.h 2007-05-06 14:50:43.746990723 +0400 -@@ -0,0 +1,343 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* This file contains page/cluster index translators and offset modulators -+ See http://www.namesys.com/cryptcompress_design.html for details */ -+ -+#if !defined( __FS_REISER4_CLUSTER_H__ ) -+#define __FS_REISER4_CLUSTER_H__ -+ -+#include "../inode.h" -+ -+static inline int inode_cluster_shift(struct inode *inode) -+{ -+ assert("edward-92", inode != NULL); -+ assert("edward-93", reiser4_inode_data(inode) != NULL); -+ -+ return inode_cluster_plugin(inode)->shift; -+} -+ -+static inline unsigned cluster_nrpages_shift(struct inode *inode) -+{ -+ return inode_cluster_shift(inode) - PAGE_CACHE_SHIFT; -+} -+ -+/* cluster size in page units */ -+static inline unsigned cluster_nrpages(struct inode *inode) -+{ -+ return 1U << cluster_nrpages_shift(inode); -+} -+ -+static inline size_t inode_cluster_size(struct inode *inode) -+{ -+ assert("edward-96", inode != NULL); -+ -+ return 1U << inode_cluster_shift(inode); -+} -+ -+static inline cloff_t pg_to_clust(pgoff_t idx, struct inode *inode) -+{ -+ return idx >> cluster_nrpages_shift(inode); -+} -+ -+static inline pgoff_t clust_to_pg(cloff_t idx, struct inode *inode) -+{ -+ return idx << cluster_nrpages_shift(inode); -+} -+ -+static inline pgoff_t pg_to_clust_to_pg(pgoff_t idx, struct inode *inode) -+{ -+ return clust_to_pg(pg_to_clust(idx, inode), inode); -+} -+ -+static inline pgoff_t off_to_pg(loff_t off) -+{ -+ return (off >> PAGE_CACHE_SHIFT); -+} -+ -+static inline loff_t pg_to_off(pgoff_t idx) -+{ -+ return ((loff_t) (idx) << PAGE_CACHE_SHIFT); -+} -+ -+static inline cloff_t off_to_clust(loff_t off, struct inode *inode) -+{ -+ return off >> inode_cluster_shift(inode); -+} -+ -+static inline loff_t clust_to_off(cloff_t idx, struct inode *inode) -+{ -+ return (loff_t) idx << inode_cluster_shift(inode); -+} -+ -+static inline unsigned long count_to_nr(loff_t count, unsigned shift) -+{ -+ return (count + (1UL << shift) - 1) >> shift; -+} -+ -+/* number of pages occupied by @count bytes */ -+static inline pgoff_t count_to_nrpages(loff_t count) -+{ -+ return count_to_nr(count, PAGE_CACHE_SHIFT); -+} -+ -+/* number of clusters occupied by @count bytes */ -+static inline cloff_t count_to_nrclust(loff_t count, struct inode *inode) -+{ -+ return count_to_nr(count, inode_cluster_shift(inode)); -+} -+ -+/* number of clusters occupied by @count pages */ -+static inline cloff_t pgcount_to_nrclust(pgoff_t count, struct inode *inode) -+{ -+ return count_to_nr(count, cluster_nrpages_shift(inode)); -+} -+ -+static inline loff_t off_to_clust_to_off(loff_t off, struct inode *inode) -+{ -+ return clust_to_off(off_to_clust(off, inode), inode); -+} -+ -+static inline pgoff_t off_to_clust_to_pg(loff_t off, struct inode *inode) -+{ -+ return clust_to_pg(off_to_clust(off, inode), inode); -+} -+ -+static inline unsigned off_to_pgoff(loff_t off) -+{ -+ return off & (PAGE_CACHE_SIZE - 1); -+} -+ -+static inline unsigned off_to_cloff(loff_t off, struct inode *inode) -+{ -+ return off & ((loff_t) (inode_cluster_size(inode)) - 1); -+} -+ -+static inline unsigned -+pg_to_off_to_cloff(unsigned long idx, struct inode *inode) -+{ -+ return off_to_cloff(pg_to_off(idx), inode); -+} -+ -+/* if @size != 0, returns index of the page -+ which contains the last byte of the file */ -+static inline pgoff_t size_to_pg(loff_t size) -+{ -+ return (size ? off_to_pg(size - 1) : 0); -+} -+ -+/* minimal index of the page which doesn't contain -+ file data */ -+static inline pgoff_t size_to_next_pg(loff_t size) -+{ -+ return (size ? off_to_pg(size - 1) + 1 : 0); -+} -+ -+/* how many bytes of file of size @cnt can be contained -+ in page of index @idx */ -+static inline unsigned cnt_to_pgcnt(loff_t cnt, pgoff_t idx) -+{ -+ if (idx > off_to_pg(cnt)) -+ return 0; -+ if (idx < off_to_pg(cnt)) -+ return PAGE_CACHE_SIZE; -+ return off_to_pgoff(cnt); -+} -+ -+/* how many bytes of file of size @cnt can be contained -+ in logical cluster of index @idx */ -+static inline unsigned cnt_to_clcnt(loff_t cnt, cloff_t idx, -+ struct inode *inode) -+{ -+ if (idx > off_to_clust(cnt, inode)) -+ return 0; -+ if (idx < off_to_clust(cnt, inode)) -+ return inode_cluster_size(inode); -+ return off_to_cloff(cnt, inode); -+} -+ -+static inline unsigned -+fsize_to_count(reiser4_cluster_t * clust, struct inode *inode) -+{ -+ assert("edward-288", clust != NULL); -+ assert("edward-289", inode != NULL); -+ -+ return cnt_to_clcnt(inode->i_size, clust->index, inode); -+} -+ -+static inline int -+cluster_is_complete(reiser4_cluster_t * clust, struct inode * inode) -+{ -+ return clust->tc.lsize == inode_cluster_size(inode); -+} -+ -+static inline void reiser4_slide_init(reiser4_slide_t * win) -+{ -+ assert("edward-1084", win != NULL); -+ memset(win, 0, sizeof *win); -+} -+ -+static inline tfm_action -+cluster_get_tfm_act(tfm_cluster_t * tc) -+{ -+ assert("edward-1356", tc != NULL); -+ return tc->act; -+} -+ -+static inline void -+cluster_set_tfm_act(tfm_cluster_t * tc, tfm_action act) -+{ -+ assert("edward-1356", tc != NULL); -+ tc->act = act; -+} -+ -+static inline void -+cluster_init_act (reiser4_cluster_t * clust, tfm_action act, reiser4_slide_t * window){ -+ assert("edward-84", clust != NULL); -+ memset(clust, 0, sizeof *clust); -+ cluster_set_tfm_act(&clust->tc, act); -+ clust->dstat = INVAL_DISK_CLUSTER; -+ clust->win = window; -+} -+ -+static inline void -+cluster_init_read(reiser4_cluster_t * clust, reiser4_slide_t * window) -+{ -+ cluster_init_act (clust, TFMA_READ, window); -+} -+ -+static inline void -+cluster_init_write(reiser4_cluster_t * clust, reiser4_slide_t * window) -+{ -+ cluster_init_act (clust, TFMA_WRITE, window); -+} -+ -+static inline int dclust_get_extension_dsize(hint_t * hint) -+{ -+ return hint->ext_coord.extension.ctail.dsize; -+} -+ -+static inline void dclust_set_extension_dsize(hint_t * hint, int dsize) -+{ -+ hint->ext_coord.extension.ctail.dsize = dsize; -+} -+ -+static inline int dclust_get_extension_shift(hint_t * hint) -+{ -+ return hint->ext_coord.extension.ctail.shift; -+} -+ -+static inline int dclust_get_extension_ncount(hint_t * hint) -+{ -+ return hint->ext_coord.extension.ctail.ncount; -+} -+ -+static inline void dclust_inc_extension_ncount(hint_t * hint) -+{ -+ hint->ext_coord.extension.ctail.ncount ++; -+} -+ -+static inline void dclust_init_extension(hint_t * hint) -+{ -+ memset(&hint->ext_coord.extension.ctail, 0, -+ sizeof(hint->ext_coord.extension.ctail)); -+} -+ -+static inline int hint_is_unprepped_dclust(hint_t * hint) -+{ -+ assert("edward-1451", hint_is_valid(hint)); -+ return dclust_get_extension_shift(hint) == (int)UCTAIL_SHIFT; -+} -+ -+static inline void coord_set_between_clusters(coord_t * coord) -+{ -+#if REISER4_DEBUG -+ int result; -+ result = zload(coord->node); -+ assert("edward-1296", !result); -+#endif -+ if (!coord_is_between_items(coord)) { -+ coord->between = AFTER_ITEM; -+ coord->unit_pos = 0; -+ } -+#if REISER4_DEBUG -+ zrelse(coord->node); -+#endif -+} -+ -+int reiser4_inflate_cluster(reiser4_cluster_t *, struct inode *); -+int find_disk_cluster(reiser4_cluster_t *, struct inode *, int read, -+ znode_lock_mode mode); -+int flush_cluster_pages(reiser4_cluster_t *, jnode *, struct inode *); -+int reiser4_deflate_cluster(reiser4_cluster_t *, struct inode *); -+void truncate_page_cluster_cryptcompress(struct inode *inode, cloff_t start, -+ int even_cows); -+void invalidate_hint_cluster(reiser4_cluster_t * clust); -+void put_hint_cluster(reiser4_cluster_t * clust, struct inode *inode, -+ znode_lock_mode mode); -+int get_disk_cluster_locked(reiser4_cluster_t * clust, struct inode *inode, -+ znode_lock_mode lock_mode); -+void reset_cluster_params(reiser4_cluster_t * clust); -+int set_cluster_by_page(reiser4_cluster_t * clust, struct page * page, -+ int count); -+int prepare_page_cluster(struct inode *inode, reiser4_cluster_t * clust, -+ int capture); -+void reiser4_release_cluster_pages(reiser4_cluster_t *); -+void put_cluster_handle(reiser4_cluster_t * clust); -+int grab_tfm_stream(struct inode *inode, tfm_cluster_t * tc, tfm_stream_id id); -+int tfm_cluster_is_uptodate(tfm_cluster_t * tc); -+void tfm_cluster_set_uptodate(tfm_cluster_t * tc); -+void tfm_cluster_clr_uptodate(tfm_cluster_t * tc); -+ -+/* move cluster handle to the target position -+ specified by the page of index @pgidx -+*/ -+static inline void move_cluster_forward(reiser4_cluster_t * clust, -+ struct inode *inode, -+ pgoff_t pgidx) -+{ -+ assert("edward-1297", clust != NULL); -+ assert("edward-1298", inode != NULL); -+ -+ reset_cluster_params(clust); -+ if (clust->index_valid && -+ /* Hole in the indices. Hint became invalid and can not be -+ used by find_cluster_item() even if seal/node versions -+ will coincide */ -+ pg_to_clust(pgidx, inode) != clust->index + 1) { -+ reiser4_unset_hint(clust->hint); -+ invalidate_hint_cluster(clust); -+ } -+ clust->index = pg_to_clust(pgidx, inode); -+ clust->index_valid = 1; -+} -+ -+static inline int -+alloc_clust_pages(reiser4_cluster_t * clust, struct inode *inode) -+{ -+ assert("edward-791", clust != NULL); -+ assert("edward-792", inode != NULL); -+ clust->pages = -+ kmalloc(sizeof(*clust->pages) << inode_cluster_shift(inode), -+ reiser4_ctx_gfp_mask_get()); -+ if (!clust->pages) -+ return -ENOMEM; -+ return 0; -+} -+ -+static inline void free_clust_pages(reiser4_cluster_t * clust) -+{ -+ kfree(clust->pages); -+} -+ -+#endif /* __FS_REISER4_CLUSTER_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/compress/compress.c linux-2.6.20/fs/reiser4/plugin/compress/compress.c ---- linux-2.6.20.orig/fs/reiser4/plugin/compress/compress.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/compress/compress.c 2007-05-06 14:50:43.746990723 +0400 -@@ -0,0 +1,381 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+/* reiser4 compression transform plugins */ -+ -+#include "../../debug.h" -+#include "../../inode.h" -+#include "../plugin.h" -+#include "minilzo.h" -+ -+#include -+#include -+#include -+ -+static int change_compression(struct inode *inode, -+ reiser4_plugin * plugin, -+ pset_member memb) -+{ -+ assert("edward-1316", inode != NULL); -+ assert("edward-1317", plugin != NULL); -+ assert("edward-1318", is_reiser4_inode(inode)); -+ assert("edward-1319", -+ plugin->h.type_id == REISER4_COMPRESSION_PLUGIN_TYPE); -+ -+ /* cannot change compression plugin of already existing regular object */ -+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE)) -+ return RETERR(-EINVAL); -+ -+ /* If matches, nothing to change. */ -+ if (inode_hash_plugin(inode) != NULL && -+ inode_hash_plugin(inode)->h.id == plugin->h.id) -+ return 0; -+ -+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset, -+ PSET_COMPRESSION, plugin); -+} -+ -+static reiser4_plugin_ops compression_plugin_ops = { -+ .init = NULL, -+ .load = NULL, -+ .save_len = NULL, -+ .save = NULL, -+ .change = &change_compression -+}; -+ -+/******************************************************************************/ -+/* gzip1 compression */ -+/******************************************************************************/ -+ -+#define GZIP1_DEF_LEVEL Z_BEST_SPEED -+#define GZIP1_DEF_WINBITS 15 -+#define GZIP1_DEF_MEMLEVEL MAX_MEM_LEVEL -+ -+static int gzip1_init(void) -+{ -+ int ret = -EINVAL; -+#if REISER4_ZLIB -+ ret = 0; -+#endif -+ if (ret == -EINVAL) -+ warning("edward-1337", "Zlib not compiled into kernel"); -+ return ret; -+} -+ -+static int gzip1_overrun(unsigned src_len UNUSED_ARG) -+{ -+ return 0; -+} -+ -+static coa_t gzip1_alloc(tfm_action act) -+{ -+ coa_t coa = NULL; -+#if REISER4_ZLIB -+ int ret = 0; -+ switch (act) { -+ case TFMA_WRITE: /* compress */ -+ coa = reiser4_vmalloc(zlib_deflate_workspacesize()); -+ if (!coa) { -+ ret = -ENOMEM; -+ break; -+ } -+ memset(coa, 0, zlib_deflate_workspacesize()); -+ break; -+ case TFMA_READ: /* decompress */ -+ coa = reiser4_vmalloc(zlib_inflate_workspacesize()); -+ if (!coa) { -+ ret = -ENOMEM; -+ break; -+ } -+ memset(coa, 0, zlib_inflate_workspacesize()); -+ break; -+ default: -+ impossible("edward-767", -+ "trying to alloc workspace for unknown tfm action"); -+ } -+ if (ret) { -+ warning("edward-768", -+ "alloc workspace for gzip1 (tfm action = %d) failed\n", -+ act); -+ return ERR_PTR(ret); -+ } -+#endif -+ return coa; -+} -+ -+static void gzip1_free(coa_t coa, tfm_action act) -+{ -+ assert("edward-769", coa != NULL); -+ -+ switch (act) { -+ case TFMA_WRITE: /* compress */ -+ vfree(coa); -+ break; -+ case TFMA_READ: /* decompress */ -+ vfree(coa); -+ break; -+ default: -+ impossible("edward-770", "unknown tfm action"); -+ } -+ return; -+} -+ -+static int gzip1_min_size_deflate(void) -+{ -+ return 64; -+} -+ -+static void -+gzip1_compress(coa_t coa, __u8 * src_first, unsigned src_len, -+ __u8 * dst_first, unsigned *dst_len) -+{ -+#if REISER4_ZLIB -+ int ret = 0; -+ struct z_stream_s stream; -+ -+ memset(&stream, 0, sizeof(stream)); -+ -+ assert("edward-842", coa != NULL); -+ assert("edward-875", src_len != 0); -+ -+ stream.workspace = coa; -+ ret = zlib_deflateInit2(&stream, GZIP1_DEF_LEVEL, Z_DEFLATED, -+ -GZIP1_DEF_WINBITS, GZIP1_DEF_MEMLEVEL, -+ Z_DEFAULT_STRATEGY); -+ if (ret != Z_OK) { -+ warning("edward-771", "zlib_deflateInit2 returned %d\n", ret); -+ goto rollback; -+ } -+ ret = zlib_deflateReset(&stream); -+ if (ret != Z_OK) { -+ warning("edward-772", "zlib_deflateReset returned %d\n", ret); -+ goto rollback; -+ } -+ stream.next_in = src_first; -+ stream.avail_in = src_len; -+ stream.next_out = dst_first; -+ stream.avail_out = *dst_len; -+ -+ ret = zlib_deflate(&stream, Z_FINISH); -+ if (ret != Z_STREAM_END) { -+ if (ret != Z_OK) -+ warning("edward-773", -+ "zlib_deflate returned %d\n", ret); -+ goto rollback; -+ } -+ *dst_len = stream.total_out; -+ return; -+ rollback: -+ *dst_len = src_len; -+#endif -+ return; -+} -+ -+static void -+gzip1_decompress(coa_t coa, __u8 * src_first, unsigned src_len, -+ __u8 * dst_first, unsigned *dst_len) -+{ -+#if REISER4_ZLIB -+ int ret = 0; -+ struct z_stream_s stream; -+ -+ memset(&stream, 0, sizeof(stream)); -+ -+ assert("edward-843", coa != NULL); -+ assert("edward-876", src_len != 0); -+ -+ stream.workspace = coa; -+ ret = zlib_inflateInit2(&stream, -GZIP1_DEF_WINBITS); -+ if (ret != Z_OK) { -+ warning("edward-774", "zlib_inflateInit2 returned %d\n", ret); -+ return; -+ } -+ ret = zlib_inflateReset(&stream); -+ if (ret != Z_OK) { -+ warning("edward-775", "zlib_inflateReset returned %d\n", ret); -+ return; -+ } -+ -+ stream.next_in = src_first; -+ stream.avail_in = src_len; -+ stream.next_out = dst_first; -+ stream.avail_out = *dst_len; -+ -+ ret = zlib_inflate(&stream, Z_SYNC_FLUSH); -+ /* -+ * Work around a bug in zlib, which sometimes wants to taste an extra -+ * byte when being used in the (undocumented) raw deflate mode. -+ * (From USAGI). -+ */ -+ if (ret == Z_OK && !stream.avail_in && stream.avail_out) { -+ u8 zerostuff = 0; -+ stream.next_in = &zerostuff; -+ stream.avail_in = 1; -+ ret = zlib_inflate(&stream, Z_FINISH); -+ } -+ if (ret != Z_STREAM_END) { -+ warning("edward-776", "zlib_inflate returned %d\n", ret); -+ return; -+ } -+ *dst_len = stream.total_out; -+#endif -+ return; -+} -+ -+/******************************************************************************/ -+/* lzo1 compression */ -+/******************************************************************************/ -+ -+static int lzo1_init(void) -+{ -+ int ret; -+ ret = lzo_init(); -+ if (ret != LZO_E_OK) -+ warning("edward-848", "lzo_init() failed with ret = %d\n", ret); -+ return ret; -+} -+ -+static int lzo1_overrun(unsigned in_len) -+{ -+ return in_len / 64 + 16 + 3; -+} -+ -+#define LZO_HEAP_SIZE(size) \ -+ sizeof(lzo_align_t) * (((size) + (sizeof(lzo_align_t) - 1)) / sizeof(lzo_align_t)) -+ -+static coa_t lzo1_alloc(tfm_action act) -+{ -+ int ret = 0; -+ coa_t coa = NULL; -+ -+ switch (act) { -+ case TFMA_WRITE: /* compress */ -+ coa = reiser4_vmalloc(LZO_HEAP_SIZE(LZO1X_1_MEM_COMPRESS)); -+ if (!coa) { -+ ret = -ENOMEM; -+ break; -+ } -+ memset(coa, 0, LZO_HEAP_SIZE(LZO1X_1_MEM_COMPRESS)); -+ case TFMA_READ: /* decompress */ -+ break; -+ default: -+ impossible("edward-877", -+ "trying to alloc workspace for unknown tfm action"); -+ } -+ if (ret) { -+ warning("edward-878", -+ "alloc workspace for lzo1 (tfm action = %d) failed\n", -+ act); -+ return ERR_PTR(ret); -+ } -+ return coa; -+} -+ -+static void lzo1_free(coa_t coa, tfm_action act) -+{ -+ assert("edward-879", coa != NULL); -+ -+ switch (act) { -+ case TFMA_WRITE: /* compress */ -+ vfree(coa); -+ break; -+ case TFMA_READ: /* decompress */ -+ impossible("edward-1304", -+ "trying to free non-allocated workspace"); -+ default: -+ impossible("edward-880", "unknown tfm action"); -+ } -+ return; -+} -+ -+static int lzo1_min_size_deflate(void) -+{ -+ return 256; -+} -+ -+static void -+lzo1_compress(coa_t coa, __u8 * src_first, unsigned src_len, -+ __u8 * dst_first, unsigned *dst_len) -+{ -+ int result; -+ -+ assert("edward-846", coa != NULL); -+ assert("edward-847", src_len != 0); -+ -+ result = lzo1x_1_compress(src_first, src_len, dst_first, dst_len, coa); -+ if (result != LZO_E_OK) { -+ warning("edward-849", "lzo1x_1_compress failed\n"); -+ goto out; -+ } -+ if (*dst_len >= src_len) { -+ //warning("edward-850", "lzo1x_1_compress: incompressible data\n"); -+ goto out; -+ } -+ return; -+ out: -+ *dst_len = src_len; -+ return; -+} -+ -+static void -+lzo1_decompress(coa_t coa, __u8 * src_first, unsigned src_len, -+ __u8 * dst_first, unsigned *dst_len) -+{ -+ int result; -+ -+ assert("edward-851", coa == NULL); -+ assert("edward-852", src_len != 0); -+ -+ result = lzo1x_decompress(src_first, src_len, dst_first, dst_len, NULL); -+ if (result != LZO_E_OK) -+ warning("edward-853", "lzo1x_1_decompress failed\n"); -+ return; -+} -+ -+compression_plugin compression_plugins[LAST_COMPRESSION_ID] = { -+ [LZO1_COMPRESSION_ID] = { -+ .h = { -+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE, -+ .id = LZO1_COMPRESSION_ID, -+ .pops = &compression_plugin_ops, -+ .label = "lzo1", -+ .desc = "lzo1 compression transform", -+ .linkage = {NULL, NULL} -+ }, -+ .init = lzo1_init, -+ .overrun = lzo1_overrun, -+ .alloc = lzo1_alloc, -+ .free = lzo1_free, -+ .min_size_deflate = lzo1_min_size_deflate, -+ .checksum = reiser4_adler32, -+ .compress = lzo1_compress, -+ .decompress = lzo1_decompress -+ }, -+ [GZIP1_COMPRESSION_ID] = { -+ .h = { -+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE, -+ .id = GZIP1_COMPRESSION_ID, -+ .pops = &compression_plugin_ops, -+ .label = "gzip1", -+ .desc = "gzip1 compression transform", -+ .linkage = {NULL, NULL} -+ }, -+ .init = gzip1_init, -+ .overrun = gzip1_overrun, -+ .alloc = gzip1_alloc, -+ .free = gzip1_free, -+ .min_size_deflate = gzip1_min_size_deflate, -+ .checksum = reiser4_adler32, -+ .compress = gzip1_compress, -+ .decompress = gzip1_decompress -+ } -+}; -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/compress/compress.h linux-2.6.20/fs/reiser4/plugin/compress/compress.h ---- linux-2.6.20.orig/fs/reiser4/plugin/compress/compress.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/compress/compress.h 2007-05-06 14:50:43.746990723 +0400 -@@ -0,0 +1,38 @@ -+#if !defined( __FS_REISER4_COMPRESS_H__ ) -+#define __FS_REISER4_COMPRESS_H__ -+ -+#include -+#include -+ -+typedef enum { -+ TFMA_READ, -+ TFMA_WRITE, -+ TFMA_LAST -+} tfm_action; -+ -+/* builtin compression plugins */ -+ -+typedef enum { -+ LZO1_COMPRESSION_ID, -+ GZIP1_COMPRESSION_ID, -+ LAST_COMPRESSION_ID, -+} reiser4_compression_id; -+ -+typedef unsigned long cloff_t; -+typedef void *coa_t; -+typedef coa_t coa_set[LAST_COMPRESSION_ID][TFMA_LAST]; -+ -+__u32 reiser4_adler32(char *data, __u32 len); -+ -+#endif /* __FS_REISER4_COMPRESS_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/compress/compress_mode.c linux-2.6.20/fs/reiser4/plugin/compress/compress_mode.c ---- linux-2.6.20.orig/fs/reiser4/plugin/compress/compress_mode.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/compress/compress_mode.c 2007-05-06 14:50:43.750991972 +0400 -@@ -0,0 +1,162 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+/* This file contains Reiser4 compression mode plugins. -+ -+ Compression mode plugin is a set of handlers called by compressor -+ at flush time and represent some heuristics including the ones -+ which are to avoid compression of incompressible data, see -+ http://www.namesys.com/cryptcompress_design.html for more details. -+*/ -+#include "../../inode.h" -+#include "../plugin.h" -+ -+static int should_deflate_none(struct inode * inode, cloff_t index) -+{ -+ return 0; -+} -+ -+static int should_deflate_common(struct inode * inode, cloff_t index) -+{ -+ return compression_is_on(cryptcompress_inode_data(inode)); -+} -+ -+static int discard_hook_ultim(struct inode *inode, cloff_t index) -+{ -+ turn_off_compression(cryptcompress_inode_data(inode)); -+ return 0; -+} -+ -+static int discard_hook_lattd(struct inode *inode, cloff_t index) -+{ -+ cryptcompress_info_t * info = cryptcompress_inode_data(inode); -+ -+ assert("edward-1462", -+ get_lattice_factor(info) >= MIN_LATTICE_FACTOR && -+ get_lattice_factor(info) <= MAX_LATTICE_FACTOR); -+ -+ turn_off_compression(info); -+ if (get_lattice_factor(info) < MAX_LATTICE_FACTOR) -+ set_lattice_factor(info, get_lattice_factor(info) << 1); -+ return 0; -+} -+ -+static int accept_hook_lattd(struct inode *inode, cloff_t index) -+{ -+ turn_on_compression(cryptcompress_inode_data(inode)); -+ set_lattice_factor(cryptcompress_inode_data(inode), MIN_LATTICE_FACTOR); -+ return 0; -+} -+ -+/* Check on dynamic lattice, the adaptive compression modes which -+ defines the following behavior: -+ -+ Compression is on: try to compress everything and turn -+ it off, whenever cluster is incompressible. -+ -+ Compression is off: try to compress clusters of indexes -+ k * FACTOR (k = 0, 1, 2, ...) and turn it on, if some of -+ them is compressible. If incompressible, then increase FACTOR */ -+ -+/* check if @index belongs to one-dimensional lattice -+ of sparce factor @factor */ -+static int is_on_lattice(cloff_t index, int factor) -+{ -+ return (factor ? index % factor == 0: index == 0); -+} -+ -+static int should_deflate_lattd(struct inode * inode, cloff_t index) -+{ -+ return should_deflate_common(inode, index) || -+ is_on_lattice(index, -+ get_lattice_factor -+ (cryptcompress_inode_data(inode))); -+} -+ -+/* compression mode_plugins */ -+compression_mode_plugin compression_mode_plugins[LAST_COMPRESSION_MODE_ID] = { -+ [NONE_COMPRESSION_MODE_ID] = { -+ .h = { -+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ .id = NONE_COMPRESSION_MODE_ID, -+ .pops = NULL, -+ .label = "none", -+ .desc = "Compress nothing", -+ .linkage = {NULL, NULL} -+ }, -+ .should_deflate = should_deflate_none, -+ .accept_hook = NULL, -+ .discard_hook = NULL -+ }, -+ /* Check-on-dynamic-lattice adaptive compression mode */ -+ [LATTD_COMPRESSION_MODE_ID] = { -+ .h = { -+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ .id = LATTD_COMPRESSION_MODE_ID, -+ .pops = NULL, -+ .label = "lattd", -+ .desc = "Check on dynamic lattice", -+ .linkage = {NULL, NULL} -+ }, -+ .should_deflate = should_deflate_lattd, -+ .accept_hook = accept_hook_lattd, -+ .discard_hook = discard_hook_lattd -+ }, -+ /* Check-ultimately compression mode: -+ Turn off compression forever as soon as we meet -+ incompressible data */ -+ [ULTIM_COMPRESSION_MODE_ID] = { -+ .h = { -+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ .id = ULTIM_COMPRESSION_MODE_ID, -+ .pops = NULL, -+ .label = "ultim", -+ .desc = "Check ultimately", -+ .linkage = {NULL, NULL} -+ }, -+ .should_deflate = should_deflate_common, -+ .accept_hook = NULL, -+ .discard_hook = discard_hook_ultim -+ }, -+ /* Force-to-compress-everything compression mode */ -+ [FORCE_COMPRESSION_MODE_ID] = { -+ .h = { -+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ .id = FORCE_COMPRESSION_MODE_ID, -+ .pops = NULL, -+ .label = "force", -+ .desc = "Force to compress everything", -+ .linkage = {NULL, NULL} -+ }, -+ .should_deflate = NULL, -+ .accept_hook = NULL, -+ .discard_hook = NULL -+ }, -+ /* Convert-to-extent compression mode. -+ In this mode items will be converted to extents and management -+ will be passed to (classic) unix file plugin as soon as ->write() -+ detects that the first complete logical cluster (of index #0) is -+ incompressible. */ -+ [CONVX_COMPRESSION_MODE_ID] = { -+ .h = { -+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ .id = CONVX_COMPRESSION_MODE_ID, -+ .pops = NULL, -+ .label = "conv", -+ .desc = "Convert to extent", -+ .linkage = {NULL, NULL} -+ }, -+ .should_deflate = should_deflate_common, -+ .accept_hook = NULL, -+ .discard_hook = NULL -+ } -+}; -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/compress/lzoconf.h linux-2.6.20/fs/reiser4/plugin/compress/lzoconf.h ---- linux-2.6.20.orig/fs/reiser4/plugin/compress/lzoconf.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/compress/lzoconf.h 2007-05-06 14:50:43.750991972 +0400 -@@ -0,0 +1,216 @@ -+/* lzoconf.h -- configuration for the LZO real-time data compression library -+ adopted for reiser4 compression transform plugin. -+ -+ This file is part of the LZO real-time data compression library -+ and not included in any proprietary licenses of reiser4. -+ -+ Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer -+ Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer -+ Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer -+ Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer -+ Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer -+ Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer -+ Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer -+ All Rights Reserved. -+ -+ The LZO library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU General Public License as -+ published by the Free Software Foundation; either version 2 of -+ the License, or (at your option) any later version. -+ -+ The LZO library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ GNU General Public License for more details. -+ -+ You should have received a copy of the GNU General Public License -+ along with the LZO library; see the file COPYING. -+ If not, write to the Free Software Foundation, Inc., -+ 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -+ -+ Markus F.X.J. Oberhumer -+ -+ http://www.oberhumer.com/opensource/lzo/ -+ */ -+ -+#include /* for UINT_MAX, ULONG_MAX - edward */ -+ -+#ifndef __LZOCONF_H -+#define __LZOCONF_H -+ -+#define LZO_VERSION 0x1080 -+#define LZO_VERSION_STRING "1.08" -+#define LZO_VERSION_DATE "Jul 12 2002" -+ -+/* internal Autoconf configuration file - only used when building LZO */ -+ -+/*********************************************************************** -+// LZO requires a conforming -+************************************************************************/ -+ -+#define CHAR_BIT 8 -+#define USHRT_MAX 0xffff -+ -+/* workaround a cpp bug under hpux 10.20 */ -+#define LZO_0xffffffffL 4294967295ul -+ -+/*********************************************************************** -+// architecture defines -+************************************************************************/ -+ -+#if !defined(__LZO_i386) -+# if defined(__i386__) || defined(__386__) || defined(_M_IX86) -+# define __LZO_i386 -+# endif -+#endif -+ -+/* memory checkers */ -+#if !defined(__LZO_CHECKER) -+# if defined(__BOUNDS_CHECKING_ON) -+# define __LZO_CHECKER -+# elif defined(__CHECKER__) -+# define __LZO_CHECKER -+# elif defined(__INSURE__) -+# define __LZO_CHECKER -+# elif defined(__PURIFY__) -+# define __LZO_CHECKER -+# endif -+#endif -+ -+/*********************************************************************** -+// integral and pointer types -+************************************************************************/ -+ -+/* Integral types with 32 bits or more */ -+#if !defined(LZO_UINT32_MAX) -+# if (UINT_MAX >= LZO_0xffffffffL) -+ typedef unsigned int lzo_uint32; -+ typedef int lzo_int32; -+# define LZO_UINT32_MAX UINT_MAX -+# define LZO_INT32_MAX INT_MAX -+# define LZO_INT32_MIN INT_MIN -+# elif (ULONG_MAX >= LZO_0xffffffffL) -+ typedef unsigned long lzo_uint32; -+ typedef long lzo_int32; -+# define LZO_UINT32_MAX ULONG_MAX -+# define LZO_INT32_MAX LONG_MAX -+# define LZO_INT32_MIN LONG_MIN -+# else -+# error "lzo_uint32" -+# endif -+#endif -+ -+/* lzo_uint is used like size_t */ -+#if !defined(LZO_UINT_MAX) -+# if (UINT_MAX >= LZO_0xffffffffL) -+ typedef unsigned int lzo_uint; -+ typedef int lzo_int; -+# define LZO_UINT_MAX UINT_MAX -+# define LZO_INT_MAX INT_MAX -+# define LZO_INT_MIN INT_MIN -+# elif (ULONG_MAX >= LZO_0xffffffffL) -+ typedef unsigned long lzo_uint; -+ typedef long lzo_int; -+# define LZO_UINT_MAX ULONG_MAX -+# define LZO_INT_MAX LONG_MAX -+# define LZO_INT_MIN LONG_MIN -+# else -+# error "lzo_uint" -+# endif -+#endif -+ -+ typedef int lzo_bool; -+ -+/*********************************************************************** -+// memory models -+************************************************************************/ -+ -+/* Memory model that allows to access memory at offsets of lzo_uint. */ -+#if !defined(__LZO_MMODEL) -+# if (LZO_UINT_MAX <= UINT_MAX) -+# define __LZO_MMODEL -+# else -+# error "__LZO_MMODEL" -+# endif -+#endif -+ -+/* no typedef here because of const-pointer issues */ -+#define lzo_byte unsigned char __LZO_MMODEL -+#define lzo_bytep unsigned char __LZO_MMODEL * -+#define lzo_charp char __LZO_MMODEL * -+#define lzo_voidp void __LZO_MMODEL * -+#define lzo_shortp short __LZO_MMODEL * -+#define lzo_ushortp unsigned short __LZO_MMODEL * -+#define lzo_uint32p lzo_uint32 __LZO_MMODEL * -+#define lzo_int32p lzo_int32 __LZO_MMODEL * -+#define lzo_uintp lzo_uint __LZO_MMODEL * -+#define lzo_intp lzo_int __LZO_MMODEL * -+#define lzo_voidpp lzo_voidp __LZO_MMODEL * -+#define lzo_bytepp lzo_bytep __LZO_MMODEL * -+ -+#ifndef lzo_sizeof_dict_t -+# define lzo_sizeof_dict_t sizeof(lzo_bytep) -+#endif -+ -+typedef int (*lzo_compress_t) (const lzo_byte * src, lzo_uint src_len, -+ lzo_byte * dst, lzo_uintp dst_len, -+ lzo_voidp wrkmem); -+ -+ -+/*********************************************************************** -+// error codes and prototypes -+************************************************************************/ -+ -+/* Error codes for the compression/decompression functions. Negative -+ * values are errors, positive values will be used for special but -+ * normal events. -+ */ -+#define LZO_E_OK 0 -+#define LZO_E_ERROR (-1) -+#define LZO_E_OUT_OF_MEMORY (-2) /* not used right now */ -+#define LZO_E_NOT_COMPRESSIBLE (-3) /* not used right now */ -+#define LZO_E_INPUT_OVERRUN (-4) -+#define LZO_E_OUTPUT_OVERRUN (-5) -+#define LZO_E_LOOKBEHIND_OVERRUN (-6) -+#define LZO_E_EOF_NOT_FOUND (-7) -+#define LZO_E_INPUT_NOT_CONSUMED (-8) -+ -+/* lzo_init() should be the first function you call. -+ * Check the return code ! -+ * -+ * lzo_init() is a macro to allow checking that the library and the -+ * compiler's view of various types are consistent. -+ */ -+#define lzo_init() __lzo_init2(LZO_VERSION,(int)sizeof(short),(int)sizeof(int),\ -+ (int)sizeof(long),(int)sizeof(lzo_uint32),(int)sizeof(lzo_uint),\ -+ (int)lzo_sizeof_dict_t,(int)sizeof(char *),(int)sizeof(lzo_voidp),\ -+ (int)sizeof(lzo_compress_t)) -+ extern int __lzo_init2(unsigned, int, int, int, int, int, int, -+ int, int, int); -+ -+/* checksum functions */ -+extern lzo_uint32 lzo_crc32(lzo_uint32 _c, const lzo_byte * _buf, -+ lzo_uint _len); -+/* misc. */ -+ typedef union { -+ lzo_bytep p; -+ lzo_uint u; -+ } __lzo_pu_u; -+ typedef union { -+ lzo_bytep p; -+ lzo_uint32 u32; -+ } __lzo_pu32_u; -+ typedef union { -+ void *vp; -+ lzo_bytep bp; -+ lzo_uint32 u32; -+ long l; -+ } lzo_align_t; -+ -+#define LZO_PTR_ALIGN_UP(_ptr,_size) \ -+ ((_ptr) + (lzo_uint) __lzo_align_gap((const lzo_voidp)(_ptr),(lzo_uint)(_size))) -+ -+/* deprecated - only for backward compatibility */ -+#define LZO_ALIGN(_ptr,_size) LZO_PTR_ALIGN_UP(_ptr,_size) -+ -+#endif /* already included */ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/compress/Makefile linux-2.6.20/fs/reiser4/plugin/compress/Makefile ---- linux-2.6.20.orig/fs/reiser4/plugin/compress/Makefile 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/compress/Makefile 2007-05-06 14:50:43.750991972 +0400 -@@ -0,0 +1,6 @@ -+obj-$(CONFIG_REISER4_FS) += compress_plugins.o -+ -+compress_plugins-objs := \ -+ compress.o \ -+ minilzo.o \ -+ compress_mode.o -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/compress/minilzo.c linux-2.6.20/fs/reiser4/plugin/compress/minilzo.c ---- linux-2.6.20.orig/fs/reiser4/plugin/compress/minilzo.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/compress/minilzo.c 2007-05-06 14:50:43.754993222 +0400 -@@ -0,0 +1,1967 @@ -+/* minilzo.c -- mini subset of the LZO real-time data compression library -+ adopted for reiser4 compression transform plugin. -+ -+ This file is part of the LZO real-time data compression library -+ and not included in any proprietary licenses of reiser4. -+ -+ Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer -+ Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer -+ Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer -+ Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer -+ Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer -+ Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer -+ Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer -+ All Rights Reserved. -+ -+ The LZO library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU General Public License as -+ published by the Free Software Foundation; either version 2 of -+ the License, or (at your option) any later version. -+ -+ The LZO library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ GNU General Public License for more details. -+ -+ You should have received a copy of the GNU General Public License -+ along with the LZO library; see the file COPYING. -+ If not, write to the Free Software Foundation, Inc., -+ 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -+ -+ Markus F.X.J. Oberhumer -+ -+ http://www.oberhumer.com/opensource/lzo/ -+ */ -+ -+/* -+ * NOTE: -+ * the full LZO package can be found at -+ * http://www.oberhumer.com/opensource/lzo/ -+ */ -+ -+#include "../../debug.h" /* for reiser4 assert macro -edward */ -+ -+#define __LZO_IN_MINILZO -+#define LZO_BUILD -+ -+#include "minilzo.h" -+ -+#if !defined(MINILZO_VERSION) || (MINILZO_VERSION != 0x1080) -+# error "version mismatch in miniLZO source files" -+#endif -+ -+#ifndef __LZO_CONF_H -+#define __LZO_CONF_H -+ -+# define BOUNDS_CHECKING_OFF_DURING(stmt) stmt -+# define BOUNDS_CHECKING_OFF_IN_EXPR(expr) (expr) -+ -+# define HAVE_MEMCMP -+# define HAVE_MEMCPY -+# define HAVE_MEMMOVE -+# define HAVE_MEMSET -+ -+#undef NDEBUG -+#if !defined(LZO_DEBUG) -+# define NDEBUG -+#endif -+#if defined(LZO_DEBUG) || !defined(NDEBUG) -+# if !defined(NO_STDIO_H) -+# include -+# endif -+#endif -+ -+#if !defined(LZO_COMPILE_TIME_ASSERT) -+# define LZO_COMPILE_TIME_ASSERT(expr) \ -+ { typedef int __lzo_compile_time_assert_fail[1 - 2 * !(expr)]; } -+#endif -+ -+#if !defined(LZO_UNUSED) -+# if 1 -+# define LZO_UNUSED(var) ((void)&var) -+# elif 0 -+# define LZO_UNUSED(var) { typedef int __lzo_unused[sizeof(var) ? 2 : 1]; } -+# else -+# define LZO_UNUSED(parm) (parm = parm) -+# endif -+#endif -+ -+#if defined(NO_MEMCMP) -+# undef HAVE_MEMCMP -+#endif -+ -+#if !defined(HAVE_MEMSET) -+# undef memset -+# define memset lzo_memset -+#endif -+ -+# define LZO_BYTE(x) ((unsigned char) ((x) & 0xff)) -+ -+#define LZO_MAX(a,b) ((a) >= (b) ? (a) : (b)) -+#define LZO_MIN(a,b) ((a) <= (b) ? (a) : (b)) -+#define LZO_MAX3(a,b,c) ((a) >= (b) ? LZO_MAX(a,c) : LZO_MAX(b,c)) -+#define LZO_MIN3(a,b,c) ((a) <= (b) ? LZO_MIN(a,c) : LZO_MIN(b,c)) -+ -+#define lzo_sizeof(type) ((lzo_uint) (sizeof(type))) -+ -+#define LZO_HIGH(array) ((lzo_uint) (sizeof(array)/sizeof(*(array)))) -+ -+#define LZO_SIZE(bits) (1u << (bits)) -+#define LZO_MASK(bits) (LZO_SIZE(bits) - 1) -+ -+#define LZO_LSIZE(bits) (1ul << (bits)) -+#define LZO_LMASK(bits) (LZO_LSIZE(bits) - 1) -+ -+#define LZO_USIZE(bits) ((lzo_uint) 1 << (bits)) -+#define LZO_UMASK(bits) (LZO_USIZE(bits) - 1) -+ -+#define LZO_STYPE_MAX(b) (((1l << (8*(b)-2)) - 1l) + (1l << (8*(b)-2))) -+#define LZO_UTYPE_MAX(b) (((1ul << (8*(b)-1)) - 1ul) + (1ul << (8*(b)-1))) -+ -+#if !defined(SIZEOF_UNSIGNED) -+# if (UINT_MAX == 0xffff) -+# define SIZEOF_UNSIGNED 2 -+# elif (UINT_MAX == LZO_0xffffffffL) -+# define SIZEOF_UNSIGNED 4 -+# elif (UINT_MAX >= LZO_0xffffffffL) -+# define SIZEOF_UNSIGNED 8 -+# else -+# error "SIZEOF_UNSIGNED" -+# endif -+#endif -+ -+#if !defined(SIZEOF_UNSIGNED_LONG) -+# if (ULONG_MAX == LZO_0xffffffffL) -+# define SIZEOF_UNSIGNED_LONG 4 -+# elif (ULONG_MAX >= LZO_0xffffffffL) -+# define SIZEOF_UNSIGNED_LONG 8 -+# else -+# error "SIZEOF_UNSIGNED_LONG" -+# endif -+#endif -+ -+#if !defined(SIZEOF_SIZE_T) -+# define SIZEOF_SIZE_T SIZEOF_UNSIGNED -+#endif -+#if !defined(SIZE_T_MAX) -+# define SIZE_T_MAX LZO_UTYPE_MAX(SIZEOF_SIZE_T) -+#endif -+ -+#if 1 && defined(__LZO_i386) && (UINT_MAX == LZO_0xffffffffL) -+# if !defined(LZO_UNALIGNED_OK_2) && (USHRT_MAX == 0xffff) -+# define LZO_UNALIGNED_OK_2 -+# endif -+# if !defined(LZO_UNALIGNED_OK_4) && (LZO_UINT32_MAX == LZO_0xffffffffL) -+# define LZO_UNALIGNED_OK_4 -+# endif -+#endif -+ -+#if defined(LZO_UNALIGNED_OK_2) || defined(LZO_UNALIGNED_OK_4) -+# if !defined(LZO_UNALIGNED_OK) -+# define LZO_UNALIGNED_OK -+# endif -+#endif -+ -+#if defined(__LZO_NO_UNALIGNED) -+# undef LZO_UNALIGNED_OK -+# undef LZO_UNALIGNED_OK_2 -+# undef LZO_UNALIGNED_OK_4 -+#endif -+ -+#if defined(LZO_UNALIGNED_OK_2) && (USHRT_MAX != 0xffff) -+# error "LZO_UNALIGNED_OK_2 must not be defined on this system" -+#endif -+#if defined(LZO_UNALIGNED_OK_4) && (LZO_UINT32_MAX != LZO_0xffffffffL) -+# error "LZO_UNALIGNED_OK_4 must not be defined on this system" -+#endif -+ -+#if defined(__LZO_NO_ALIGNED) -+# undef LZO_ALIGNED_OK_4 -+#endif -+ -+#if defined(LZO_ALIGNED_OK_4) && (LZO_UINT32_MAX != LZO_0xffffffffL) -+# error "LZO_ALIGNED_OK_4 must not be defined on this system" -+#endif -+ -+#define LZO_LITTLE_ENDIAN 1234 -+#define LZO_BIG_ENDIAN 4321 -+#define LZO_PDP_ENDIAN 3412 -+ -+#if !defined(LZO_BYTE_ORDER) -+# if defined(MFX_BYTE_ORDER) -+# define LZO_BYTE_ORDER MFX_BYTE_ORDER -+# elif defined(__LZO_i386) -+# define LZO_BYTE_ORDER LZO_LITTLE_ENDIAN -+# elif defined(BYTE_ORDER) -+# define LZO_BYTE_ORDER BYTE_ORDER -+# elif defined(__BYTE_ORDER) -+# define LZO_BYTE_ORDER __BYTE_ORDER -+# endif -+#endif -+ -+#if defined(LZO_BYTE_ORDER) -+# if (LZO_BYTE_ORDER != LZO_LITTLE_ENDIAN) && \ -+ (LZO_BYTE_ORDER != LZO_BIG_ENDIAN) -+# error "invalid LZO_BYTE_ORDER" -+# endif -+#endif -+ -+#if defined(LZO_UNALIGNED_OK) && !defined(LZO_BYTE_ORDER) -+# error "LZO_BYTE_ORDER is not defined" -+#endif -+ -+#define LZO_OPTIMIZE_GNUC_i386_IS_BUGGY -+ -+#if defined(NDEBUG) && !defined(LZO_DEBUG) && !defined(__LZO_CHECKER) -+# if defined(__GNUC__) && defined(__i386__) -+# if !defined(LZO_OPTIMIZE_GNUC_i386_IS_BUGGY) -+# define LZO_OPTIMIZE_GNUC_i386 -+# endif -+# endif -+#endif -+ -+extern const lzo_uint32 _lzo_crc32_table[256]; -+ -+#define _LZO_STRINGIZE(x) #x -+#define _LZO_MEXPAND(x) _LZO_STRINGIZE(x) -+ -+#define _LZO_CONCAT2(a,b) a ## b -+#define _LZO_CONCAT3(a,b,c) a ## b ## c -+#define _LZO_CONCAT4(a,b,c,d) a ## b ## c ## d -+#define _LZO_CONCAT5(a,b,c,d,e) a ## b ## c ## d ## e -+ -+#define _LZO_ECONCAT2(a,b) _LZO_CONCAT2(a,b) -+#define _LZO_ECONCAT3(a,b,c) _LZO_CONCAT3(a,b,c) -+#define _LZO_ECONCAT4(a,b,c,d) _LZO_CONCAT4(a,b,c,d) -+#define _LZO_ECONCAT5(a,b,c,d,e) _LZO_CONCAT5(a,b,c,d,e) -+ -+#ifndef __LZO_PTR_H -+#define __LZO_PTR_H -+ -+#if !defined(lzo_ptrdiff_t) -+# if (UINT_MAX >= LZO_0xffffffffL) -+typedef ptrdiff_t lzo_ptrdiff_t; -+# else -+typedef long lzo_ptrdiff_t; -+# endif -+#endif -+ -+#if !defined(__LZO_HAVE_PTR_T) -+# if defined(lzo_ptr_t) -+# define __LZO_HAVE_PTR_T -+# endif -+#endif -+#if !defined(__LZO_HAVE_PTR_T) -+# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED_LONG) -+# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED_LONG) -+typedef unsigned long lzo_ptr_t; -+typedef long lzo_sptr_t; -+# define __LZO_HAVE_PTR_T -+# endif -+# endif -+#endif -+#if !defined(__LZO_HAVE_PTR_T) -+# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED) -+# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED) -+typedef unsigned int lzo_ptr_t; -+typedef int lzo_sptr_t; -+# define __LZO_HAVE_PTR_T -+# endif -+# endif -+#endif -+#if !defined(__LZO_HAVE_PTR_T) -+# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED_SHORT) -+# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED_SHORT) -+typedef unsigned short lzo_ptr_t; -+typedef short lzo_sptr_t; -+# define __LZO_HAVE_PTR_T -+# endif -+# endif -+#endif -+#if !defined(__LZO_HAVE_PTR_T) -+# if defined(LZO_HAVE_CONFIG_H) || defined(SIZEOF_CHAR_P) -+# error "no suitable type for lzo_ptr_t" -+# else -+typedef unsigned long lzo_ptr_t; -+typedef long lzo_sptr_t; -+# define __LZO_HAVE_PTR_T -+# endif -+#endif -+ -+#define PTR(a) ((lzo_ptr_t) (a)) -+#define PTR_LINEAR(a) PTR(a) -+#define PTR_ALIGNED_4(a) ((PTR_LINEAR(a) & 3) == 0) -+#define PTR_ALIGNED_8(a) ((PTR_LINEAR(a) & 7) == 0) -+#define PTR_ALIGNED2_4(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 3) == 0) -+#define PTR_ALIGNED2_8(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 7) == 0) -+ -+#define PTR_LT(a,b) (PTR(a) < PTR(b)) -+#define PTR_GE(a,b) (PTR(a) >= PTR(b)) -+#define PTR_DIFF(a,b) ((lzo_ptrdiff_t) (PTR(a) - PTR(b))) -+#define pd(a,b) ((lzo_uint) ((a)-(b))) -+ -+typedef union { -+ char a_char; -+ unsigned char a_uchar; -+ short a_short; -+ unsigned short a_ushort; -+ int a_int; -+ unsigned int a_uint; -+ long a_long; -+ unsigned long a_ulong; -+ lzo_int a_lzo_int; -+ lzo_uint a_lzo_uint; -+ lzo_int32 a_lzo_int32; -+ lzo_uint32 a_lzo_uint32; -+ ptrdiff_t a_ptrdiff_t; -+ lzo_ptrdiff_t a_lzo_ptrdiff_t; -+ lzo_ptr_t a_lzo_ptr_t; -+ lzo_voidp a_lzo_voidp; -+ void *a_void_p; -+ lzo_bytep a_lzo_bytep; -+ lzo_bytepp a_lzo_bytepp; -+ lzo_uintp a_lzo_uintp; -+ lzo_uint *a_lzo_uint_p; -+ lzo_uint32p a_lzo_uint32p; -+ lzo_uint32 *a_lzo_uint32_p; -+ unsigned char *a_uchar_p; -+ char *a_char_p; -+} lzo_full_align_t; -+ -+#endif -+#define LZO_DETERMINISTIC -+#define LZO_DICT_USE_PTR -+# define lzo_dict_t const lzo_bytep -+# define lzo_dict_p lzo_dict_t __LZO_MMODEL * -+#if !defined(lzo_moff_t) -+#define lzo_moff_t lzo_uint -+#endif -+#endif -+static lzo_ptr_t __lzo_ptr_linear(const lzo_voidp ptr) -+{ -+ return PTR_LINEAR(ptr); -+} -+ -+static unsigned __lzo_align_gap(const lzo_voidp ptr, lzo_uint size) -+{ -+ lzo_ptr_t p, s, n; -+ -+ assert("lzo-01", size > 0); -+ -+ p = __lzo_ptr_linear(ptr); -+ s = (lzo_ptr_t) (size - 1); -+ n = (((p + s) / size) * size) - p; -+ -+ assert("lzo-02", (long)n >= 0); -+ assert("lzo-03", n <= s); -+ -+ return (unsigned)n; -+} -+ -+#ifndef __LZO_UTIL_H -+#define __LZO_UTIL_H -+ -+#ifndef __LZO_CONF_H -+#endif -+ -+#if 1 && defined(HAVE_MEMCPY) -+#define MEMCPY8_DS(dest,src,len) \ -+ memcpy(dest,src,len); \ -+ dest += len; \ -+ src += len -+#endif -+ -+#if !defined(MEMCPY8_DS) -+ -+#define MEMCPY8_DS(dest,src,len) \ -+ { register lzo_uint __l = (len) / 8; \ -+ do { \ -+ *dest++ = *src++; \ -+ *dest++ = *src++; \ -+ *dest++ = *src++; \ -+ *dest++ = *src++; \ -+ *dest++ = *src++; \ -+ *dest++ = *src++; \ -+ *dest++ = *src++; \ -+ *dest++ = *src++; \ -+ } while (--__l > 0); } -+ -+#endif -+ -+#define MEMCPY_DS(dest,src,len) \ -+ do *dest++ = *src++; \ -+ while (--len > 0) -+ -+#define MEMMOVE_DS(dest,src,len) \ -+ do *dest++ = *src++; \ -+ while (--len > 0) -+ -+#if (LZO_UINT_MAX <= SIZE_T_MAX) && defined(HAVE_MEMSET) -+ -+#define BZERO8_PTR(s,l,n) memset((s),0,(lzo_uint)(l)*(n)) -+ -+#else -+ -+#define BZERO8_PTR(s,l,n) \ -+ lzo_memset((lzo_voidp)(s),0,(lzo_uint)(l)*(n)) -+ -+#endif -+#endif -+ -+/* If you use the LZO library in a product, you *must* keep this -+ * copyright string in the executable of your product. -+ */ -+ -+static const lzo_byte __lzo_copyright[] = -+#if !defined(__LZO_IN_MINLZO) -+ LZO_VERSION_STRING; -+#else -+ "\n\n\n" -+ "LZO real-time data compression library.\n" -+ "Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002 Markus Franz Xaver Johannes Oberhumer\n" -+ "\n" -+ "http://www.oberhumer.com/opensource/lzo/\n" -+ "\n" -+ "LZO version: v" LZO_VERSION_STRING ", " LZO_VERSION_DATE "\n" -+ "LZO build date: " __DATE__ " " __TIME__ "\n\n" -+ "LZO special compilation options:\n" -+#ifdef __cplusplus -+ " __cplusplus\n" -+#endif -+#if defined(__PIC__) -+ " __PIC__\n" -+#elif defined(__pic__) -+ " __pic__\n" -+#endif -+#if (UINT_MAX < LZO_0xffffffffL) -+ " 16BIT\n" -+#endif -+#if defined(__LZO_STRICT_16BIT) -+ " __LZO_STRICT_16BIT\n" -+#endif -+#if (UINT_MAX > LZO_0xffffffffL) -+ " UINT_MAX=" _LZO_MEXPAND(UINT_MAX) "\n" -+#endif -+#if (ULONG_MAX > LZO_0xffffffffL) -+ " ULONG_MAX=" _LZO_MEXPAND(ULONG_MAX) "\n" -+#endif -+#if defined(LZO_BYTE_ORDER) -+ " LZO_BYTE_ORDER=" _LZO_MEXPAND(LZO_BYTE_ORDER) "\n" -+#endif -+#if defined(LZO_UNALIGNED_OK_2) -+ " LZO_UNALIGNED_OK_2\n" -+#endif -+#if defined(LZO_UNALIGNED_OK_4) -+ " LZO_UNALIGNED_OK_4\n" -+#endif -+#if defined(LZO_ALIGNED_OK_4) -+ " LZO_ALIGNED_OK_4\n" -+#endif -+#if defined(LZO_DICT_USE_PTR) -+ " LZO_DICT_USE_PTR\n" -+#endif -+#if defined(__LZO_QUERY_COMPRESS) -+ " __LZO_QUERY_COMPRESS\n" -+#endif -+#if defined(__LZO_QUERY_DECOMPRESS) -+ " __LZO_QUERY_DECOMPRESS\n" -+#endif -+#if defined(__LZO_IN_MINILZO) -+ " __LZO_IN_MINILZO\n" -+#endif -+ "\n\n" "$Id: LZO " LZO_VERSION_STRING " built " __DATE__ " " __TIME__ -+#if defined(__GNUC__) && defined(__VERSION__) -+ " by gcc " __VERSION__ -+#elif defined(__BORLANDC__) -+ " by Borland C " _LZO_MEXPAND(__BORLANDC__) -+#elif defined(_MSC_VER) -+ " by Microsoft C " _LZO_MEXPAND(_MSC_VER) -+#elif defined(__PUREC__) -+ " by Pure C " _LZO_MEXPAND(__PUREC__) -+#elif defined(__SC__) -+ " by Symantec C " _LZO_MEXPAND(__SC__) -+#elif defined(__TURBOC__) -+ " by Turbo C " _LZO_MEXPAND(__TURBOC__) -+#elif defined(__WATCOMC__) -+ " by Watcom C " _LZO_MEXPAND(__WATCOMC__) -+#endif -+ " $\n" -+ "$Copyright: LZO (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002 Markus Franz Xaver Johannes Oberhumer $\n"; -+#endif -+ -+#define LZO_BASE 65521u -+#define LZO_NMAX 5552 -+ -+#define LZO_DO1(buf,i) {s1 += buf[i]; s2 += s1;} -+#define LZO_DO2(buf,i) LZO_DO1(buf,i); LZO_DO1(buf,i+1); -+#define LZO_DO4(buf,i) LZO_DO2(buf,i); LZO_DO2(buf,i+2); -+#define LZO_DO8(buf,i) LZO_DO4(buf,i); LZO_DO4(buf,i+4); -+#define LZO_DO16(buf,i) LZO_DO8(buf,i); LZO_DO8(buf,i+8); -+ -+# define IS_SIGNED(type) (((type) (-1)) < ((type) 0)) -+# define IS_UNSIGNED(type) (((type) (-1)) > ((type) 0)) -+ -+#define IS_POWER_OF_2(x) (((x) & ((x) - 1)) == 0) -+ -+static lzo_bool schedule_insns_bug(void); -+static lzo_bool strength_reduce_bug(int *); -+ -+# define __lzo_assert(x) ((x) ? 1 : 0) -+ -+#undef COMPILE_TIME_ASSERT -+ -+# define COMPILE_TIME_ASSERT(expr) LZO_COMPILE_TIME_ASSERT(expr) -+ -+static lzo_bool basic_integral_check(void) -+{ -+ lzo_bool r = 1; -+ -+ COMPILE_TIME_ASSERT(CHAR_BIT == 8); -+ COMPILE_TIME_ASSERT(sizeof(char) == 1); -+ COMPILE_TIME_ASSERT(sizeof(short) >= 2); -+ COMPILE_TIME_ASSERT(sizeof(long) >= 4); -+ COMPILE_TIME_ASSERT(sizeof(int) >= sizeof(short)); -+ COMPILE_TIME_ASSERT(sizeof(long) >= sizeof(int)); -+ -+ COMPILE_TIME_ASSERT(sizeof(lzo_uint) == sizeof(lzo_int)); -+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == sizeof(lzo_int32)); -+ -+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) >= 4); -+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) >= sizeof(unsigned)); -+#if defined(__LZO_STRICT_16BIT) -+ COMPILE_TIME_ASSERT(sizeof(lzo_uint) == 2); -+#else -+ COMPILE_TIME_ASSERT(sizeof(lzo_uint) >= 4); -+ COMPILE_TIME_ASSERT(sizeof(lzo_uint) >= sizeof(unsigned)); -+#endif -+ -+#if (USHRT_MAX == 65535u) -+ COMPILE_TIME_ASSERT(sizeof(short) == 2); -+#elif (USHRT_MAX == LZO_0xffffffffL) -+ COMPILE_TIME_ASSERT(sizeof(short) == 4); -+#elif (USHRT_MAX >= LZO_0xffffffffL) -+ COMPILE_TIME_ASSERT(sizeof(short) > 4); -+#endif -+ COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned char)); -+ COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned short)); -+ COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned)); -+ COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned long)); -+ COMPILE_TIME_ASSERT(IS_SIGNED(short)); -+ COMPILE_TIME_ASSERT(IS_SIGNED(int)); -+ COMPILE_TIME_ASSERT(IS_SIGNED(long)); -+ -+ COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_uint32)); -+ COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_uint)); -+ COMPILE_TIME_ASSERT(IS_SIGNED(lzo_int32)); -+ COMPILE_TIME_ASSERT(IS_SIGNED(lzo_int)); -+ -+ COMPILE_TIME_ASSERT(INT_MAX == LZO_STYPE_MAX(sizeof(int))); -+ COMPILE_TIME_ASSERT(UINT_MAX == LZO_UTYPE_MAX(sizeof(unsigned))); -+ COMPILE_TIME_ASSERT(LONG_MAX == LZO_STYPE_MAX(sizeof(long))); -+ COMPILE_TIME_ASSERT(ULONG_MAX == LZO_UTYPE_MAX(sizeof(unsigned long))); -+ COMPILE_TIME_ASSERT(USHRT_MAX == LZO_UTYPE_MAX(sizeof(unsigned short))); -+ COMPILE_TIME_ASSERT(LZO_UINT32_MAX == -+ LZO_UTYPE_MAX(sizeof(lzo_uint32))); -+ COMPILE_TIME_ASSERT(LZO_UINT_MAX == LZO_UTYPE_MAX(sizeof(lzo_uint))); -+ -+ r &= __lzo_assert(LZO_BYTE(257) == 1); -+ -+ return r; -+} -+ -+static lzo_bool basic_ptr_check(void) -+{ -+ lzo_bool r = 1; -+ -+ COMPILE_TIME_ASSERT(sizeof(char *) >= sizeof(int)); -+ COMPILE_TIME_ASSERT(sizeof(lzo_byte *) >= sizeof(char *)); -+ -+ COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_byte *)); -+ COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_voidpp)); -+ COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_bytepp)); -+ COMPILE_TIME_ASSERT(sizeof(lzo_voidp) >= sizeof(lzo_uint)); -+ -+ COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) == sizeof(lzo_voidp)); -+ COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) == sizeof(lzo_sptr_t)); -+ COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) >= sizeof(lzo_uint)); -+ -+ COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= 4); -+ COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= sizeof(ptrdiff_t)); -+ -+ COMPILE_TIME_ASSERT(sizeof(ptrdiff_t) >= sizeof(size_t)); -+ COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= sizeof(lzo_uint)); -+ -+#if defined(SIZEOF_CHAR_P) -+ COMPILE_TIME_ASSERT(SIZEOF_CHAR_P == sizeof(char *)); -+#endif -+#if defined(SIZEOF_PTRDIFF_T) -+ COMPILE_TIME_ASSERT(SIZEOF_PTRDIFF_T == sizeof(ptrdiff_t)); -+#endif -+ -+ COMPILE_TIME_ASSERT(IS_SIGNED(ptrdiff_t)); -+ COMPILE_TIME_ASSERT(IS_UNSIGNED(size_t)); -+ COMPILE_TIME_ASSERT(IS_SIGNED(lzo_ptrdiff_t)); -+ COMPILE_TIME_ASSERT(IS_SIGNED(lzo_sptr_t)); -+ COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_ptr_t)); -+ COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_moff_t)); -+ -+ return r; -+} -+ -+static lzo_bool ptr_check(void) -+{ -+ lzo_bool r = 1; -+ int i; -+ char _wrkmem[10 * sizeof(lzo_byte *) + sizeof(lzo_full_align_t)]; -+ lzo_bytep wrkmem; -+ lzo_bytepp dict; -+ unsigned char x[4 * sizeof(lzo_full_align_t)]; -+ long d; -+ lzo_full_align_t a; -+ lzo_full_align_t u; -+ -+ for (i = 0; i < (int)sizeof(x); i++) -+ x[i] = LZO_BYTE(i); -+ -+ wrkmem = -+ LZO_PTR_ALIGN_UP((lzo_byte *) _wrkmem, sizeof(lzo_full_align_t)); -+ -+ u.a_lzo_bytep = wrkmem; -+ dict = u.a_lzo_bytepp; -+ -+ d = (long)((const lzo_bytep)dict - (const lzo_bytep)_wrkmem); -+ r &= __lzo_assert(d >= 0); -+ r &= __lzo_assert(d < (long)sizeof(lzo_full_align_t)); -+ -+ memset(&a, 0, sizeof(a)); -+ r &= __lzo_assert(a.a_lzo_voidp == NULL); -+ -+ memset(&a, 0xff, sizeof(a)); -+ r &= __lzo_assert(a.a_ushort == USHRT_MAX); -+ r &= __lzo_assert(a.a_uint == UINT_MAX); -+ r &= __lzo_assert(a.a_ulong == ULONG_MAX); -+ r &= __lzo_assert(a.a_lzo_uint == LZO_UINT_MAX); -+ r &= __lzo_assert(a.a_lzo_uint32 == LZO_UINT32_MAX); -+ -+ if (r == 1) { -+ for (i = 0; i < 8; i++) -+ r &= __lzo_assert((const lzo_voidp)(&dict[i]) == -+ (const -+ lzo_voidp)(&wrkmem[i * -+ sizeof(lzo_byte -+ *)])); -+ } -+ -+ memset(&a, 0, sizeof(a)); -+ r &= __lzo_assert(a.a_char_p == NULL); -+ r &= __lzo_assert(a.a_lzo_bytep == NULL); -+ r &= __lzo_assert(NULL == (void *)0); -+ if (r == 1) { -+ for (i = 0; i < 10; i++) -+ dict[i] = wrkmem; -+ BZERO8_PTR(dict + 1, sizeof(dict[0]), 8); -+ r &= __lzo_assert(dict[0] == wrkmem); -+ for (i = 1; i < 9; i++) -+ r &= __lzo_assert(dict[i] == NULL); -+ r &= __lzo_assert(dict[9] == wrkmem); -+ } -+ -+ if (r == 1) { -+ unsigned k = 1; -+ const unsigned n = (unsigned)sizeof(lzo_uint32); -+ lzo_byte *p0; -+ lzo_byte *p1; -+ -+ k += __lzo_align_gap(&x[k], n); -+ p0 = (lzo_bytep) & x[k]; -+#if defined(PTR_LINEAR) -+ r &= __lzo_assert((PTR_LINEAR(p0) & (n - 1)) == 0); -+#else -+ r &= __lzo_assert(n == 4); -+ r &= __lzo_assert(PTR_ALIGNED_4(p0)); -+#endif -+ -+ r &= __lzo_assert(k >= 1); -+ p1 = (lzo_bytep) & x[1]; -+ r &= __lzo_assert(PTR_GE(p0, p1)); -+ -+ r &= __lzo_assert(k < 1 + n); -+ p1 = (lzo_bytep) & x[1 + n]; -+ r &= __lzo_assert(PTR_LT(p0, p1)); -+ -+ if (r == 1) { -+ lzo_uint32 v0, v1; -+ -+ u.a_uchar_p = &x[k]; -+ v0 = *u.a_lzo_uint32_p; -+ u.a_uchar_p = &x[k + n]; -+ v1 = *u.a_lzo_uint32_p; -+ -+ r &= __lzo_assert(v0 > 0); -+ r &= __lzo_assert(v1 > 0); -+ } -+ } -+ -+ return r; -+} -+ -+static int _lzo_config_check(void) -+{ -+ lzo_bool r = 1; -+ int i; -+ union { -+ lzo_uint32 a; -+ unsigned short b; -+ lzo_uint32 aa[4]; -+ unsigned char x[4 * sizeof(lzo_full_align_t)]; -+ } u; -+ -+ COMPILE_TIME_ASSERT((int)((unsigned char)((signed char)-1)) == 255); -+ COMPILE_TIME_ASSERT((((unsigned char)128) << (int)(8 * sizeof(int) - 8)) -+ < 0); -+ -+ r &= basic_integral_check(); -+ r &= basic_ptr_check(); -+ if (r != 1) -+ return LZO_E_ERROR; -+ -+ u.a = 0; -+ u.b = 0; -+ for (i = 0; i < (int)sizeof(u.x); i++) -+ u.x[i] = LZO_BYTE(i); -+ -+#if defined(LZO_BYTE_ORDER) -+ if (r == 1) { -+# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN) -+ lzo_uint32 a = (lzo_uint32) (u.a & LZO_0xffffffffL); -+ unsigned short b = (unsigned short)(u.b & 0xffff); -+ r &= __lzo_assert(a == 0x03020100L); -+ r &= __lzo_assert(b == 0x0100); -+# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN) -+ lzo_uint32 a = u.a >> (8 * sizeof(u.a) - 32); -+ unsigned short b = u.b >> (8 * sizeof(u.b) - 16); -+ r &= __lzo_assert(a == 0x00010203L); -+ r &= __lzo_assert(b == 0x0001); -+# else -+# error "invalid LZO_BYTE_ORDER" -+# endif -+ } -+#endif -+ -+#if defined(LZO_UNALIGNED_OK_2) -+ COMPILE_TIME_ASSERT(sizeof(short) == 2); -+ if (r == 1) { -+ unsigned short b[4]; -+ -+ for (i = 0; i < 4; i++) -+ b[i] = *(const unsigned short *)&u.x[i]; -+ -+# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN) -+ r &= __lzo_assert(b[0] == 0x0100); -+ r &= __lzo_assert(b[1] == 0x0201); -+ r &= __lzo_assert(b[2] == 0x0302); -+ r &= __lzo_assert(b[3] == 0x0403); -+# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN) -+ r &= __lzo_assert(b[0] == 0x0001); -+ r &= __lzo_assert(b[1] == 0x0102); -+ r &= __lzo_assert(b[2] == 0x0203); -+ r &= __lzo_assert(b[3] == 0x0304); -+# endif -+ } -+#endif -+ -+#if defined(LZO_UNALIGNED_OK_4) -+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == 4); -+ if (r == 1) { -+ lzo_uint32 a[4]; -+ -+ for (i = 0; i < 4; i++) -+ a[i] = *(const lzo_uint32 *)&u.x[i]; -+ -+# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN) -+ r &= __lzo_assert(a[0] == 0x03020100L); -+ r &= __lzo_assert(a[1] == 0x04030201L); -+ r &= __lzo_assert(a[2] == 0x05040302L); -+ r &= __lzo_assert(a[3] == 0x06050403L); -+# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN) -+ r &= __lzo_assert(a[0] == 0x00010203L); -+ r &= __lzo_assert(a[1] == 0x01020304L); -+ r &= __lzo_assert(a[2] == 0x02030405L); -+ r &= __lzo_assert(a[3] == 0x03040506L); -+# endif -+ } -+#endif -+ -+#if defined(LZO_ALIGNED_OK_4) -+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == 4); -+#endif -+ -+ COMPILE_TIME_ASSERT(lzo_sizeof_dict_t == sizeof(lzo_dict_t)); -+ -+ if (r == 1) { -+ r &= __lzo_assert(!schedule_insns_bug()); -+ } -+ -+ if (r == 1) { -+ static int x[3]; -+ static unsigned xn = 3; -+ register unsigned j; -+ -+ for (j = 0; j < xn; j++) -+ x[j] = (int)j - 3; -+ r &= __lzo_assert(!strength_reduce_bug(x)); -+ } -+ -+ if (r == 1) { -+ r &= ptr_check(); -+ } -+ -+ return r == 1 ? LZO_E_OK : LZO_E_ERROR; -+} -+ -+static lzo_bool schedule_insns_bug(void) -+{ -+#if defined(__LZO_CHECKER) -+ return 0; -+#else -+ const int clone[] = { 1, 2, 0 }; -+ const int *q; -+ q = clone; -+ return (*q) ? 0 : 1; -+#endif -+} -+ -+static lzo_bool strength_reduce_bug(int *x) -+{ -+ return x[0] != -3 || x[1] != -2 || x[2] != -1; -+} -+ -+#undef COMPILE_TIME_ASSERT -+ -+int __lzo_init2(unsigned v, int s1, int s2, int s3, int s4, int s5, -+ int s6, int s7, int s8, int s9) -+{ -+ int r; -+ -+ if (v == 0) -+ return LZO_E_ERROR; -+ -+ r = (s1 == -1 || s1 == (int)sizeof(short)) && -+ (s2 == -1 || s2 == (int)sizeof(int)) && -+ (s3 == -1 || s3 == (int)sizeof(long)) && -+ (s4 == -1 || s4 == (int)sizeof(lzo_uint32)) && -+ (s5 == -1 || s5 == (int)sizeof(lzo_uint)) && -+ (s6 == -1 || s6 == (int)lzo_sizeof_dict_t) && -+ (s7 == -1 || s7 == (int)sizeof(char *)) && -+ (s8 == -1 || s8 == (int)sizeof(lzo_voidp)) && -+ (s9 == -1 || s9 == (int)sizeof(lzo_compress_t)); -+ if (!r) -+ return LZO_E_ERROR; -+ -+ r = _lzo_config_check(); -+ if (r != LZO_E_OK) -+ return r; -+ -+ return r; -+} -+ -+#define do_compress _lzo1x_1_do_compress -+ -+#define LZO_NEED_DICT_H -+#define D_BITS 14 -+#define D_INDEX1(d,p) d = DM((0x21*DX3(p,5,5,6)) >> 5) -+#define D_INDEX2(d,p) d = (d & (D_MASK & 0x7ff)) ^ (D_HIGH | 0x1f) -+ -+#ifndef __LZO_CONFIG1X_H -+#define __LZO_CONFIG1X_H -+ -+#if !defined(LZO1X) && !defined(LZO1Y) && !defined(LZO1Z) -+# define LZO1X -+#endif -+ -+#define LZO_EOF_CODE -+#undef LZO_DETERMINISTIC -+ -+#define M1_MAX_OFFSET 0x0400 -+#ifndef M2_MAX_OFFSET -+#define M2_MAX_OFFSET 0x0800 -+#endif -+#define M3_MAX_OFFSET 0x4000 -+#define M4_MAX_OFFSET 0xbfff -+ -+#define MX_MAX_OFFSET (M1_MAX_OFFSET + M2_MAX_OFFSET) -+ -+#define M1_MIN_LEN 2 -+#define M1_MAX_LEN 2 -+#define M2_MIN_LEN 3 -+#ifndef M2_MAX_LEN -+#define M2_MAX_LEN 8 -+#endif -+#define M3_MIN_LEN 3 -+#define M3_MAX_LEN 33 -+#define M4_MIN_LEN 3 -+#define M4_MAX_LEN 9 -+ -+#define M1_MARKER 0 -+#define M2_MARKER 64 -+#define M3_MARKER 32 -+#define M4_MARKER 16 -+ -+#ifndef MIN_LOOKAHEAD -+#define MIN_LOOKAHEAD (M2_MAX_LEN + 1) -+#endif -+ -+#if defined(LZO_NEED_DICT_H) -+ -+#ifndef LZO_HASH -+#define LZO_HASH LZO_HASH_LZO_INCREMENTAL_B -+#endif -+#define DL_MIN_LEN M2_MIN_LEN -+ -+#ifndef __LZO_DICT_H -+#define __LZO_DICT_H -+ -+#if !defined(D_BITS) && defined(DBITS) -+# define D_BITS DBITS -+#endif -+#if !defined(D_BITS) -+# error "D_BITS is not defined" -+#endif -+#if (D_BITS < 16) -+# define D_SIZE LZO_SIZE(D_BITS) -+# define D_MASK LZO_MASK(D_BITS) -+#else -+# define D_SIZE LZO_USIZE(D_BITS) -+# define D_MASK LZO_UMASK(D_BITS) -+#endif -+#define D_HIGH ((D_MASK >> 1) + 1) -+ -+#if !defined(DD_BITS) -+# define DD_BITS 0 -+#endif -+#define DD_SIZE LZO_SIZE(DD_BITS) -+#define DD_MASK LZO_MASK(DD_BITS) -+ -+#if !defined(DL_BITS) -+# define DL_BITS (D_BITS - DD_BITS) -+#endif -+#if (DL_BITS < 16) -+# define DL_SIZE LZO_SIZE(DL_BITS) -+# define DL_MASK LZO_MASK(DL_BITS) -+#else -+# define DL_SIZE LZO_USIZE(DL_BITS) -+# define DL_MASK LZO_UMASK(DL_BITS) -+#endif -+ -+#if (D_BITS != DL_BITS + DD_BITS) -+# error "D_BITS does not match" -+#endif -+#if (D_BITS < 8 || D_BITS > 18) -+# error "invalid D_BITS" -+#endif -+#if (DL_BITS < 8 || DL_BITS > 20) -+# error "invalid DL_BITS" -+#endif -+#if (DD_BITS < 0 || DD_BITS > 6) -+# error "invalid DD_BITS" -+#endif -+ -+#if !defined(DL_MIN_LEN) -+# define DL_MIN_LEN 3 -+#endif -+#if !defined(DL_SHIFT) -+# define DL_SHIFT ((DL_BITS + (DL_MIN_LEN - 1)) / DL_MIN_LEN) -+#endif -+ -+#define LZO_HASH_GZIP 1 -+#define LZO_HASH_GZIP_INCREMENTAL 2 -+#define LZO_HASH_LZO_INCREMENTAL_A 3 -+#define LZO_HASH_LZO_INCREMENTAL_B 4 -+ -+#if !defined(LZO_HASH) -+# error "choose a hashing strategy" -+#endif -+ -+#if (DL_MIN_LEN == 3) -+# define _DV2_A(p,shift1,shift2) \ -+ (((( (lzo_uint32)((p)[0]) << shift1) ^ (p)[1]) << shift2) ^ (p)[2]) -+# define _DV2_B(p,shift1,shift2) \ -+ (((( (lzo_uint32)((p)[2]) << shift1) ^ (p)[1]) << shift2) ^ (p)[0]) -+# define _DV3_B(p,shift1,shift2,shift3) \ -+ ((_DV2_B((p)+1,shift1,shift2) << (shift3)) ^ (p)[0]) -+#elif (DL_MIN_LEN == 2) -+# define _DV2_A(p,shift1,shift2) \ -+ (( (lzo_uint32)(p[0]) << shift1) ^ p[1]) -+# define _DV2_B(p,shift1,shift2) \ -+ (( (lzo_uint32)(p[1]) << shift1) ^ p[2]) -+#else -+# error "invalid DL_MIN_LEN" -+#endif -+#define _DV_A(p,shift) _DV2_A(p,shift,shift) -+#define _DV_B(p,shift) _DV2_B(p,shift,shift) -+#define DA2(p,s1,s2) \ -+ (((((lzo_uint32)((p)[2]) << (s2)) + (p)[1]) << (s1)) + (p)[0]) -+#define DS2(p,s1,s2) \ -+ (((((lzo_uint32)((p)[2]) << (s2)) - (p)[1]) << (s1)) - (p)[0]) -+#define DX2(p,s1,s2) \ -+ (((((lzo_uint32)((p)[2]) << (s2)) ^ (p)[1]) << (s1)) ^ (p)[0]) -+#define DA3(p,s1,s2,s3) ((DA2((p)+1,s2,s3) << (s1)) + (p)[0]) -+#define DS3(p,s1,s2,s3) ((DS2((p)+1,s2,s3) << (s1)) - (p)[0]) -+#define DX3(p,s1,s2,s3) ((DX2((p)+1,s2,s3) << (s1)) ^ (p)[0]) -+#define DMS(v,s) ((lzo_uint) (((v) & (D_MASK >> (s))) << (s))) -+#define DM(v) DMS(v,0) -+ -+#if (LZO_HASH == LZO_HASH_GZIP) -+# define _DINDEX(dv,p) (_DV_A((p),DL_SHIFT)) -+ -+#elif (LZO_HASH == LZO_HASH_GZIP_INCREMENTAL) -+# define __LZO_HASH_INCREMENTAL -+# define DVAL_FIRST(dv,p) dv = _DV_A((p),DL_SHIFT) -+# define DVAL_NEXT(dv,p) dv = (((dv) << DL_SHIFT) ^ p[2]) -+# define _DINDEX(dv,p) (dv) -+# define DVAL_LOOKAHEAD DL_MIN_LEN -+ -+#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_A) -+# define __LZO_HASH_INCREMENTAL -+# define DVAL_FIRST(dv,p) dv = _DV_A((p),5) -+# define DVAL_NEXT(dv,p) \ -+ dv ^= (lzo_uint32)(p[-1]) << (2*5); dv = (((dv) << 5) ^ p[2]) -+# define _DINDEX(dv,p) ((0x9f5f * (dv)) >> 5) -+# define DVAL_LOOKAHEAD DL_MIN_LEN -+ -+#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_B) -+# define __LZO_HASH_INCREMENTAL -+# define DVAL_FIRST(dv,p) dv = _DV_B((p),5) -+# define DVAL_NEXT(dv,p) \ -+ dv ^= p[-1]; dv = (((dv) >> 5) ^ ((lzo_uint32)(p[2]) << (2*5))) -+# define _DINDEX(dv,p) ((0x9f5f * (dv)) >> 5) -+# define DVAL_LOOKAHEAD DL_MIN_LEN -+ -+#else -+# error "choose a hashing strategy" -+#endif -+ -+#ifndef DINDEX -+#define DINDEX(dv,p) ((lzo_uint)((_DINDEX(dv,p)) & DL_MASK) << DD_BITS) -+#endif -+#if !defined(DINDEX1) && defined(D_INDEX1) -+#define DINDEX1 D_INDEX1 -+#endif -+#if !defined(DINDEX2) && defined(D_INDEX2) -+#define DINDEX2 D_INDEX2 -+#endif -+ -+#if !defined(__LZO_HASH_INCREMENTAL) -+# define DVAL_FIRST(dv,p) ((void) 0) -+# define DVAL_NEXT(dv,p) ((void) 0) -+# define DVAL_LOOKAHEAD 0 -+#endif -+ -+#if !defined(DVAL_ASSERT) -+#if defined(__LZO_HASH_INCREMENTAL) && !defined(NDEBUG) -+static void DVAL_ASSERT(lzo_uint32 dv, const lzo_byte * p) -+{ -+ lzo_uint32 df; -+ DVAL_FIRST(df, (p)); -+ assert(DINDEX(dv, p) == DINDEX(df, p)); -+} -+#else -+# define DVAL_ASSERT(dv,p) ((void) 0) -+#endif -+#endif -+ -+# define DENTRY(p,in) (p) -+# define GINDEX(m_pos,m_off,dict,dindex,in) m_pos = dict[dindex] -+ -+#if (DD_BITS == 0) -+ -+# define UPDATE_D(dict,drun,dv,p,in) dict[ DINDEX(dv,p) ] = DENTRY(p,in) -+# define UPDATE_I(dict,drun,index,p,in) dict[index] = DENTRY(p,in) -+# define UPDATE_P(ptr,drun,p,in) (ptr)[0] = DENTRY(p,in) -+ -+#else -+ -+# define UPDATE_D(dict,drun,dv,p,in) \ -+ dict[ DINDEX(dv,p) + drun++ ] = DENTRY(p,in); drun &= DD_MASK -+# define UPDATE_I(dict,drun,index,p,in) \ -+ dict[ (index) + drun++ ] = DENTRY(p,in); drun &= DD_MASK -+# define UPDATE_P(ptr,drun,p,in) \ -+ (ptr) [ drun++ ] = DENTRY(p,in); drun &= DD_MASK -+ -+#endif -+ -+#define LZO_CHECK_MPOS_DET(m_pos,m_off,in,ip,max_offset) \ -+ (m_pos == NULL || (m_off = (lzo_moff_t) (ip - m_pos)) > max_offset) -+ -+#define LZO_CHECK_MPOS_NON_DET(m_pos,m_off,in,ip,max_offset) \ -+ (BOUNDS_CHECKING_OFF_IN_EXPR( \ -+ (PTR_LT(m_pos,in) || \ -+ (m_off = (lzo_moff_t) PTR_DIFF(ip,m_pos)) <= 0 || \ -+ m_off > max_offset) )) -+ -+#if defined(LZO_DETERMINISTIC) -+# define LZO_CHECK_MPOS LZO_CHECK_MPOS_DET -+#else -+# define LZO_CHECK_MPOS LZO_CHECK_MPOS_NON_DET -+#endif -+#endif -+#endif -+#endif -+#define DO_COMPRESS lzo1x_1_compress -+static -+lzo_uint do_compress(const lzo_byte * in, lzo_uint in_len, -+ lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem) -+{ -+ register const lzo_byte *ip; -+ lzo_byte *op; -+ const lzo_byte *const in_end = in + in_len; -+ const lzo_byte *const ip_end = in + in_len - M2_MAX_LEN - 5; -+ const lzo_byte *ii; -+ lzo_dict_p const dict = (lzo_dict_p) wrkmem; -+ -+ op = out; -+ ip = in; -+ ii = ip; -+ -+ ip += 4; -+ for (;;) { -+ register const lzo_byte *m_pos; -+ -+ lzo_moff_t m_off; -+ lzo_uint m_len; -+ lzo_uint dindex; -+ -+ DINDEX1(dindex, ip); -+ GINDEX(m_pos, m_off, dict, dindex, in); -+ if (LZO_CHECK_MPOS_NON_DET(m_pos, m_off, in, ip, M4_MAX_OFFSET)) -+ goto literal; -+#if 1 -+ if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3]) -+ goto try_match; -+ DINDEX2(dindex, ip); -+#endif -+ GINDEX(m_pos, m_off, dict, dindex, in); -+ if (LZO_CHECK_MPOS_NON_DET(m_pos, m_off, in, ip, M4_MAX_OFFSET)) -+ goto literal; -+ if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3]) -+ goto try_match; -+ goto literal; -+ -+ try_match: -+#if 1 && defined(LZO_UNALIGNED_OK_2) -+ if (*(const lzo_ushortp)m_pos != *(const lzo_ushortp)ip) { -+#else -+ if (m_pos[0] != ip[0] || m_pos[1] != ip[1]) { -+#endif -+ ; -+ } else { -+ if (m_pos[2] == ip[2]) { -+ goto match; -+ } else { -+ ; -+ } -+ } -+ -+ literal: -+ UPDATE_I(dict, 0, dindex, ip, in); -+ ++ip; -+ if (ip >= ip_end) -+ break; -+ continue; -+ -+ match: -+ UPDATE_I(dict, 0, dindex, ip, in); -+ if (pd(ip, ii) > 0) { -+ register lzo_uint t = pd(ip, ii); -+ -+ if (t <= 3) { -+ assert("lzo-04", op - 2 > out); -+ op[-2] |= LZO_BYTE(t); -+ } else if (t <= 18) -+ *op++ = LZO_BYTE(t - 3); -+ else { -+ register lzo_uint tt = t - 18; -+ -+ *op++ = 0; -+ while (tt > 255) { -+ tt -= 255; -+ *op++ = 0; -+ } -+ assert("lzo-05", tt > 0); -+ *op++ = LZO_BYTE(tt); -+ } -+ do -+ *op++ = *ii++; -+ while (--t > 0); -+ } -+ -+ assert("lzo-06", ii == ip); -+ ip += 3; -+ if (m_pos[3] != *ip++ || m_pos[4] != *ip++ || m_pos[5] != *ip++ -+ || m_pos[6] != *ip++ || m_pos[7] != *ip++ -+ || m_pos[8] != *ip++ -+#ifdef LZO1Y -+ || m_pos[9] != *ip++ || m_pos[10] != *ip++ -+ || m_pos[11] != *ip++ || m_pos[12] != *ip++ -+ || m_pos[13] != *ip++ || m_pos[14] != *ip++ -+#endif -+ ) { -+ --ip; -+ m_len = ip - ii; -+ assert("lzo-07", m_len >= 3); -+ assert("lzo-08", m_len <= M2_MAX_LEN); -+ -+ if (m_off <= M2_MAX_OFFSET) { -+ m_off -= 1; -+#if defined(LZO1X) -+ *op++ = -+ LZO_BYTE(((m_len - -+ 1) << 5) | ((m_off & 7) << 2)); -+ *op++ = LZO_BYTE(m_off >> 3); -+#elif defined(LZO1Y) -+ *op++ = -+ LZO_BYTE(((m_len + -+ 1) << 4) | ((m_off & 3) << 2)); -+ *op++ = LZO_BYTE(m_off >> 2); -+#endif -+ } else if (m_off <= M3_MAX_OFFSET) { -+ m_off -= 1; -+ *op++ = LZO_BYTE(M3_MARKER | (m_len - 2)); -+ goto m3_m4_offset; -+ } else -+#if defined(LZO1X) -+ { -+ m_off -= 0x4000; -+ assert("lzo-09", m_off > 0); -+ assert("lzo-10", m_off <= 0x7fff); -+ *op++ = LZO_BYTE(M4_MARKER | -+ ((m_off & 0x4000) >> 11) | -+ (m_len - 2)); -+ goto m3_m4_offset; -+ } -+#elif defined(LZO1Y) -+ goto m4_match; -+#endif -+ } else { -+ { -+ const lzo_byte *end = in_end; -+ const lzo_byte *m = m_pos + M2_MAX_LEN + 1; -+ while (ip < end && *m == *ip) -+ m++, ip++; -+ m_len = (ip - ii); -+ } -+ assert("lzo-11", m_len > M2_MAX_LEN); -+ -+ if (m_off <= M3_MAX_OFFSET) { -+ m_off -= 1; -+ if (m_len <= 33) -+ *op++ = -+ LZO_BYTE(M3_MARKER | (m_len - 2)); -+ else { -+ m_len -= 33; -+ *op++ = M3_MARKER | 0; -+ goto m3_m4_len; -+ } -+ } else { -+#if defined(LZO1Y) -+ m4_match: -+#endif -+ m_off -= 0x4000; -+ assert("lzo-12", m_off > 0); -+ assert("lzo-13", m_off <= 0x7fff); -+ if (m_len <= M4_MAX_LEN) -+ *op++ = LZO_BYTE(M4_MARKER | -+ ((m_off & 0x4000) >> -+ 11) | (m_len - 2)); -+ else { -+ m_len -= M4_MAX_LEN; -+ *op++ = -+ LZO_BYTE(M4_MARKER | -+ ((m_off & 0x4000) >> 11)); -+ m3_m4_len: -+ while (m_len > 255) { -+ m_len -= 255; -+ *op++ = 0; -+ } -+ assert("lzo-14", m_len > 0); -+ *op++ = LZO_BYTE(m_len); -+ } -+ } -+ -+ m3_m4_offset: -+ *op++ = LZO_BYTE((m_off & 63) << 2); -+ *op++ = LZO_BYTE(m_off >> 6); -+ } -+ -+ ii = ip; -+ if (ip >= ip_end) -+ break; -+ } -+ -+ *out_len = op - out; -+ return pd(in_end, ii); -+} -+ -+int DO_COMPRESS(const lzo_byte * in, lzo_uint in_len, -+ lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem) -+{ -+ lzo_byte *op = out; -+ lzo_uint t; -+ -+#if defined(__LZO_QUERY_COMPRESS) -+ if (__LZO_IS_COMPRESS_QUERY(in, in_len, out, out_len, wrkmem)) -+ return __LZO_QUERY_COMPRESS(in, in_len, out, out_len, wrkmem, -+ D_SIZE, lzo_sizeof(lzo_dict_t)); -+#endif -+ -+ if (in_len <= M2_MAX_LEN + 5) -+ t = in_len; -+ else { -+ t = do_compress(in, in_len, op, out_len, wrkmem); -+ op += *out_len; -+ } -+ -+ if (t > 0) { -+ const lzo_byte *ii = in + in_len - t; -+ -+ if (op == out && t <= 238) -+ *op++ = LZO_BYTE(17 + t); -+ else if (t <= 3) -+ op[-2] |= LZO_BYTE(t); -+ else if (t <= 18) -+ *op++ = LZO_BYTE(t - 3); -+ else { -+ lzo_uint tt = t - 18; -+ -+ *op++ = 0; -+ while (tt > 255) { -+ tt -= 255; -+ *op++ = 0; -+ } -+ assert("lzo-15", tt > 0); -+ *op++ = LZO_BYTE(tt); -+ } -+ do -+ *op++ = *ii++; -+ while (--t > 0); -+ } -+ -+ *op++ = M4_MARKER | 1; -+ *op++ = 0; -+ *op++ = 0; -+ -+ *out_len = op - out; -+ return LZO_E_OK; -+} -+ -+#undef do_compress -+#undef DO_COMPRESS -+#undef LZO_HASH -+ -+#undef LZO_TEST_DECOMPRESS_OVERRUN -+#undef LZO_TEST_DECOMPRESS_OVERRUN_INPUT -+#undef LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT -+#undef LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND -+#undef DO_DECOMPRESS -+#define DO_DECOMPRESS lzo1x_decompress -+ -+#if defined(LZO_TEST_DECOMPRESS_OVERRUN) -+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT) -+# define LZO_TEST_DECOMPRESS_OVERRUN_INPUT 2 -+# endif -+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT) -+# define LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT 2 -+# endif -+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND) -+# define LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND -+# endif -+#endif -+ -+#undef TEST_IP -+#undef TEST_OP -+#undef TEST_LOOKBEHIND -+#undef NEED_IP -+#undef NEED_OP -+#undef HAVE_TEST_IP -+#undef HAVE_TEST_OP -+#undef HAVE_NEED_IP -+#undef HAVE_NEED_OP -+#undef HAVE_ANY_IP -+#undef HAVE_ANY_OP -+ -+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT) -+# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 1) -+# define TEST_IP (ip < ip_end) -+# endif -+# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 2) -+# define NEED_IP(x) \ -+ if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x)) goto input_overrun -+# endif -+#endif -+ -+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT) -+# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 1) -+# define TEST_OP (op <= op_end) -+# endif -+# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 2) -+# undef TEST_OP -+# define NEED_OP(x) \ -+ if ((lzo_uint)(op_end - op) < (lzo_uint)(x)) goto output_overrun -+# endif -+#endif -+ -+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND) -+# define TEST_LOOKBEHIND(m_pos,out) if (m_pos < out) goto lookbehind_overrun -+#else -+# define TEST_LOOKBEHIND(m_pos,op) ((void) 0) -+#endif -+ -+#if !defined(LZO_EOF_CODE) && !defined(TEST_IP) -+# define TEST_IP (ip < ip_end) -+#endif -+ -+#if defined(TEST_IP) -+# define HAVE_TEST_IP -+#else -+# define TEST_IP 1 -+#endif -+#if defined(TEST_OP) -+# define HAVE_TEST_OP -+#else -+# define TEST_OP 1 -+#endif -+ -+#if defined(NEED_IP) -+# define HAVE_NEED_IP -+#else -+# define NEED_IP(x) ((void) 0) -+#endif -+#if defined(NEED_OP) -+# define HAVE_NEED_OP -+#else -+# define NEED_OP(x) ((void) 0) -+#endif -+ -+#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP) -+# define HAVE_ANY_IP -+#endif -+#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP) -+# define HAVE_ANY_OP -+#endif -+ -+#undef __COPY4 -+#define __COPY4(dst,src) * (lzo_uint32p)(dst) = * (const lzo_uint32p)(src) -+ -+#undef COPY4 -+#if defined(LZO_UNALIGNED_OK_4) -+# define COPY4(dst,src) __COPY4(dst,src) -+#elif defined(LZO_ALIGNED_OK_4) -+# define COPY4(dst,src) __COPY4((lzo_ptr_t)(dst),(lzo_ptr_t)(src)) -+#endif -+ -+#if defined(DO_DECOMPRESS) -+int DO_DECOMPRESS(const lzo_byte * in, lzo_uint in_len, -+ lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem) -+#endif -+{ -+ register lzo_byte *op; -+ register const lzo_byte *ip; -+ register lzo_uint t; -+#if defined(COPY_DICT) -+ lzo_uint m_off; -+ const lzo_byte *dict_end; -+#else -+ register const lzo_byte *m_pos; -+#endif -+ -+ const lzo_byte *const ip_end = in + in_len; -+#if defined(HAVE_ANY_OP) -+ lzo_byte *const op_end = out + *out_len; -+#endif -+#if defined(LZO1Z) -+ lzo_uint last_m_off = 0; -+#endif -+ -+ LZO_UNUSED(wrkmem); -+ -+#if defined(__LZO_QUERY_DECOMPRESS) -+ if (__LZO_IS_DECOMPRESS_QUERY(in, in_len, out, out_len, wrkmem)) -+ return __LZO_QUERY_DECOMPRESS(in, in_len, out, out_len, wrkmem, -+ 0, 0); -+#endif -+ -+#if defined(COPY_DICT) -+ if (dict) { -+ if (dict_len > M4_MAX_OFFSET) { -+ dict += dict_len - M4_MAX_OFFSET; -+ dict_len = M4_MAX_OFFSET; -+ } -+ dict_end = dict + dict_len; -+ } else { -+ dict_len = 0; -+ dict_end = NULL; -+ } -+#endif -+ -+ *out_len = 0; -+ -+ op = out; -+ ip = in; -+ -+ if (*ip > 17) { -+ t = *ip++ - 17; -+ if (t < 4) -+ goto match_next; -+ assert("lzo-16", t > 0); -+ NEED_OP(t); -+ NEED_IP(t + 1); -+ do -+ *op++ = *ip++; -+ while (--t > 0); -+ goto first_literal_run; -+ } -+ -+ while (TEST_IP && TEST_OP) { -+ t = *ip++; -+ if (t >= 16) -+ goto match; -+ if (t == 0) { -+ NEED_IP(1); -+ while (*ip == 0) { -+ t += 255; -+ ip++; -+ NEED_IP(1); -+ } -+ t += 15 + *ip++; -+ } -+ assert("lzo-17", t > 0); -+ NEED_OP(t + 3); -+ NEED_IP(t + 4); -+#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4) -+#if !defined(LZO_UNALIGNED_OK_4) -+ if (PTR_ALIGNED2_4(op, ip)) { -+#endif -+ COPY4(op, ip); -+ op += 4; -+ ip += 4; -+ if (--t > 0) { -+ if (t >= 4) { -+ do { -+ COPY4(op, ip); -+ op += 4; -+ ip += 4; -+ t -= 4; -+ } while (t >= 4); -+ if (t > 0) -+ do -+ *op++ = *ip++; -+ while (--t > 0); -+ } else -+ do -+ *op++ = *ip++; -+ while (--t > 0); -+ } -+#if !defined(LZO_UNALIGNED_OK_4) -+ } else -+#endif -+#endif -+#if !defined(LZO_UNALIGNED_OK_4) -+ { -+ *op++ = *ip++; -+ *op++ = *ip++; -+ *op++ = *ip++; -+ do -+ *op++ = *ip++; -+ while (--t > 0); -+ } -+#endif -+ -+ first_literal_run: -+ -+ t = *ip++; -+ if (t >= 16) -+ goto match; -+#if defined(COPY_DICT) -+#if defined(LZO1Z) -+ m_off = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2); -+ last_m_off = m_off; -+#else -+ m_off = (1 + M2_MAX_OFFSET) + (t >> 2) + (*ip++ << 2); -+#endif -+ NEED_OP(3); -+ t = 3; -+ COPY_DICT(t, m_off) -+#else -+#if defined(LZO1Z) -+ t = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2); -+ m_pos = op - t; -+ last_m_off = t; -+#else -+ m_pos = op - (1 + M2_MAX_OFFSET); -+ m_pos -= t >> 2; -+ m_pos -= *ip++ << 2; -+#endif -+ TEST_LOOKBEHIND(m_pos, out); -+ NEED_OP(3); -+ *op++ = *m_pos++; -+ *op++ = *m_pos++; -+ *op++ = *m_pos; -+#endif -+ goto match_done; -+ -+ while (TEST_IP && TEST_OP) { -+ match: -+ if (t >= 64) { -+#if defined(COPY_DICT) -+#if defined(LZO1X) -+ m_off = 1 + ((t >> 2) & 7) + (*ip++ << 3); -+ t = (t >> 5) - 1; -+#elif defined(LZO1Y) -+ m_off = 1 + ((t >> 2) & 3) + (*ip++ << 2); -+ t = (t >> 4) - 3; -+#elif defined(LZO1Z) -+ m_off = t & 0x1f; -+ if (m_off >= 0x1c) -+ m_off = last_m_off; -+ else { -+ m_off = 1 + (m_off << 6) + (*ip++ >> 2); -+ last_m_off = m_off; -+ } -+ t = (t >> 5) - 1; -+#endif -+#else -+#if defined(LZO1X) -+ m_pos = op - 1; -+ m_pos -= (t >> 2) & 7; -+ m_pos -= *ip++ << 3; -+ t = (t >> 5) - 1; -+#elif defined(LZO1Y) -+ m_pos = op - 1; -+ m_pos -= (t >> 2) & 3; -+ m_pos -= *ip++ << 2; -+ t = (t >> 4) - 3; -+#elif defined(LZO1Z) -+ { -+ lzo_uint off = t & 0x1f; -+ m_pos = op; -+ if (off >= 0x1c) { -+ assert(last_m_off > 0); -+ m_pos -= last_m_off; -+ } else { -+ off = -+ 1 + (off << 6) + -+ (*ip++ >> 2); -+ m_pos -= off; -+ last_m_off = off; -+ } -+ } -+ t = (t >> 5) - 1; -+#endif -+ TEST_LOOKBEHIND(m_pos, out); -+ assert("lzo-18", t > 0); -+ NEED_OP(t + 3 - 1); -+ goto copy_match; -+#endif -+ } else if (t >= 32) { -+ t &= 31; -+ if (t == 0) { -+ NEED_IP(1); -+ while (*ip == 0) { -+ t += 255; -+ ip++; -+ NEED_IP(1); -+ } -+ t += 31 + *ip++; -+ } -+#if defined(COPY_DICT) -+#if defined(LZO1Z) -+ m_off = 1 + (ip[0] << 6) + (ip[1] >> 2); -+ last_m_off = m_off; -+#else -+ m_off = 1 + (ip[0] >> 2) + (ip[1] << 6); -+#endif -+#else -+#if defined(LZO1Z) -+ { -+ lzo_uint off = -+ 1 + (ip[0] << 6) + (ip[1] >> 2); -+ m_pos = op - off; -+ last_m_off = off; -+ } -+#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN) -+ m_pos = op - 1; -+ m_pos -= (*(const lzo_ushortp)ip) >> 2; -+#else -+ m_pos = op - 1; -+ m_pos -= (ip[0] >> 2) + (ip[1] << 6); -+#endif -+#endif -+ ip += 2; -+ } else if (t >= 16) { -+#if defined(COPY_DICT) -+ m_off = (t & 8) << 11; -+#else -+ m_pos = op; -+ m_pos -= (t & 8) << 11; -+#endif -+ t &= 7; -+ if (t == 0) { -+ NEED_IP(1); -+ while (*ip == 0) { -+ t += 255; -+ ip++; -+ NEED_IP(1); -+ } -+ t += 7 + *ip++; -+ } -+#if defined(COPY_DICT) -+#if defined(LZO1Z) -+ m_off += (ip[0] << 6) + (ip[1] >> 2); -+#else -+ m_off += (ip[0] >> 2) + (ip[1] << 6); -+#endif -+ ip += 2; -+ if (m_off == 0) -+ goto eof_found; -+ m_off += 0x4000; -+#if defined(LZO1Z) -+ last_m_off = m_off; -+#endif -+#else -+#if defined(LZO1Z) -+ m_pos -= (ip[0] << 6) + (ip[1] >> 2); -+#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN) -+ m_pos -= (*(const lzo_ushortp)ip) >> 2; -+#else -+ m_pos -= (ip[0] >> 2) + (ip[1] << 6); -+#endif -+ ip += 2; -+ if (m_pos == op) -+ goto eof_found; -+ m_pos -= 0x4000; -+#if defined(LZO1Z) -+ last_m_off = op - m_pos; -+#endif -+#endif -+ } else { -+#if defined(COPY_DICT) -+#if defined(LZO1Z) -+ m_off = 1 + (t << 6) + (*ip++ >> 2); -+ last_m_off = m_off; -+#else -+ m_off = 1 + (t >> 2) + (*ip++ << 2); -+#endif -+ NEED_OP(2); -+ t = 2; -+ COPY_DICT(t, m_off) -+#else -+#if defined(LZO1Z) -+ t = 1 + (t << 6) + (*ip++ >> 2); -+ m_pos = op - t; -+ last_m_off = t; -+#else -+ m_pos = op - 1; -+ m_pos -= t >> 2; -+ m_pos -= *ip++ << 2; -+#endif -+ TEST_LOOKBEHIND(m_pos, out); -+ NEED_OP(2); -+ *op++ = *m_pos++; -+ *op++ = *m_pos; -+#endif -+ goto match_done; -+ } -+ -+#if defined(COPY_DICT) -+ -+ NEED_OP(t + 3 - 1); -+ t += 3 - 1; -+ COPY_DICT(t, m_off) -+#else -+ -+ TEST_LOOKBEHIND(m_pos, out); -+ assert("lzo-19", t > 0); -+ NEED_OP(t + 3 - 1); -+#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4) -+#if !defined(LZO_UNALIGNED_OK_4) -+ if (t >= 2 * 4 - (3 - 1) && PTR_ALIGNED2_4(op, m_pos)) { -+ assert((op - m_pos) >= 4); -+#else -+ if (t >= 2 * 4 - (3 - 1) && (op - m_pos) >= 4) { -+#endif -+ COPY4(op, m_pos); -+ op += 4; -+ m_pos += 4; -+ t -= 4 - (3 - 1); -+ do { -+ COPY4(op, m_pos); -+ op += 4; -+ m_pos += 4; -+ t -= 4; -+ } while (t >= 4); -+ if (t > 0) -+ do -+ *op++ = *m_pos++; -+ while (--t > 0); -+ } else -+#endif -+ { -+ copy_match: -+ *op++ = *m_pos++; -+ *op++ = *m_pos++; -+ do -+ *op++ = *m_pos++; -+ while (--t > 0); -+ } -+ -+#endif -+ -+ match_done: -+#if defined(LZO1Z) -+ t = ip[-1] & 3; -+#else -+ t = ip[-2] & 3; -+#endif -+ if (t == 0) -+ break; -+ -+ match_next: -+ assert("lzo-20", t > 0); -+ NEED_OP(t); -+ NEED_IP(t + 1); -+ do -+ *op++ = *ip++; -+ while (--t > 0); -+ t = *ip++; -+ } -+ } -+ -+#if defined(HAVE_TEST_IP) || defined(HAVE_TEST_OP) -+ *out_len = op - out; -+ return LZO_E_EOF_NOT_FOUND; -+#endif -+ -+ eof_found: -+ assert("lzo-21", t == 1); -+ *out_len = op - out; -+ return (ip == ip_end ? LZO_E_OK : -+ (ip < ip_end ? LZO_E_INPUT_NOT_CONSUMED : LZO_E_INPUT_OVERRUN)); -+ -+#if defined(HAVE_NEED_IP) -+ input_overrun: -+ *out_len = op - out; -+ return LZO_E_INPUT_OVERRUN; -+#endif -+ -+#if defined(HAVE_NEED_OP) -+ output_overrun: -+ *out_len = op - out; -+ return LZO_E_OUTPUT_OVERRUN; -+#endif -+ -+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND) -+ lookbehind_overrun: -+ *out_len = op - out; -+ return LZO_E_LOOKBEHIND_OVERRUN; -+#endif -+} -+ -+#define LZO_TEST_DECOMPRESS_OVERRUN -+#undef DO_DECOMPRESS -+#define DO_DECOMPRESS lzo1x_decompress_safe -+ -+#if defined(LZO_TEST_DECOMPRESS_OVERRUN) -+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT) -+# define LZO_TEST_DECOMPRESS_OVERRUN_INPUT 2 -+# endif -+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT) -+# define LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT 2 -+# endif -+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND) -+# define LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND -+# endif -+#endif -+ -+#undef TEST_IP -+#undef TEST_OP -+#undef TEST_LOOKBEHIND -+#undef NEED_IP -+#undef NEED_OP -+#undef HAVE_TEST_IP -+#undef HAVE_TEST_OP -+#undef HAVE_NEED_IP -+#undef HAVE_NEED_OP -+#undef HAVE_ANY_IP -+#undef HAVE_ANY_OP -+ -+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT) -+# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 1) -+# define TEST_IP (ip < ip_end) -+# endif -+# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 2) -+# define NEED_IP(x) \ -+ if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x)) goto input_overrun -+# endif -+#endif -+ -+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT) -+# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 1) -+# define TEST_OP (op <= op_end) -+# endif -+# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 2) -+# undef TEST_OP -+# define NEED_OP(x) \ -+ if ((lzo_uint)(op_end - op) < (lzo_uint)(x)) goto output_overrun -+# endif -+#endif -+ -+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND) -+# define TEST_LOOKBEHIND(m_pos,out) if (m_pos < out) goto lookbehind_overrun -+#else -+# define TEST_LOOKBEHIND(m_pos,op) ((void) 0) -+#endif -+ -+#if !defined(LZO_EOF_CODE) && !defined(TEST_IP) -+# define TEST_IP (ip < ip_end) -+#endif -+ -+#if defined(TEST_IP) -+# define HAVE_TEST_IP -+#else -+# define TEST_IP 1 -+#endif -+#if defined(TEST_OP) -+# define HAVE_TEST_OP -+#else -+# define TEST_OP 1 -+#endif -+ -+#if defined(NEED_IP) -+# define HAVE_NEED_IP -+#else -+# define NEED_IP(x) ((void) 0) -+#endif -+#if defined(NEED_OP) -+# define HAVE_NEED_OP -+#else -+# define NEED_OP(x) ((void) 0) -+#endif -+ -+#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP) -+# define HAVE_ANY_IP -+#endif -+#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP) -+# define HAVE_ANY_OP -+#endif -+ -+#undef __COPY4 -+#define __COPY4(dst,src) * (lzo_uint32p)(dst) = * (const lzo_uint32p)(src) -+ -+#undef COPY4 -+#if defined(LZO_UNALIGNED_OK_4) -+# define COPY4(dst,src) __COPY4(dst,src) -+#elif defined(LZO_ALIGNED_OK_4) -+# define COPY4(dst,src) __COPY4((lzo_ptr_t)(dst),(lzo_ptr_t)(src)) -+#endif -+ -+/***** End of minilzo.c *****/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/compress/minilzo.h linux-2.6.20/fs/reiser4/plugin/compress/minilzo.h ---- linux-2.6.20.orig/fs/reiser4/plugin/compress/minilzo.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/compress/minilzo.h 2007-05-06 14:50:43.754993222 +0400 -@@ -0,0 +1,70 @@ -+/* minilzo.h -- mini subset of the LZO real-time data compression library -+ adopted for reiser4 compression transform plugin. -+ -+ This file is part of the LZO real-time data compression library -+ and not included in any proprietary licenses of reiser4. -+ -+ Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer -+ Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer -+ Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer -+ Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer -+ Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer -+ Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer -+ Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer -+ All Rights Reserved. -+ -+ The LZO library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU General Public License as -+ published by the Free Software Foundation; either version 2 of -+ the License, or (at your option) any later version. -+ -+ The LZO library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ GNU General Public License for more details. -+ -+ You should have received a copy of the GNU General Public License -+ along with the LZO library; see the file COPYING. -+ If not, write to the Free Software Foundation, Inc., -+ 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -+ -+ Markus F.X.J. Oberhumer -+ -+ http://www.oberhumer.com/opensource/lzo/ -+ */ -+ -+/* -+ * NOTE: -+ * the full LZO package can be found at -+ * http://www.oberhumer.com/opensource/lzo/ -+ */ -+ -+#ifndef __MINILZO_H -+#define __MINILZO_H -+ -+#define MINILZO_VERSION 0x1080 -+ -+#include "lzoconf.h" -+ -+/* Memory required for the wrkmem parameter. -+ * When the required size is 0, you can also pass a NULL pointer. -+ */ -+ -+#define LZO1X_MEM_COMPRESS LZO1X_1_MEM_COMPRESS -+#define LZO1X_1_MEM_COMPRESS ((lzo_uint32) (16384L * lzo_sizeof_dict_t)) -+#define LZO1X_MEM_DECOMPRESS (0) -+ -+/* compression */ -+extern int lzo1x_1_compress(const lzo_byte * src, lzo_uint src_len, -+ lzo_byte * dst, lzo_uintp dst_len, -+ lzo_voidp wrkmem); -+/* decompression */ -+extern int lzo1x_decompress(const lzo_byte * src, lzo_uint src_len, -+ lzo_byte * dst, lzo_uintp dst_len, -+ lzo_voidp wrkmem /* NOT USED */); -+/* safe decompression with overrun testing */ -+extern int lzo1x_decompress_safe(const lzo_byte * src, lzo_uint src_len, -+ lzo_byte * dst, lzo_uintp dst_len, -+ lzo_voidp wrkmem /* NOT USED */ ); -+ -+#endif /* already included */ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/crypto/cipher.c linux-2.6.20/fs/reiser4/plugin/crypto/cipher.c ---- linux-2.6.20.orig/fs/reiser4/plugin/crypto/cipher.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/crypto/cipher.c 2007-05-06 14:50:43.754993222 +0400 -@@ -0,0 +1,37 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, -+ licensing governed by reiser4/README */ -+/* Reiser4 cipher transform plugins */ -+ -+#include "../../debug.h" -+#include "../plugin.h" -+ -+cipher_plugin cipher_plugins[LAST_CIPHER_ID] = { -+ [NONE_CIPHER_ID] = { -+ .h = { -+ .type_id = REISER4_CIPHER_PLUGIN_TYPE, -+ .id = NONE_CIPHER_ID, -+ .pops = NULL, -+ .label = "none", -+ .desc = "no cipher transform", -+ .linkage = {NULL, NULL} -+ }, -+ .alloc = NULL, -+ .free = NULL, -+ .scale = NULL, -+ .align_stream = NULL, -+ .setkey = NULL, -+ .encrypt = NULL, -+ .decrypt = NULL -+ } -+}; -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/crypto/cipher.h linux-2.6.20/fs/reiser4/plugin/crypto/cipher.h ---- linux-2.6.20.orig/fs/reiser4/plugin/crypto/cipher.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/crypto/cipher.h 2007-05-06 14:50:43.754993222 +0400 -@@ -0,0 +1,55 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+/* This file contains definitions for the objects operated -+ by reiser4 key manager, which is something like keyring -+ wrapped by appropriate reiser4 plugin */ -+ -+#if !defined( __FS_REISER4_CRYPT_H__ ) -+#define __FS_REISER4_CRYPT_H__ -+ -+#include -+ -+/* key info imported from user space */ -+typedef struct crypto_data { -+ int keysize; /* uninstantiated key size */ -+ __u8 * key; /* uninstantiated key */ -+ int keyid_size; /* size of passphrase */ -+ __u8 * keyid; /* passphrase */ -+} crypto_data_t; -+ -+/* This object contains all needed infrastructure to implement -+ cipher transform. This is operated (allocating, inheriting, -+ validating, binding to host inode, etc..) by reiser4 key manager. -+ -+ This info can be allocated in two cases: -+ 1. importing a key from user space. -+ 2. reading inode from disk */ -+typedef struct crypto_stat { -+ struct inode * host; -+ struct crypto_hash * digest; -+ struct crypto_blkcipher * cipher; -+#if 0 -+ cipher_key_plugin * kplug; /* key manager */ -+#endif -+ __u8 * keyid; /* key fingerprint, created by digest plugin, -+ using uninstantiated key and passphrase. -+ supposed to be stored in disk stat-data */ -+ int inst; /* this indicates if the cipher key is -+ instantiated (case 1 above) */ -+ int keysize; /* uninstantiated key size (bytes), supposed -+ to be stored in disk stat-data */ -+ int keyload_count; /* number of the objects which has this -+ crypto-stat attached */ -+} crypto_stat_t; -+ -+#endif /* __FS_REISER4_CRYPT_H__ */ -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/crypto/digest.c linux-2.6.20/fs/reiser4/plugin/crypto/digest.c ---- linux-2.6.20.orig/fs/reiser4/plugin/crypto/digest.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/crypto/digest.c 2007-05-06 14:50:43.754993222 +0400 -@@ -0,0 +1,58 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* reiser4 digest transform plugin (is used by cryptcompress object plugin) */ -+/* EDWARD-FIXME-HANS: and it does what? a digest is a what? */ -+#include "../../debug.h" -+#include "../plugin_header.h" -+#include "../plugin.h" -+#include "../file/cryptcompress.h" -+ -+#include -+ -+extern digest_plugin digest_plugins[LAST_DIGEST_ID]; -+ -+static struct crypto_hash * alloc_sha256 (void) -+{ -+#if REISER4_SHA256 -+ return crypto_alloc_hash ("sha256", 0, CRYPTO_ALG_ASYNC); -+#else -+ warning("edward-1418", "sha256 unsupported"); -+ return ERR_PTR(-EINVAL); -+#endif -+} -+ -+static void free_sha256 (struct crypto_hash * tfm) -+{ -+#if REISER4_SHA256 -+ crypto_free_hash(tfm); -+#endif -+ return; -+} -+ -+/* digest plugins */ -+digest_plugin digest_plugins[LAST_DIGEST_ID] = { -+ [SHA256_32_DIGEST_ID] = { -+ .h = { -+ .type_id = REISER4_DIGEST_PLUGIN_TYPE, -+ .id = SHA256_32_DIGEST_ID, -+ .pops = NULL, -+ .label = "sha256_32", -+ .desc = "sha256_32 digest transform", -+ .linkage = {NULL, NULL} -+ }, -+ .fipsize = sizeof(__u32), -+ .alloc = alloc_sha256, -+ .free = free_sha256 -+ } -+}; -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/dir/dir.h linux-2.6.20/fs/reiser4/plugin/dir/dir.h ---- linux-2.6.20.orig/fs/reiser4/plugin/dir/dir.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/dir/dir.h 2007-05-06 14:50:43.754993222 +0400 -@@ -0,0 +1,36 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* this file contains declarations of methods implementing directory plugins */ -+ -+#if !defined( __REISER4_DIR_H__ ) -+#define __REISER4_DIR_H__ -+ -+/*#include "../../key.h" -+ -+#include */ -+ -+/* declarations of functions implementing HASHED_DIR_PLUGIN_ID dir plugin */ -+ -+/* "hashed" directory methods of dir plugin */ -+void build_entry_key_hashed(const struct inode *, const struct qstr *, -+ reiser4_key *); -+ -+/* declarations of functions implementing SEEKABLE_HASHED_DIR_PLUGIN_ID dir plugin */ -+ -+/* "seekable" directory methods of dir plugin */ -+void build_entry_key_seekable(const struct inode *, const struct qstr *, -+ reiser4_key *); -+ -+/* __REISER4_DIR_H__ */ -+#endif -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/dir/hashed_dir.c linux-2.6.20/fs/reiser4/plugin/dir/hashed_dir.c ---- linux-2.6.20.orig/fs/reiser4/plugin/dir/hashed_dir.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/dir/hashed_dir.c 2007-05-06 14:50:43.754993222 +0400 -@@ -0,0 +1,81 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Directory plugin using hashes (see fs/reiser4/plugin/hash.c) to map file -+ names to the files. */ -+ -+/* -+ * Hashed directory logically consists of persistent directory -+ * entries. Directory entry is a pair of a file name and a key of stat-data of -+ * a file that has this name in the given directory. -+ * -+ * Directory entries are stored in the tree in the form of directory -+ * items. Directory item should implement dir_entry_ops portion of item plugin -+ * interface (see plugin/item/item.h). Hashed directory interacts with -+ * directory item plugin exclusively through dir_entry_ops operations. -+ * -+ * Currently there are two implementations of directory items: "simple -+ * directory item" (plugin/item/sde.[ch]), and "compound directory item" -+ * (plugin/item/cde.[ch]) with the latter being the default. -+ * -+ * There is, however some delicate way through which directory code interferes -+ * with item plugin: key assignment policy. A key for a directory item is -+ * chosen by directory code, and as described in kassign.c, this key contains -+ * a portion of file name. Directory item uses this knowledge to avoid storing -+ * this portion of file name twice: in the key and in the directory item body. -+ * -+ */ -+ -+#include "../../inode.h" -+ -+void complete_entry_key(const struct inode *, const char *name, -+ int len, reiser4_key * result); -+ -+/* this is implementation of build_entry_key method of dir -+ plugin for HASHED_DIR_PLUGIN_ID -+ */ -+void build_entry_key_hashed(const struct inode *dir, /* directory where entry is -+ * (or will be) in.*/ -+ const struct qstr *qname, /* name of file referenced -+ * by this entry */ -+ reiser4_key * result /* resulting key of directory -+ * entry */ ) -+{ -+ const char *name; -+ int len; -+ -+ assert("nikita-1139", dir != NULL); -+ assert("nikita-1140", qname != NULL); -+ assert("nikita-1141", qname->name != NULL); -+ assert("nikita-1142", result != NULL); -+ -+ name = qname->name; -+ len = qname->len; -+ -+ assert("nikita-2867", strlen(name) == len); -+ -+ reiser4_key_init(result); -+ /* locality of directory entry's key is objectid of parent -+ directory */ -+ set_key_locality(result, get_inode_oid(dir)); -+ /* minor packing locality is constant */ -+ set_key_type(result, KEY_FILE_NAME_MINOR); -+ /* dot is special case---we always want it to be first entry in -+ a directory. Actually, we just want to have smallest -+ directory entry. -+ */ -+ if (len == 1 && name[0] == '.') -+ return; -+ -+ /* initialize part of entry key which depends on file name */ -+ complete_entry_key(dir, name, len, result); -+} -+ -+/* Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/dir/Makefile linux-2.6.20/fs/reiser4/plugin/dir/Makefile ---- linux-2.6.20.orig/fs/reiser4/plugin/dir/Makefile 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/dir/Makefile 2007-05-06 14:50:43.758994472 +0400 -@@ -0,0 +1,5 @@ -+obj-$(CONFIG_REISER4_FS) += dir_plugins.o -+ -+dir_plugins-objs := \ -+ hashed_dir.o \ -+ seekable_dir.o -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/dir/seekable_dir.c linux-2.6.20/fs/reiser4/plugin/dir/seekable_dir.c ---- linux-2.6.20.orig/fs/reiser4/plugin/dir/seekable_dir.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/dir/seekable_dir.c 2007-05-06 14:50:43.758994472 +0400 -@@ -0,0 +1,46 @@ -+/* Copyright 2005 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#include "../../inode.h" -+ -+/* this is implementation of build_entry_key method of dir -+ plugin for SEEKABLE_HASHED_DIR_PLUGIN_ID -+ This is for directories where we want repeatable and restartable readdir() -+ even in case 32bit user level struct dirent (readdir(3)). -+*/ -+void -+build_entry_key_seekable(const struct inode *dir, const struct qstr *name, -+ reiser4_key * result) -+{ -+ oid_t objectid; -+ -+ assert("nikita-2283", dir != NULL); -+ assert("nikita-2284", name != NULL); -+ assert("nikita-2285", name->name != NULL); -+ assert("nikita-2286", result != NULL); -+ -+ reiser4_key_init(result); -+ /* locality of directory entry's key is objectid of parent -+ directory */ -+ set_key_locality(result, get_inode_oid(dir)); -+ /* minor packing locality is constant */ -+ set_key_type(result, KEY_FILE_NAME_MINOR); -+ /* dot is special case---we always want it to be first entry in -+ a directory. Actually, we just want to have smallest -+ directory entry. -+ */ -+ if ((name->len == 1) && (name->name[0] == '.')) -+ return; -+ -+ /* objectid of key is 31 lowest bits of hash. */ -+ objectid = -+ inode_hash_plugin(dir)->hash(name->name, -+ (int)name->len) & 0x7fffffff; -+ -+ assert("nikita-2303", !(objectid & ~KEY_OBJECTID_MASK)); -+ set_key_objectid(result, objectid); -+ -+ /* offset is always 0. */ -+ set_key_offset(result, (__u64) 0); -+ return; -+} -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/dir_plugin_common.c linux-2.6.20/fs/reiser4/plugin/dir_plugin_common.c ---- linux-2.6.20.orig/fs/reiser4/plugin/dir_plugin_common.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/dir_plugin_common.c 2007-05-06 14:50:43.758994472 +0400 -@@ -0,0 +1,872 @@ -+/* Copyright 2005 by Hans Reiser, licensing governed by -+ reiser4/README */ -+ -+/* this file contains typical implementations for most of methods of -+ directory plugin -+*/ -+ -+#include "../inode.h" -+ -+int reiser4_find_entry(struct inode *dir, struct dentry *name, -+ lock_handle *, znode_lock_mode, reiser4_dir_entry_desc *); -+int reiser4_lookup_name(struct inode *parent, struct dentry *dentry, reiser4_key * key); -+void check_light_weight(struct inode *inode, struct inode *parent); -+ -+/* this is common implementation of get_parent method of dir plugin -+ this is used by NFS kernel server to "climb" up directory tree to -+ check permissions -+ */ -+struct dentry *get_parent_common(struct inode *child) -+{ -+ struct super_block *s; -+ struct inode *parent; -+ struct dentry dotdot; -+ struct dentry *dentry; -+ reiser4_key key; -+ int result; -+ -+ /* -+ * lookup dotdot entry. -+ */ -+ -+ s = child->i_sb; -+ memset(&dotdot, 0, sizeof(dotdot)); -+ dotdot.d_name.name = ".."; -+ dotdot.d_name.len = 2; -+ dotdot.d_op = &get_super_private(s)->ops.dentry; -+ -+ result = reiser4_lookup_name(child, &dotdot, &key); -+ if (result != 0) -+ return ERR_PTR(result); -+ -+ parent = reiser4_iget(s, &key, 1); -+ if (!IS_ERR(parent)) { -+ /* -+ * FIXME-NIKITA dubious: attributes are inherited from @child -+ * to @parent. But: -+ * -+ * (*) this is the only this we can do -+ * -+ * (*) attributes of light-weight object are inherited -+ * from a parent through which object was looked up first, -+ * so it is ambiguous anyway. -+ * -+ */ -+ check_light_weight(parent, child); -+ reiser4_iget_complete(parent); -+ dentry = d_alloc_anon(parent); -+ if (dentry == NULL) { -+ iput(parent); -+ dentry = ERR_PTR(RETERR(-ENOMEM)); -+ } else -+ dentry->d_op = &get_super_private(s)->ops.dentry; -+ } else if (PTR_ERR(parent) == -ENOENT) -+ dentry = ERR_PTR(RETERR(-ESTALE)); -+ else -+ dentry = (void *)parent; -+ return dentry; -+} -+ -+/* this is common implementation of is_name_acceptable method of dir -+ plugin -+ */ -+int is_name_acceptable_common(const struct inode *inode, /* directory to check */ -+ const char *name UNUSED_ARG, /* name to check */ -+ int len /* @name's length */ ) -+{ -+ assert("nikita-733", inode != NULL); -+ assert("nikita-734", name != NULL); -+ assert("nikita-735", len > 0); -+ -+ return len <= reiser4_max_filename_len(inode); -+} -+ -+/* there is no common implementation of build_entry_key method of dir -+ plugin. See plugin/dir/hashed_dir.c:build_entry_key_hashed() or -+ plugin/dir/seekable.c:build_entry_key_seekable() for example -+*/ -+ -+/* this is common implementation of build_readdir_key method of dir -+ plugin -+ see reiser4_readdir_common for more details -+*/ -+int build_readdir_key_common(struct file *dir /* directory being read */ , -+ reiser4_key * result /* where to store key */ ) -+{ -+ reiser4_file_fsdata *fdata; -+ struct inode *inode; -+ -+ assert("nikita-1361", dir != NULL); -+ assert("nikita-1362", result != NULL); -+ assert("nikita-1363", dir->f_dentry != NULL); -+ inode = dir->f_dentry->d_inode; -+ assert("nikita-1373", inode != NULL); -+ -+ fdata = reiser4_get_file_fsdata(dir); -+ if (IS_ERR(fdata)) -+ return PTR_ERR(fdata); -+ assert("nikita-1364", fdata != NULL); -+ return extract_key_from_de_id(get_inode_oid(inode), -+ &fdata->dir.readdir.position. -+ dir_entry_key, result); -+ -+} -+ -+void reiser4_adjust_dir_file(struct inode *, const struct dentry *, int offset, -+ int adj); -+ -+/* this is common implementation of add_entry method of dir plugin -+*/ -+int reiser4_add_entry_common(struct inode *object, /* directory to add new name -+ * in */ -+ struct dentry *where, /* new name */ -+ reiser4_object_create_data * data, /* parameters of -+ * new object */ -+ reiser4_dir_entry_desc * entry /* parameters of -+ * new directory -+ * entry */) -+{ -+ int result; -+ coord_t *coord; -+ lock_handle lh; -+ reiser4_dentry_fsdata *fsdata; -+ reiser4_block_nr reserve; -+ -+ assert("nikita-1114", object != NULL); -+ assert("nikita-1250", where != NULL); -+ -+ fsdata = reiser4_get_dentry_fsdata(where); -+ if (unlikely(IS_ERR(fsdata))) -+ return PTR_ERR(fsdata); -+ -+ reserve = inode_dir_plugin(object)->estimate.add_entry(object); -+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) -+ return RETERR(-ENOSPC); -+ -+ init_lh(&lh); -+ coord = &fsdata->dec.entry_coord; -+ coord_clear_iplug(coord); -+ -+ /* check for this entry in a directory. This is plugin method. */ -+ result = reiser4_find_entry(object, where, &lh, ZNODE_WRITE_LOCK, -+ entry); -+ if (likely(result == -ENOENT)) { -+ /* add new entry. Just pass control to the directory -+ item plugin. */ -+ assert("nikita-1709", inode_dir_item_plugin(object)); -+ assert("nikita-2230", coord->node == lh.node); -+ reiser4_seal_done(&fsdata->dec.entry_seal); -+ result = -+ inode_dir_item_plugin(object)->s.dir.add_entry(object, -+ coord, &lh, -+ where, -+ entry); -+ if (result == 0) { -+ reiser4_adjust_dir_file(object, where, -+ fsdata->dec.pos + 1, +1); -+ INODE_INC_FIELD(object, i_size); -+ } -+ } else if (result == 0) { -+ assert("nikita-2232", coord->node == lh.node); -+ result = RETERR(-EEXIST); -+ } -+ done_lh(&lh); -+ -+ return result; -+} -+ -+/** -+ * rem_entry - remove entry from directory item -+ * @dir: -+ * @dentry: -+ * @entry: -+ * @coord: -+ * @lh: -+ * -+ * Checks that coordinate @coord is set properly and calls item plugin -+ * method to cut entry. -+ */ -+static int -+rem_entry(struct inode *dir, struct dentry *dentry, -+ reiser4_dir_entry_desc * entry, coord_t * coord, lock_handle * lh) -+{ -+ item_plugin *iplug; -+ struct inode *child; -+ -+ iplug = inode_dir_item_plugin(dir); -+ child = dentry->d_inode; -+ assert("nikita-3399", child != NULL); -+ -+ /* check that we are really destroying an entry for @child */ -+ if (REISER4_DEBUG) { -+ int result; -+ reiser4_key key; -+ -+ result = iplug->s.dir.extract_key(coord, &key); -+ if (result != 0) -+ return result; -+ if (get_key_objectid(&key) != get_inode_oid(child)) { -+ warning("nikita-3397", -+ "rem_entry: %#llx != %#llx\n", -+ get_key_objectid(&key), -+ (unsigned long long)get_inode_oid(child)); -+ return RETERR(-EIO); -+ } -+ } -+ return iplug->s.dir.rem_entry(dir, &dentry->d_name, coord, lh, entry); -+} -+ -+/** -+ * reiser4_rem_entry_common - remove entry from a directory -+ * @dir: directory to remove entry from -+ * @where: name that is being removed -+ * @entry: description of entry being removed -+ * -+ * This is common implementation of rem_entry method of dir plugin. -+ */ -+int reiser4_rem_entry_common(struct inode *dir, -+ struct dentry *dentry, -+ reiser4_dir_entry_desc *entry) -+{ -+ int result; -+ coord_t *coord; -+ lock_handle lh; -+ reiser4_dentry_fsdata *fsdata; -+ __u64 tograb; -+ -+ assert("nikita-1124", dir != NULL); -+ assert("nikita-1125", dentry != NULL); -+ -+ tograb = inode_dir_plugin(dir)->estimate.rem_entry(dir); -+ result = reiser4_grab_space(tograb, BA_CAN_COMMIT | BA_RESERVED); -+ if (result != 0) -+ return RETERR(-ENOSPC); -+ -+ init_lh(&lh); -+ -+ /* check for this entry in a directory. This is plugin method. */ -+ result = reiser4_find_entry(dir, dentry, &lh, ZNODE_WRITE_LOCK, entry); -+ fsdata = reiser4_get_dentry_fsdata(dentry); -+ if (IS_ERR(fsdata)) { -+ done_lh(&lh); -+ return PTR_ERR(fsdata); -+ } -+ -+ coord = &fsdata->dec.entry_coord; -+ -+ assert("nikita-3404", -+ get_inode_oid(dentry->d_inode) != get_inode_oid(dir) || -+ dir->i_size <= 1); -+ -+ coord_clear_iplug(coord); -+ if (result == 0) { -+ /* remove entry. Just pass control to the directory item -+ plugin. */ -+ assert("vs-542", inode_dir_item_plugin(dir)); -+ reiser4_seal_done(&fsdata->dec.entry_seal); -+ reiser4_adjust_dir_file(dir, dentry, fsdata->dec.pos, -1); -+ result = -+ WITH_COORD(coord, -+ rem_entry(dir, dentry, entry, coord, &lh)); -+ if (result == 0) { -+ if (dir->i_size >= 1) -+ INODE_DEC_FIELD(dir, i_size); -+ else { -+ warning("nikita-2509", "Dir %llu is runt", -+ (unsigned long long) -+ get_inode_oid(dir)); -+ result = RETERR(-EIO); -+ } -+ -+ assert("nikita-3405", dentry->d_inode->i_nlink != 1 || -+ dentry->d_inode->i_size != 2 || -+ inode_dir_plugin(dentry->d_inode) == NULL); -+ } -+ } -+ done_lh(&lh); -+ -+ return result; -+} -+ -+static reiser4_block_nr estimate_init(struct inode *parent, -+ struct inode *object); -+static int create_dot_dotdot(struct inode *object, struct inode *parent); -+ -+/* this is common implementation of init method of dir plugin -+ create "." and ".." entries -+*/ -+int reiser4_dir_init_common(struct inode *object, /* new directory */ -+ struct inode *parent, /* parent directory */ -+ reiser4_object_create_data * data /* info passed -+ * to us, this -+ * is filled by -+ * reiser4() -+ * syscall in -+ * particular */) -+{ -+ reiser4_block_nr reserve; -+ -+ assert("nikita-680", object != NULL); -+ assert("nikita-681", S_ISDIR(object->i_mode)); -+ assert("nikita-682", parent != NULL); -+ assert("nikita-684", data != NULL); -+ assert("nikita-686", data->id == DIRECTORY_FILE_PLUGIN_ID); -+ assert("nikita-687", object->i_mode & S_IFDIR); -+ -+ reserve = estimate_init(parent, object); -+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) -+ return RETERR(-ENOSPC); -+ -+ return create_dot_dotdot(object, parent); -+} -+ -+/* this is common implementation of done method of dir plugin -+ remove "." entry -+*/ -+int reiser4_dir_done_common(struct inode *object /* object being deleted */ ) -+{ -+ int result; -+ reiser4_block_nr reserve; -+ struct dentry goodby_dots; -+ reiser4_dir_entry_desc entry; -+ -+ assert("nikita-1449", object != NULL); -+ -+ if (reiser4_inode_get_flag(object, REISER4_NO_SD)) -+ return 0; -+ -+ /* of course, this can be rewritten to sweep everything in one -+ reiser4_cut_tree(). */ -+ memset(&entry, 0, sizeof entry); -+ -+ /* FIXME: this done method is called from reiser4_delete_dir_common which -+ * reserved space already */ -+ reserve = inode_dir_plugin(object)->estimate.rem_entry(object); -+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT | BA_RESERVED)) -+ return RETERR(-ENOSPC); -+ -+ memset(&goodby_dots, 0, sizeof goodby_dots); -+ entry.obj = goodby_dots.d_inode = object; -+ goodby_dots.d_name.name = "."; -+ goodby_dots.d_name.len = 1; -+ result = reiser4_rem_entry_common(object, &goodby_dots, &entry); -+ reiser4_free_dentry_fsdata(&goodby_dots); -+ if (unlikely(result != 0 && result != -ENOMEM && result != -ENOENT)) -+ /* only worth a warning -+ -+ "values of B will give rise to dom!\n" -+ -- v6src/s2/mv.c:89 -+ */ -+ warning("nikita-2252", "Cannot remove dot of %lli: %i", -+ (unsigned long long)get_inode_oid(object), result); -+ return 0; -+} -+ -+/* this is common implementation of attach method of dir plugin -+*/ -+int reiser4_attach_common(struct inode *child UNUSED_ARG, -+ struct inode *parent UNUSED_ARG) -+{ -+ assert("nikita-2647", child != NULL); -+ assert("nikita-2648", parent != NULL); -+ -+ return 0; -+} -+ -+/* this is common implementation of detach method of dir plugin -+ remove "..", decrease nlink on parent -+*/ -+int reiser4_detach_common(struct inode *object, struct inode *parent) -+{ -+ int result; -+ struct dentry goodby_dots; -+ reiser4_dir_entry_desc entry; -+ -+ assert("nikita-2885", object != NULL); -+ assert("nikita-2886", !reiser4_inode_get_flag(object, REISER4_NO_SD)); -+ -+ memset(&entry, 0, sizeof entry); -+ -+ /* NOTE-NIKITA this only works if @parent is -the- parent of -+ @object, viz. object whose key is stored in dotdot -+ entry. Wouldn't work with hard-links on directories. */ -+ memset(&goodby_dots, 0, sizeof goodby_dots); -+ entry.obj = goodby_dots.d_inode = parent; -+ goodby_dots.d_name.name = ".."; -+ goodby_dots.d_name.len = 2; -+ result = reiser4_rem_entry_common(object, &goodby_dots, &entry); -+ reiser4_free_dentry_fsdata(&goodby_dots); -+ if (result == 0) { -+ /* the dot should be the only entry remaining at this time... */ -+ assert("nikita-3400", -+ object->i_size == 1 && object->i_nlink <= 2); -+#if 0 -+ /* and, together with the only name directory can have, they -+ * provides for the last 2 remaining references. If we get -+ * here as part of error handling during mkdir, @object -+ * possibly has no name yet, so its nlink == 1. If we get here -+ * from rename (targeting empty directory), it has no name -+ * already, so its nlink == 1. */ -+ assert("nikita-3401", -+ object->i_nlink == 2 || object->i_nlink == 1); -+#endif -+ -+ /* decrement nlink of directory removed ".." pointed -+ to */ -+ reiser4_del_nlink(parent, NULL, 0); -+ } -+ return result; -+} -+ -+/* this is common implementation of estimate.add_entry method of -+ dir plugin -+ estimation of adding entry which supposes that entry is inserting a -+ unit into item -+*/ -+reiser4_block_nr estimate_add_entry_common(const struct inode * inode) -+{ -+ return estimate_one_insert_into_item(reiser4_tree_by_inode(inode)); -+} -+ -+/* this is common implementation of estimate.rem_entry method of dir -+ plugin -+*/ -+reiser4_block_nr estimate_rem_entry_common(const struct inode * inode) -+{ -+ return estimate_one_item_removal(reiser4_tree_by_inode(inode)); -+} -+ -+/* this is common implementation of estimate.unlink method of dir -+ plugin -+*/ -+reiser4_block_nr -+dir_estimate_unlink_common(const struct inode * parent, -+ const struct inode * object) -+{ -+ reiser4_block_nr res; -+ -+ /* hashed_rem_entry(object) */ -+ res = inode_dir_plugin(object)->estimate.rem_entry(object); -+ /* del_nlink(parent) */ -+ res += 2 * inode_file_plugin(parent)->estimate.update(parent); -+ -+ return res; -+} -+ -+/* -+ * helper for inode_ops ->lookup() and dir plugin's ->get_parent() -+ * methods: if @inode is a light-weight file, setup its credentials -+ * that are not stored in the stat-data in this case -+ */ -+void check_light_weight(struct inode *inode, struct inode *parent) -+{ -+ if (reiser4_inode_get_flag(inode, REISER4_LIGHT_WEIGHT)) { -+ inode->i_uid = parent->i_uid; -+ inode->i_gid = parent->i_gid; -+ /* clear light-weight flag. If inode would be read by any -+ other name, [ug]id wouldn't change. */ -+ reiser4_inode_clr_flag(inode, REISER4_LIGHT_WEIGHT); -+ } -+} -+ -+/* looks for name specified in @dentry in directory @parent and if name is -+ found - key of object found entry points to is stored in @entry->key */ -+int reiser4_lookup_name(struct inode *parent, /* inode of directory to lookup for -+ * name in */ -+ struct dentry *dentry, /* name to look for */ -+ reiser4_key * key /* place to store key */ ) -+{ -+ int result; -+ coord_t *coord; -+ lock_handle lh; -+ const char *name; -+ int len; -+ reiser4_dir_entry_desc entry; -+ reiser4_dentry_fsdata *fsdata; -+ -+ assert("nikita-1247", parent != NULL); -+ assert("nikita-1248", dentry != NULL); -+ assert("nikita-1123", dentry->d_name.name != NULL); -+ assert("vs-1486", -+ dentry->d_op == &get_super_private(parent->i_sb)->ops.dentry); -+ -+ name = dentry->d_name.name; -+ len = dentry->d_name.len; -+ -+ if (!inode_dir_plugin(parent)->is_name_acceptable(parent, name, len)) -+ /* some arbitrary error code to return */ -+ return RETERR(-ENAMETOOLONG); -+ -+ fsdata = reiser4_get_dentry_fsdata(dentry); -+ if (IS_ERR(fsdata)) -+ return PTR_ERR(fsdata); -+ -+ coord = &fsdata->dec.entry_coord; -+ coord_clear_iplug(coord); -+ init_lh(&lh); -+ -+ /* find entry in a directory. This is plugin method. */ -+ result = reiser4_find_entry(parent, dentry, &lh, ZNODE_READ_LOCK, -+ &entry); -+ if (result == 0) { -+ /* entry was found, extract object key from it. */ -+ result = -+ WITH_COORD(coord, -+ item_plugin_by_coord(coord)->s.dir. -+ extract_key(coord, key)); -+ } -+ done_lh(&lh); -+ return result; -+ -+} -+ -+/* helper for reiser4_dir_init_common(): estimate number of blocks to reserve */ -+static reiser4_block_nr -+estimate_init(struct inode *parent, struct inode *object) -+{ -+ reiser4_block_nr res = 0; -+ -+ assert("vpf-321", parent != NULL); -+ assert("vpf-322", object != NULL); -+ -+ /* hashed_add_entry(object) */ -+ res += inode_dir_plugin(object)->estimate.add_entry(object); -+ /* reiser4_add_nlink(object) */ -+ res += inode_file_plugin(object)->estimate.update(object); -+ /* hashed_add_entry(object) */ -+ res += inode_dir_plugin(object)->estimate.add_entry(object); -+ /* reiser4_add_nlink(parent) */ -+ res += inode_file_plugin(parent)->estimate.update(parent); -+ -+ return 0; -+} -+ -+/* helper function for reiser4_dir_init_common(). Create "." and ".." */ -+static int create_dot_dotdot(struct inode *object /* object to create dot and -+ * dotdot for */ , -+ struct inode *parent /* parent of @object */) -+{ -+ int result; -+ struct dentry dots_entry; -+ reiser4_dir_entry_desc entry; -+ -+ assert("nikita-688", object != NULL); -+ assert("nikita-689", S_ISDIR(object->i_mode)); -+ assert("nikita-691", parent != NULL); -+ -+ /* We store dot and dotdot as normal directory entries. This is -+ not necessary, because almost all information stored in them -+ is already in the stat-data of directory, the only thing -+ being missed is objectid of grand-parent directory that can -+ easily be added there as extension. -+ -+ But it is done the way it is done, because not storing dot -+ and dotdot will lead to the following complications: -+ -+ . special case handling in ->lookup(). -+ . addition of another extension to the sd. -+ . dependency on key allocation policy for stat data. -+ -+ */ -+ -+ memset(&entry, 0, sizeof entry); -+ memset(&dots_entry, 0, sizeof dots_entry); -+ entry.obj = dots_entry.d_inode = object; -+ dots_entry.d_name.name = "."; -+ dots_entry.d_name.len = 1; -+ result = reiser4_add_entry_common(object, &dots_entry, NULL, &entry); -+ reiser4_free_dentry_fsdata(&dots_entry); -+ -+ if (result == 0) { -+ result = reiser4_add_nlink(object, object, 0); -+ if (result == 0) { -+ entry.obj = dots_entry.d_inode = parent; -+ dots_entry.d_name.name = ".."; -+ dots_entry.d_name.len = 2; -+ result = reiser4_add_entry_common(object, -+ &dots_entry, NULL, &entry); -+ reiser4_free_dentry_fsdata(&dots_entry); -+ /* if creation of ".." failed, iput() will delete -+ object with ".". */ -+ if (result == 0) { -+ result = reiser4_add_nlink(parent, object, 0); -+ if (result != 0) -+ /* -+ * if we failed to bump i_nlink, try -+ * to remove ".." -+ */ -+ reiser4_detach_common(object, parent); -+ } -+ } -+ } -+ -+ if (result != 0) { -+ /* -+ * in the case of error, at least update stat-data so that, -+ * ->i_nlink updates are not lingering. -+ */ -+ reiser4_update_sd(object); -+ reiser4_update_sd(parent); -+ } -+ -+ return result; -+} -+ -+/* -+ * return 0 iff @coord contains a directory entry for the file with the name -+ * @name. -+ */ -+static int -+check_item(const struct inode *dir, const coord_t * coord, const char *name) -+{ -+ item_plugin *iplug; -+ char buf[DE_NAME_BUF_LEN]; -+ -+ iplug = item_plugin_by_coord(coord); -+ if (iplug == NULL) { -+ warning("nikita-1135", "Cannot get item plugin"); -+ print_coord("coord", coord, 1); -+ return RETERR(-EIO); -+ } else if (item_id_by_coord(coord) != -+ item_id_by_plugin(inode_dir_item_plugin(dir))) { -+ /* item id of current item does not match to id of items a -+ directory is built of */ -+ warning("nikita-1136", "Wrong item plugin"); -+ print_coord("coord", coord, 1); -+ return RETERR(-EIO); -+ } -+ assert("nikita-1137", iplug->s.dir.extract_name); -+ -+ /* Compare name stored in this entry with name we are looking for. -+ -+ NOTE-NIKITA Here should go code for support of something like -+ unicode, code tables, etc. -+ */ -+ return !!strcmp(name, iplug->s.dir.extract_name(coord, buf)); -+} -+ -+static int -+check_entry(const struct inode *dir, coord_t * coord, const struct qstr *name) -+{ -+ return WITH_COORD(coord, check_item(dir, coord, name->name)); -+} -+ -+/* -+ * argument package used by entry_actor to scan entries with identical keys. -+ */ -+typedef struct entry_actor_args { -+ /* name we are looking for */ -+ const char *name; -+ /* key of directory entry. entry_actor() scans through sequence of -+ * items/units having the same key */ -+ reiser4_key *key; -+ /* how many entries with duplicate key was scanned so far. */ -+ int non_uniq; -+#if REISER4_USE_COLLISION_LIMIT -+ /* scan limit */ -+ int max_non_uniq; -+#endif -+ /* return parameter: set to true, if ->name wasn't found */ -+ int not_found; -+ /* what type of lock to take when moving to the next node during -+ * scan */ -+ znode_lock_mode mode; -+ -+ /* last coord that was visited during scan */ -+ coord_t last_coord; -+ /* last node locked during scan */ -+ lock_handle last_lh; -+ /* inode of directory */ -+ const struct inode *inode; -+} entry_actor_args; -+ -+/* Function called by reiser4_find_entry() to look for given name -+ in the directory. */ -+static int entry_actor(reiser4_tree * tree UNUSED_ARG /* tree being scanned */ , -+ coord_t * coord /* current coord */ , -+ lock_handle * lh /* current lock handle */ , -+ void *entry_actor_arg /* argument to scan */ ) -+{ -+ reiser4_key unit_key; -+ entry_actor_args *args; -+ -+ assert("nikita-1131", tree != NULL); -+ assert("nikita-1132", coord != NULL); -+ assert("nikita-1133", entry_actor_arg != NULL); -+ -+ args = entry_actor_arg; -+ ++args->non_uniq; -+#if REISER4_USE_COLLISION_LIMIT -+ if (args->non_uniq > args->max_non_uniq) { -+ args->not_found = 1; -+ /* hash collision overflow. */ -+ return RETERR(-EBUSY); -+ } -+#endif -+ -+ /* -+ * did we just reach the end of the sequence of items/units with -+ * identical keys? -+ */ -+ if (!keyeq(args->key, unit_key_by_coord(coord, &unit_key))) { -+ assert("nikita-1791", -+ keylt(args->key, unit_key_by_coord(coord, &unit_key))); -+ args->not_found = 1; -+ args->last_coord.between = AFTER_UNIT; -+ return 0; -+ } -+ -+ coord_dup(&args->last_coord, coord); -+ /* -+ * did scan just moved to the next node? -+ */ -+ if (args->last_lh.node != lh->node) { -+ int lock_result; -+ -+ /* -+ * if so, lock new node with the mode requested by the caller -+ */ -+ done_lh(&args->last_lh); -+ assert("nikita-1896", znode_is_any_locked(lh->node)); -+ lock_result = longterm_lock_znode(&args->last_lh, lh->node, -+ args->mode, ZNODE_LOCK_HIPRI); -+ if (lock_result != 0) -+ return lock_result; -+ } -+ return check_item(args->inode, coord, args->name); -+} -+ -+/* Look for given @name within directory @dir. -+ -+ This is called during lookup, creation and removal of directory -+ entries and on reiser4_rename_common -+ -+ First calculate key that directory entry for @name would have. Search -+ for this key in the tree. If such key is found, scan all items with -+ the same key, checking name in each directory entry along the way. -+*/ -+int reiser4_find_entry(struct inode *dir, /* directory to scan */ -+ struct dentry *de, /* name to search for */ -+ lock_handle * lh, /* resulting lock handle */ -+ znode_lock_mode mode, /* required lock mode */ -+ reiser4_dir_entry_desc * entry /* parameters of found -+ directory entry */) -+{ -+ const struct qstr *name; -+ seal_t *seal; -+ coord_t *coord; -+ int result; -+ __u32 flags; -+ de_location *dec; -+ reiser4_dentry_fsdata *fsdata; -+ -+ assert("nikita-1130", lh != NULL); -+ assert("nikita-1128", dir != NULL); -+ -+ name = &de->d_name; -+ assert("nikita-1129", name != NULL); -+ -+ /* dentry private data don't require lock, because dentry -+ manipulations are protected by i_mutex on parent. -+ -+ This is not so for inodes, because there is no -the- parent in -+ inode case. -+ */ -+ fsdata = reiser4_get_dentry_fsdata(de); -+ if (IS_ERR(fsdata)) -+ return PTR_ERR(fsdata); -+ dec = &fsdata->dec; -+ -+ coord = &dec->entry_coord; -+ coord_clear_iplug(coord); -+ seal = &dec->entry_seal; -+ /* compose key of directory entry for @name */ -+ inode_dir_plugin(dir)->build_entry_key(dir, name, &entry->key); -+ -+ if (reiser4_seal_is_set(seal)) { -+ /* check seal */ -+ result = reiser4_seal_validate(seal, coord, &entry->key, -+ lh, mode, ZNODE_LOCK_LOPRI); -+ if (result == 0) { -+ /* key was found. Check that it is really item we are -+ looking for. */ -+ result = check_entry(dir, coord, name); -+ if (result == 0) -+ return 0; -+ } -+ } -+ flags = (mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0; -+ /* -+ * find place in the tree where directory item should be located. -+ */ -+ result = reiser4_object_lookup(dir, &entry->key, coord, lh, mode, -+ FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, -+ flags, NULL /*ra_info */ ); -+ if (result == CBK_COORD_FOUND) { -+ entry_actor_args arg; -+ -+ /* fast path: no hash collisions */ -+ result = check_entry(dir, coord, name); -+ if (result == 0) { -+ reiser4_seal_init(seal, coord, &entry->key); -+ dec->pos = 0; -+ } else if (result > 0) { -+ /* Iterate through all units with the same keys. */ -+ arg.name = name->name; -+ arg.key = &entry->key; -+ arg.not_found = 0; -+ arg.non_uniq = 0; -+#if REISER4_USE_COLLISION_LIMIT -+ arg.max_non_uniq = max_hash_collisions(dir); -+ assert("nikita-2851", arg.max_non_uniq > 1); -+#endif -+ arg.mode = mode; -+ arg.inode = dir; -+ coord_init_zero(&arg.last_coord); -+ init_lh(&arg.last_lh); -+ -+ result = reiser4_iterate_tree -+ (reiser4_tree_by_inode(dir), -+ coord, lh, -+ entry_actor, &arg, mode, 1); -+ /* if end of the tree or extent was reached during -+ scanning. */ -+ if (arg.not_found || (result == -E_NO_NEIGHBOR)) { -+ /* step back */ -+ done_lh(lh); -+ -+ result = zload(arg.last_coord.node); -+ if (result == 0) { -+ coord_clear_iplug(&arg.last_coord); -+ coord_dup(coord, &arg.last_coord); -+ move_lh(lh, &arg.last_lh); -+ result = RETERR(-ENOENT); -+ zrelse(arg.last_coord.node); -+ --arg.non_uniq; -+ } -+ } -+ -+ done_lh(&arg.last_lh); -+ if (result == 0) -+ reiser4_seal_init(seal, coord, &entry->key); -+ -+ if (result == 0 || result == -ENOENT) { -+ assert("nikita-2580", arg.non_uniq > 0); -+ dec->pos = arg.non_uniq - 1; -+ } -+ } -+ } else -+ dec->pos = -1; -+ return result; -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/disk_format/disk_format40.c linux-2.6.20/fs/reiser4/plugin/disk_format/disk_format40.c ---- linux-2.6.20.orig/fs/reiser4/plugin/disk_format/disk_format40.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/disk_format/disk_format40.c 2007-05-06 14:50:43.762995722 +0400 -@@ -0,0 +1,655 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "../../debug.h" -+#include "../../dformat.h" -+#include "../../key.h" -+#include "../node/node.h" -+#include "../space/space_allocator.h" -+#include "disk_format40.h" -+#include "../plugin.h" -+#include "../../txnmgr.h" -+#include "../../jnode.h" -+#include "../../tree.h" -+#include "../../super.h" -+#include "../../wander.h" -+#include "../../inode.h" -+#include "../../ktxnmgrd.h" -+#include "../../status_flags.h" -+ -+#include /* for __u?? */ -+#include /* for struct super_block */ -+#include -+ -+/* reiser 4.0 default disk layout */ -+ -+/* Amount of free blocks needed to perform release_format40 when fs gets -+ mounted RW: 1 for SB, 1 for non-leaves in overwrite set, 2 for tx header -+ & tx record. */ -+#define RELEASE_RESERVED 4 -+ -+/* The greatest supported format40 version number */ -+#define FORMAT40_VERSION PLUGIN_LIBRARY_VERSION -+ -+/* This flag indicates that backup should be updated -+ (the update is performed by fsck) */ -+#define FORMAT40_UPDATE_BACKUP (1 << 31) -+ -+/* functions to access fields of format40_disk_super_block */ -+static __u64 get_format40_block_count(const format40_disk_super_block * sb) -+{ -+ return le64_to_cpu(get_unaligned(&sb->block_count)); -+} -+ -+static __u64 get_format40_free_blocks(const format40_disk_super_block * sb) -+{ -+ return le64_to_cpu(get_unaligned(&sb->free_blocks)); -+} -+ -+static __u64 get_format40_root_block(const format40_disk_super_block * sb) -+{ -+ return le64_to_cpu(get_unaligned(&sb->root_block)); -+} -+ -+static __u16 get_format40_tree_height(const format40_disk_super_block * sb) -+{ -+ return le16_to_cpu(get_unaligned(&sb->tree_height)); -+} -+ -+static __u64 get_format40_file_count(const format40_disk_super_block * sb) -+{ -+ return le64_to_cpu(get_unaligned(&sb->file_count)); -+} -+ -+static __u64 get_format40_oid(const format40_disk_super_block * sb) -+{ -+ return le64_to_cpu(get_unaligned(&sb->oid)); -+} -+ -+static __u32 get_format40_mkfs_id(const format40_disk_super_block * sb) -+{ -+ return le32_to_cpu(get_unaligned(&sb->mkfs_id)); -+} -+ -+static __u64 get_format40_flags(const format40_disk_super_block * sb) -+{ -+ return le64_to_cpu(get_unaligned(&sb->flags)); -+} -+ -+static __u32 get_format40_version(const format40_disk_super_block * sb) -+{ -+ return le32_to_cpu(get_unaligned(&sb->version)) & -+ ~FORMAT40_UPDATE_BACKUP; -+} -+ -+static int update_backup_version(const format40_disk_super_block * sb) -+{ -+ return (le32_to_cpu(get_unaligned(&sb->version)) & -+ FORMAT40_UPDATE_BACKUP); -+} -+ -+static int update_disk_version(const format40_disk_super_block * sb) -+{ -+ return (get_format40_version(sb) < FORMAT40_VERSION); -+} -+ -+static int incomplete_compatibility(const format40_disk_super_block * sb) -+{ -+ return (get_format40_version(sb) > FORMAT40_VERSION); -+} -+ -+static format40_super_info *get_sb_info(struct super_block *super) -+{ -+ return &get_super_private(super)->u.format40; -+} -+ -+static int consult_diskmap(struct super_block *s) -+{ -+ format40_super_info *info; -+ journal_location *jloc; -+ -+ info = get_sb_info(s); -+ jloc = &get_super_private(s)->jloc; -+ /* Default format-specific locations, if there is nothing in -+ * diskmap */ -+ jloc->footer = FORMAT40_JOURNAL_FOOTER_BLOCKNR; -+ jloc->header = FORMAT40_JOURNAL_HEADER_BLOCKNR; -+ info->loc.super = FORMAT40_OFFSET / s->s_blocksize; -+#ifdef CONFIG_REISER4_BADBLOCKS -+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JF, -+ &jloc->footer); -+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JH, -+ &jloc->header); -+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_SUPER, -+ &info->loc.super); -+#endif -+ return 0; -+} -+ -+/* find any valid super block of disk_format40 (even if the first -+ super block is destroyed), will change block numbers of actual journal header/footer (jf/jh) -+ if needed */ -+static struct buffer_head *find_a_disk_format40_super_block(struct super_block -+ *s) -+{ -+ struct buffer_head *super_bh; -+ format40_disk_super_block *disk_sb; -+ format40_super_info *info; -+ -+ assert("umka-487", s != NULL); -+ -+ info = get_sb_info(s); -+ -+ super_bh = sb_bread(s, info->loc.super); -+ if (super_bh == NULL) -+ return ERR_PTR(RETERR(-EIO)); -+ -+ disk_sb = (format40_disk_super_block *) super_bh->b_data; -+ if (strncmp(disk_sb->magic, FORMAT40_MAGIC, sizeof(FORMAT40_MAGIC))) { -+ brelse(super_bh); -+ return ERR_PTR(RETERR(-EINVAL)); -+ } -+ -+ reiser4_set_block_count(s, le64_to_cpu(get_unaligned(&disk_sb->block_count))); -+ reiser4_set_data_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)) - -+ le64_to_cpu(get_unaligned(&disk_sb->free_blocks))); -+ reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->free_blocks))); -+ -+ return super_bh; -+} -+ -+/* find the most recent version of super block. This is called after journal is -+ replayed */ -+static struct buffer_head *read_super_block(struct super_block *s UNUSED_ARG) -+{ -+ /* Here the most recent superblock copy has to be read. However, as -+ journal replay isn't complete, we are using -+ find_a_disk_format40_super_block() function. */ -+ return find_a_disk_format40_super_block(s); -+} -+ -+static int get_super_jnode(struct super_block *s) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(s); -+ jnode *sb_jnode; -+ int ret; -+ -+ sb_jnode = reiser4_alloc_io_head(&get_sb_info(s)->loc.super); -+ -+ ret = jload(sb_jnode); -+ -+ if (ret) { -+ reiser4_drop_io_head(sb_jnode); -+ return ret; -+ } -+ -+ pin_jnode_data(sb_jnode); -+ jrelse(sb_jnode); -+ -+ sbinfo->u.format40.sb_jnode = sb_jnode; -+ -+ return 0; -+} -+ -+static void done_super_jnode(struct super_block *s) -+{ -+ jnode *sb_jnode = get_super_private(s)->u.format40.sb_jnode; -+ -+ if (sb_jnode) { -+ unpin_jnode_data(sb_jnode); -+ reiser4_drop_io_head(sb_jnode); -+ } -+} -+ -+typedef enum format40_init_stage { -+ NONE_DONE = 0, -+ CONSULT_DISKMAP, -+ FIND_A_SUPER, -+ INIT_JOURNAL_INFO, -+ INIT_STATUS, -+ JOURNAL_REPLAY, -+ READ_SUPER, -+ KEY_CHECK, -+ INIT_OID, -+ INIT_TREE, -+ JOURNAL_RECOVER, -+ INIT_SA, -+ INIT_JNODE, -+ ALL_DONE -+} format40_init_stage; -+ -+static format40_disk_super_block *copy_sb(const struct buffer_head *super_bh) -+{ -+ format40_disk_super_block *sb_copy; -+ -+ sb_copy = kmalloc(sizeof(format40_disk_super_block), -+ reiser4_ctx_gfp_mask_get()); -+ if (sb_copy == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ memcpy(sb_copy, ((format40_disk_super_block *) super_bh->b_data), -+ sizeof(format40_disk_super_block)); -+ return sb_copy; -+} -+ -+static int check_key_format(const format40_disk_super_block *sb_copy) -+{ -+ if (!equi(REISER4_LARGE_KEY, -+ get_format40_flags(sb_copy) & (1 << FORMAT40_LARGE_KEYS))) { -+ warning("nikita-3228", "Key format mismatch. " -+ "Only %s keys are supported.", -+ REISER4_LARGE_KEY ? "large" : "small"); -+ return RETERR(-EINVAL); -+ } -+ return 0; -+} -+ -+/** -+ * try_init_format40 -+ * @super: -+ * @stage: -+ * -+ */ -+static int try_init_format40(struct super_block *super, -+ format40_init_stage *stage) -+{ -+ int result; -+ struct buffer_head *super_bh; -+ reiser4_super_info_data *sbinfo; -+ format40_disk_super_block *sb_copy; -+ tree_level height; -+ reiser4_block_nr root_block; -+ node_plugin *nplug; -+ -+ assert("vs-475", super != NULL); -+ assert("vs-474", get_super_private(super)); -+ -+ *stage = NONE_DONE; -+ -+ result = consult_diskmap(super); -+ if (result) -+ return result; -+ *stage = CONSULT_DISKMAP; -+ -+ super_bh = find_a_disk_format40_super_block(super); -+ if (IS_ERR(super_bh)) -+ return PTR_ERR(super_bh); -+ brelse(super_bh); -+ *stage = FIND_A_SUPER; -+ -+ /* ok, we are sure that filesystem format is a format40 format */ -+ -+ /* map jnodes for journal control blocks (header, footer) to disk */ -+ result = reiser4_init_journal_info(super); -+ if (result) -+ return result; -+ *stage = INIT_JOURNAL_INFO; -+ -+ /* ok, we are sure that filesystem format is a format40 format */ -+ /* Now check it's state */ -+ result = reiser4_status_init(FORMAT40_STATUS_BLOCKNR); -+ if (result != 0 && result != -EINVAL) -+ /* -EINVAL means there is no magic, so probably just old -+ * fs. */ -+ return result; -+ *stage = INIT_STATUS; -+ -+ result = reiser4_status_query(NULL, NULL); -+ if (result == REISER4_STATUS_MOUNT_WARN) -+ notice("vpf-1363", "Warning: mounting %s with errors.", -+ super->s_id); -+ if (result == REISER4_STATUS_MOUNT_RO) -+ notice("vpf-1364", "Warning: mounting %s with fatal errors," -+ " forcing read-only mount.", super->s_id); -+ result = reiser4_journal_replay(super); -+ if (result) -+ return result; -+ *stage = JOURNAL_REPLAY; -+ -+ super_bh = read_super_block(super); -+ if (IS_ERR(super_bh)) -+ return PTR_ERR(super_bh); -+ *stage = READ_SUPER; -+ -+ /* allocate and make a copy of format40_disk_super_block */ -+ sb_copy = copy_sb(super_bh); -+ brelse(super_bh); -+ -+ if (IS_ERR(sb_copy)) -+ return PTR_ERR(sb_copy); -+ printk("reiser4: %s: found disk format 4.0.%u.\n", -+ super->s_id, -+ get_format40_version(sb_copy)); -+ if (incomplete_compatibility(sb_copy)) -+ printk("reiser4: Warning: The last completely supported " -+ "version of disk format40 is %u. Some objects of " -+ "the semantic tree can be unaccessible.\n", -+ FORMAT40_VERSION); -+ /* make sure that key format of kernel and filesystem match */ -+ result = check_key_format(sb_copy); -+ if (result) { -+ kfree(sb_copy); -+ return result; -+ } -+ *stage = KEY_CHECK; -+ -+ result = oid_init_allocator(super, get_format40_file_count(sb_copy), -+ get_format40_oid(sb_copy)); -+ if (result) { -+ kfree(sb_copy); -+ return result; -+ } -+ *stage = INIT_OID; -+ -+ /* get things necessary to init reiser4_tree */ -+ root_block = get_format40_root_block(sb_copy); -+ height = get_format40_tree_height(sb_copy); -+ nplug = node_plugin_by_id(NODE40_ID); -+ -+ /* initialize reiser4_super_info_data */ -+ sbinfo = get_super_private(super); -+ assert("", sbinfo->tree.super == super); -+ /* init reiser4_tree for the filesystem */ -+ result = reiser4_init_tree(&sbinfo->tree, &root_block, height, nplug); -+ if (result) { -+ kfree(sb_copy); -+ return result; -+ } -+ *stage = INIT_TREE; -+ -+ /* -+ * initialize reiser4_super_info_data with data from format40 super -+ * block -+ */ -+ sbinfo->default_uid = 0; -+ sbinfo->default_gid = 0; -+ sbinfo->mkfs_id = get_format40_mkfs_id(sb_copy); -+ /* number of blocks in filesystem and reserved space */ -+ reiser4_set_block_count(super, get_format40_block_count(sb_copy)); -+ sbinfo->blocks_free = get_format40_free_blocks(sb_copy); -+ sbinfo->version = get_format40_version(sb_copy); -+ kfree(sb_copy); -+ -+ if (update_backup_version(sb_copy)) -+ printk("reiser4: Warning: metadata backup is not updated. " -+ "Please run 'fsck.reiser4 --fix' on %s.\n", -+ super->s_id); -+ -+ sbinfo->fsuid = 0; -+ sbinfo->fs_flags |= (1 << REISER4_ADG); /* hard links for directories -+ * are not supported */ -+ sbinfo->fs_flags |= (1 << REISER4_ONE_NODE_PLUGIN); /* all nodes in -+ * layout 40 are -+ * of one -+ * plugin */ -+ /* sbinfo->tmgr is initialized already */ -+ -+ /* recover sb data which were logged separately from sb block */ -+ -+ /* NOTE-NIKITA: reiser4_journal_recover_sb_data() calls -+ * oid_init_allocator() and reiser4_set_free_blocks() with new -+ * data. What's the reason to call them above? */ -+ result = reiser4_journal_recover_sb_data(super); -+ if (result != 0) -+ return result; -+ *stage = JOURNAL_RECOVER; -+ -+ /* -+ * Set number of used blocks. The number of used blocks is not stored -+ * neither in on-disk super block nor in the journal footer blocks. At -+ * this moment actual values of total blocks and free block counters -+ * are set in the reiser4 super block (in-memory structure) and we can -+ * calculate number of used blocks from them. -+ */ -+ reiser4_set_data_blocks(super, -+ reiser4_block_count(super) - -+ reiser4_free_blocks(super)); -+ -+#if REISER4_DEBUG -+ sbinfo->min_blocks_used = 16 /* reserved area */ + -+ 2 /* super blocks */ + -+ 2 /* journal footer and header */ ; -+#endif -+ -+ /* init disk space allocator */ -+ result = sa_init_allocator(reiser4_get_space_allocator(super), -+ super, NULL); -+ if (result) -+ return result; -+ *stage = INIT_SA; -+ -+ result = get_super_jnode(super); -+ if (result == 0) -+ *stage = ALL_DONE; -+ return result; -+} -+ -+/* plugin->u.format.get_ready */ -+int init_format_format40(struct super_block *s, void *data UNUSED_ARG) -+{ -+ int result; -+ format40_init_stage stage; -+ -+ result = try_init_format40(s, &stage); -+ switch (stage) { -+ case ALL_DONE: -+ assert("nikita-3458", result == 0); -+ break; -+ case INIT_JNODE: -+ done_super_jnode(s); -+ case INIT_SA: -+ sa_destroy_allocator(reiser4_get_space_allocator(s), s); -+ case JOURNAL_RECOVER: -+ case INIT_TREE: -+ reiser4_done_tree(&get_super_private(s)->tree); -+ case INIT_OID: -+ case KEY_CHECK: -+ case READ_SUPER: -+ case JOURNAL_REPLAY: -+ case INIT_STATUS: -+ reiser4_status_finish(); -+ case INIT_JOURNAL_INFO: -+ reiser4_done_journal_info(s); -+ case FIND_A_SUPER: -+ case CONSULT_DISKMAP: -+ case NONE_DONE: -+ break; -+ default: -+ impossible("nikita-3457", "init stage: %i", stage); -+ } -+ -+ if (!rofs_super(s) && reiser4_free_blocks(s) < RELEASE_RESERVED) -+ return RETERR(-ENOSPC); -+ -+ return result; -+} -+ -+static void pack_format40_super(const struct super_block *s, char *data) -+{ -+ format40_disk_super_block *super_data = -+ (format40_disk_super_block *) data; -+ -+ reiser4_super_info_data *sbinfo = get_super_private(s); -+ -+ assert("zam-591", data != NULL); -+ -+ put_unaligned(cpu_to_le64(reiser4_free_committed_blocks(s)), -+ &super_data->free_blocks); -+ -+ put_unaligned(cpu_to_le64(sbinfo->tree.root_block), -+ &super_data->root_block); -+ -+ put_unaligned(cpu_to_le64(oid_next(s)), -+ &super_data->oid); -+ -+ put_unaligned(cpu_to_le64(oids_used(s)), -+ &super_data->file_count); -+ -+ put_unaligned(cpu_to_le16(sbinfo->tree.height), -+ &super_data->tree_height); -+ -+ if (update_disk_version(super_data)) { -+ __u32 version = FORMAT40_VERSION | FORMAT40_UPDATE_BACKUP; -+ -+ put_unaligned(cpu_to_le32(version), &super_data->version); -+ } -+} -+ -+/* plugin->u.format.log_super -+ return a jnode which should be added to transaction when the super block -+ gets logged */ -+jnode *log_super_format40(struct super_block *s) -+{ -+ jnode *sb_jnode; -+ -+ sb_jnode = get_super_private(s)->u.format40.sb_jnode; -+ -+ jload(sb_jnode); -+ -+ pack_format40_super(s, jdata(sb_jnode)); -+ -+ jrelse(sb_jnode); -+ -+ return sb_jnode; -+} -+ -+/* plugin->u.format.release */ -+int release_format40(struct super_block *s) -+{ -+ int ret; -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = get_super_private(s); -+ assert("zam-579", sbinfo != NULL); -+ -+ if (!rofs_super(s)) { -+ ret = reiser4_capture_super_block(s); -+ if (ret != 0) -+ warning("vs-898", -+ "reiser4_capture_super_block failed: %d", -+ ret); -+ -+ ret = txnmgr_force_commit_all(s, 1); -+ if (ret != 0) -+ warning("jmacd-74438", "txn_force failed: %d", ret); -+ -+ all_grabbed2free(); -+ } -+ -+ sa_destroy_allocator(&sbinfo->space_allocator, s); -+ reiser4_done_journal_info(s); -+ done_super_jnode(s); -+ -+ rcu_barrier(); -+ reiser4_done_tree(&sbinfo->tree); -+ /* call finish_rcu(), because some znode were "released" in -+ * reiser4_done_tree(). */ -+ rcu_barrier(); -+ -+ return 0; -+} -+ -+#define FORMAT40_ROOT_LOCALITY 41 -+#define FORMAT40_ROOT_OBJECTID 42 -+ -+/* plugin->u.format.root_dir_key */ -+const reiser4_key *root_dir_key_format40(const struct super_block *super -+ UNUSED_ARG) -+{ -+ static const reiser4_key FORMAT40_ROOT_DIR_KEY = { -+ .el = { -+ __constant_cpu_to_le64((FORMAT40_ROOT_LOCALITY << 4) | KEY_SD_MINOR), -+#if REISER4_LARGE_KEY -+ ON_LARGE_KEY(0ull,) -+#endif -+ __constant_cpu_to_le64(FORMAT40_ROOT_OBJECTID), -+ 0ull -+ } -+ }; -+ -+ return &FORMAT40_ROOT_DIR_KEY; -+} -+ -+/* plugin->u.format.check_open. -+ Check the opened object for validness. For now it checks for the valid oid & -+ locality only, can be improved later and it its work may depend on the mount -+ options. */ -+int check_open_format40(const struct inode *object) -+{ -+ oid_t max, oid; -+ -+ max = oid_next(object->i_sb) - 1; -+ -+ /* Check the oid. */ -+ oid = get_inode_oid(object); -+ if (oid > max) { -+ warning("vpf-1360", "The object with the oid %llu " -+ "greater then the max used oid %llu found.", -+ (unsigned long long)oid, (unsigned long long)max); -+ -+ return RETERR(-EIO); -+ } -+ -+ /* Check the locality. */ -+ oid = reiser4_inode_data(object)->locality_id; -+ if (oid > max) { -+ warning("vpf-1361", "The object with the locality %llu " -+ "greater then the max used oid %llu found.", -+ (unsigned long long)oid, (unsigned long long)max); -+ -+ return RETERR(-EIO); -+ } -+ -+ return 0; -+} -+ -+/* plugin->u.format.version_update. -+ Perform all version update operations from the on-disk -+ format40_disk_super_block.version on disk to FORMAT40_VERSION. -+ */ -+int version_update_format40(struct super_block *super) { -+ txn_handle * trans; -+ lock_handle lh; -+ txn_atom *atom; -+ int ret; -+ -+ /* Nothing to do if RO mount or the on-disk version is not less. */ -+ if (super->s_flags & MS_RDONLY) -+ return 0; -+ -+ if (get_super_private(super)->version >= FORMAT40_VERSION) -+ return 0; -+ -+ printk("reiser4: Updating disk format to 4.0.%u. The reiser4 metadata " -+ "backup is left unchanged. Please run 'fsck.reiser4 --fix' " -+ "on %s to update it too.\n", FORMAT40_VERSION, super->s_id); -+ -+ /* Mark the uber znode dirty to call log_super on write_logs. */ -+ init_lh(&lh); -+ ret = get_uber_znode(reiser4_get_tree(super), ZNODE_WRITE_LOCK, -+ ZNODE_LOCK_HIPRI, &lh); -+ if (ret != 0) -+ return ret; -+ -+ znode_make_dirty(lh.node); -+ done_lh(&lh); -+ -+ /* Update the backup blocks. */ -+ -+ /* Force write_logs immediately. */ -+ trans = get_current_context()->trans; -+ atom = get_current_atom_locked(); -+ assert("vpf-1906", atom != NULL); -+ -+ spin_lock_txnh(trans); -+ return force_commit_atom(trans); -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/disk_format/disk_format40.h linux-2.6.20/fs/reiser4/plugin/disk_format/disk_format40.h ---- linux-2.6.20.orig/fs/reiser4/plugin/disk_format/disk_format40.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/disk_format/disk_format40.h 2007-05-06 14:50:43.762995722 +0400 -@@ -0,0 +1,109 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* this file contains: -+ - definition of ondisk super block of standart disk layout for -+ reiser 4.0 (layout 40) -+ - definition of layout 40 specific portion of in-core super block -+ - declarations of functions implementing methods of layout plugin -+ for layout 40 -+ - declarations of functions used to get/set fields in layout 40 super block -+*/ -+ -+#ifndef __DISK_FORMAT40_H__ -+#define __DISK_FORMAT40_H__ -+ -+/* magic for default reiser4 layout */ -+#define FORMAT40_MAGIC "ReIsEr40FoRmAt" -+#define FORMAT40_OFFSET (REISER4_MASTER_OFFSET + PAGE_CACHE_SIZE) -+ -+#include "../../dformat.h" -+ -+#include /* for struct super_block */ -+ -+typedef enum { -+ FORMAT40_LARGE_KEYS -+} format40_flags; -+ -+/* ondisk super block for format 40. It is 512 bytes long */ -+typedef struct format40_disk_super_block { -+ /* 0 */ d64 block_count; -+ /* number of block in a filesystem */ -+ /* 8 */ d64 free_blocks; -+ /* number of free blocks */ -+ /* 16 */ d64 root_block; -+ /* filesystem tree root block */ -+ /* 24 */ d64 oid; -+ /* smallest free objectid */ -+ /* 32 */ d64 file_count; -+ /* number of files in a filesystem */ -+ /* 40 */ d64 flushes; -+ /* number of times super block was -+ flushed. Needed if format 40 -+ will have few super blocks */ -+ /* 48 */ d32 mkfs_id; -+ /* unique identifier of fs */ -+ /* 52 */ char magic[16]; -+ /* magic string ReIsEr40FoRmAt */ -+ /* 68 */ d16 tree_height; -+ /* height of filesystem tree */ -+ /* 70 */ d16 formatting_policy; -+ /* not used anymore */ -+ /* 72 */ d64 flags; -+ /* 80 */ d32 version; -+ /* on-disk format version number -+ initially assigned by mkfs as the greatest format40 -+ version number supported by reiser4progs and updated -+ in mount time in accordance with the greatest format40 -+ version number supported by kernel. -+ Is used by fsck to catch possible corruption and -+ for various compatibility issues */ -+ /* 84 */ char not_used[428]; -+} format40_disk_super_block; -+ -+/* format 40 specific part of reiser4_super_info_data */ -+typedef struct format40_super_info { -+/* format40_disk_super_block actual_sb; */ -+ jnode *sb_jnode; -+ struct { -+ reiser4_block_nr super; -+ } loc; -+} format40_super_info; -+ -+/* Defines for journal header and footer respectively. */ -+#define FORMAT40_JOURNAL_HEADER_BLOCKNR \ -+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 3) -+ -+#define FORMAT40_JOURNAL_FOOTER_BLOCKNR \ -+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 4) -+ -+#define FORMAT40_STATUS_BLOCKNR \ -+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 5) -+ -+/* Diskmap declarations */ -+#define FORMAT40_PLUGIN_DISKMAP_ID ((REISER4_FORMAT_PLUGIN_TYPE<<16) | (FORMAT40_ID)) -+#define FORMAT40_SUPER 1 -+#define FORMAT40_JH 2 -+#define FORMAT40_JF 3 -+ -+/* declarations of functions implementing methods of layout plugin for -+ format 40. The functions theirself are in disk_format40.c */ -+extern int init_format_format40(struct super_block *, void *data); -+extern const reiser4_key *root_dir_key_format40(const struct super_block *); -+extern int release_format40(struct super_block *s); -+extern jnode *log_super_format40(struct super_block *s); -+extern int check_open_format40(const struct inode *object); -+extern int version_update_format40(struct super_block *super); -+ -+/* __DISK_FORMAT40_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/disk_format/disk_format.c linux-2.6.20/fs/reiser4/plugin/disk_format/disk_format.c ---- linux-2.6.20.orig/fs/reiser4/plugin/disk_format/disk_format.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/disk_format/disk_format.c 2007-05-06 14:50:43.762995722 +0400 -@@ -0,0 +1,38 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "../../debug.h" -+#include "../plugin_header.h" -+#include "disk_format40.h" -+#include "disk_format.h" -+#include "../plugin.h" -+ -+/* initialization of disk layout plugins */ -+disk_format_plugin format_plugins[LAST_FORMAT_ID] = { -+ [FORMAT40_ID] = { -+ .h = { -+ .type_id = REISER4_FORMAT_PLUGIN_TYPE, -+ .id = FORMAT40_ID, -+ .pops = NULL, -+ .label = "reiser40", -+ .desc = "standard disk layout for reiser40", -+ .linkage = {NULL, NULL} -+ }, -+ .init_format = init_format_format40, -+ .root_dir_key = root_dir_key_format40, -+ .release = release_format40, -+ .log_super = log_super_format40, -+ .check_open = check_open_format40, -+ .version_update = version_update_format40 -+ } -+}; -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/disk_format/disk_format.h linux-2.6.20/fs/reiser4/plugin/disk_format/disk_format.h ---- linux-2.6.20.orig/fs/reiser4/plugin/disk_format/disk_format.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/disk_format/disk_format.h 2007-05-06 14:50:43.762995722 +0400 -@@ -0,0 +1,27 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* identifiers for disk layouts, they are also used as indexes in array of disk -+ plugins */ -+ -+#if !defined( __REISER4_DISK_FORMAT_H__ ) -+#define __REISER4_DISK_FORMAT_H__ -+ -+typedef enum { -+ /* standard reiser4 disk layout plugin id */ -+ FORMAT40_ID, -+ LAST_FORMAT_ID -+} disk_format_id; -+ -+/* __REISER4_DISK_FORMAT_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/disk_format/Makefile linux-2.6.20/fs/reiser4/plugin/disk_format/Makefile ---- linux-2.6.20.orig/fs/reiser4/plugin/disk_format/Makefile 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/disk_format/Makefile 2007-05-06 14:50:43.762995722 +0400 -@@ -0,0 +1,5 @@ -+obj-$(CONFIG_REISER4_FS) += df_plugins.o -+ -+df_plugins-objs := \ -+ disk_format40.o \ -+ disk_format.o -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/fibration.c linux-2.6.20/fs/reiser4/plugin/fibration.c ---- linux-2.6.20.orig/fs/reiser4/plugin/fibration.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/fibration.c 2007-05-06 14:50:43.762995722 +0400 -@@ -0,0 +1,175 @@ -+/* Copyright 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Directory fibrations */ -+ -+/* -+ * Suppose we have a directory tree with sources of some project. During -+ * compilation .o files are created within this tree. This makes access -+ * to the original source files less efficient, because source files are -+ * now "diluted" by object files: default directory plugin uses prefix -+ * of a file name as a part of the key for directory entry (and this -+ * part is also inherited by the key of file body). This means that -+ * foo.o will be located close to foo.c and foo.h in the tree. -+ * -+ * To avoid this effect directory plugin fill highest 7 (unused -+ * originally) bits of the second component of the directory entry key -+ * by bit-pattern depending on the file name (see -+ * fs/reiser4/kassign.c:build_entry_key_common()). These bits are called -+ * "fibre". Fibre of the file name key is inherited by key of stat data -+ * and keys of file body (in the case of REISER4_LARGE_KEY). -+ * -+ * Fibre for a given file is chosen by per-directory fibration -+ * plugin. Names within given fibre are ordered lexicographically. -+ */ -+ -+#include "../debug.h" -+#include "plugin_header.h" -+#include "plugin.h" -+#include "../super.h" -+#include "../inode.h" -+ -+#include -+ -+static const int fibre_shift = 57; -+ -+#define FIBRE_NO(n) (((__u64)(n)) << fibre_shift) -+ -+/* -+ * Trivial fibration: all files of directory are just ordered -+ * lexicographically. -+ */ -+static __u64 fibre_trivial(const struct inode *dir, const char *name, int len) -+{ -+ return FIBRE_NO(0); -+} -+ -+/* -+ * dot-o fibration: place .o files after all others. -+ */ -+static __u64 fibre_dot_o(const struct inode *dir, const char *name, int len) -+{ -+ /* special treatment for .*\.o */ -+ if (len > 2 && name[len - 1] == 'o' && name[len - 2] == '.') -+ return FIBRE_NO(1); -+ else -+ return FIBRE_NO(0); -+} -+ -+/* -+ * ext.1 fibration: subdivide directory into 128 fibrations one for each -+ * 7bit extension character (file "foo.h" goes into fibre "h"), plus -+ * default fibre for the rest. -+ */ -+static __u64 fibre_ext_1(const struct inode *dir, const char *name, int len) -+{ -+ if (len > 2 && name[len - 2] == '.') -+ return FIBRE_NO(name[len - 1]); -+ else -+ return FIBRE_NO(0); -+} -+ -+/* -+ * ext.3 fibration: try to separate files with different 3-character -+ * extensions from each other. -+ */ -+static __u64 fibre_ext_3(const struct inode *dir, const char *name, int len) -+{ -+ if (len > 4 && name[len - 4] == '.') -+ return FIBRE_NO(name[len - 3] + name[len - 2] + name[len - 1]); -+ else -+ return FIBRE_NO(0); -+} -+ -+static int change_fibration(struct inode *inode, -+ reiser4_plugin * plugin, -+ pset_member memb) -+{ -+ int result; -+ -+ assert("nikita-3503", inode != NULL); -+ assert("nikita-3504", plugin != NULL); -+ -+ assert("nikita-3505", is_reiser4_inode(inode)); -+ assert("nikita-3506", inode_dir_plugin(inode) != NULL); -+ assert("nikita-3507", -+ plugin->h.type_id == REISER4_FIBRATION_PLUGIN_TYPE); -+ -+ result = 0; -+ if (inode_fibration_plugin(inode) == NULL || -+ inode_fibration_plugin(inode)->h.id != plugin->h.id) { -+ if (is_dir_empty(inode) == 0) -+ result = aset_set_unsafe(&reiser4_inode_data(inode)->pset, -+ PSET_FIBRATION, plugin); -+ else -+ result = RETERR(-ENOTEMPTY); -+ -+ } -+ return result; -+} -+ -+static reiser4_plugin_ops fibration_plugin_ops = { -+ .init = NULL, -+ .load = NULL, -+ .save_len = NULL, -+ .save = NULL, -+ .change = change_fibration -+}; -+ -+/* fibration plugins */ -+fibration_plugin fibration_plugins[LAST_FIBRATION_ID] = { -+ [FIBRATION_LEXICOGRAPHIC] = { -+ .h = { -+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE, -+ .id = FIBRATION_LEXICOGRAPHIC, -+ .pops = &fibration_plugin_ops, -+ .label = "lexicographic", -+ .desc = "no fibration", -+ .linkage = {NULL, NULL} -+ }, -+ .fibre = fibre_trivial -+ }, -+ [FIBRATION_DOT_O] = { -+ .h = { -+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE, -+ .id = FIBRATION_DOT_O, -+ .pops = &fibration_plugin_ops, -+ .label = "dot-o", -+ .desc = "fibrate .o files separately", -+ .linkage = {NULL, NULL} -+ }, -+ .fibre = fibre_dot_o -+ }, -+ [FIBRATION_EXT_1] = { -+ .h = { -+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE, -+ .id = FIBRATION_EXT_1, -+ .pops = &fibration_plugin_ops, -+ .label = "ext-1", -+ .desc = "fibrate file by single character extension", -+ .linkage = {NULL, NULL} -+ }, -+ .fibre = fibre_ext_1 -+ }, -+ [FIBRATION_EXT_3] = { -+ .h = { -+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE, -+ .id = FIBRATION_EXT_3, -+ .pops = &fibration_plugin_ops, -+ .label = "ext-3", -+ .desc = "fibrate file by three character extension", -+ .linkage = {NULL, NULL} -+ }, -+ .fibre = fibre_ext_3 -+ } -+}; -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/fibration.h linux-2.6.20/fs/reiser4/plugin/fibration.h ---- linux-2.6.20.orig/fs/reiser4/plugin/fibration.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/fibration.h 2007-05-06 14:50:43.762995722 +0400 -@@ -0,0 +1,37 @@ -+/* Copyright 2004 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Fibration plugin used by hashed directory plugin to segment content -+ * of directory. See fs/reiser4/plugin/fibration.c for more on this. */ -+ -+#if !defined( __FS_REISER4_PLUGIN_FIBRATION_H__ ) -+#define __FS_REISER4_PLUGIN_FIBRATION_H__ -+ -+#include "plugin_header.h" -+ -+typedef struct fibration_plugin { -+ /* generic fields */ -+ plugin_header h; -+ -+ __u64(*fibre) (const struct inode * dir, const char *name, int len); -+} fibration_plugin; -+ -+typedef enum { -+ FIBRATION_LEXICOGRAPHIC, -+ FIBRATION_DOT_O, -+ FIBRATION_EXT_1, -+ FIBRATION_EXT_3, -+ LAST_FIBRATION_ID -+} reiser4_fibration_id; -+ -+/* __FS_REISER4_PLUGIN_FIBRATION_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/cryptcompress.c linux-2.6.20/fs/reiser4/plugin/file/cryptcompress.c ---- linux-2.6.20.orig/fs/reiser4/plugin/file/cryptcompress.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/file/cryptcompress.c 2007-05-06 14:50:43.770998222 +0400 -@@ -0,0 +1,3760 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ reiser4/README */ -+ -+/* This file contains implementations of inode/file/address_space/file plugin -+ * operations specific for cryptcompress file plugin which manages files with -+ * compressed and encrypted bodies. "Cryptcompress file" is built of items of -+ * CTAIL_ID (see http://www.namesys.com/cryptcompress_design.html for details). -+ */ -+ -+#include "../../inode.h" -+#include "../cluster.h" -+#include "../object.h" -+#include "../../tree_walk.h" -+#include "cryptcompress.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* get cryptcompress specific portion of inode */ -+cryptcompress_info_t *cryptcompress_inode_data(const struct inode *inode) -+{ -+ return &reiser4_inode_data(inode)->file_plugin_data.cryptcompress_info; -+} -+ -+/* plugin->u.file.init_inode_data */ -+void init_inode_data_cryptcompress(struct inode *inode, -+ reiser4_object_create_data * crd, -+ int create) -+{ -+ cryptcompress_info_t *data; -+ -+ data = cryptcompress_inode_data(inode); -+ assert("edward-685", data != NULL); -+ -+ memset(data, 0, sizeof(*data)); -+ -+ turn_on_compression(data); -+ set_lattice_factor(data, MIN_LATTICE_FACTOR); -+ init_inode_ordering(inode, crd, create); -+} -+ -+#if REISER4_DEBUG -+int cryptcompress_inode_ok(struct inode *inode) -+{ -+ if (!(reiser4_inode_data(inode)->plugin_mask & (1 << PSET_FILE))) -+ return 0; -+ if (!cluster_shift_ok(inode_cluster_shift(inode))) -+ return 0; -+ return 1; -+} -+#endif -+ -+/* The following is a part of reiser4 cipher key manager -+ which is called when opening/creating a cryptcompress file */ -+ -+/* get/set cipher key info */ -+crypto_stat_t * inode_crypto_stat (struct inode * inode) -+{ -+ assert("edward-90", inode != NULL); -+ assert("edward-91", reiser4_inode_data(inode) != NULL); -+ return cryptcompress_inode_data(inode)->crypt; -+} -+ -+static void set_inode_crypto_stat (struct inode * inode, crypto_stat_t * stat) -+{ -+ cryptcompress_inode_data(inode)->crypt = stat; -+} -+ -+/* allocate a cipher key info */ -+crypto_stat_t * reiser4_alloc_crypto_stat (struct inode * inode) -+{ -+ crypto_stat_t * info; -+ int fipsize; -+ -+ info = kmalloc(sizeof(*info), reiser4_ctx_gfp_mask_get()); -+ if (!info) -+ return ERR_PTR(-ENOMEM); -+ memset(info, 0, sizeof (*info)); -+ fipsize = inode_digest_plugin(inode)->fipsize; -+ info->keyid = kmalloc(fipsize, reiser4_ctx_gfp_mask_get()); -+ if (!info->keyid) { -+ kfree(info); -+ return ERR_PTR(-ENOMEM); -+ } -+ info->host = inode; -+ return info; -+} -+ -+#if 0 -+/* allocate/free low-level info for cipher and digest -+ transforms */ -+static int alloc_crypto_tfms(crypto_stat_t * info) -+{ -+ struct crypto_blkcipher * ctfm = NULL; -+ struct crypto_hash * dtfm = NULL; -+ cipher_plugin * cplug = inode_cipher_plugin(info->host); -+ digest_plugin * dplug = inode_digest_plugin(info->host); -+ -+ if (cplug->alloc) { -+ ctfm = cplug->alloc(); -+ if (IS_ERR(ctfm)) { -+ warning("edward-1364", -+ "Can not allocate info for %s\n", -+ cplug->h.desc); -+ return RETERR(PTR_ERR(ctfm)); -+ } -+ } -+ info_set_cipher(info, ctfm); -+ if (dplug->alloc) { -+ dtfm = dplug->alloc(); -+ if (IS_ERR(dtfm)) { -+ warning("edward-1365", -+ "Can not allocate info for %s\n", -+ dplug->h.desc); -+ goto unhappy_with_digest; -+ } -+ } -+ info_set_digest(info, dtfm); -+ return 0; -+ unhappy_with_digest: -+ if (cplug->free) { -+ cplug->free(ctfm); -+ info_set_cipher(info, NULL); -+ } -+ return RETERR(PTR_ERR(dtfm)); -+} -+#endif -+ -+static void -+free_crypto_tfms(crypto_stat_t * info) -+{ -+ assert("edward-1366", info != NULL); -+ if (!info_get_cipher(info)) { -+ assert("edward-1601", !info_get_digest(info)); -+ return; -+ } -+ inode_cipher_plugin(info->host)->free(info_get_cipher(info)); -+ info_set_cipher(info, NULL); -+ inode_digest_plugin(info->host)->free(info_get_digest(info)); -+ info_set_digest(info, NULL); -+ return; -+} -+ -+#if 0 -+/* create a key fingerprint for disk stat-data */ -+static int create_keyid (crypto_stat_t * info, crypto_data_t * data) -+{ -+ int ret = -ENOMEM; -+ size_t blk, pad; -+ __u8 * dmem; -+ __u8 * cmem; -+ struct hash_desc ddesc; -+ struct blkcipher_desc cdesc; -+ struct scatterlist sg; -+ -+ assert("edward-1367", info != NULL); -+ assert("edward-1368", info->keyid != NULL); -+ -+ ddesc.tfm = info_get_digest(info); -+ ddesc.flags = 0; -+ cdesc.tfm = info_get_cipher(info); -+ cdesc.flags = 0; -+ -+ dmem = kmalloc((size_t)crypto_hash_digestsize(ddesc.tfm), -+ reiser4_ctx_gfp_mask_get()); -+ if (!dmem) -+ goto exit1; -+ -+ blk = crypto_blkcipher_blocksize(cdesc.tfm); -+ -+ pad = data->keyid_size % blk; -+ pad = (pad ? blk - pad : 0); -+ -+ cmem = kmalloc((size_t)data->keyid_size + pad, -+ reiser4_ctx_gfp_mask_get()); -+ if (!cmem) -+ goto exit2; -+ memcpy(cmem, data->keyid, data->keyid_size); -+ memset(cmem + data->keyid_size, 0, pad); -+ -+ sg.page = virt_to_page(cmem); -+ sg.offset = offset_in_page(cmem); -+ sg.length = data->keyid_size + pad; -+ -+ ret = crypto_blkcipher_encrypt(&cdesc, &sg, &sg, -+ data->keyid_size + pad); -+ if (ret) { -+ warning("edward-1369", -+ "encryption failed flags=%x\n", cdesc.flags); -+ goto exit3; -+ } -+ ret = crypto_hash_digest(&ddesc, &sg, sg.length, dmem); -+ if (ret) { -+ warning("edward-1602", -+ "digest failed flags=%x\n", ddesc.flags); -+ goto exit3; -+ } -+ memcpy(info->keyid, dmem, inode_digest_plugin(info->host)->fipsize); -+ exit3: -+ kfree(cmem); -+ exit2: -+ kfree(dmem); -+ exit1: -+ return ret; -+} -+#endif -+ -+static void destroy_keyid(crypto_stat_t * info) -+{ -+ assert("edward-1370", info != NULL); -+ assert("edward-1371", info->keyid != NULL); -+ kfree(info->keyid); -+ return; -+} -+ -+static void __free_crypto_stat (struct inode * inode) -+{ -+ crypto_stat_t * info = inode_crypto_stat(inode); -+ assert("edward-1372", info != NULL); -+ -+ free_crypto_tfms(info); -+ destroy_keyid(info); -+ kfree(info); -+} -+ -+#if 0 -+static void instantiate_crypto_stat(crypto_stat_t * info) -+{ -+ assert("edward-1373", info != NULL); -+ assert("edward-1374", info->inst == 0); -+ info->inst = 1; -+} -+#endif -+ -+static void uninstantiate_crypto_stat(crypto_stat_t * info) -+{ -+ assert("edward-1375", info != NULL); -+ info->inst = 0; -+} -+ -+static int crypto_stat_instantiated(crypto_stat_t * info) -+{ -+ return info->inst; -+} -+ -+static int inode_has_cipher_key(struct inode * inode) -+{ -+ assert("edward-1376", inode != NULL); -+ return inode_crypto_stat(inode) && -+ crypto_stat_instantiated(inode_crypto_stat(inode)); -+} -+ -+static void free_crypto_stat (struct inode * inode) -+{ -+ uninstantiate_crypto_stat(inode_crypto_stat(inode)); -+ __free_crypto_stat(inode); -+} -+ -+static int need_cipher(struct inode * inode) -+{ -+ return inode_cipher_plugin(inode) != -+ cipher_plugin_by_id(NONE_CIPHER_ID); -+} -+ -+/* Create a crypto-stat and attach result to the @object. -+ If success is returned, then low-level cipher info contains -+ an instantiated key */ -+#if 0 -+crypto_stat_t * -+create_crypto_stat(struct inode * object, -+ crypto_data_t * data /* this contains a (uninstantiated) -+ cipher key imported from user -+ space */) -+{ -+ int ret; -+ crypto_stat_t * info; -+ -+ assert("edward-1377", data != NULL); -+ assert("edward-1378", need_cipher(object)); -+ -+ if (inode_file_plugin(object) != -+ file_plugin_by_id(DIRECTORY_FILE_PLUGIN_ID)) -+ return ERR_PTR(-EINVAL); -+ -+ info = reiser4_alloc_crypto_stat(object); -+ if (IS_ERR(info)) -+ return info; -+ ret = alloc_crypto_tfms(info); -+ if (ret) -+ goto err; -+ /* instantiating a key */ -+ ret = crypto_blkcipher_setkey(info_get_cipher(info), -+ data->key, -+ data->keysize); -+ if (ret) { -+ warning("edward-1379", -+ "setkey failed flags=%x\n", -+ crypto_blkcipher_get_flags(info_get_cipher(info))); -+ goto err; -+ } -+ info->keysize = data->keysize; -+ ret = create_keyid(info, data); -+ if (ret) -+ goto err; -+ instantiate_crypto_stat(info); -+ return info; -+ err: -+ __free_crypto_stat(object); -+ return ERR_PTR(ret); -+} -+#endif -+ -+/* increment/decrement a load counter when -+ attaching/detaching the crypto-stat to any object */ -+static void load_crypto_stat(crypto_stat_t * info) -+{ -+ assert("edward-1380", info != NULL); -+ inc_keyload_count(info); -+} -+ -+static void unload_crypto_stat(struct inode * inode) -+{ -+ crypto_stat_t * info = inode_crypto_stat(inode); -+ assert("edward-1381", info->keyload_count > 0); -+ -+ dec_keyload_count(inode_crypto_stat(inode)); -+ if (info->keyload_count == 0) -+ /* final release */ -+ free_crypto_stat(inode); -+} -+ -+/* attach/detach an existing crypto-stat */ -+void reiser4_attach_crypto_stat(struct inode * inode, crypto_stat_t * info) -+{ -+ assert("edward-1382", inode != NULL); -+ assert("edward-1383", info != NULL); -+ assert("edward-1384", inode_crypto_stat(inode) == NULL); -+ -+ set_inode_crypto_stat(inode, info); -+ load_crypto_stat(info); -+} -+ -+/* returns true, if crypto stat can be attached to the @host */ -+#if REISER4_DEBUG -+static int host_allows_crypto_stat(struct inode * host) -+{ -+ int ret; -+ file_plugin * fplug = inode_file_plugin(host); -+ -+ switch (fplug->h.id) { -+ case CRYPTCOMPRESS_FILE_PLUGIN_ID: -+ ret = 1; -+ break; -+ default: -+ ret = 0; -+ } -+ return ret; -+} -+#endif /* REISER4_DEBUG */ -+ -+static void reiser4_detach_crypto_stat(struct inode * inode) -+{ -+ assert("edward-1385", inode != NULL); -+ assert("edward-1386", host_allows_crypto_stat(inode)); -+ -+ if (inode_crypto_stat(inode)) -+ unload_crypto_stat(inode); -+ set_inode_crypto_stat(inode, NULL); -+} -+ -+#if 0 -+ -+/* compare fingerprints of @child and @parent */ -+static int keyid_eq(crypto_stat_t * child, crypto_stat_t * parent) -+{ -+ return !memcmp(child->keyid, parent->keyid, info_digest_plugin(parent)->fipsize); -+} -+ -+/* check if a crypto-stat (which is bound to @parent) can be inherited */ -+int can_inherit_crypto_cryptcompress(struct inode *child, struct inode *parent) -+{ -+ if (!need_cipher(child)) -+ return 0; -+ /* the child is created */ -+ if (!inode_crypto_stat(child)) -+ return 1; -+ /* the child is looked up */ -+ if (!inode_crypto_stat(parent)) -+ return 0; -+ return (inode_cipher_plugin(child) == inode_cipher_plugin(parent) && -+ inode_digest_plugin(child) == inode_digest_plugin(parent) && -+ inode_crypto_stat(child)->keysize == inode_crypto_stat(parent)->keysize && -+ keyid_eq(inode_crypto_stat(child), inode_crypto_stat(parent))); -+} -+#endif -+ -+/* helper functions for ->create() method of the cryptcompress plugin */ -+static int inode_set_crypto(struct inode * object) -+{ -+ reiser4_inode * info; -+ if (!inode_crypto_stat(object)) { -+ if (need_cipher(object)) -+ return RETERR(-EINVAL); -+ /* the file is not to be encrypted */ -+ return 0; -+ } -+ info = reiser4_inode_data(object); -+ info->extmask |= (1 << CRYPTO_STAT); -+ return 0; -+} -+ -+static int inode_init_compression(struct inode * object) -+{ -+ int result = 0; -+ assert("edward-1461", object != NULL); -+ if (inode_compression_plugin(object)->init) -+ result = inode_compression_plugin(object)->init(); -+ return result; -+} -+ -+static int inode_check_cluster(struct inode * object) -+{ -+ assert("edward-696", object != NULL); -+ -+ if (inode_cluster_size(object) < PAGE_CACHE_SIZE) { -+ warning("edward-1320", "Can not support '%s' " -+ "logical clusters (less then page size)", -+ inode_cluster_plugin(object)->h.label); -+ return RETERR(-EINVAL); -+ } -+ return 0; -+} -+ -+/* ->destroy_inode() method of the cryptcompress plugin */ -+void destroy_inode_cryptcompress(struct inode * inode) -+{ -+ assert("edward-23", cryptcompress_inode_data(inode)->pgcount == 0); -+ reiser4_detach_crypto_stat(inode); -+ return; -+} -+ -+/* ->create() method of the cryptcompress plugin -+ -+. install plugins -+. attach crypto info if specified -+. attach compression info if specified -+. attach cluster info -+*/ -+int -+create_cryptcompress(struct inode *object, struct inode *parent, -+ reiser4_object_create_data * data) -+{ -+ int result; -+ reiser4_inode *info; -+ -+ assert("edward-23", object != NULL); -+ assert("edward-24", parent != NULL); -+ assert("edward-30", data != NULL); -+ assert("edward-26", reiser4_inode_get_flag(object, REISER4_NO_SD)); -+ assert("edward-27", data->id == CRYPTCOMPRESS_FILE_PLUGIN_ID); -+ -+ info = reiser4_inode_data(object); -+ -+ assert("edward-29", info != NULL); -+ -+ /* set file bit */ -+ info->plugin_mask |= (1 << PSET_FILE); -+ -+ /* set crypto */ -+ result = inode_set_crypto(object); -+ if (result) -+ goto error; -+ /* set compression */ -+ result = inode_init_compression(object); -+ if (result) -+ goto error; -+ /* set cluster */ -+ result = inode_check_cluster(object); -+ if (result) -+ goto error; -+ -+ /* save everything in disk stat-data */ -+ result = write_sd_by_inode_common(object); -+ if (!result) -+ return 0; -+ error: -+ reiser4_detach_crypto_stat(object); -+ return result; -+} -+ -+/* ->open() method of the cryptcompress plugin */ -+int open_object_cryptcompress(struct inode * inode, struct file * file) -+{ -+ int result; -+ struct inode * parent; -+ -+ assert("edward-1394", inode != NULL); -+ assert("edward-1395", file != NULL); -+ assert("edward-1396", file != NULL); -+ assert("edward-1397", file->f_dentry->d_inode == inode); -+ assert("edward-1398", file->f_dentry->d_parent != NULL); -+ assert("edward-1399", file->f_dentry->d_parent->d_inode != NULL); -+ assert("edward-698", -+ inode_file_plugin(inode) == -+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); -+ result = inode_check_cluster(inode); -+ if (result) -+ return result; -+ result = inode_init_compression(inode); -+ if (result) -+ return result; -+ if (!need_cipher(inode)) -+ /* the file is not to be ciphered */ -+ return 0; -+ parent = file->f_dentry->d_parent->d_inode; -+ if (!inode_has_cipher_key(inode)) -+ return RETERR(-EINVAL); -+ return 0; -+} -+ -+/* returns a blocksize, the attribute of a cipher algorithm */ -+static unsigned int -+cipher_blocksize(struct inode * inode) -+{ -+ assert("edward-758", need_cipher(inode)); -+ assert("edward-1400", inode_crypto_stat(inode) != NULL); -+ return crypto_blkcipher_blocksize -+ (info_get_cipher(inode_crypto_stat(inode))); -+} -+ -+/* returns offset translated by scale factor of the crypto-algorithm */ -+static loff_t inode_scaled_offset (struct inode * inode, -+ const loff_t src_off /* input offset */) -+{ -+ assert("edward-97", inode != NULL); -+ -+ if (!need_cipher(inode) || -+ src_off == get_key_offset(reiser4_min_key()) || -+ src_off == get_key_offset(reiser4_max_key())) -+ return src_off; -+ -+ return inode_cipher_plugin(inode)->scale(inode, -+ cipher_blocksize(inode), -+ src_off); -+} -+ -+/* returns disk cluster size */ -+size_t inode_scaled_cluster_size(struct inode * inode) -+{ -+ assert("edward-110", inode != NULL); -+ -+ return inode_scaled_offset(inode, inode_cluster_size(inode)); -+} -+ -+static int new_cluster(reiser4_cluster_t * clust, struct inode *inode) -+{ -+ return (clust_to_off(clust->index, inode) >= inode->i_size); -+} -+ -+/* set number of cluster pages */ -+static void set_cluster_nrpages(reiser4_cluster_t * clust, struct inode *inode) -+{ -+ reiser4_slide_t *win; -+ -+ assert("edward-180", clust != NULL); -+ assert("edward-1040", inode != NULL); -+ -+ win = clust->win; -+ if (!win) { -+ /* NOTE-EDWARD: i_size should be protected */ -+ clust->nr_pages = -+ count_to_nrpages(fsize_to_count(clust, inode)); -+ return; -+ } -+ assert("edward-1176", clust->op != PCL_UNKNOWN); -+ assert("edward-1064", win->off + win->count + win->delta != 0); -+ -+ if (win->stat == HOLE_WINDOW && -+ win->off == 0 && win->count == inode_cluster_size(inode)) { -+ /* special case: we start write hole from fake cluster */ -+ clust->nr_pages = 0; -+ return; -+ } -+ clust->nr_pages = -+ count_to_nrpages(max_count(win->off + win->count + win->delta, -+ fsize_to_count(clust, inode))); -+ return; -+} -+ -+/* ->key_by_inode() method of the cryptcompress plugin */ -+/* see plugin/plugin.h for details */ -+int -+key_by_inode_cryptcompress(struct inode *inode, loff_t off, reiser4_key * key) -+{ -+ loff_t clust_off; -+ -+ assert("edward-64", inode != 0); -+ // assert("edward-112", ergo(off != get_key_offset(reiser4_max_key()), !off_to_cloff(off, inode))); -+ /* don't come here with other offsets */ -+ -+ clust_off = -+ (off == -+ get_key_offset(reiser4_max_key())? get_key_offset(reiser4_max_key()) : -+ off_to_clust_to_off(off, inode)); -+ -+ key_by_inode_and_offset_common(inode, 0, key); -+ set_key_offset(key, -+ (__u64) (!inode_crypto_stat(inode) ? clust_off : -+ inode_scaled_offset(inode, clust_off))); -+ return 0; -+} -+ -+/* plugin->flow_by_inode */ -+int -+flow_by_inode_cryptcompress(struct inode *inode /* file to build flow for */ , -+ const char __user *buf /* user level buffer */ , -+ int user /* 1 if @buf is of user space, 0 - if it is -+ kernel space */ , -+ loff_t size /* buffer size */ , -+ loff_t off /* offset to start io from */ , -+ rw_op op /* READ or WRITE */ , -+ flow_t * f /* resulting flow */ ) -+{ -+ assert("edward-436", f != NULL); -+ assert("edward-149", inode != NULL); -+ assert("edward-150", inode_file_plugin(inode) != NULL); -+ -+ f->length = size; -+ memcpy(&f->data, &buf, sizeof(buf)); -+ f->user = user; -+ f->op = op; -+ -+ if (op == WRITE_OP && user == 1) -+ return 0; -+ return key_by_inode_cryptcompress(inode, off, &f->key); -+} -+ -+static int -+cryptcompress_hint_validate(hint_t * hint, const reiser4_key * key, -+ znode_lock_mode lock_mode) -+{ -+ coord_t *coord; -+ -+ assert("edward-704", hint != NULL); -+ assert("edward-1089", !hint_is_valid(hint)); -+ assert("edward-706", hint->lh.owner == NULL); -+ -+ coord = &hint->ext_coord.coord; -+ -+ if (!hint || !hint_is_set(hint) || hint->mode != lock_mode) -+ /* hint either not set or set by different operation */ -+ return RETERR(-E_REPEAT); -+ -+ if (get_key_offset(key) != hint->offset) -+ /* hint is set for different key */ -+ return RETERR(-E_REPEAT); -+ -+ assert("edward-707", reiser4_schedulable()); -+ -+ return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord, -+ key, &hint->lh, lock_mode, -+ ZNODE_LOCK_LOPRI); -+} -+ -+/* reserve disk space when writing a logical cluster */ -+static int reserve4cluster(struct inode *inode, reiser4_cluster_t *clust) -+{ -+ int result = 0; -+ -+ assert("edward-965", reiser4_schedulable()); -+ assert("edward-439", inode != NULL); -+ assert("edward-440", clust != NULL); -+ assert("edward-441", clust->pages != NULL); -+ -+ if (clust->nr_pages == 0) { -+ assert("edward-1152", clust->win != NULL); -+ assert("edward-1153", clust->win->stat == HOLE_WINDOW); -+ /* don't reserve space for fake disk clusteer */ -+ return 0; -+ } -+ assert("edward-442", jprivate(clust->pages[0]) != NULL); -+ -+ result = reiser4_grab_space_force(estimate_insert_cluster(inode) + -+ estimate_update_cluster(inode), -+ BA_CAN_COMMIT); -+ if (result) -+ return result; -+ clust->reserved = 1; -+ grabbed2cluster_reserved(estimate_insert_cluster(inode) + -+ estimate_update_cluster(inode)); -+#if REISER4_DEBUG -+ clust->reserved_prepped = estimate_update_cluster(inode); -+ clust->reserved_unprepped = estimate_insert_cluster(inode); -+#endif -+ /* there can be space grabbed by txnmgr_force_commit_all */ -+ return 0; -+} -+ -+/* free reserved disk space if writing a logical cluster fails */ -+static void -+free_reserved4cluster(struct inode *inode, reiser4_cluster_t * clust, int count) -+{ -+ assert("edward-967", clust->reserved == 1); -+ -+ cluster_reserved2free(count); -+ clust->reserved = 0; -+} -+ -+/* The core search procedure of the cryptcompress plugin. -+ If returned value is not cbk_errored, then current znode is locked */ -+static int find_cluster_item(hint_t * hint, -+ const reiser4_key * key, /* key of the item we are -+ looking for */ -+ znode_lock_mode lock_mode /* which lock */ , -+ ra_info_t * ra_info, lookup_bias bias, __u32 flags) -+{ -+ int result; -+ reiser4_key ikey; -+ int went_right = 0; -+ coord_t *coord = &hint->ext_coord.coord; -+ coord_t orig = *coord; -+ -+ assert("edward-152", hint != NULL); -+ -+ if (!hint_is_valid(hint)) { -+ result = cryptcompress_hint_validate(hint, key, lock_mode); -+ if (result == -E_REPEAT) -+ goto traverse_tree; -+ else if (result) { -+ assert("edward-1216", 0); -+ return result; -+ } -+ hint_set_valid(hint); -+ } -+ assert("edward-709", znode_is_any_locked(coord->node)); -+ -+ /* In-place lookup is going here, it means we just need to -+ check if next item of the @coord match to the @keyhint) */ -+ -+ if (equal_to_rdk(coord->node, key)) { -+ result = goto_right_neighbor(coord, &hint->lh); -+ if (result == -E_NO_NEIGHBOR) { -+ assert("edward-1217", 0); -+ return RETERR(-EIO); -+ } -+ if (result) -+ return result; -+ assert("edward-1218", equal_to_ldk(coord->node, key)); -+ went_right = 1; -+ } else { -+ coord->item_pos++; -+ coord->unit_pos = 0; -+ coord->between = AT_UNIT; -+ } -+ result = zload(coord->node); -+ if (result) -+ return result; -+ assert("edward-1219", !node_is_empty(coord->node)); -+ -+ if (!coord_is_existing_item(coord)) { -+ zrelse(coord->node); -+ goto not_found; -+ } -+ item_key_by_coord(coord, &ikey); -+ zrelse(coord->node); -+ if (!keyeq(key, &ikey)) -+ goto not_found; -+ /* Ok, item is found, update node counts */ -+ if (went_right) -+ dclust_inc_extension_ncount(hint); -+ return CBK_COORD_FOUND; -+ -+ not_found: -+ assert("edward-1220", coord->item_pos > 0); -+ //coord->item_pos--; -+ /* roll back */ -+ *coord = orig; -+ ON_DEBUG(coord_update_v(coord)); -+ return CBK_COORD_NOTFOUND; -+ -+ traverse_tree: -+ assert("edward-713", hint->lh.owner == NULL); -+ assert("edward-714", reiser4_schedulable()); -+ -+ reiser4_unset_hint(hint); -+ dclust_init_extension(hint); -+ coord_init_zero(coord); -+ result = coord_by_key(current_tree, key, coord, &hint->lh, -+ lock_mode, bias, LEAF_LEVEL, LEAF_LEVEL, -+ CBK_UNIQUE | flags, ra_info); -+ if (cbk_errored(result)) -+ return result; -+ if(result == CBK_COORD_FOUND) -+ dclust_inc_extension_ncount(hint); -+ hint_set_valid(hint); -+ return result; -+} -+ -+/* This function is called by deflate[inflate] manager when -+ creating a transformed/plain stream to check if we should -+ create/cut some overhead. If this returns true, then @oh -+ contains the size of this overhead. -+ */ -+static int -+need_cut_or_align(struct inode * inode, reiser4_cluster_t * clust, -+ rw_op rw, int * oh) -+{ -+ tfm_cluster_t * tc = &clust->tc; -+ switch (rw) { -+ case WRITE_OP: /* estimate align */ -+ *oh = tc->len % cipher_blocksize(inode); -+ if (*oh != 0) -+ return 1; -+ break; -+ case READ_OP: /* estimate cut */ -+ *oh = *(tfm_output_data(clust) + tc->len - 1); -+ break; -+ default: -+ impossible("edward-1401", "bad option"); -+ } -+ return (tc->len != tc->lsize); -+} -+ -+/* create/cut an overhead of transformed/plain stream */ -+static void -+align_or_cut_overhead(struct inode * inode, reiser4_cluster_t * clust, rw_op rw) -+{ -+ int oh; -+ cipher_plugin * cplug = inode_cipher_plugin(inode); -+ -+ assert("edward-1402", need_cipher(inode)); -+ -+ if (!need_cut_or_align(inode, clust, rw, &oh)) -+ return; -+ switch (rw) { -+ case WRITE_OP: /* do align */ -+ clust->tc.len += -+ cplug->align_stream(tfm_input_data(clust) + -+ clust->tc.len, clust->tc.len, -+ cipher_blocksize(inode)); -+ *(tfm_input_data(clust) + clust->tc.len - 1) = -+ cipher_blocksize(inode) - oh; -+ break; -+ case READ_OP: /* do cut */ -+ assert("edward-1403", oh <= cipher_blocksize(inode)); -+ clust->tc.len -= oh; -+ break; -+ default: -+ impossible("edward-1404", "bad option"); -+ } -+ return; -+} -+ -+/* the following two functions are to evaluate results -+ of compression transform */ -+static unsigned -+max_cipher_overhead(struct inode * inode) -+{ -+ if (!need_cipher(inode) || !inode_cipher_plugin(inode)->align_stream) -+ return 0; -+ return cipher_blocksize(inode); -+} -+ -+static int deflate_overhead(struct inode *inode) -+{ -+ return (inode_compression_plugin(inode)-> -+ checksum ? DC_CHECKSUM_SIZE : 0); -+} -+ -+static unsigned deflate_overrun(struct inode * inode, int ilen) -+{ -+ return coa_overrun(inode_compression_plugin(inode), ilen); -+} -+ -+/* Estimating compressibility of a logical cluster by various -+ policies represented by compression mode plugin. -+ If this returns false, then compressor won't be called for -+ the cluster of index @index. -+*/ -+static int should_compress(tfm_cluster_t * tc, cloff_t index, -+ struct inode *inode) -+{ -+ compression_plugin *cplug = inode_compression_plugin(inode); -+ compression_mode_plugin *mplug = inode_compression_mode_plugin(inode); -+ -+ assert("edward-1321", tc->len != 0); -+ assert("edward-1322", cplug != NULL); -+ assert("edward-1323", mplug != NULL); -+ -+ return /* estimate by size */ -+ (cplug->min_size_deflate ? -+ tc->len >= cplug->min_size_deflate() : -+ 1) && -+ /* estimate by compression mode plugin */ -+ (mplug->should_deflate ? -+ mplug->should_deflate(inode, index) : -+ 1); -+} -+ -+/* Evaluating results of compression transform. -+ Returns true, if we need to accept this results */ -+static int -+save_compressed(int size_before, int size_after, struct inode * inode) -+{ -+ return (size_after + deflate_overhead(inode) + -+ max_cipher_overhead(inode) < size_before); -+} -+ -+/* Guess result of the evaluation above */ -+static int -+need_inflate(reiser4_cluster_t * clust, struct inode *inode, -+ int encrypted /* is cluster encrypted */ ) -+{ -+ tfm_cluster_t *tc = &clust->tc; -+ -+ assert("edward-142", tc != 0); -+ assert("edward-143", inode != NULL); -+ -+ return tc->len < -+ (encrypted ? -+ inode_scaled_offset(inode, tc->lsize) : -+ tc->lsize); -+} -+ -+/* If results of compression were accepted, then we add -+ a checksum to catch possible disk cluster corruption. -+ The following is a format of the data stored in disk clusters: -+ -+ data This is (transformed) logical cluster. -+ cipher_overhead This is created by ->align() method -+ of cipher plugin. May be absent. -+ checksum (4) This is created by ->checksum method -+ of compression plugin to check -+ integrity. May be absent. -+ -+ Crypto overhead format: -+ -+ data -+ control_byte (1) contains aligned overhead size: -+ 1 <= overhead <= cipher_blksize -+*/ -+/* Append a checksum at the end of a transformed stream */ -+static void dc_set_checksum(compression_plugin * cplug, tfm_cluster_t * tc) -+{ -+ __u32 checksum; -+ -+ assert("edward-1309", tc != NULL); -+ assert("edward-1310", tc->len > 0); -+ assert("edward-1311", cplug->checksum != NULL); -+ -+ checksum = cplug->checksum(tfm_stream_data(tc, OUTPUT_STREAM), tc->len); -+ put_unaligned(cpu_to_le32(checksum), -+ (d32 *)(tfm_stream_data(tc, OUTPUT_STREAM) + tc->len)); -+ tc->len += (int)DC_CHECKSUM_SIZE; -+} -+ -+/* Check a disk cluster checksum. -+ Returns 0 if checksum is correct, otherwise returns 1 */ -+static int dc_check_checksum(compression_plugin * cplug, tfm_cluster_t * tc) -+{ -+ assert("edward-1312", tc != NULL); -+ assert("edward-1313", tc->len > (int)DC_CHECKSUM_SIZE); -+ assert("edward-1314", cplug->checksum != NULL); -+ -+ if (cplug->checksum(tfm_stream_data(tc, INPUT_STREAM), -+ tc->len - (int)DC_CHECKSUM_SIZE) != -+ le32_to_cpu(get_unaligned((d32 *) -+ (tfm_stream_data(tc, INPUT_STREAM) -+ + tc->len - (int)DC_CHECKSUM_SIZE)))) { -+ warning("edward-156", -+ "Bad disk cluster checksum %d, (should be %d) Fsck?\n", -+ (int)le32_to_cpu -+ (get_unaligned((d32 *) -+ (tfm_stream_data(tc, INPUT_STREAM) + -+ tc->len - (int)DC_CHECKSUM_SIZE))), -+ (int)cplug->checksum -+ (tfm_stream_data(tc, INPUT_STREAM), -+ tc->len - (int)DC_CHECKSUM_SIZE)); -+ return 1; -+ } -+ tc->len -= (int)DC_CHECKSUM_SIZE; -+ return 0; -+} -+ -+/* get input/output stream for some transform action */ -+int grab_tfm_stream(struct inode * inode, tfm_cluster_t * tc, -+ tfm_stream_id id) -+{ -+ size_t size = inode_scaled_cluster_size(inode); -+ -+ assert("edward-901", tc != NULL); -+ assert("edward-1027", inode_compression_plugin(inode) != NULL); -+ -+ if (cluster_get_tfm_act(tc) == TFMA_WRITE) -+ size += deflate_overrun(inode, inode_cluster_size(inode)); -+ -+ if (!tfm_stream(tc, id) && id == INPUT_STREAM) -+ alternate_streams(tc); -+ if (!tfm_stream(tc, id)) -+ return alloc_tfm_stream(tc, size, id); -+ -+ assert("edward-902", tfm_stream_is_set(tc, id)); -+ -+ if (tfm_stream_size(tc, id) < size) -+ return realloc_tfm_stream(tc, size, id); -+ return 0; -+} -+ -+/* Common deflate manager */ -+int reiser4_deflate_cluster(reiser4_cluster_t * clust, struct inode * inode) -+{ -+ int result = 0; -+ int compressed = 0; -+ int encrypted = 0; -+ tfm_cluster_t * tc = &clust->tc; -+ compression_plugin * coplug; -+ -+ assert("edward-401", inode != NULL); -+ assert("edward-903", tfm_stream_is_set(tc, INPUT_STREAM)); -+ assert("edward-1348", cluster_get_tfm_act(tc) == TFMA_WRITE); -+ assert("edward-498", !tfm_cluster_is_uptodate(tc)); -+ -+ coplug = inode_compression_plugin(inode); -+ if (should_compress(tc, clust->index, inode)) { -+ /* try to compress, discard bad results */ -+ __u32 dst_len; -+ compression_mode_plugin * mplug = -+ inode_compression_mode_plugin(inode); -+ assert("edward-602", coplug != NULL); -+ assert("edward-1423", coplug->compress != NULL); -+ -+ result = grab_coa(tc, coplug); -+ if (result) { -+ warning("edward-1424", -+ "alloc_coa failed with ret=%d, skipped compression", -+ result); -+ goto cipher; -+ } -+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM); -+ if (result) { -+ warning("edward-1425", -+ "alloc stream failed with ret=%d, skipped compression", -+ result); -+ goto cipher; -+ } -+ dst_len = tfm_stream_size(tc, OUTPUT_STREAM); -+ coplug->compress(get_coa(tc, coplug->h.id, tc->act), -+ tfm_input_data(clust), tc->len, -+ tfm_output_data(clust), &dst_len); -+ /* make sure we didn't overwrite extra bytes */ -+ assert("edward-603", -+ dst_len <= tfm_stream_size(tc, OUTPUT_STREAM)); -+ -+ /* evaluate results of compression transform */ -+ if (save_compressed(tc->len, dst_len, inode)) { -+ /* good result, accept */ -+ tc->len = dst_len; -+ if (mplug->accept_hook != NULL) { -+ result = mplug->accept_hook(inode, clust->index); -+ if (result) -+ warning("edward-1426", -+ "accept_hook failed with ret=%d", -+ result); -+ } -+ compressed = 1; -+ } -+ else { -+ /* bad result, discard */ -+#if REISER4_DEBUG -+ if (cluster_is_complete(clust, inode)) -+ warning("edward-1338", -+ "incompressible cluster %lu (inode %llu)", -+ clust->index, -+ (unsigned long long)get_inode_oid(inode)); -+#endif -+ if (mplug->discard_hook != NULL && -+ cluster_is_complete(clust, inode)) { -+ result = mplug->discard_hook(inode, -+ clust->index); -+ if (result) -+ warning("edward-1427", -+ "discard_hook failed with ret=%d", -+ result); -+ } -+ } -+ } -+ cipher: -+ if (need_cipher(inode)) { -+ cipher_plugin * ciplug; -+ struct blkcipher_desc desc; -+ struct scatterlist src; -+ struct scatterlist dst; -+ -+ ciplug = inode_cipher_plugin(inode); -+ desc.tfm = info_get_cipher(inode_crypto_stat(inode)); -+ desc.flags = 0; -+ if (compressed) -+ alternate_streams(tc); -+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM); -+ if (result) -+ return result; -+ -+ align_or_cut_overhead(inode, clust, WRITE_OP); -+ src.page = virt_to_page(tfm_input_data(clust)); -+ src.offset = offset_in_page(tfm_input_data(clust)); -+ src.length = tc->len; -+ -+ dst.page = virt_to_page(tfm_output_data(clust)); -+ dst.offset = offset_in_page(tfm_output_data(clust)); -+ dst.length = tc->len; -+ -+ result = crypto_blkcipher_encrypt(&desc, &dst, &src, tc->len); -+ if (result) { -+ warning("edward-1405", -+ "encryption failed flags=%x\n", desc.flags); -+ return result; -+ } -+ encrypted = 1; -+ } -+ if (compressed && coplug->checksum != NULL) -+ dc_set_checksum(coplug, tc); -+ if (!compressed && !encrypted) -+ alternate_streams(tc); -+ return result; -+} -+ -+/* Common inflate manager. */ -+int reiser4_inflate_cluster(reiser4_cluster_t * clust, struct inode * inode) -+{ -+ int result = 0; -+ int transformed = 0; -+ tfm_cluster_t * tc = &clust->tc; -+ compression_plugin * coplug; -+ -+ assert("edward-905", inode != NULL); -+ assert("edward-1178", clust->dstat == PREP_DISK_CLUSTER); -+ assert("edward-906", tfm_stream_is_set(&clust->tc, INPUT_STREAM)); -+ assert("edward-1349", tc->act == TFMA_READ); -+ assert("edward-907", !tfm_cluster_is_uptodate(tc)); -+ -+ /* Handle a checksum (if any) */ -+ coplug = inode_compression_plugin(inode); -+ if (need_inflate(clust, inode, need_cipher(inode)) && -+ coplug->checksum != NULL) { -+ result = dc_check_checksum(coplug, tc); -+ if (unlikely(result)) { -+ warning("edward-1460", -+ "Inode %llu: disk cluster %lu looks corrupted", -+ (unsigned long long)get_inode_oid(inode), -+ clust->index); -+ return RETERR(-EIO); -+ } -+ } -+ if (need_cipher(inode)) { -+ cipher_plugin * ciplug; -+ struct blkcipher_desc desc; -+ struct scatterlist src; -+ struct scatterlist dst; -+ -+ ciplug = inode_cipher_plugin(inode); -+ desc.tfm = info_get_cipher(inode_crypto_stat(inode)); -+ desc.flags = 0; -+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM); -+ if (result) -+ return result; -+ assert("edward-909", tfm_cluster_is_set(tc)); -+ -+ src.page = virt_to_page(tfm_input_data(clust)); -+ src.offset = offset_in_page(tfm_input_data(clust)); -+ src.length = tc->len; -+ -+ dst.page = virt_to_page(tfm_output_data(clust)); -+ dst.offset = offset_in_page(tfm_output_data(clust)); -+ dst.length = tc->len; -+ -+ result = crypto_blkcipher_decrypt(&desc, &dst, &src, tc->len); -+ if (result) { -+ warning("edward-1600", "decrypt failed flags=%x\n", -+ desc.flags); -+ return result; -+ } -+ align_or_cut_overhead(inode, clust, READ_OP); -+ transformed = 1; -+ } -+ if (need_inflate(clust, inode, 0)) { -+ unsigned dst_len = inode_cluster_size(inode); -+ if(transformed) -+ alternate_streams(tc); -+ -+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM); -+ if (result) -+ return result; -+ assert("edward-1305", coplug->decompress != NULL); -+ assert("edward-910", tfm_cluster_is_set(tc)); -+ -+ coplug->decompress(get_coa(tc, coplug->h.id, tc->act), -+ tfm_input_data(clust), tc->len, -+ tfm_output_data(clust), &dst_len); -+ /* check length */ -+ tc->len = dst_len; -+ assert("edward-157", dst_len == tc->lsize); -+ transformed = 1; -+ } -+ if (!transformed) -+ alternate_streams(tc); -+ return result; -+} -+ -+/* This is implementation of readpage method of struct -+ address_space_operations for cryptcompress plugin. */ -+int readpage_cryptcompress(struct file *file, struct page *page) -+{ -+ reiser4_context *ctx; -+ reiser4_cluster_t clust; -+ item_plugin *iplug; -+ int result; -+ -+ assert("edward-88", PageLocked(page)); -+ assert("vs-976", !PageUptodate(page)); -+ assert("edward-89", page->mapping && page->mapping->host); -+ -+ ctx = reiser4_init_context(page->mapping->host->i_sb); -+ if (IS_ERR(ctx)) { -+ unlock_page(page); -+ return PTR_ERR(ctx); -+ } -+ assert("edward-113", -+ ergo(file != NULL, -+ page->mapping == file->f_dentry->d_inode->i_mapping)); -+ -+ if (PageUptodate(page)) { -+ warning("edward-1338", "page is already uptodate\n"); -+ unlock_page(page); -+ reiser4_exit_context(ctx); -+ return 0; -+ } -+ cluster_init_read(&clust, NULL); -+ clust.file = file; -+ iplug = item_plugin_by_id(CTAIL_ID); -+ if (!iplug->s.file.readpage) { -+ unlock_page(page); -+ put_cluster_handle(&clust); -+ reiser4_exit_context(ctx); -+ return -EINVAL; -+ } -+ result = iplug->s.file.readpage(&clust, page); -+ -+ assert("edward-1459", !PageLocked(page)); -+ assert("edward-64", ergo(result == 0, PageUptodate(page))); -+ put_cluster_handle(&clust); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/* how much pages will be captured */ -+static int cluster_nrpages_to_capture(reiser4_cluster_t * clust) -+{ -+ switch (clust->op) { -+ case PCL_APPEND: -+ return clust->nr_pages; -+ case PCL_TRUNCATE: -+ assert("edward-1179", clust->win != NULL); -+ return count_to_nrpages(clust->win->off + clust->win->count); -+ default: -+ impossible("edward-1180", "bad page cluster option"); -+ return 0; -+ } -+} -+ -+static void set_cluster_pages_dirty(reiser4_cluster_t * clust) -+{ -+ int i; -+ struct page *pg; -+ int nrpages = cluster_nrpages_to_capture(clust); -+ -+ for (i = 0; i < nrpages; i++) { -+ -+ pg = clust->pages[i]; -+ assert("edward-968", pg != NULL); -+ lock_page(pg); -+ assert("edward-1065", PageUptodate(pg)); -+ reiser4_set_page_dirty_internal(pg); -+ unlock_page(pg); -+ mark_page_accessed(pg); -+ } -+} -+ -+static void clear_cluster_pages_dirty(reiser4_cluster_t * clust) -+{ -+ int i; -+ assert("edward-1275", clust != NULL); -+ -+ for (i = 0; i < clust->nr_pages; i++) { -+ assert("edward-1276", clust->pages[i] != NULL); -+ -+ lock_page(clust->pages[i]); -+ if (PageDirty(clust->pages[i])) { -+ assert("edward-1277", PageUptodate(clust->pages[i])); -+ cancel_dirty_page(clust->pages[i], PAGE_CACHE_SIZE); -+ } -+#if REISER4_DEBUG -+ else -+ /* Race between flush and write: -+ some pages became clean when write() (or another -+ process which modifies data) capture the cluster. */ -+ warning("edward-985", "Page of index %lu (inode %llu)" -+ " is not dirty\n", clust->pages[i]->index, -+ (unsigned long long)get_inode_oid(clust-> -+ pages[i]-> -+ mapping-> -+ host)); -+#endif -+ unlock_page(clust->pages[i]); -+ } -+} -+ -+/* update i_size by window */ -+static void inode_set_new_size(reiser4_cluster_t * clust, struct inode *inode) -+{ -+ loff_t size; -+ reiser4_slide_t *win; -+ -+ assert("edward-1181", clust != NULL); -+ assert("edward-1182", inode != NULL); -+ -+ win = clust->win; -+ assert("edward-1183", win != NULL); -+ assert("edward-1183", win->count != 0); -+ -+ size = clust_to_off(clust->index, inode) + win->off; -+ -+ switch (clust->op) { -+ case PCL_APPEND: -+ if (size + win->count <= inode->i_size) -+ /* overwrite only */ -+ return; -+ size += win->count; -+ break; -+ case PCL_TRUNCATE: -+ break; -+ default: -+ impossible("edward-1184", "bad page cluster option"); -+ break; -+ } -+ inode_check_scale_nolock(inode, inode->i_size, size); -+ inode->i_size = size; -+ return; -+} -+ -+/* Check in page cluster modifications. -+ . Make jnode dirty, if it wasn't; -+ . Reserve space for a disk cluster update by flush algorithm, if needed; -+ . Clean up old references (if any). -+ . Put pages (grabbed in this thread) which will be truncated -+*/ -+static void -+make_cluster_jnode_dirty_locked(reiser4_cluster_t * clust, jnode * node, -+ loff_t * old_isize, struct inode *inode) -+{ -+ int i; -+ int old_nrpages; -+ int new_nrpages = cluster_nrpages_to_capture(clust); -+ -+ assert("edward-973", new_nrpages > 0); -+ assert("edward-221", node != NULL); -+ assert("edward-971", clust->reserved == 1); -+ assert_spin_locked(&(node->guard)); -+ assert("edward-972", node->page_count <= cluster_nrpages(inode)); -+ assert("edward-1263", -+ clust->reserved_prepped == estimate_update_cluster(inode)); -+ assert("edward-1264", clust->reserved_unprepped == 0); -+ -+ if (JF_ISSET(node, JNODE_DIRTY)) { -+ /* someone has modified this cluster, but -+ the modifications are not committed yet */ -+ old_nrpages = -+ count_to_nrpages(cnt_to_clcnt(*old_isize, -+ clust->index, inode)); -+ /* free space which is already reserved */ -+ free_reserved4cluster(inode, clust, -+ estimate_update_cluster(inode)); -+ /* put old references */ -+ for (i = 0; i < old_nrpages; i++) { -+ assert("edward-975", clust->pages[i]); -+ assert("edward-1185", PageUptodate(clust->pages[i])); -+ -+ page_cache_release(clust->pages[i]); -+#if REISER4_DEBUG -+ cryptcompress_inode_data(inode)->pgcount --; -+#endif -+ } -+ } else { -+ /* no captured pages */ -+ assert("edward-1043", node->page_count == 0); -+ jnode_make_dirty_locked(node); -+ clust->reserved = 0; -+ } -+ /* put pages that will be truncated (if any) */ -+ for (i = new_nrpages; i < clust->nr_pages; i++) { -+ assert("edward-1433", clust->pages[i]); -+ assert("edward-1434", PageUptodate(clust->pages[i])); -+ page_cache_release(clust->pages[i]); -+#if REISER4_DEBUG -+ cryptcompress_inode_data(inode)->pgcount --; -+#endif -+ } -+#if REISER4_DEBUG -+ clust->reserved_prepped -= estimate_update_cluster(inode); -+ node->page_count = new_nrpages; -+#endif -+ return; -+} -+ -+/* This function spawns a transaction and -+ is called by any thread as a final step in page cluster modification. -+*/ -+static int try_capture_cluster(reiser4_cluster_t * clust, struct inode *inode) -+{ -+ int result = 0; -+ loff_t old_size; -+ jnode *node; -+ -+ assert("edward-1029", clust != NULL); -+ assert("edward-1030", clust->reserved == 1); -+ assert("edward-1031", clust->nr_pages != 0); -+ assert("edward-1032", clust->pages != NULL); -+ assert("edward-1033", clust->pages[0] != NULL); -+ -+ node = jprivate(clust->pages[0]); -+ assert("edward-1035", node != NULL); -+ assert("edward-1446", jnode_is_cluster_page(node)); -+ -+ spin_lock_jnode(node); -+ -+ old_size = inode->i_size; -+ if (clust->win) -+ inode_set_new_size(clust, inode); -+ -+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0); -+ if (result) -+ goto exit; -+ make_cluster_jnode_dirty_locked(clust, node, &old_size, inode); -+ exit: -+ spin_unlock_jnode(node); -+ jput(node); -+ return result; -+} -+ -+/* Collect unlocked cluster pages for any modifications and attach a jnode. -+ We allocate only one jnode per cluster, this jnode is binded to the first -+ page of this cluster, so we have an extra-reference that will exist with -+ this jnode, other references will be cleaned up in flush time. -+*/ -+static int -+grab_cluster_pages_jnode(struct inode *inode, reiser4_cluster_t * clust) -+{ -+ int i; -+ int result = 0; -+ jnode *node = NULL; -+ -+ assert("edward-182", clust != NULL); -+ assert("edward-183", clust->pages != NULL); -+ assert("edward-184", clust->nr_pages <= cluster_nrpages(inode)); -+ -+ if (clust->nr_pages == 0) -+ return 0; -+ -+ for (i = 0; i < clust->nr_pages; i++) { -+ -+ assert("edward-1044", clust->pages[i] == NULL); -+ -+ clust->pages[i] = -+ find_or_create_page(inode->i_mapping, -+ clust_to_pg(clust->index, inode) + i, -+ reiser4_ctx_gfp_mask_get()); -+ if (!clust->pages[i]) { -+ result = RETERR(-ENOMEM); -+ break; -+ } -+ if (i == 0) { -+ node = jnode_of_page(clust->pages[i]); -+ if (IS_ERR(node)) { -+ result = PTR_ERR(node); -+ unlock_page(clust->pages[i]); -+ break; -+ } -+ JF_SET(node, JNODE_CLUSTER_PAGE); -+ unlock_page(clust->pages[i]); -+ assert("edward-919", node); -+ continue; -+ } -+ unlock_page(clust->pages[i]); -+ } -+ if (result) { -+ while (i) -+ page_cache_release(clust->pages[--i]); -+ if (node && !IS_ERR(node)) -+ jput(node); -+ return result; -+ } -+ assert("edward-920", jprivate(clust->pages[0])); -+#if REISER4_DEBUG -+ cryptcompress_inode_data(inode)->pgcount += clust->nr_pages; -+#endif -+ return 0; -+} -+ -+/* Collect unlocked cluster pages only for read (not to modify) */ -+int grab_cluster_pages(struct inode *inode, reiser4_cluster_t * clust) -+{ -+ int i; -+ int result = 0; -+ -+ assert("edward-1428", inode != NULL); -+ assert("edward-1429", inode->i_mapping != NULL); -+ assert("edward-787", clust != NULL); -+ assert("edward-788", clust->pages != NULL); -+ assert("edward-789", clust->nr_pages != 0); -+ assert("edward-790", clust->nr_pages <= cluster_nrpages(inode)); -+ -+ for (i = 0; i < clust->nr_pages; i++) { -+ clust->pages[i] = -+ find_or_create_page(inode->i_mapping, -+ clust_to_pg(clust->index, inode) + i, -+ reiser4_ctx_gfp_mask_get()); -+ if (!clust->pages[i]) { -+ result = RETERR(-ENOMEM); -+ break; -+ } -+ unlock_page(clust->pages[i]); -+ } -+ if (result) -+ while (i) -+ page_cache_release(clust->pages[--i]); -+ return result; -+} -+ -+/* @node might be attached by reiser4_writepage(), not by -+ cryptcompress plugin code, but emergency flush should -+ understand that pages of cryptcompress files are not -+ flushable. -+*/ -+#if 0 -+int jnode_of_cluster(const jnode * node, struct page * page) -+{ -+ assert("edward-1339", node != NULL); -+ assert("edward-1340", page != NULL); -+ assert("edward-1341", page->mapping != NULL); -+ assert("edward-1342", page->mapping->host != NULL); -+ assert("edward-1343", -+ ergo(jnode_is_unformatted(node), -+ get_inode_oid(page->mapping->host) == -+ node->key.j.objectid)); -+ if (inode_file_plugin(page->mapping->host) == -+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)) { -+#if REISER4_DEBUG -+ if (!jnode_is_cluster_page(node)) -+ warning("edward-1345", -+ "inode %llu: cluster page of index %lu became private", -+ (unsigned long long)get_inode_oid(page->mapping->host), -+ page->index); -+#endif -+ return 1; -+ } -+ return 0; -+} -+#endif /* 0 */ -+ -+/* put cluster pages */ -+void reiser4_release_cluster_pages(reiser4_cluster_t * clust) -+{ -+ int i; -+ -+ assert("edward-447", clust != NULL); -+ for (i = 0; i < clust->nr_pages; i++) { -+ -+ assert("edward-449", clust->pages[i] != NULL); -+ -+ page_cache_release(clust->pages[i]); -+ } -+} -+ -+/* this is called when something is failed */ -+static void reiser4_release_cluster_pages_and_jnode(reiser4_cluster_t * clust) -+{ -+ jnode *node; -+ -+ assert("edward-445", clust != NULL); -+ assert("edward-922", clust->pages != NULL); -+ assert("edward-446", clust->pages[0] != NULL); -+ -+ node = jprivate(clust->pages[0]); -+ -+ assert("edward-447", node != NULL); -+ -+ reiser4_release_cluster_pages(clust); -+ jput(node); -+} -+ -+#if REISER4_DEBUG -+static int window_ok(reiser4_slide_t * win, struct inode *inode) -+{ -+ assert("edward-1115", win != NULL); -+ assert("edward-1116", ergo(win->delta, win->stat == HOLE_WINDOW)); -+ -+ return (win->off != inode_cluster_size(inode)) && -+ (win->off + win->count + win->delta <= inode_cluster_size(inode)); -+} -+ -+static int cluster_ok(reiser4_cluster_t * clust, struct inode *inode) -+{ -+ assert("edward-279", clust != NULL); -+ -+ if (!clust->pages) -+ return 0; -+ return (clust->win ? window_ok(clust->win, inode) : 1); -+} -+#endif -+ -+/* guess next window stat */ -+static inline window_stat next_window_stat(reiser4_slide_t * win) -+{ -+ assert("edward-1130", win != NULL); -+ return ((win->stat == HOLE_WINDOW && win->delta == 0) ? -+ HOLE_WINDOW : DATA_WINDOW); -+} -+ -+/* guess next cluster index and window params */ -+static void -+update_cluster(struct inode *inode, reiser4_cluster_t * clust, loff_t file_off, -+ loff_t to_file) -+{ -+ reiser4_slide_t *win; -+ -+ assert("edward-185", clust != NULL); -+ assert("edward-438", clust->pages != NULL); -+ assert("edward-281", cluster_ok(clust, inode)); -+ -+ win = clust->win; -+ if (!win) -+ return; -+ -+ switch (win->stat) { -+ case DATA_WINDOW: -+ /* increment window position */ -+ clust->index++; -+ win->stat = DATA_WINDOW; -+ win->off = 0; -+ win->count = min_count(inode_cluster_size(inode), to_file); -+ break; -+ case HOLE_WINDOW: -+ switch (next_window_stat(win)) { -+ case HOLE_WINDOW: -+ /* set window to fit the offset we start write from */ -+ clust->index = off_to_clust(file_off, inode); -+ win->stat = HOLE_WINDOW; -+ win->off = 0; -+ win->count = off_to_cloff(file_off, inode); -+ win->delta = -+ min_count(inode_cluster_size(inode) - win->count, -+ to_file); -+ break; -+ case DATA_WINDOW: -+ /* do not move the window, just change its state, -+ off+count+delta=inv */ -+ win->stat = DATA_WINDOW; -+ win->off = win->off + win->count; -+ win->count = win->delta; -+ win->delta = 0; -+ break; -+ default: -+ impossible("edward-282", "wrong next window state"); -+ } -+ break; -+ default: -+ impossible("edward-283", "wrong current window state"); -+ } -+ assert("edward-1068", cluster_ok(clust, inode)); -+} -+ -+static int update_sd_cryptcompress(struct inode *inode) -+{ -+ int result = 0; -+ -+ assert("edward-978", reiser4_schedulable()); -+ -+ result = reiser4_grab_space_force( /* one for stat data update */ -+ estimate_update_common(inode), -+ BA_CAN_COMMIT); -+ if (result) -+ return result; -+ inode->i_ctime = inode->i_mtime = CURRENT_TIME; -+ result = reiser4_update_sd(inode); -+ -+ return result; -+} -+ -+/* NOTE-Edward: this is too similar to reiser4/txnmgr.c:uncapture_jnode() */ -+static void uncapture_cluster_jnode(jnode * node) -+{ -+ txn_atom *atom; -+ -+ assert_spin_locked(&(node->guard)); -+ -+ /*jnode_make_clean(node); */ -+ atom = jnode_get_atom(node); -+ if (atom == NULL) { -+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY)); -+ spin_unlock_jnode(node); -+ return; -+ } -+ -+ reiser4_uncapture_block(node); -+ spin_unlock_atom(atom); -+ jput(node); -+} -+ -+static void forget_cluster_pages(struct page **pages, int nr) -+{ -+ int i; -+ for (i = 0; i < nr; i++) { -+ -+ assert("edward-1045", pages[i] != NULL); -+ page_cache_release(pages[i]); -+ } -+} -+ -+/* Check out last modifications we are about to commit, -+ and prepare input stream for transform operations. -+*/ -+int -+flush_cluster_pages(reiser4_cluster_t * clust, jnode * node, -+ struct inode *inode) -+{ -+ int result = 0; -+ int i; -+ int nr_pages = 0; -+ tfm_cluster_t *tc = &clust->tc; -+#if REISER4_DEBUG -+ int node_pgcount; -+#endif -+ assert("edward-980", node != NULL); -+ assert("edward-236", inode != NULL); -+ assert("edward-237", clust != NULL); -+ assert("edward-240", !clust->win); -+ assert("edward-241", reiser4_schedulable()); -+ assert("edward-718", cryptcompress_inode_ok(inode)); -+ -+ result = grab_tfm_stream(inode, tc, INPUT_STREAM); -+ if (result) { -+ warning("edward-1430", -+ "alloc stream failed with ret=%d", result); -+ return result; -+ } -+ spin_lock_jnode(node); -+#if REISER4_DEBUG -+ node_pgcount = node->page_count; -+#endif -+ if (!JF_ISSET(node, JNODE_DIRTY)) { -+ /* race with another flush */ -+#if REISER4_DEBUG -+ assert("edward-981", node_pgcount == 0); -+ warning("edward-982", "flush_cluster_pages: jnode is not dirty " -+ "clust %lu, inode %llu\n", -+ clust->index, (unsigned long long)get_inode_oid(inode)); -+#endif -+ spin_unlock_jnode(node); -+ return RETERR(-E_REPEAT); -+ } -+ /* Check out a size of logical cluster and -+ set a number of cluster pages to commit. */ -+ tc->len = tc->lsize = fsize_to_count(clust, inode); -+ clust->nr_pages = count_to_nrpages(tc->len); -+ -+#if REISER4_DEBUG -+ node->page_count = 0; -+#endif -+ cluster_reserved2grabbed(estimate_update_cluster(inode)); -+ uncapture_cluster_jnode(node); -+ -+ assert("edward-1224", reiser4_schedulable()); -+ /* Check out page cluster for commit */ -+ nr_pages = -+ find_get_pages(inode->i_mapping, clust_to_pg(clust->index, inode), -+ clust->nr_pages, clust->pages); -+ if (nr_pages != clust->nr_pages) -+ goto checkout_failed; -+ -+ /* Try to construct input stream from the checked out pages */ -+ for (i = 0; i < clust->nr_pages; i++) { -+ char *data; -+ -+ assert("edward-242", clust->pages[i] != NULL); -+ if (clust->pages[i]->index != -+ clust_to_pg(clust->index, inode) + i) -+ goto checkout_failed; -+ BUG_ON(!PageUptodate(clust->pages[i])); -+ -+ /* flush the page into input transform stream */ -+ lock_page(clust->pages[i]); -+ data = kmap(clust->pages[i]); -+ -+ assert("edward-986", cnt_to_pgcnt(tc->len, i) != 0); -+ -+ memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i), -+ data, cnt_to_pgcnt(tc->len, i)); -+ kunmap(clust->pages[i]); -+ unlock_page(clust->pages[i]); -+ } -+ /* page cluster flushed successfully */ -+ -+ clear_cluster_pages_dirty(clust); -+ reiser4_release_cluster_pages(clust); -+#if REISER4_DEBUG -+ cryptcompress_inode_data(inode)->pgcount -= clust->nr_pages; -+#endif -+ goto out; -+ checkout_failed: -+#if REISER4_DEBUG -+ assert("edward-1282", node_pgcount == 0); -+ warning("edward-1435", "Inode %llu : checkout page cluster" -+ "of index %lu failed\n", -+ (unsigned long long)get_inode_oid(inode), clust->index); -+#endif /* REISER4_DEBUG */ -+ result = RETERR(-E_REPEAT); -+ out: -+ /* put pages that were found here */ -+ forget_cluster_pages(clust->pages, nr_pages); -+ return result; -+} -+ -+/* set hint for the cluster of the index @index */ -+static void set_hint_cluster(struct inode *inode, hint_t * hint, -+ cloff_t index, znode_lock_mode mode) -+{ -+ reiser4_key key; -+ assert("edward-722", cryptcompress_inode_ok(inode)); -+ assert("edward-723", -+ inode_file_plugin(inode) == -+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); -+ -+ inode_file_plugin(inode)->key_by_inode(inode, -+ clust_to_off(index, inode), -+ &key); -+ -+ reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, &key); -+ hint->offset = get_key_offset(&key); -+ hint->mode = mode; -+} -+ -+void invalidate_hint_cluster(reiser4_cluster_t * clust) -+{ -+ assert("edward-1291", clust != NULL); -+ assert("edward-1292", clust->hint != NULL); -+ -+ done_lh(&clust->hint->lh); -+ hint_clr_valid(clust->hint); -+} -+ -+void put_hint_cluster(reiser4_cluster_t * clust, struct inode *inode, -+ znode_lock_mode mode) -+{ -+ assert("edward-1286", clust != NULL); -+ assert("edward-1287", clust->hint != NULL); -+ -+ set_hint_cluster(inode, clust->hint, clust->index + 1, mode); -+ invalidate_hint_cluster(clust); -+} -+ -+static int -+balance_dirty_page_cluster(reiser4_cluster_t * clust, struct inode *inode, -+ loff_t off, loff_t to_file) -+{ -+ int result; -+ -+ assert("edward-724", inode != NULL); -+ assert("edward-725", cryptcompress_inode_ok(inode)); -+ -+ /* set next window params */ -+ update_cluster(inode, clust, off, to_file); -+ -+ result = update_sd_cryptcompress(inode); -+ if (result) -+ return result; -+ assert("edward-726", clust->hint->lh.owner == NULL); -+ -+ reiser4_throttle_write(inode); -+ return 0; -+} -+ -+/* set zeroes to the cluster, update it, and maybe, try to capture its pages */ -+static int -+write_hole(struct inode *inode, reiser4_cluster_t * clust, loff_t file_off, -+ loff_t to_file) -+{ -+ char *data; -+ int result = 0; -+ unsigned cl_off, cl_count = 0; -+ unsigned to_pg, pg_off; -+ reiser4_slide_t *win; -+ -+ assert("edward-190", clust != NULL); -+ assert("edward-1069", clust->win != NULL); -+ assert("edward-191", inode != NULL); -+ assert("edward-727", cryptcompress_inode_ok(inode)); -+ assert("edward-1171", clust->dstat != INVAL_DISK_CLUSTER); -+ assert("edward-1154", -+ ergo(clust->dstat != FAKE_DISK_CLUSTER, clust->reserved == 1)); -+ -+ win = clust->win; -+ -+ assert("edward-1070", win != NULL); -+ assert("edward-201", win->stat == HOLE_WINDOW); -+ assert("edward-192", cluster_ok(clust, inode)); -+ -+ if (win->off == 0 && win->count == inode_cluster_size(inode)) { -+ /* the hole will be represented by fake disk cluster */ -+ update_cluster(inode, clust, file_off, to_file); -+ return 0; -+ } -+ cl_count = win->count; /* number of zeroes to write */ -+ cl_off = win->off; -+ pg_off = off_to_pgoff(win->off); -+ -+ while (cl_count) { -+ struct page *page; -+ page = clust->pages[off_to_pg(cl_off)]; -+ -+ assert("edward-284", page != NULL); -+ -+ to_pg = min_count(PAGE_CACHE_SIZE - pg_off, cl_count); -+ lock_page(page); -+ data = kmap_atomic(page, KM_USER0); -+ memset(data + pg_off, 0, to_pg); -+ flush_dcache_page(page); -+ kunmap_atomic(data, KM_USER0); -+ SetPageUptodate(page); -+ unlock_page(page); -+ -+ cl_off += to_pg; -+ cl_count -= to_pg; -+ pg_off = 0; -+ } -+ if (!win->delta) { -+ /* only zeroes, try to capture */ -+ -+ set_cluster_pages_dirty(clust); -+ result = try_capture_cluster(clust, inode); -+ if (result) -+ return result; -+ put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK); -+ result = -+ balance_dirty_page_cluster(clust, inode, file_off, to_file); -+ } else -+ update_cluster(inode, clust, file_off, to_file); -+ return result; -+} -+ -+/* -+ The main disk search procedure for cryptcompress plugins, which -+ . scans all items of disk cluster with the lock mode @mode -+ . maybe reads each one (if @read) -+ . maybe makes its znode dirty (if write lock mode was specified) -+ -+ NOTE-EDWARD: Callers should handle the case when disk cluster -+ is incomplete (-EIO) -+*/ -+int find_disk_cluster(reiser4_cluster_t * clust, -+ struct inode *inode, int read, znode_lock_mode mode) -+{ -+ flow_t f; -+ hint_t *hint; -+ int result = 0; -+ unsigned long cl_idx; -+ ra_info_t ra_info; -+ file_plugin *fplug; -+ item_plugin *iplug; -+ tfm_cluster_t *tc; -+ int was_grabbed; -+ -+ assert("edward-138", clust != NULL); -+ assert("edward-728", clust->hint != NULL); -+ assert("edward-226", reiser4_schedulable()); -+ assert("edward-137", inode != NULL); -+ assert("edward-729", cryptcompress_inode_ok(inode)); -+ -+ hint = clust->hint; -+ cl_idx = clust->index; -+ fplug = inode_file_plugin(inode); -+ was_grabbed = get_current_context()->grabbed_blocks; -+ tc = &clust->tc; -+ -+ assert("edward-462", !tfm_cluster_is_uptodate(tc)); -+ assert("edward-461", ergo(read, tfm_stream_is_set(tc, INPUT_STREAM))); -+ -+ dclust_init_extension(hint); -+ -+ /* set key of the first disk cluster item */ -+ fplug->flow_by_inode(inode, -+ (read ? (char __user *)tfm_stream_data(tc, INPUT_STREAM) : NULL), -+ 0 /* kernel space */ , -+ inode_scaled_cluster_size(inode), -+ clust_to_off(cl_idx, inode), READ_OP, &f); -+ if (mode == ZNODE_WRITE_LOCK) { -+ /* reserve for flush to make dirty all the leaf nodes -+ which contain disk cluster */ -+ result = -+ reiser4_grab_space_force(estimate_dirty_cluster(inode), -+ BA_CAN_COMMIT); -+ if (result) -+ goto out; -+ } -+ -+ ra_info.key_to_stop = f.key; -+ set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key())); -+ -+ while (f.length) { -+ result = find_cluster_item(hint, &f.key, mode, -+ NULL, FIND_EXACT, -+ (mode == ZNODE_WRITE_LOCK ? -+ CBK_FOR_INSERT : 0)); -+ switch (result) { -+ case CBK_COORD_NOTFOUND: -+ result = 0; -+ if (inode_scaled_offset -+ (inode, -+ clust_to_off(cl_idx, -+ inode)) == get_key_offset(&f.key)) { -+ /* first item not found, this is treated -+ as disk cluster is absent */ -+ clust->dstat = FAKE_DISK_CLUSTER; -+ goto out; -+ } -+ /* we are outside the cluster, stop search here */ -+ assert("edward-146", -+ f.length != inode_scaled_cluster_size(inode)); -+ goto ok; -+ case CBK_COORD_FOUND: -+ assert("edward-148", -+ hint->ext_coord.coord.between == AT_UNIT); -+ assert("edward-460", -+ hint->ext_coord.coord.unit_pos == 0); -+ -+ coord_clear_iplug(&hint->ext_coord.coord); -+ result = zload_ra(hint->ext_coord.coord.node, &ra_info); -+ if (unlikely(result)) -+ goto out; -+ iplug = item_plugin_by_coord(&hint->ext_coord.coord); -+ assert("edward-147", -+ item_id_by_coord(&hint->ext_coord.coord) == -+ CTAIL_ID); -+ -+ result = iplug->s.file.read(NULL, &f, hint); -+ if (result) { -+ zrelse(hint->ext_coord.coord.node); -+ goto out; -+ } -+ if (mode == ZNODE_WRITE_LOCK) { -+ /* Don't make dirty more nodes then it was -+ estimated (see comments before -+ estimate_dirty_cluster). Missed nodes will be -+ read up in flush time if they are evicted from -+ memory */ -+ if (dclust_get_extension_ncount(hint) <= -+ estimate_dirty_cluster(inode)) -+ znode_make_dirty(hint->ext_coord.coord.node); -+ -+ znode_set_convertible(hint->ext_coord.coord. -+ node); -+ } -+ zrelse(hint->ext_coord.coord.node); -+ break; -+ default: -+ goto out; -+ } -+ } -+ ok: -+ /* at least one item was found */ -+ /* NOTE-EDWARD: Callers should handle the case -+ when disk cluster is incomplete (-EIO) */ -+ tc->len = inode_scaled_cluster_size(inode) - f.length; -+ tc->lsize = fsize_to_count(clust, inode); -+ assert("edward-1196", tc->len > 0); -+ assert("edward-1406", tc->lsize > 0); -+ -+ if (hint_is_unprepped_dclust(clust->hint)) -+ clust->dstat = UNPR_DISK_CLUSTER; -+ else { -+ dclust_set_extension_dsize(clust->hint, tc->len); -+ clust->dstat = PREP_DISK_CLUSTER; -+ } -+ out: -+ assert("edward-1339", -+ get_current_context()->grabbed_blocks >= was_grabbed); -+ grabbed2free(get_current_context(), -+ get_current_super_private(), -+ get_current_context()->grabbed_blocks - was_grabbed); -+ return result; -+} -+ -+int -+get_disk_cluster_locked(reiser4_cluster_t * clust, struct inode *inode, -+ znode_lock_mode lock_mode) -+{ -+ reiser4_key key; -+ ra_info_t ra_info; -+ -+ assert("edward-730", reiser4_schedulable()); -+ assert("edward-731", clust != NULL); -+ assert("edward-732", inode != NULL); -+ -+ if (hint_is_valid(clust->hint)) { -+ assert("edward-1293", clust->dstat != INVAL_DISK_CLUSTER); -+ assert("edward-1294", -+ znode_is_write_locked(clust->hint->lh.node)); -+ /* already have a valid locked position */ -+ return (clust->dstat == -+ FAKE_DISK_CLUSTER ? CBK_COORD_NOTFOUND : -+ CBK_COORD_FOUND); -+ } -+ key_by_inode_cryptcompress(inode, clust_to_off(clust->index, inode), -+ &key); -+ ra_info.key_to_stop = key; -+ set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key())); -+ -+ return find_cluster_item(clust->hint, &key, lock_mode, NULL, FIND_EXACT, -+ CBK_FOR_INSERT); -+} -+ -+/* Read needed cluster pages before modifying. -+ If success, @clust->hint contains locked position in the tree. -+ Also: -+ . find and set disk cluster state -+ . make disk cluster dirty if its state is not FAKE_DISK_CLUSTER. -+*/ -+static int -+read_some_cluster_pages(struct inode *inode, reiser4_cluster_t * clust) -+{ -+ int i; -+ int result = 0; -+ item_plugin *iplug; -+ reiser4_slide_t *win = clust->win; -+ znode_lock_mode mode = ZNODE_WRITE_LOCK; -+ -+ iplug = item_plugin_by_id(CTAIL_ID); -+ -+ assert("edward-924", !tfm_cluster_is_uptodate(&clust->tc)); -+ -+#if REISER4_DEBUG -+ if (clust->nr_pages == 0) { -+ /* start write hole from fake disk cluster */ -+ assert("edward-1117", win != NULL); -+ assert("edward-1118", win->stat == HOLE_WINDOW); -+ assert("edward-1119", new_cluster(clust, inode)); -+ } -+#endif -+ if (new_cluster(clust, inode)) { -+ /* -+ new page cluster is about to be written, nothing to read, -+ */ -+ assert("edward-734", reiser4_schedulable()); -+ assert("edward-735", clust->hint->lh.owner == NULL); -+ -+ if (clust->nr_pages) { -+ int off; -+ char *data; -+ struct page * pg; -+ assert("edward-1419", clust->pages != NULL); -+ pg = clust->pages[clust->nr_pages - 1]; -+ assert("edward-1420", pg != NULL); -+ off = off_to_pgoff(win->off+win->count+win->delta); -+ if (off) { -+ lock_page(pg); -+ data = kmap_atomic(pg, KM_USER0); -+ memset(data + off, 0, PAGE_CACHE_SIZE - off); -+ flush_dcache_page(pg); -+ kunmap_atomic(data, KM_USER0); -+ unlock_page(pg); -+ } -+ } -+ clust->dstat = FAKE_DISK_CLUSTER; -+ return 0; -+ } -+ /* -+ Here we should search for disk cluster to figure out its real state. -+ Also there is one more important reason to do disk search: we need -+ to make disk cluster _dirty_ if it exists -+ */ -+ -+ /* if windows is specified, read the only pages -+ that will be modified partially */ -+ -+ for (i = 0; i < clust->nr_pages; i++) { -+ struct page *pg = clust->pages[i]; -+ -+ lock_page(pg); -+ if (PageUptodate(pg)) { -+ unlock_page(pg); -+ continue; -+ } -+ unlock_page(pg); -+ -+ if (win && -+ i >= count_to_nrpages(win->off) && -+ i < off_to_pg(win->off + win->count + win->delta)) -+ /* page will be completely overwritten */ -+ continue; -+ -+ if (win && (i == clust->nr_pages - 1) && -+ /* the last page is -+ partially modified, -+ not uptodate .. */ -+ (count_to_nrpages(inode->i_size) <= pg->index)) { -+ /* .. and appended, -+ so set zeroes to the rest */ -+ char *data; -+ int offset; -+ lock_page(pg); -+ data = kmap_atomic(pg, KM_USER0); -+ -+ assert("edward-1260", -+ count_to_nrpages(win->off + win->count + -+ win->delta) - 1 == i); -+ -+ offset = -+ off_to_pgoff(win->off + win->count + win->delta); -+ memset(data + offset, 0, PAGE_CACHE_SIZE - offset); -+ flush_dcache_page(pg); -+ kunmap_atomic(data, KM_USER0); -+ unlock_page(pg); -+ /* still not uptodate */ -+ break; -+ } -+ if (!tfm_cluster_is_uptodate(&clust->tc)) { -+ result = ctail_read_disk_cluster(clust, inode, mode); -+ if (result) -+ goto out; -+ assert("edward-925", -+ tfm_cluster_is_uptodate(&clust->tc)); -+ } -+ lock_page(pg); -+ result = do_readpage_ctail(inode, clust, pg, mode); -+ unlock_page(pg); -+ if (result) { -+ impossible("edward-219", -+ "do_readpage_ctail returned crap"); -+ goto out; -+ } -+ } -+ if (!tfm_cluster_is_uptodate(&clust->tc)) { -+ /* disk cluster unclaimed, but we need to make its znodes dirty -+ to make flush update convert its content */ -+ result = find_disk_cluster(clust, inode, 0 /* do not read items */, -+ mode); -+ } -+ out: -+ tfm_cluster_clr_uptodate(&clust->tc); -+ return result; -+} -+ -+static int -+should_create_unprepped_cluster(reiser4_cluster_t * clust, struct inode *inode) -+{ -+ assert("edward-737", clust != NULL); -+ -+ switch (clust->dstat) { -+ case PREP_DISK_CLUSTER: -+ case UNPR_DISK_CLUSTER: -+ return 0; -+ case FAKE_DISK_CLUSTER: -+ if (clust->win && -+ clust->win->stat == HOLE_WINDOW && clust->nr_pages == 0) { -+ assert("edward-1172", new_cluster(clust, inode)); -+ return 0; -+ } -+ return 1; -+ default: -+ impossible("edward-1173", "bad disk cluster state"); -+ return 0; -+ } -+} -+ -+static int -+cryptcompress_make_unprepped_cluster(reiser4_cluster_t * clust, -+ struct inode *inode) -+{ -+ int result; -+ -+ assert("edward-1123", reiser4_schedulable()); -+ assert("edward-737", clust != NULL); -+ assert("edward-738", inode != NULL); -+ assert("edward-739", cryptcompress_inode_ok(inode)); -+ assert("edward-1053", clust->hint != NULL); -+ -+ if (!should_create_unprepped_cluster(clust, inode)) { -+ if (clust->reserved) { -+ cluster_reserved2free(estimate_insert_cluster(inode)); -+#if REISER4_DEBUG -+ assert("edward-1267", -+ clust->reserved_unprepped == -+ estimate_insert_cluster(inode)); -+ clust->reserved_unprepped -= -+ estimate_insert_cluster(inode); -+#endif -+ } -+ return 0; -+ } -+ assert("edward-1268", clust->reserved); -+ cluster_reserved2grabbed(estimate_insert_cluster(inode)); -+#if REISER4_DEBUG -+ assert("edward-1441", -+ clust->reserved_unprepped == estimate_insert_cluster(inode)); -+ clust->reserved_unprepped -= estimate_insert_cluster(inode); -+#endif -+ result = ctail_insert_unprepped_cluster(clust, inode); -+ if (result) -+ return result; -+ -+ inode_add_bytes(inode, inode_cluster_size(inode)); -+ -+ assert("edward-743", cryptcompress_inode_ok(inode)); -+ assert("edward-744", znode_is_write_locked(clust->hint->lh.node)); -+ -+ clust->dstat = UNPR_DISK_CLUSTER; -+ return 0; -+} -+ -+#if REISER4_DEBUG -+static int jnode_truncate_ok(struct inode *inode, cloff_t index) -+{ -+ jnode *node; -+ node = -+ jlookup(current_tree, get_inode_oid(inode), -+ clust_to_pg(index, inode)); -+ if (likely(!node)) -+ return 1; -+ /* someone got this jnode */ -+ warning("edward-1315", "jnode %p is untruncated\n", node); -+ jput(node); -+ return (atomic_read(&node->x_count)); -+} -+#endif -+ -+/* Collect unlocked cluster pages and jnode (the last is in the -+ case when the page cluster will be modified and captured) */ -+int -+prepare_page_cluster(struct inode *inode, reiser4_cluster_t * clust, -+ int capture) -+{ -+ assert("edward-177", inode != NULL); -+ assert("edward-741", cryptcompress_inode_ok(inode)); -+ assert("edward-740", clust->pages != NULL); -+ -+ set_cluster_nrpages(clust, inode); -+ reset_cluster_pgset(clust, cluster_nrpages(inode)); -+ return (capture ? -+ grab_cluster_pages_jnode(inode, clust) : -+ grab_cluster_pages(inode, clust)); -+} -+ -+/* Truncate all pages of the cluster of index @index. -+ This is called by ->kill_hook() method of item plugin */ -+void truncate_page_cluster_cryptcompress(struct inode *inode, cloff_t index, -+ int even_cows) -+{ -+ int i; -+ int found = 0; -+ int nr_pages; -+ jnode *node; -+ struct page *pages[MAX_CLUSTER_NRPAGES]; -+ -+ node = -+ jlookup(current_tree, get_inode_oid(inode), -+ clust_to_pg(index, inode)); -+ /* jnode is absent, just drop pages which can not -+ acquire jnode because of exclusive access */ -+ if (!node) -+ goto truncate; -+ /* jnode is present and may be dirty */ -+ nr_pages = count_to_nrpages(cnt_to_clcnt(inode->i_size, index, inode)); -+ -+ found = find_get_pages(inode->i_mapping, clust_to_pg(index, inode), -+ nr_pages, pages); -+ spin_lock_jnode(node); -+ -+ if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS) -+ && index == 0) -+ /* converting to unix_file in progress */ -+ JF_CLR(node, JNODE_CLUSTER_PAGE); -+ if (JF_ISSET(node, JNODE_DIRTY)) { -+ /* someone has done modifications which are not -+ yet committed, so we need to release some resources */ -+ -+ /* free disk space grabbed for disk cluster converting */ -+ cluster_reserved2grabbed(estimate_update_cluster(inode)); -+ grabbed2free(get_current_context(), -+ get_current_super_private(), -+ estimate_update_cluster(inode)); -+ -+ assert("edward-1198", found == nr_pages); -+ assert("edward-1199", node->page_count == nr_pages); -+#if REISER4_DEBUG -+ node->page_count = 0; -+#endif -+ /* This will clear dirty bit */ -+ uncapture_cluster_jnode(node); -+ -+ /* put pages grabbed for last uncommitted modifications */ -+ for (i = 0; i < nr_pages; i++) { -+ assert("edward-1200", PageUptodate(pages[i])); -+ page_cache_release(pages[i]); -+#if REISER4_DEBUG -+ cryptcompress_inode_data(inode)->pgcount --; -+#endif -+ } -+ } else -+ spin_unlock_jnode(node); -+ /* FIXME-EDWARD: Use truncate_complete_page in the loop above instead */ -+ -+ jput(node); -+ /* put pages found here */ -+ forget_cluster_pages(pages, found); -+ truncate: -+ if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS) && -+ index == 0) -+ return; -+ reiser4_invalidate_pages(inode->i_mapping, -+ clust_to_pg(index, inode), -+ cluster_nrpages(inode), -+ even_cows); -+ assert("edward-1201", -+ ergo(!reiser4_inode_get_flag(inode, -+ REISER4_FILE_CONV_IN_PROGRESS), -+ jnode_truncate_ok(inode, index))); -+ return; -+} -+ -+/* Prepare cluster handle before(after) modifications -+ which are supposed to be committed. -+ -+ . grab cluster pages; -+ . reserve disk space; -+ . maybe read pages from disk and set the disk cluster dirty; -+ . maybe write hole; -+ . maybe create 'unprepped' disk cluster if the last one is fake -+ (i.e. is not represenred by any items) -+*/ -+ -+static int -+prepare_cluster(struct inode *inode, -+ loff_t file_off /* write position in the file */ , -+ loff_t to_file, /* bytes of users data to write to the file */ -+ reiser4_cluster_t * clust, page_cluster_op op) -+{ -+ int result = 0; -+ reiser4_slide_t *win = clust->win; -+ -+ reset_cluster_params(clust); -+ cluster_set_tfm_act(&clust->tc, TFMA_READ); -+#if REISER4_DEBUG -+ clust->ctx = get_current_context(); -+#endif -+ assert("edward-1190", op != PCL_UNKNOWN); -+ -+ clust->op = op; -+ -+ result = prepare_page_cluster(inode, clust, 1); -+ if (result) -+ return result; -+ assert("edward-1447", -+ ergo(clust->nr_pages != 0, jprivate(clust->pages[0]))); -+ assert("edward-1448", -+ ergo(clust->nr_pages != 0, -+ jnode_is_cluster_page(jprivate(clust->pages[0])))); -+ -+ result = reserve4cluster(inode, clust); -+ if (result) -+ goto err1; -+ result = read_some_cluster_pages(inode, clust); -+ if (result) { -+ free_reserved4cluster(inode, -+ clust, -+ estimate_update_cluster(inode) + -+ estimate_insert_cluster(inode)); -+ goto err1; -+ } -+ assert("edward-1124", clust->dstat != INVAL_DISK_CLUSTER); -+ -+ result = cryptcompress_make_unprepped_cluster(clust, inode); -+ if (result) -+ goto err2; -+ if (win && win->stat == HOLE_WINDOW) { -+ result = write_hole(inode, clust, file_off, to_file); -+ if (result) -+ goto err2; -+ } -+ return 0; -+ err2: -+ free_reserved4cluster(inode, clust, -+ estimate_update_cluster(inode)); -+ err1: -+ reiser4_release_cluster_pages_and_jnode(clust); -+ assert("edward-1125", result == -ENOSPC); -+ return result; -+} -+ -+/* set window by two offsets */ -+static void -+set_window(reiser4_cluster_t * clust, reiser4_slide_t * win, -+ struct inode *inode, loff_t o1, loff_t o2) -+{ -+ assert("edward-295", clust != NULL); -+ assert("edward-296", inode != NULL); -+ assert("edward-1071", win != NULL); -+ assert("edward-297", o1 <= o2); -+ -+ clust->index = off_to_clust(o1, inode); -+ -+ win->off = off_to_cloff(o1, inode); -+ win->count = min_count(inode_cluster_size(inode) - win->off, o2 - o1); -+ win->delta = 0; -+ -+ clust->win = win; -+} -+ -+static int -+set_cluster_by_window(struct inode *inode, reiser4_cluster_t * clust, -+ reiser4_slide_t * win, flow_t * f, loff_t file_off) -+{ -+ int result; -+ -+ assert("edward-197", clust != NULL); -+ assert("edward-1072", win != NULL); -+ assert("edward-198", inode != NULL); -+ -+ result = alloc_cluster_pgset(clust, cluster_nrpages(inode)); -+ if (result) -+ return result; -+ -+ if (file_off > inode->i_size) { -+ /* Uhmm, hole in cryptcompress file... */ -+ loff_t hole_size; -+ hole_size = file_off - inode->i_size; -+ -+ set_window(clust, win, inode, inode->i_size, file_off); -+ win->stat = HOLE_WINDOW; -+ if (win->off + hole_size < inode_cluster_size(inode)) -+ /* there is also user's data to append to the hole */ -+ win->delta = -+ min_count(inode_cluster_size(inode) - -+ (win->off + win->count), f->length); -+ return 0; -+ } -+ set_window(clust, win, inode, file_off, file_off + f->length); -+ win->stat = DATA_WINDOW; -+ return 0; -+} -+ -+int set_cluster_by_page(reiser4_cluster_t * clust, struct page * page, -+ int count) -+{ -+ int result = 0; -+ int (*setting_actor)(reiser4_cluster_t * clust, int count); -+ -+ assert("edward-1358", clust != NULL); -+ assert("edward-1359", page != NULL); -+ assert("edward-1360", page->mapping != NULL); -+ assert("edward-1361", page->mapping->host != NULL); -+ -+ setting_actor = (clust->pages ? reset_cluster_pgset : alloc_cluster_pgset); -+ result = setting_actor(clust, count); -+ clust->index = pg_to_clust(page->index, page->mapping->host); -+ return result; -+} -+ -+/* reset all the params that not get updated */ -+void reset_cluster_params(reiser4_cluster_t * clust) -+{ -+ assert("edward-197", clust != NULL); -+ -+ clust->dstat = INVAL_DISK_CLUSTER; -+ clust->tc.uptodate = 0; -+ clust->tc.len = 0; -+} -+ -+/* Core write procedure of cryptcompress plugin, which slices user's -+ flow into logical clusters, maps the last ones to the appropriate -+ page clusters, and tries to capture them. -+ If @buf != NULL, returns number of successfully written bytes, -+ otherwise returns error -+*/ -+static loff_t -+write_cryptcompress_flow(struct file *file, struct inode *inode, -+ const char __user *buf, size_t count, loff_t pos, -+ int *conv_occured) -+{ -+ int i; -+ flow_t f; -+ hint_t *hint; -+ int result = 0; -+ size_t to_write = 0; -+ loff_t file_off; -+ reiser4_slide_t win; -+ reiser4_cluster_t clust; -+ -+ assert("edward-161", reiser4_schedulable()); -+ assert("edward-748", cryptcompress_inode_ok(inode)); -+ assert("edward-159", current_blocksize == PAGE_CACHE_SIZE); -+ assert("edward-1274", get_current_context()->grabbed_blocks == 0); -+ -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) -+ return RETERR(-ENOMEM); -+ -+ result = load_file_hint(file, hint); -+ if (result) { -+ kfree(hint); -+ return result; -+ } -+ -+ result = -+ flow_by_inode_cryptcompress(inode, buf, 1 /* user space */ , -+ count, pos, WRITE_OP, &f); -+ if (result) -+ goto out; -+ to_write = f.length; -+ -+ /* current write position in file */ -+ file_off = pos; -+ reiser4_slide_init(&win); -+ cluster_init_read(&clust, &win); -+ clust.hint = hint; -+ -+ result = set_cluster_by_window(inode, &clust, &win, &f, file_off); -+ if (result) -+ goto out; -+ -+ if (next_window_stat(&win) == HOLE_WINDOW) { -+ result = write_conversion_hook(file, inode, pos, &clust, NULL); -+ if (result) -+ goto out; -+ result = -+ prepare_cluster(inode, file_off, f.length, &clust, -+ PCL_APPEND); -+ if (result) -+ goto out; -+ } -+ do { -+ char *src; -+ unsigned page_off, page_count; -+ -+ assert("edward-750", reiser4_schedulable()); -+ -+ result = write_conversion_hook(file, inode, pos, &clust, -+ conv_occured); -+ if (result || *conv_occured) -+ goto out; -+ result = -+ prepare_cluster(inode, file_off, f.length, &clust, -+ PCL_APPEND); -+ if (result) -+ goto out; -+ -+ assert("edward-751", cryptcompress_inode_ok(inode)); -+ assert("edward-204", win.stat == DATA_WINDOW); -+ assert("edward-1288", hint_is_valid(clust.hint)); -+ assert("edward-752", -+ znode_is_write_locked(hint->ext_coord.coord.node)); -+ -+ put_hint_cluster(&clust, inode, ZNODE_WRITE_LOCK); -+ -+ /* set write position in page */ -+ page_off = off_to_pgoff(win.off); -+ -+ /* copy user's data to cluster pages */ -+ for (i = off_to_pg(win.off), src = f.data; -+ i < count_to_nrpages(win.off + win.count); -+ i++, src += page_count) { -+ page_count = -+ cnt_to_pgcnt(win.off + win.count, i) - page_off; -+ -+ assert("edward-1039", -+ page_off + page_count <= PAGE_CACHE_SIZE); -+ assert("edward-287", clust.pages[i] != NULL); -+ -+ lock_page(clust.pages[i]); -+ result = -+ __copy_from_user((char *)kmap(clust.pages[i]) + -+ page_off, (char __user *)src, page_count); -+ kunmap(clust.pages[i]); -+ if (unlikely(result)) { -+ unlock_page(clust.pages[i]); -+ result = -EFAULT; -+ goto err2; -+ } -+ SetPageUptodate(clust.pages[i]); -+ unlock_page(clust.pages[i]); -+ page_off = 0; -+ } -+ assert("edward-753", cryptcompress_inode_ok(inode)); -+ -+ set_cluster_pages_dirty(&clust); -+ -+ result = try_capture_cluster(&clust, inode); -+ if (result) -+ goto err2; -+ -+ assert("edward-998", f.user == 1); -+ -+ move_flow_forward(&f, win.count); -+ -+ /* disk cluster may be already clean at this point */ -+ -+ /* . update cluster -+ . set hint for new offset -+ . unlock znode -+ . update inode -+ . balance dirty pages -+ */ -+ result = balance_dirty_page_cluster(&clust, inode, 0, f.length); -+ if (result) -+ goto err1; -+ assert("edward-755", hint->lh.owner == NULL); -+ reset_cluster_params(&clust); -+ continue; -+ err2: -+ reiser4_release_cluster_pages_and_jnode(&clust); -+ err1: -+ if (clust.reserved) -+ free_reserved4cluster(inode, -+ &clust, -+ estimate_update_cluster(inode)); -+ break; -+ } while (f.length); -+ out: -+ done_lh(&hint->lh); -+ if (result == -EEXIST) -+ warning("edward-1407", "write returns EEXIST!\n"); -+ -+ put_cluster_handle(&clust); -+ save_file_hint(file, hint); -+ kfree(hint); -+ if (buf) { -+ /* if nothing were written - there must be an error */ -+ assert("edward-195", ergo((to_write == f.length), -+ (result < 0 || *conv_occured))); -+ return (to_write - f.length) ? (to_write - f.length) : result; -+ } -+ return result; -+} -+ -+/** -+ * write_cryptcompress - write of struct file_operations -+ * @file: file to write to -+ * @buf: address of user-space buffer -+ * @read_amount: number of bytes to write -+ * @off: position in file to write to -+ * -+ * This is implementation of vfs's write method of struct file_operations for -+ * cryptcompress plugin. -+ */ -+ssize_t write_cryptcompress(struct file *file, const char __user *buf, -+ size_t count, loff_t *off, int *conv) -+{ -+ ssize_t result; -+ struct inode *inode; -+ reiser4_context *ctx; -+ loff_t pos = *off; -+ cryptcompress_info_t *info; -+ -+ assert("edward-1449", *conv == 0); -+ -+ inode = file->f_dentry->d_inode; -+ assert("edward-196", cryptcompress_inode_ok(inode)); -+ -+ info = cryptcompress_inode_data(inode); -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ mutex_lock(&inode->i_mutex); -+ -+ result = generic_write_checks(file, &pos, &count, 0); -+ if (unlikely(result != 0)) -+ goto out; -+ if (unlikely(count == 0)) -+ goto out; -+ result = remove_suid(file->f_dentry); -+ if (unlikely(result != 0)) -+ goto out; -+ /* remove_suid might create a transaction */ -+ reiser4_txn_restart(ctx); -+ -+ result = write_cryptcompress_flow(file, inode, buf, count, pos, conv); -+ -+ if (result < 0) -+ goto out; -+ /* update position in a file */ -+ *off = pos + result; -+ out: -+ mutex_unlock(&inode->i_mutex); -+ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+int readpages_cryptcompress(struct file *file, struct address_space *mapping, -+ struct list_head *pages, unsigned nr_pages) -+{ -+ reiser4_context * ctx; -+ int ret; -+ -+ ctx = reiser4_init_context(mapping->host->i_sb); -+ if (IS_ERR(ctx)) { -+ ret = PTR_ERR(ctx); -+ goto err; -+ } -+ /* crc files can be built of ctail items only */ -+ ret = readpages_ctail(file, mapping, pages); -+ reiser4_exit_context(ctx); -+ if (ret) { -+err: -+ put_pages_list(pages); -+ } -+ return ret; -+} -+ -+static reiser4_block_nr cryptcompress_estimate_read(struct inode *inode) -+{ -+ /* reserve one block to update stat data item */ -+ assert("edward-1193", -+ inode_file_plugin(inode)->estimate.update == -+ estimate_update_common); -+ return estimate_update_common(inode); -+} -+ -+/** -+ * read_cryptcompress - read of struct file_operations -+ * @file: file to read from -+ * @buf: address of user-space buffer -+ * @read_amount: number of bytes to read -+ * @off: position in file to read from -+ * -+ * This is implementation of vfs's read method of struct file_operations for -+ * cryptcompress plugin. -+ */ -+ssize_t read_cryptcompress(struct file * file, char __user *buf, size_t size, -+ loff_t * off) -+{ -+ ssize_t result; -+ struct inode *inode; -+ reiser4_context *ctx; -+ cryptcompress_info_t *info; -+ reiser4_block_nr needed; -+ -+ inode = file->f_dentry->d_inode; -+ assert("edward-1194", !reiser4_inode_get_flag(inode, REISER4_NO_SD)); -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ info = cryptcompress_inode_data(inode); -+ needed = cryptcompress_estimate_read(inode); -+ -+ result = reiser4_grab_space(needed, BA_CAN_COMMIT); -+ if (result != 0) { -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ LOCK_CNT_INC(inode_sem_r); -+ -+ result = do_sync_read(file, buf, size, off); -+ -+ LOCK_CNT_DEC(inode_sem_r); -+ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ -+ return result; -+} -+ -+/* If @index > 0, find real disk cluster of the index (@index - 1), -+ If @index == 0 find the real disk cluster of the object of maximal index. -+ Keep incremented index of the result in @found. -+ It succes was returned: -+ (@index == 0 && @found == 0) means that the object doesn't have real disk -+ clusters. -+ (@index != 0 && @found == 0) means that disk cluster of (@index -1) doesn't -+ exist. -+*/ -+static int -+find_real_disk_cluster(struct inode *inode, cloff_t * found, cloff_t index) -+{ -+ int result; -+ reiser4_key key; -+ loff_t offset; -+ hint_t *hint; -+ lock_handle *lh; -+ lookup_bias bias; -+ coord_t *coord; -+ item_plugin *iplug; -+ -+ assert("edward-1131", inode != NULL); -+ assert("edward-95", cryptcompress_inode_ok(inode)); -+ -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) -+ return RETERR(-ENOMEM); -+ hint_init_zero(hint); -+ lh = &hint->lh; -+ -+ bias = (index ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN); -+ offset = -+ (index ? clust_to_off(index, inode) - -+ 1 : get_key_offset(reiser4_max_key())); -+ -+ key_by_inode_cryptcompress(inode, offset, &key); -+ -+ /* find the last item of this object */ -+ result = -+ find_cluster_item(hint, &key, ZNODE_READ_LOCK, NULL /* ra_info */, -+ bias, 0); -+ if (cbk_errored(result)) { -+ done_lh(lh); -+ kfree(hint); -+ return result; -+ } -+ if (result == CBK_COORD_NOTFOUND) { -+ /* no real disk clusters */ -+ done_lh(lh); -+ kfree(hint); -+ *found = 0; -+ return 0; -+ } -+ /* disk cluster is found */ -+ coord = &hint->ext_coord.coord; -+ coord_clear_iplug(coord); -+ result = zload(coord->node); -+ if (unlikely(result)) { -+ done_lh(lh); -+ kfree(hint); -+ return result; -+ } -+ iplug = item_plugin_by_coord(coord); -+ assert("edward-277", iplug == item_plugin_by_id(CTAIL_ID)); -+ assert("edward-1202", ctail_ok(coord)); -+ -+ item_key_by_coord(coord, &key); -+ *found = off_to_clust(get_key_offset(&key), inode) + 1; -+ -+ assert("edward-1132", ergo(index, index == *found)); -+ -+ zrelse(coord->node); -+ done_lh(lh); -+ kfree(hint); -+ return 0; -+} -+ -+static int find_fake_appended(struct inode *inode, cloff_t * index) -+{ -+ return find_real_disk_cluster(inode, index, -+ 0 /* find last real one */ ); -+} -+ -+/* Set left coord when unit is not found after node_lookup() -+ This takes into account that there can be holes in a sequence -+ of disk clusters */ -+ -+static void adjust_left_coord(coord_t * left_coord) -+{ -+ switch (left_coord->between) { -+ case AFTER_UNIT: -+ left_coord->between = AFTER_ITEM; -+ case AFTER_ITEM: -+ case BEFORE_UNIT: -+ break; -+ default: -+ impossible("edward-1204", "bad left coord to cut"); -+ } -+ return; -+} -+ -+#define CRC_CUT_TREE_MIN_ITERATIONS 64 -+int -+cut_tree_worker_cryptcompress(tap_t * tap, const reiser4_key * from_key, -+ const reiser4_key * to_key, -+ reiser4_key * smallest_removed, -+ struct inode *object, int truncate, int *progress) -+{ -+ lock_handle next_node_lock; -+ coord_t left_coord; -+ int result; -+ -+ assert("edward-1158", tap->coord->node != NULL); -+ assert("edward-1159", znode_is_write_locked(tap->coord->node)); -+ assert("edward-1160", znode_get_level(tap->coord->node) == LEAF_LEVEL); -+ -+ *progress = 0; -+ init_lh(&next_node_lock); -+ -+ while (1) { -+ znode *node; /* node from which items are cut */ -+ node_plugin *nplug; /* node plugin for @node */ -+ -+ node = tap->coord->node; -+ -+ /* Move next_node_lock to the next node on the left. */ -+ result = -+ reiser4_get_left_neighbor(&next_node_lock, node, -+ ZNODE_WRITE_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ if (result != 0 && result != -E_NO_NEIGHBOR) -+ break; -+ /* FIXME-EDWARD: Check can we delete the node as a whole. */ -+ result = reiser4_tap_load(tap); -+ if (result) -+ return result; -+ -+ /* Prepare the second (right) point for cut_node() */ -+ if (*progress) -+ coord_init_last_unit(tap->coord, node); -+ -+ else if (item_plugin_by_coord(tap->coord)->b.lookup == NULL) -+ /* set rightmost unit for the items without lookup method */ -+ tap->coord->unit_pos = coord_last_unit_pos(tap->coord); -+ -+ nplug = node->nplug; -+ -+ assert("edward-1161", nplug); -+ assert("edward-1162", nplug->lookup); -+ -+ /* left_coord is leftmost unit cut from @node */ -+ result = nplug->lookup(node, from_key, FIND_EXACT, &left_coord); -+ -+ if (IS_CBKERR(result)) -+ break; -+ -+ if (result == CBK_COORD_NOTFOUND) -+ adjust_left_coord(&left_coord); -+ -+ /* adjust coordinates so that they are set to existing units */ -+ if (coord_set_to_right(&left_coord) -+ || coord_set_to_left(tap->coord)) { -+ result = 0; -+ break; -+ } -+ -+ if (coord_compare(&left_coord, tap->coord) == -+ COORD_CMP_ON_RIGHT) { -+ /* keys from @from_key to @to_key are not in the tree */ -+ result = 0; -+ break; -+ } -+ -+ /* cut data from one node */ -+ *smallest_removed = *reiser4_min_key(); -+ result = kill_node_content(&left_coord, -+ tap->coord, -+ from_key, -+ to_key, -+ smallest_removed, -+ next_node_lock.node, -+ object, truncate); -+#if REISER4_DEBUG -+ /*node_check(node, ~0U); */ -+#endif -+ reiser4_tap_relse(tap); -+ -+ if (result) -+ break; -+ -+ ++(*progress); -+ -+ /* Check whether all items with keys >= from_key were removed -+ * from the tree. */ -+ if (keyle(smallest_removed, from_key)) -+ /* result = 0; */ -+ break; -+ -+ if (next_node_lock.node == NULL) -+ break; -+ -+ result = reiser4_tap_move(tap, &next_node_lock); -+ done_lh(&next_node_lock); -+ if (result) -+ break; -+ -+ /* Break long cut_tree operation (deletion of a large file) if -+ * atom requires commit. */ -+ if (*progress > CRC_CUT_TREE_MIN_ITERATIONS -+ && current_atom_should_commit()) { -+ result = -E_REPEAT; -+ break; -+ } -+ } -+ done_lh(&next_node_lock); -+ return result; -+} -+ -+/* Append or expand hole in two steps (exclusive access should be aquired!) -+ 1) write zeroes to the current real cluster, -+ 2) expand hole via fake clusters (just increase i_size) */ -+static int -+cryptcompress_append_hole(struct inode *inode /*contains old i_size */ , -+ loff_t new_size) -+{ -+ int result = 0; -+ hint_t *hint; -+ lock_handle *lh; -+ loff_t hole_size; -+ int nr_zeroes; -+ reiser4_slide_t win; -+ reiser4_cluster_t clust; -+ -+ assert("edward-1133", inode->i_size < new_size); -+ assert("edward-1134", reiser4_schedulable()); -+ assert("edward-1135", cryptcompress_inode_ok(inode)); -+ assert("edward-1136", current_blocksize == PAGE_CACHE_SIZE); -+ assert("edward-1333", off_to_cloff(inode->i_size, inode) != 0); -+ -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) -+ return RETERR(-ENOMEM); -+ hint_init_zero(hint); -+ lh = &hint->lh; -+ -+ reiser4_slide_init(&win); -+ cluster_init_read(&clust, &win); -+ clust.hint = hint; -+ -+ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode)); -+ if (result) -+ goto out; -+ if (off_to_cloff(inode->i_size, inode) == 0) -+ goto fake_append; -+ hole_size = new_size - inode->i_size; -+ nr_zeroes = -+ inode_cluster_size(inode) - off_to_cloff(inode->i_size, inode); -+ if (hole_size < nr_zeroes) -+ nr_zeroes = hole_size; -+ set_window(&clust, &win, inode, inode->i_size, -+ inode->i_size + nr_zeroes); -+ win.stat = HOLE_WINDOW; -+ -+ assert("edward-1137", -+ clust.index == off_to_clust(inode->i_size, inode)); -+ -+ result = prepare_cluster(inode, 0, 0, &clust, PCL_APPEND); -+ -+ assert("edward-1271", !result || result == -ENOSPC); -+ if (result) -+ goto out; -+ assert("edward-1139", -+ clust.dstat == PREP_DISK_CLUSTER || -+ clust.dstat == UNPR_DISK_CLUSTER); -+ -+ assert("edward-1431", hole_size >= nr_zeroes); -+ if (hole_size == nr_zeroes) -+ /* nothing to append anymore */ -+ goto out; -+ fake_append: -+ INODE_SET_FIELD(inode, i_size, new_size); -+ out: -+ done_lh(lh); -+ kfree(hint); -+ put_cluster_handle(&clust); -+ return result; -+} -+ -+#if REISER4_DEBUG -+static int -+pages_truncate_ok(struct inode *inode, loff_t old_size, pgoff_t start) -+{ -+ struct pagevec pvec; -+ int i; -+ int count; -+ int rest; -+ -+ rest = count_to_nrpages(old_size) - start; -+ -+ pagevec_init(&pvec, 0); -+ count = min_count(pagevec_space(&pvec), rest); -+ -+ while (rest) { -+ count = min_count(pagevec_space(&pvec), rest); -+ pvec.nr = find_get_pages(inode->i_mapping, start, -+ count, pvec.pages); -+ for (i = 0; i < pagevec_count(&pvec); i++) { -+ if (PageUptodate(pvec.pages[i])) { -+ warning("edward-1205", -+ "truncated page of index %lu is uptodate", -+ pvec.pages[i]->index); -+ return 0; -+ } -+ } -+ start += count; -+ rest -= count; -+ pagevec_release(&pvec); -+ } -+ return 1; -+} -+ -+static int body_truncate_ok(struct inode *inode, cloff_t aidx) -+{ -+ int result; -+ cloff_t raidx; -+ -+ result = find_fake_appended(inode, &raidx); -+ return !result && (aidx == raidx); -+} -+#endif -+ -+static int -+update_cryptcompress_size(struct inode *inode, reiser4_key * key, int update_sd) -+{ -+ return (get_key_offset(key) & ((loff_t) (inode_cluster_size(inode)) - 1) -+ ? 0 : reiser4_update_file_size(inode, key, update_sd)); -+} -+ -+/* prune cryptcompress file in two steps (exclusive access should be acquired!) -+ 1) cut all disk clusters but the last one partially truncated, -+ 2) set zeroes and capture last partially truncated page cluster if the last -+ one exists, otherwise truncate via prune fake cluster (just decrease i_size) -+*/ -+static int -+prune_cryptcompress(struct inode *inode, loff_t new_size, int update_sd, -+ cloff_t aidx) -+{ -+ int result = 0; -+ unsigned nr_zeroes; -+ loff_t to_prune; -+ loff_t old_size; -+ cloff_t ridx; -+ -+ hint_t *hint; -+ lock_handle *lh; -+ reiser4_slide_t win; -+ reiser4_cluster_t clust; -+ -+ assert("edward-1140", inode->i_size >= new_size); -+ assert("edward-1141", reiser4_schedulable()); -+ assert("edward-1142", cryptcompress_inode_ok(inode)); -+ assert("edward-1143", current_blocksize == PAGE_CACHE_SIZE); -+ -+ old_size = inode->i_size; -+ -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) -+ return RETERR(-ENOMEM); -+ hint_init_zero(hint); -+ lh = &hint->lh; -+ -+ reiser4_slide_init(&win); -+ cluster_init_read(&clust, &win); -+ clust.hint = hint; -+ -+ /* rightmost completely truncated cluster */ -+ ridx = count_to_nrclust(new_size, inode); -+ -+ assert("edward-1174", ridx <= aidx); -+ old_size = inode->i_size; -+ if (ridx != aidx) { -+ result = cut_file_items(inode, -+ clust_to_off(ridx, inode), -+ update_sd, -+ clust_to_off(aidx, inode), -+ update_cryptcompress_size); -+ if (result) -+ goto out; -+ } -+ if (!off_to_cloff(new_size, inode)) { -+ /* no partially truncated clusters */ -+ assert("edward-1145", inode->i_size == new_size); -+ goto finish; -+ } -+ assert("edward-1146", new_size < inode->i_size); -+ -+ to_prune = inode->i_size - new_size; -+ -+ /* partial truncate of leftmost cluster, -+ first check if it is fake */ -+ result = find_real_disk_cluster(inode, &aidx, ridx); -+ if (result) -+ goto out; -+ if (!aidx) -+ /* yup, this is fake one */ -+ goto finish; -+ -+ assert("edward-1148", aidx == ridx); -+ -+ /* do partial truncate of the leftmost page cluster, -+ then try to capture this one */ -+ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode)); -+ if (result) -+ goto out; -+ nr_zeroes = (off_to_pgoff(new_size) ? -+ PAGE_CACHE_SIZE - off_to_pgoff(new_size) : 0); -+ set_window(&clust, &win, inode, new_size, new_size + nr_zeroes); -+ win.stat = HOLE_WINDOW; -+ -+ assert("edward-1149", clust.index == ridx - 1); -+ -+ result = prepare_cluster(inode, 0, 0, &clust, PCL_TRUNCATE); -+ if (result) -+ goto out; -+ assert("edward-1151", -+ clust.dstat == PREP_DISK_CLUSTER || -+ clust.dstat == UNPR_DISK_CLUSTER); -+ -+ assert("edward-1191", inode->i_size == new_size); -+ assert("edward-1206", body_truncate_ok(inode, ridx)); -+ finish: -+ /* drop all the pages that don't have jnodes (i.e. pages -+ which can not be truncated by cut_file_items() because -+ of holes represented by fake disk clusters) including -+ the pages of partially truncated cluster which was -+ released by prepare_cluster() */ -+ truncate_inode_pages(inode->i_mapping, new_size); -+ INODE_SET_FIELD(inode, i_size, new_size); -+ out: -+ assert("edward-1334", !result || result == -ENOSPC); -+ assert("edward-1209", -+ pages_truncate_ok(inode, old_size, count_to_nrpages(new_size))); -+ done_lh(lh); -+ kfree(hint); -+ put_cluster_handle(&clust); -+ return result; -+} -+ -+/* Prepare cryptcompress file for truncate: -+ prune or append rightmost fake logical clusters (if any) -+*/ -+static int -+start_truncate_fake(struct inode *inode, cloff_t aidx, loff_t new_size, -+ int update_sd) -+{ -+ int result = 0; -+ int bytes; -+ -+ if (new_size > inode->i_size) { -+ /* append */ -+ if (inode->i_size < clust_to_off(aidx, inode)) -+ /* no fake bytes */ -+ return 0; -+ bytes = new_size - inode->i_size; -+ INODE_SET_FIELD(inode, i_size, inode->i_size + bytes); -+ } else { -+ /* prune */ -+ if (inode->i_size <= clust_to_off(aidx, inode)) -+ /* no fake bytes */ -+ return 0; -+ bytes = -+ inode->i_size - max_count(new_size, -+ clust_to_off(aidx, inode)); -+ if (!bytes) -+ return 0; -+ INODE_SET_FIELD(inode, i_size, inode->i_size - bytes); -+ /* In the case of fake prune we need to drop page cluster. -+ There are only 2 cases for partially truncated page: -+ 1. If is is dirty, therefore it is anonymous -+ (was dirtied via mmap), and will be captured -+ later via ->capture(). -+ 2. If is clean, therefore it is filled by zeroes. -+ In both cases we don't need to make it dirty and -+ capture here. -+ */ -+ truncate_inode_pages(inode->i_mapping, inode->i_size); -+ } -+ if (update_sd) -+ result = update_sd_cryptcompress(inode); -+ return result; -+} -+ -+/* This is called in setattr_cryptcompress when it is used to truncate, -+ and in delete_cryptcompress */ -+static int cryptcompress_truncate(struct inode *inode, /* old size */ -+ loff_t new_size, /* new size */ -+ int update_sd) -+{ -+ int result; -+ cloff_t aidx; -+ -+ result = find_fake_appended(inode, &aidx); -+ if (result) -+ return result; -+ assert("edward-1208", -+ ergo(aidx > 0, inode->i_size > clust_to_off(aidx - 1, inode))); -+ -+ result = start_truncate_fake(inode, aidx, new_size, update_sd); -+ if (result) -+ return result; -+ if (inode->i_size == new_size) -+ /* nothing to truncate anymore */ -+ return 0; -+ result = (inode->i_size < new_size ? -+ cryptcompress_append_hole(inode, new_size) : -+ prune_cryptcompress(inode, new_size, update_sd, aidx)); -+ if (!result && update_sd) -+ result = update_sd_cryptcompress(inode); -+ return result; -+} -+ -+static void clear_moved_tag_cluster(struct address_space * mapping, -+ reiser4_cluster_t * clust) -+{ -+ int i; -+ void * ret; -+ read_lock_irq(&mapping->tree_lock); -+ for (i = 0; i < clust->nr_pages; i++) { -+ assert("edward-1438", clust->pages[i] != NULL); -+ ret = radix_tree_tag_clear(&mapping->page_tree, -+ clust->pages[i]->index, -+ PAGECACHE_TAG_REISER4_MOVED); -+ assert("edward-1439", ret == clust->pages[i]); -+ } -+ read_unlock_irq(&mapping->tree_lock); -+} -+ -+/* Capture an anonymous pager cluster. (Page cluser is -+ anonymous if it contains at least one anonymous page */ -+static int -+capture_page_cluster(reiser4_cluster_t * clust, struct inode *inode) -+{ -+ int result; -+ -+ assert("edward-1073", clust != NULL); -+ assert("edward-1074", inode != NULL); -+ assert("edward-1075", clust->dstat == INVAL_DISK_CLUSTER); -+ -+ result = prepare_cluster(inode, 0, 0, clust, PCL_APPEND); -+ if (result) -+ return result; -+ set_cluster_pages_dirty(clust); -+ clear_moved_tag_cluster(inode->i_mapping, clust); -+ -+ result = try_capture_cluster(clust, inode); -+ put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK); -+ if (unlikely(result)) { -+ /* set cleared tag back, so it will be -+ possible to capture it again later */ -+ read_lock_irq(&inode->i_mapping->tree_lock); -+ radix_tree_tag_set(&inode->i_mapping->page_tree, -+ clust_to_pg(clust->index, inode), -+ PAGECACHE_TAG_REISER4_MOVED); -+ read_unlock_irq(&inode->i_mapping->tree_lock); -+ -+ reiser4_release_cluster_pages_and_jnode(clust); -+ } -+ return result; -+} -+ -+#define MAX_CLUSTERS_TO_CAPTURE(inode) (1024 >> cluster_nrpages_shift(inode)) -+ -+/* read lock should be acquired */ -+static int -+capture_anonymous_clusters(struct address_space *mapping, pgoff_t * index, -+ int to_capture) -+{ -+ int result = 0; -+ int found; -+ struct page *page = NULL; -+ hint_t *hint; -+ lock_handle *lh; -+ reiser4_cluster_t clust; -+ -+ assert("edward-1127", mapping != NULL); -+ assert("edward-1128", mapping->host != NULL); -+ assert("edward-1440", mapping->host->i_mapping == mapping); -+ -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) -+ return RETERR(-ENOMEM); -+ hint_init_zero(hint); -+ lh = &hint->lh; -+ -+ cluster_init_read(&clust, NULL); -+ clust.hint = hint; -+ -+ result = alloc_cluster_pgset(&clust, cluster_nrpages(mapping->host)); -+ if (result) -+ goto out; -+ -+ while (to_capture > 0) { -+ found = -+ find_get_pages_tag(mapping, index, -+ PAGECACHE_TAG_REISER4_MOVED, 1, &page); -+ if (!found) { -+ *index = (pgoff_t) - 1; -+ break; -+ } -+ assert("edward-1109", page != NULL); -+ -+ move_cluster_forward(&clust, mapping->host, page->index); -+ result = capture_page_cluster(&clust, mapping->host); -+ page_cache_release(page); -+ if (result) -+ break; -+ to_capture -= clust.nr_pages; -+ } -+ if (result) { -+ warning("edward-1077", -+ "Cannot capture anon pages: result=%i (captured=%d)\n", -+ result, -+ ((__u32) MAX_CLUSTERS_TO_CAPTURE(mapping->host)) - -+ to_capture); -+ } else { -+ /* something had to be found */ -+ assert("edward-1078", -+ to_capture <= MAX_CLUSTERS_TO_CAPTURE(mapping->host)); -+ if (to_capture <= 0) -+ /* there may be left more pages */ -+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); -+ } -+ out: -+ done_lh(lh); -+ kfree(hint); -+ put_cluster_handle(&clust); -+ return result; -+} -+ -+/* Check mapping for existence of not captured dirty pages. -+ This returns !0 if either page tree contains pages tagged -+ PAGECACHE_TAG_REISER4_MOVED */ -+static int cryptcompress_inode_has_anon_pages(struct inode *inode) -+{ -+ return mapping_tagged(inode->i_mapping, PAGECACHE_TAG_REISER4_MOVED); -+} -+ -+/* this is implementation of vfs's writepages method of struct -+ address_space_operations */ -+int -+writepages_cryptcompress(struct address_space *mapping, -+ struct writeback_control *wbc) -+{ -+ int result; -+ int to_capture; -+ pgoff_t nrpages; -+ pgoff_t index = 0; -+ cryptcompress_info_t *info; -+ struct inode *inode; -+ -+ inode = mapping->host; -+ if (!cryptcompress_inode_has_anon_pages(inode)) { -+ result = 0; -+ goto end; -+ } -+ -+ info = cryptcompress_inode_data(inode); -+ nrpages = count_to_nrpages(i_size_read(inode)); -+ -+ if (wbc->sync_mode != WB_SYNC_ALL) -+ to_capture = -+ min_count(wbc->nr_to_write, MAX_CLUSTERS_TO_CAPTURE(inode)); -+ else -+ to_capture = MAX_CLUSTERS_TO_CAPTURE(inode); -+ do { -+ reiser4_context *ctx; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) { -+ result = PTR_ERR(ctx); -+ break; -+ } -+ ctx->nobalance = 1; -+ -+ assert("edward-1079", -+ lock_stack_isclean(get_current_lock_stack())); -+ -+ LOCK_CNT_INC(inode_sem_r); -+ -+ result = -+ capture_anonymous_clusters(inode->i_mapping, &index, -+ to_capture); -+ -+ if (result != 0 || wbc->sync_mode != WB_SYNC_ALL) { -+ reiser4_exit_context(ctx); -+ break; -+ } -+ result = txnmgr_force_commit_all(inode->i_sb, 0); -+ reiser4_exit_context(ctx); -+ } while (result == 0 && index < nrpages); -+ -+ end: -+ if (is_in_reiser4_context()) { -+ if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) { -+ /* there are already pages to flush, flush them out, do -+ not delay until end of reiser4_sync_inodes */ -+ reiser4_writeout(inode->i_sb, wbc); -+ get_current_context()->nr_captured = 0; -+ } -+ } -+ return result; -+} -+ -+/* plugin->u.file.mmap */ -+int mmap_cryptcompress(struct file *file, struct vm_area_struct *vma) -+{ -+ int result; -+ struct inode *inode; -+ reiser4_context *ctx; -+ -+ inode = file->f_dentry->d_inode; -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ /* -+ * generic_file_mmap will do update_atime. Grab space for stat data -+ * update. -+ */ -+ result = reiser4_grab_space_force -+ (inode_file_plugin(inode)->estimate.update(inode), -+ BA_CAN_COMMIT); -+ if (result) { -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ result = generic_file_mmap(file, vma); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/* plugin->u.file.release */ -+/* plugin->u.file.get_block */ -+ -+/* this is implementation of delete method of file plugin for -+ cryptcompress objects */ -+int delete_object_cryptcompress(struct inode *inode) -+{ -+ int result; -+ -+ assert("edward-429", inode->i_nlink == 0); -+ -+ reiser4_txn_restart_current(); -+ -+ result = cryptcompress_truncate(inode, 0, 0); -+ if (result) { -+ warning("edward-430", -+ "cannot truncate cryptcompress file %lli: %i", -+ (unsigned long long)get_inode_oid(inode), -+ result); -+ } -+ truncate_inode_pages(inode->i_mapping, 0); -+ /* and remove stat data */ -+ return reiser4_delete_object_common(inode); -+} -+ -+/* plugin->u.file.setattr method -+ This implements actual truncate (see comments in reiser4/page_cache.c) */ -+int setattr_cryptcompress(struct dentry *dentry, struct iattr *attr) -+{ -+ int result; -+ struct inode *inode; -+ -+ inode = dentry->d_inode; -+ if (attr->ia_valid & ATTR_SIZE) { -+ if (inode->i_size != attr->ia_size) { -+ reiser4_context *ctx; -+ loff_t old_size; -+ -+ ctx = reiser4_init_context(dentry->d_inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ inode_check_scale(inode, inode->i_size, attr->ia_size); -+ -+ old_size = inode->i_size; -+ -+ result = -+ cryptcompress_truncate(inode, attr->ia_size, -+ 1 /* update stat data */ ); -+ if (result) { -+ warning("edward-1192", -+ "truncate_cryptcompress failed: oid %lli, " -+ "old size %lld, new size %lld, retval %d", -+ (unsigned long long) -+ get_inode_oid(inode), old_size, -+ attr->ia_size, result); -+ } -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ } else -+ result = 0; -+ } else -+ result = reiser4_setattr_common(dentry, attr); -+ return result; -+} -+ -+/* sendfile_cryptcompress - sendfile of struct file_operations */ -+ssize_t -+sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count, -+ read_actor_t actor, void *target) -+{ -+ reiser4_context *ctx; -+ ssize_t result; -+ struct inode *inode; -+ cryptcompress_info_t *info; -+ -+ inode = file->f_dentry->d_inode; -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ /* -+ * generic_file_sndfile may want to call update_atime. Grab space for -+ * stat data update -+ */ -+ result = reiser4_grab_space(estimate_update_common(inode), -+ BA_CAN_COMMIT); -+ if (result) -+ goto exit; -+ info = cryptcompress_inode_data(inode); -+ -+ result = generic_file_sendfile(file, ppos, count, actor, target); -+ exit: -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/* -+ * release_cryptcompress - release of struct file_operations -+ * @inode: inode of released file -+ * @file: file to release -+ */ -+int release_cryptcompress(struct inode *inode, struct file *file) -+{ -+ reiser4_context *ctx = reiser4_init_context(inode->i_sb); -+ -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ reiser4_free_file_fsdata(file); -+ reiser4_exit_context(ctx); -+ return 0; -+} -+ -+#if 0 -+int prepare_write_cryptcompress(struct file *file, struct page *page, -+ unsigned from, unsigned to) -+{ -+ return prepare_write_common(file, page, from, to); -+} -+#endif /* 0 */ -+ -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/cryptcompress.h linux-2.6.20/fs/reiser4/plugin/file/cryptcompress.h ---- linux-2.6.20.orig/fs/reiser4/plugin/file/cryptcompress.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/file/cryptcompress.h 2007-05-06 14:50:43.774999471 +0400 -@@ -0,0 +1,554 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+/* See http://www.namesys.com/cryptcompress_design.html */ -+ -+#if !defined( __FS_REISER4_CRYPTCOMPRESS_H__ ) -+#define __FS_REISER4_CRYPTCOMPRESS_H__ -+ -+#include "../../page_cache.h" -+#include "../compress/compress.h" -+#include "../crypto/cipher.h" -+ -+#include -+ -+#define MIN_CLUSTER_SHIFT PAGE_CACHE_SHIFT -+#define MAX_CLUSTER_SHIFT 16 -+#define MAX_CLUSTER_NRPAGES (1U << MAX_CLUSTER_SHIFT >> PAGE_CACHE_SHIFT) -+#define DC_CHECKSUM_SIZE 4 -+ -+#define MIN_LATTICE_FACTOR 1 -+#define MAX_LATTICE_FACTOR 32 -+ -+/* this mask contains all non-standard plugins that might -+ be present in reiser4-specific part of inode managed by -+ cryptcompress file plugin */ -+#define cryptcompress_mask \ -+ ((1 << PSET_FILE) | \ -+ (1 << PSET_CLUSTER) | \ -+ (1 << PSET_CIPHER) | \ -+ (1 << PSET_DIGEST) | \ -+ (1 << PSET_COMPRESSION) | \ -+ (1 << PSET_COMPRESSION_MODE)) -+ -+static inline loff_t min_count(loff_t a, loff_t b) -+{ -+ return (a < b ? a : b); -+} -+ -+static inline loff_t max_count(loff_t a, loff_t b) -+{ -+ return (a > b ? a : b); -+} -+ -+#if REISER4_DEBUG -+static inline int cluster_shift_ok(int shift) -+{ -+ return (shift >= MIN_CLUSTER_SHIFT) && (shift <= MAX_CLUSTER_SHIFT); -+} -+#endif -+ -+typedef struct tfm_stream { -+ __u8 *data; -+ size_t size; -+} tfm_stream_t; -+ -+typedef enum { -+ INPUT_STREAM, -+ OUTPUT_STREAM, -+ LAST_STREAM -+} tfm_stream_id; -+ -+typedef tfm_stream_t *tfm_unit[LAST_STREAM]; -+ -+static inline __u8 *ts_data(tfm_stream_t * stm) -+{ -+ assert("edward-928", stm != NULL); -+ return stm->data; -+} -+ -+static inline size_t ts_size(tfm_stream_t * stm) -+{ -+ assert("edward-929", stm != NULL); -+ return stm->size; -+} -+ -+static inline void set_ts_size(tfm_stream_t * stm, size_t size) -+{ -+ assert("edward-930", stm != NULL); -+ -+ stm->size = size; -+} -+ -+static inline int alloc_ts(tfm_stream_t ** stm) -+{ -+ assert("edward-931", stm); -+ assert("edward-932", *stm == NULL); -+ -+ *stm = kmalloc(sizeof **stm, reiser4_ctx_gfp_mask_get()); -+ if (*stm == NULL) -+ return -ENOMEM; -+ memset(*stm, 0, sizeof **stm); -+ return 0; -+} -+ -+static inline void free_ts(tfm_stream_t * stm) -+{ -+ assert("edward-933", !ts_data(stm)); -+ assert("edward-934", !ts_size(stm)); -+ -+ kfree(stm); -+} -+ -+static inline int alloc_ts_data(tfm_stream_t * stm, size_t size) -+{ -+ assert("edward-935", !ts_data(stm)); -+ assert("edward-936", !ts_size(stm)); -+ assert("edward-937", size != 0); -+ -+ stm->data = reiser4_vmalloc(size); -+ if (!stm->data) -+ return -ENOMEM; -+ set_ts_size(stm, size); -+ return 0; -+} -+ -+static inline void free_ts_data(tfm_stream_t * stm) -+{ -+ assert("edward-938", equi(ts_data(stm), ts_size(stm))); -+ -+ if (ts_data(stm)) -+ vfree(ts_data(stm)); -+ memset(stm, 0, sizeof *stm); -+} -+ -+/* Write modes for item conversion in flush convert phase */ -+typedef enum { -+ CRC_APPEND_ITEM = 1, -+ CRC_OVERWRITE_ITEM = 2, -+ CRC_CUT_ITEM = 3 -+} cryptcompress_write_mode_t; -+ -+typedef enum { -+ PCL_UNKNOWN = 0, /* invalid option */ -+ PCL_APPEND = 1, /* append and/or overwrite */ -+ PCL_TRUNCATE = 2 /* truncate */ -+} page_cluster_op; -+ -+/* Reiser4 file write/read transforms page cluster into disk cluster (and back) -+ using crypto/compression transforms implemented by reiser4 transform plugins. -+ Before each transform we allocate a pair of streams (tfm_unit) and assemble -+ page cluster into the input one. After transform we split output stream into -+ a set of items (disk cluster). -+*/ -+typedef struct tfm_cluster { -+ coa_set coa; -+ tfm_unit tun; -+ tfm_action act; -+ int uptodate; -+ int lsize; /* size of the logical cluster */ -+ int len; /* length of the transform stream */ -+} tfm_cluster_t; -+ -+static inline coa_t get_coa(tfm_cluster_t * tc, reiser4_compression_id id, tfm_action act) -+{ -+ return tc->coa[id][act]; -+} -+ -+static inline void -+set_coa(tfm_cluster_t * tc, reiser4_compression_id id, tfm_action act, coa_t coa) -+{ -+ tc->coa[id][act] = coa; -+} -+ -+static inline int -+alloc_coa(tfm_cluster_t * tc, compression_plugin * cplug) -+{ -+ coa_t coa; -+ -+ coa = cplug->alloc(tc->act); -+ if (IS_ERR(coa)) -+ return PTR_ERR(coa); -+ set_coa(tc, cplug->h.id, tc->act, coa); -+ return 0; -+} -+ -+static inline int -+grab_coa(tfm_cluster_t * tc, compression_plugin * cplug) -+{ -+ return (cplug->alloc && !get_coa(tc, cplug->h.id, tc->act) ? -+ alloc_coa(tc, cplug) : 0); -+} -+ -+static inline void free_coa_set(tfm_cluster_t * tc) -+{ -+ tfm_action j; -+ reiser4_compression_id i; -+ compression_plugin *cplug; -+ -+ assert("edward-810", tc != NULL); -+ -+ for (j = 0; j < TFMA_LAST; j++) -+ for (i = 0; i < LAST_COMPRESSION_ID; i++) { -+ if (!get_coa(tc, i, j)) -+ continue; -+ cplug = compression_plugin_by_id(i); -+ assert("edward-812", cplug->free != NULL); -+ cplug->free(get_coa(tc, i, j), j); -+ set_coa(tc, i, j, 0); -+ } -+ return; -+} -+ -+static inline tfm_stream_t *tfm_stream(tfm_cluster_t * tc, tfm_stream_id id) -+{ -+ return tc->tun[id]; -+} -+ -+static inline void -+set_tfm_stream(tfm_cluster_t * tc, tfm_stream_id id, tfm_stream_t * ts) -+{ -+ tc->tun[id] = ts; -+} -+ -+static inline __u8 *tfm_stream_data(tfm_cluster_t * tc, tfm_stream_id id) -+{ -+ return ts_data(tfm_stream(tc, id)); -+} -+ -+static inline void -+set_tfm_stream_data(tfm_cluster_t * tc, tfm_stream_id id, __u8 * data) -+{ -+ tfm_stream(tc, id)->data = data; -+} -+ -+static inline size_t tfm_stream_size(tfm_cluster_t * tc, tfm_stream_id id) -+{ -+ return ts_size(tfm_stream(tc, id)); -+} -+ -+static inline void -+set_tfm_stream_size(tfm_cluster_t * tc, tfm_stream_id id, size_t size) -+{ -+ tfm_stream(tc, id)->size = size; -+} -+ -+static inline int -+alloc_tfm_stream(tfm_cluster_t * tc, size_t size, tfm_stream_id id) -+{ -+ assert("edward-939", tc != NULL); -+ assert("edward-940", !tfm_stream(tc, id)); -+ -+ tc->tun[id] = kmalloc(sizeof(tfm_stream_t), reiser4_ctx_gfp_mask_get()); -+ if (!tc->tun[id]) -+ return -ENOMEM; -+ memset(tfm_stream(tc, id), 0, sizeof(tfm_stream_t)); -+ return alloc_ts_data(tfm_stream(tc, id), size); -+} -+ -+static inline int -+realloc_tfm_stream(tfm_cluster_t * tc, size_t size, tfm_stream_id id) -+{ -+ assert("edward-941", tfm_stream_size(tc, id) < size); -+ free_ts_data(tfm_stream(tc, id)); -+ return alloc_ts_data(tfm_stream(tc, id), size); -+} -+ -+static inline void free_tfm_stream(tfm_cluster_t * tc, tfm_stream_id id) -+{ -+ free_ts_data(tfm_stream(tc, id)); -+ free_ts(tfm_stream(tc, id)); -+ set_tfm_stream(tc, id, 0); -+} -+ -+static inline unsigned coa_overrun(compression_plugin * cplug, int ilen) -+{ -+ return (cplug->overrun != NULL ? cplug->overrun(ilen) : 0); -+} -+ -+static inline void free_tfm_unit(tfm_cluster_t * tc) -+{ -+ tfm_stream_id id; -+ for (id = 0; id < LAST_STREAM; id++) { -+ if (!tfm_stream(tc, id)) -+ continue; -+ free_tfm_stream(tc, id); -+ } -+} -+ -+static inline void put_tfm_cluster(tfm_cluster_t * tc) -+{ -+ assert("edward-942", tc != NULL); -+ free_coa_set(tc); -+ free_tfm_unit(tc); -+} -+ -+static inline int tfm_cluster_is_uptodate(tfm_cluster_t * tc) -+{ -+ assert("edward-943", tc != NULL); -+ assert("edward-944", tc->uptodate == 0 || tc->uptodate == 1); -+ return (tc->uptodate == 1); -+} -+ -+static inline void tfm_cluster_set_uptodate(tfm_cluster_t * tc) -+{ -+ assert("edward-945", tc != NULL); -+ assert("edward-946", tc->uptodate == 0 || tc->uptodate == 1); -+ tc->uptodate = 1; -+ return; -+} -+ -+static inline void tfm_cluster_clr_uptodate(tfm_cluster_t * tc) -+{ -+ assert("edward-947", tc != NULL); -+ assert("edward-948", tc->uptodate == 0 || tc->uptodate == 1); -+ tc->uptodate = 0; -+ return; -+} -+ -+static inline int tfm_stream_is_set(tfm_cluster_t * tc, tfm_stream_id id) -+{ -+ return (tfm_stream(tc, id) && -+ tfm_stream_data(tc, id) && tfm_stream_size(tc, id)); -+} -+ -+static inline int tfm_cluster_is_set(tfm_cluster_t * tc) -+{ -+ int i; -+ for (i = 0; i < LAST_STREAM; i++) -+ if (!tfm_stream_is_set(tc, i)) -+ return 0; -+ return 1; -+} -+ -+static inline void alternate_streams(tfm_cluster_t * tc) -+{ -+ tfm_stream_t *tmp = tfm_stream(tc, INPUT_STREAM); -+ -+ set_tfm_stream(tc, INPUT_STREAM, tfm_stream(tc, OUTPUT_STREAM)); -+ set_tfm_stream(tc, OUTPUT_STREAM, tmp); -+} -+ -+/* a kind of data that we can write to the window */ -+typedef enum { -+ DATA_WINDOW, /* the data we copy form user space */ -+ HOLE_WINDOW /* zeroes if we write hole */ -+} window_stat; -+ -+/* Sliding window of cluster size which should be set to the approprite position -+ (defined by cluster index) in a file before page cluster modification by -+ file_write. Then we translate file size, offset to write from, number of -+ bytes to write, etc.. to the following configuration needed to estimate -+ number of pages to read before write, etc... -+*/ -+typedef struct reiser4_slide { -+ unsigned off; /* offset we start to write/truncate from */ -+ unsigned count; /* number of bytes (zeroes) to write/truncate */ -+ unsigned delta; /* number of bytes to append to the hole */ -+ window_stat stat; /* a kind of data to write to the window */ -+} reiser4_slide_t; -+ -+/* The following is a set of possible disk cluster states */ -+typedef enum { -+ INVAL_DISK_CLUSTER, /* unknown state */ -+ PREP_DISK_CLUSTER, /* disk cluster got converted by flush -+ at least 1 time */ -+ UNPR_DISK_CLUSTER, /* disk cluster just created and should be -+ converted by flush */ -+ FAKE_DISK_CLUSTER /* disk cluster doesn't exist neither in memory -+ nor on disk */ -+} disk_cluster_stat; -+ -+/* -+ While implementing all transforms (from page to disk cluster, and back) -+ reiser4 cluster manager fills the following structure incapsulating pointers -+ to all the clusters for the same index including the sliding window above -+*/ -+typedef struct reiser4_cluster { -+ tfm_cluster_t tc; /* transform cluster */ -+ int nr_pages; /* number of pages */ -+ struct page **pages; /* page cluster */ -+ page_cluster_op op; /* page cluster operation */ -+ struct file *file; -+ hint_t *hint; /* disk cluster item for traversal */ -+ disk_cluster_stat dstat; /* state of the current disk cluster */ -+ cloff_t index; /* offset in the units of cluster size */ -+ int index_valid; /* to validate the index above, if needed */ -+ reiser4_slide_t *win; /* sliding window of cluster size */ -+ int reserved; /* this indicates that space for disk -+ cluster modification is reserved */ -+#if REISER4_DEBUG -+ reiser4_context *ctx; -+ int reserved_prepped; -+ int reserved_unprepped; -+#endif -+ -+} reiser4_cluster_t; -+ -+static inline __u8 * tfm_input_data (reiser4_cluster_t * clust) -+{ -+ return tfm_stream_data(&clust->tc, INPUT_STREAM); -+} -+ -+static inline __u8 * tfm_output_data (reiser4_cluster_t * clust) -+{ -+ return tfm_stream_data(&clust->tc, OUTPUT_STREAM); -+} -+ -+static inline int reset_cluster_pgset(reiser4_cluster_t * clust, int nrpages) -+{ -+ assert("edward-1057", clust->pages != NULL); -+ memset(clust->pages, 0, sizeof(*clust->pages) * nrpages); -+ return 0; -+} -+ -+static inline int alloc_cluster_pgset(reiser4_cluster_t * clust, int nrpages) -+{ -+ assert("edward-949", clust != NULL); -+ assert("edward-1362", clust->pages == NULL); -+ assert("edward-950", nrpages != 0 && nrpages <= MAX_CLUSTER_NRPAGES); -+ -+ clust->pages = -+ kmalloc(sizeof(*clust->pages) * nrpages, -+ reiser4_ctx_gfp_mask_get()); -+ if (!clust->pages) -+ return RETERR(-ENOMEM); -+ reset_cluster_pgset(clust, nrpages); -+ return 0; -+} -+ -+static inline void free_cluster_pgset(reiser4_cluster_t * clust) -+{ -+ assert("edward-951", clust->pages != NULL); -+ kfree(clust->pages); -+ clust->pages = NULL; -+} -+ -+static inline void put_cluster_handle(reiser4_cluster_t * clust) -+{ -+ assert("edward-435", clust != NULL); -+ -+ put_tfm_cluster(&clust->tc); -+ if (clust->pages) -+ free_cluster_pgset(clust); -+ memset(clust, 0, sizeof *clust); -+} -+ -+static inline void inc_keyload_count(crypto_stat_t * data) -+{ -+ assert("edward-1410", data != NULL); -+ data->keyload_count++; -+} -+ -+static inline void dec_keyload_count(crypto_stat_t * data) -+{ -+ assert("edward-1411", data != NULL); -+ assert("edward-1412", data->keyload_count > 0); -+ data->keyload_count--; -+} -+ -+/* cryptcompress specific part of reiser4_inode */ -+typedef struct cryptcompress_info { -+ crypto_stat_t *crypt; -+ /* the following 2 fields are controlled by compression mode plugin */ -+ int compress_toggle; /* current status of compressibility */ -+ int lattice_factor; /* factor of dynamic lattice. FIXME: Have a -+ compression_toggle to keep the factor */ -+#if REISER4_DEBUG -+ int pgcount; /* number of captured pages */ -+#endif -+} cryptcompress_info_t; -+ -+static inline void set_compression_toggle (cryptcompress_info_t * info, int val) -+{ -+ info->compress_toggle = val; -+} -+ -+static inline int get_compression_toggle (cryptcompress_info_t * info) -+{ -+ return info->compress_toggle; -+} -+ -+static inline int compression_is_on(cryptcompress_info_t * info) -+{ -+ return get_compression_toggle(info) == 1; -+} -+ -+static inline void turn_on_compression(cryptcompress_info_t * info) -+{ -+ set_compression_toggle(info, 1); -+} -+ -+static inline void turn_off_compression(cryptcompress_info_t * info) -+{ -+ set_compression_toggle(info, 0); -+} -+ -+static inline void set_lattice_factor(cryptcompress_info_t * info, int val) -+{ -+ info->lattice_factor = val; -+} -+ -+static inline int get_lattice_factor(cryptcompress_info_t * info) -+{ -+ return info->lattice_factor; -+} -+ -+cryptcompress_info_t *cryptcompress_inode_data(const struct inode *); -+int equal_to_rdk(znode *, const reiser4_key *); -+int goto_right_neighbor(coord_t *, lock_handle *); -+int cryptcompress_inode_ok(struct inode *inode); -+int coord_is_unprepped_ctail(const coord_t * coord); -+extern int ctail_read_disk_cluster (reiser4_cluster_t *, struct inode *, -+ znode_lock_mode mode); -+extern int do_readpage_ctail(struct inode *, reiser4_cluster_t *, -+ struct page * page, znode_lock_mode mode); -+extern int ctail_insert_unprepped_cluster(reiser4_cluster_t * clust, -+ struct inode * inode); -+extern int readpages_cryptcompress(struct file*, struct address_space*, -+ struct list_head*, unsigned); -+int bind_cryptcompress(struct inode *child, struct inode *parent); -+void destroy_inode_cryptcompress(struct inode * inode); -+int grab_cluster_pages(struct inode *inode, reiser4_cluster_t * clust); -+int write_conversion_hook(struct file *file, struct inode * inode, loff_t pos, -+ reiser4_cluster_t * clust, int * progress); -+crypto_stat_t * inode_crypto_stat (struct inode * inode); -+void inherit_crypto_stat_common(struct inode * parent, struct inode * object, -+ int (*can_inherit)(struct inode * child, -+ struct inode * parent)); -+void reiser4_attach_crypto_stat(struct inode * inode, crypto_stat_t * info); -+void change_crypto_stat(struct inode * inode, crypto_stat_t * new); -+crypto_stat_t * reiser4_alloc_crypto_stat (struct inode * inode); -+ -+static inline struct crypto_blkcipher * info_get_cipher(crypto_stat_t * info) -+{ -+ return info->cipher; -+} -+ -+static inline void info_set_cipher(crypto_stat_t * info, -+ struct crypto_blkcipher * tfm) -+{ -+ info->cipher = tfm; -+} -+ -+static inline struct crypto_hash * info_get_digest(crypto_stat_t * info) -+{ -+ return info->digest; -+} -+ -+static inline void info_set_digest(crypto_stat_t * info, -+ struct crypto_hash * tfm) -+{ -+ info->digest = tfm; -+} -+ -+#endif /* __FS_REISER4_CRYPTCOMPRESS_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/file.c linux-2.6.20/fs/reiser4/plugin/file/file.c ---- linux-2.6.20.orig/fs/reiser4/plugin/file/file.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/file/file.c 2007-05-06 14:50:43.779000721 +0400 -@@ -0,0 +1,2821 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* -+ * this file contains implementations of inode/file/address_space/file plugin -+ * operations specific for "unix file plugin" (plugin id is -+ * UNIX_FILE_PLUGIN_ID). "Unix file" is either built of tail items only -+ * (FORMATTING_ID) or of extent items only (EXTENT_POINTER_ID) or empty (have -+ * no items but stat data) -+ */ -+ -+#include "../../inode.h" -+#include "../../super.h" -+#include "../../tree_walk.h" -+#include "../../carry.h" -+#include "../../page_cache.h" -+#include "../../ioctl.h" -+#include "../object.h" -+#include "../../safe_link.h" -+ -+#include -+#include -+#include -+ -+ -+static int unpack(struct file *file, struct inode *inode, int forever); -+static void drop_access(unix_file_info_t *); -+static int hint_validate(hint_t * hint, const reiser4_key * key, int check_key, -+ znode_lock_mode lock_mode); -+ -+/* Get exclusive access and make sure that file is not partially -+ * converted (It may happen that another process is doing tail -+ * conversion. If so, wait until it completes) -+ */ -+static inline void get_exclusive_access_careful(unix_file_info_t * uf_info, -+ struct inode *inode) -+{ -+ do { -+ get_exclusive_access(uf_info); -+ if (!reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV)) -+ break; -+ drop_exclusive_access(uf_info); -+ schedule(); -+ } while (1); -+} -+ -+/* get unix file plugin specific portion of inode */ -+unix_file_info_t *unix_file_inode_data(const struct inode *inode) -+{ -+ return &reiser4_inode_data(inode)->file_plugin_data.unix_file_info; -+} -+ -+/** -+ * equal_to_rdk - compare key and znode's right delimiting key -+ * @node: node whose right delimiting key to compare with @key -+ * @key: key to compare with @node's right delimiting key -+ * -+ * Returns true if @key is equal to right delimiting key of @node. -+ */ -+int equal_to_rdk(znode *node, const reiser4_key *key) -+{ -+ int result; -+ -+ read_lock_dk(znode_get_tree(node)); -+ result = keyeq(key, znode_get_rd_key(node)); -+ read_unlock_dk(znode_get_tree(node)); -+ return result; -+} -+ -+#if REISER4_DEBUG -+ -+/** -+ * equal_to_ldk - compare key and znode's left delimiting key -+ * @node: node whose left delimiting key to compare with @key -+ * @key: key to compare with @node's left delimiting key -+ * -+ * Returns true if @key is equal to left delimiting key of @node. -+ */ -+int equal_to_ldk(znode *node, const reiser4_key *key) -+{ -+ int result; -+ -+ read_lock_dk(znode_get_tree(node)); -+ result = keyeq(key, znode_get_ld_key(node)); -+ read_unlock_dk(znode_get_tree(node)); -+ return result; -+} -+ -+/** -+ * check_coord - check whether coord corresponds to key -+ * @coord: coord to check -+ * @key: key @coord has to correspond to -+ * -+ * Returns true if @coord is set as if it was set as result of lookup with @key -+ * in coord->node. -+ */ -+static int check_coord(const coord_t *coord, const reiser4_key *key) -+{ -+ coord_t twin; -+ -+ node_plugin_by_node(coord->node)->lookup(coord->node, key, -+ FIND_MAX_NOT_MORE_THAN, &twin); -+ return coords_equal(coord, &twin); -+} -+ -+#endif /* REISER4_DEBUG */ -+ -+/** -+ * init_uf_coord - initialize extended coord -+ * @uf_coord: -+ * @lh: -+ * -+ * -+ */ -+void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh) -+{ -+ coord_init_zero(&uf_coord->coord); -+ coord_clear_iplug(&uf_coord->coord); -+ uf_coord->lh = lh; -+ init_lh(lh); -+ memset(&uf_coord->extension, 0, sizeof(uf_coord->extension)); -+ uf_coord->valid = 0; -+} -+ -+static void validate_extended_coord(uf_coord_t *uf_coord, loff_t offset) -+{ -+ assert("vs-1333", uf_coord->valid == 0); -+ -+ if (coord_is_between_items(&uf_coord->coord)) -+ return; -+ -+ assert("vs-1348", -+ item_plugin_by_coord(&uf_coord->coord)->s.file. -+ init_coord_extension); -+ -+ item_body_by_coord(&uf_coord->coord); -+ item_plugin_by_coord(&uf_coord->coord)->s.file. -+ init_coord_extension(uf_coord, offset); -+} -+ -+/** -+ * goto_right_neighbor - lock right neighbor, drop current node lock -+ * @coord: -+ * @lh: -+ * -+ * Obtain lock on right neighbor and drop lock on current node. -+ */ -+int goto_right_neighbor(coord_t *coord, lock_handle *lh) -+{ -+ int result; -+ lock_handle lh_right; -+ -+ assert("vs-1100", znode_is_locked(coord->node)); -+ -+ init_lh(&lh_right); -+ result = reiser4_get_right_neighbor(&lh_right, coord->node, -+ znode_is_wlocked(coord->node) ? -+ ZNODE_WRITE_LOCK : ZNODE_READ_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ if (result) { -+ done_lh(&lh_right); -+ return result; -+ } -+ -+ /* -+ * we hold two longterm locks on neighboring nodes. Unlock left of -+ * them -+ */ -+ done_lh(lh); -+ -+ coord_init_first_unit_nocheck(coord, lh_right.node); -+ move_lh(lh, &lh_right); -+ -+ return 0; -+ -+} -+ -+/** -+ * set_file_state -+ * @uf_info: -+ * @cbk_result: -+ * @level: -+ * -+ * This is to be used by find_file_item and in find_file_state to -+ * determine real state of file -+ */ -+static void set_file_state(unix_file_info_t *uf_info, int cbk_result, -+ tree_level level) -+{ -+ if (cbk_errored(cbk_result)) -+ /* error happened in find_file_item */ -+ return; -+ -+ assert("vs-1164", level == LEAF_LEVEL || level == TWIG_LEVEL); -+ -+ if (uf_info->container == UF_CONTAINER_UNKNOWN) { -+ /* -+ * container is unknown, therefore conversion can not be in -+ * progress -+ */ -+ assert("", -+ !reiser4_inode_get_flag(unix_file_info_to_inode(uf_info), -+ REISER4_PART_IN_CONV)); -+ if (cbk_result == CBK_COORD_NOTFOUND) -+ uf_info->container = UF_CONTAINER_EMPTY; -+ else if (level == LEAF_LEVEL) -+ uf_info->container = UF_CONTAINER_TAILS; -+ else -+ uf_info->container = UF_CONTAINER_EXTENTS; -+ } else { -+ /* -+ * file state is known, check whether it is set correctly if -+ * file is not being tail converted -+ */ -+ if (!reiser4_inode_get_flag(unix_file_info_to_inode(uf_info), -+ REISER4_PART_IN_CONV)) { -+ assert("vs-1162", -+ ergo(level == LEAF_LEVEL && -+ cbk_result == CBK_COORD_FOUND, -+ uf_info->container == UF_CONTAINER_TAILS)); -+ assert("vs-1165", -+ ergo(level == TWIG_LEVEL && -+ cbk_result == CBK_COORD_FOUND, -+ uf_info->container == UF_CONTAINER_EXTENTS)); -+ } -+ } -+} -+ -+int find_file_item_nohint(coord_t *coord, lock_handle *lh, -+ const reiser4_key *key, znode_lock_mode lock_mode, -+ struct inode *inode) -+{ -+ return reiser4_object_lookup(inode, key, coord, lh, lock_mode, -+ FIND_MAX_NOT_MORE_THAN, -+ TWIG_LEVEL, LEAF_LEVEL, -+ (lock_mode == ZNODE_READ_LOCK) ? CBK_UNIQUE : -+ (CBK_UNIQUE | CBK_FOR_INSERT), -+ NULL /* ra_info */ ); -+} -+ -+/** -+ * find_file_item - look for file item in the tree -+ * @hint: provides coordinate, lock handle, seal -+ * @key: key for search -+ * @mode: mode of lock to put on returned node -+ * @ra_info: -+ * @inode: -+ * -+ * This finds position in the tree corresponding to @key. It first tries to use -+ * @hint's seal if it is set. -+ */ -+int find_file_item(hint_t *hint, const reiser4_key *key, -+ znode_lock_mode lock_mode, -+ struct inode *inode) -+{ -+ int result; -+ coord_t *coord; -+ lock_handle *lh; -+ -+ assert("nikita-3030", reiser4_schedulable()); -+ assert("vs-1707", hint != NULL); -+ assert("vs-47", inode != NULL); -+ -+ coord = &hint->ext_coord.coord; -+ lh = hint->ext_coord.lh; -+ init_lh(lh); -+ -+ result = hint_validate(hint, key, 1 /* check key */, lock_mode); -+ if (!result) { -+ if (coord->between == AFTER_UNIT && -+ equal_to_rdk(coord->node, key)) { -+ result = goto_right_neighbor(coord, lh); -+ if (result == -E_NO_NEIGHBOR) -+ return RETERR(-EIO); -+ if (result) -+ return result; -+ assert("vs-1152", equal_to_ldk(coord->node, key)); -+ /* -+ * we moved to different node. Invalidate coord -+ * extension, zload is necessary to init it again -+ */ -+ hint->ext_coord.valid = 0; -+ } -+ -+ set_file_state(unix_file_inode_data(inode), CBK_COORD_FOUND, -+ znode_get_level(coord->node)); -+ -+ return CBK_COORD_FOUND; -+ } -+ -+ coord_init_zero(coord); -+ result = find_file_item_nohint(coord, lh, key, lock_mode, inode); -+ set_file_state(unix_file_inode_data(inode), result, -+ znode_get_level(coord->node)); -+ -+ /* FIXME: we might already have coord extension initialized */ -+ hint->ext_coord.valid = 0; -+ return result; -+} -+ -+/* plugin->u.file.write_flowom = NULL -+ plugin->u.file.read_flow = NULL */ -+ -+void hint_init_zero(hint_t * hint) -+{ -+ memset(hint, 0, sizeof(*hint)); -+ init_lh(&hint->lh); -+ hint->ext_coord.lh = &hint->lh; -+} -+ -+static int find_file_state(struct inode *inode, unix_file_info_t *uf_info) -+{ -+ int result; -+ reiser4_key key; -+ coord_t coord; -+ lock_handle lh; -+ -+ assert("vs-1628", ea_obtained(uf_info)); -+ -+ if (uf_info->container == UF_CONTAINER_UNKNOWN) { -+ key_by_inode_and_offset_common(inode, 0, &key); -+ init_lh(&lh); -+ result = find_file_item_nohint(&coord, &lh, &key, -+ ZNODE_READ_LOCK, inode); -+ set_file_state(uf_info, result, znode_get_level(coord.node)); -+ done_lh(&lh); -+ if (!cbk_errored(result)) -+ result = 0; -+ } else -+ result = 0; -+ assert("vs-1074", -+ ergo(result == 0, uf_info->container != UF_CONTAINER_UNKNOWN)); -+ reiser4_txn_restart_current(); -+ return result; -+} -+ -+/* estimate and reserve space needed to truncate page which gets partially truncated: one block for page itself, stat -+ data update (estimate_one_insert_into_item) and one item insertion (estimate_one_insert_into_item) which may happen -+ if page corresponds to hole extent and unallocated one will have to be created */ -+static int reserve_partial_page(reiser4_tree * tree) -+{ -+ grab_space_enable(); -+ return reiser4_grab_reserved(reiser4_get_current_sb(), -+ 1 + -+ 2 * estimate_one_insert_into_item(tree), -+ BA_CAN_COMMIT); -+} -+ -+/* estimate and reserve space needed to cut one item and update one stat data */ -+static int reserve_cut_iteration(reiser4_tree * tree) -+{ -+ __u64 estimate = estimate_one_item_removal(tree) -+ + estimate_one_insert_into_item(tree); -+ -+ assert("nikita-3172", lock_stack_isclean(get_current_lock_stack())); -+ -+ grab_space_enable(); -+ /* We need to double our estimate now that we can delete more than one -+ node. */ -+ return reiser4_grab_reserved(reiser4_get_current_sb(), estimate * 2, -+ BA_CAN_COMMIT); -+} -+ -+int reiser4_update_file_size(struct inode *inode, reiser4_key * key, -+ int update_sd) -+{ -+ int result = 0; -+ -+ INODE_SET_FIELD(inode, i_size, get_key_offset(key)); -+ if (update_sd) { -+ inode->i_ctime = inode->i_mtime = CURRENT_TIME; -+ result = reiser4_update_sd(inode); -+ } -+ return result; -+} -+ -+/* cut file items one by one starting from the last one until new file size (inode->i_size) is reached. Reserve space -+ and update file stat data on every single cut from the tree */ -+int -+cut_file_items(struct inode *inode, loff_t new_size, int update_sd, -+ loff_t cur_size, int (*update_actor) (struct inode *, -+ reiser4_key *, int)) -+{ -+ reiser4_key from_key, to_key; -+ reiser4_key smallest_removed; -+ file_plugin *fplug = inode_file_plugin(inode); -+ int result; -+ int progress = 0; -+ -+ assert("vs-1248", -+ fplug == file_plugin_by_id(UNIX_FILE_PLUGIN_ID) || -+ fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); -+ -+ fplug->key_by_inode(inode, new_size, &from_key); -+ to_key = from_key; -+ set_key_offset(&to_key, cur_size - 1 /*get_key_offset(reiser4_max_key()) */ ); -+ /* this loop normally runs just once */ -+ while (1) { -+ result = reserve_cut_iteration(reiser4_tree_by_inode(inode)); -+ if (result) -+ break; -+ -+ result = reiser4_cut_tree_object(current_tree, &from_key, &to_key, -+ &smallest_removed, inode, 1, -+ &progress); -+ if (result == -E_REPEAT) { -+ /* -E_REPEAT is a signal to interrupt a long file truncation process */ -+ if (progress) { -+ result = -+ update_actor(inode, &smallest_removed, -+ update_sd); -+ if (result) -+ break; -+ } -+ -+ /* the below does up(sbinfo->delete_mutex). Do not get folled */ -+ reiser4_release_reserved(inode->i_sb); -+ -+ /* reiser4_cut_tree_object() was interrupted probably because -+ * current atom requires commit, we have to release -+ * transaction handle to allow atom commit. */ -+ reiser4_txn_restart_current(); -+ continue; -+ } -+ if (result -+ && !(result == CBK_COORD_NOTFOUND && new_size == 0 -+ && inode->i_size == 0)) -+ break; -+ -+ set_key_offset(&smallest_removed, new_size); -+ /* Final sd update after the file gets its correct size */ -+ result = update_actor(inode, &smallest_removed, update_sd); -+ break; -+ } -+ -+ /* the below does up(sbinfo->delete_mutex). Do not get folled */ -+ reiser4_release_reserved(inode->i_sb); -+ -+ return result; -+} -+ -+int find_or_create_extent(struct page *page); -+ -+/* part of truncate_file_body: it is called when truncate is used to make file -+ shorter */ -+static int shorten_file(struct inode *inode, loff_t new_size) -+{ -+ int result; -+ struct page *page; -+ int padd_from; -+ unsigned long index; -+ char *kaddr; -+ unix_file_info_t *uf_info; -+ -+ /* -+ * all items of ordinary reiser4 file are grouped together. That is why -+ * we can use reiser4_cut_tree. Plan B files (for instance) can not be -+ * truncated that simply -+ */ -+ result = cut_file_items(inode, new_size, 1 /*update_sd */ , -+ get_key_offset(reiser4_max_key()), -+ reiser4_update_file_size); -+ if (result) -+ return result; -+ -+ uf_info = unix_file_inode_data(inode); -+ assert("vs-1105", new_size == inode->i_size); -+ if (new_size == 0) { -+ uf_info->container = UF_CONTAINER_EMPTY; -+ return 0; -+ } -+ -+ result = find_file_state(inode, uf_info); -+ if (result) -+ return result; -+ if (uf_info->container == UF_CONTAINER_TAILS) -+ /* -+ * No need to worry about zeroing last page after new file -+ * end -+ */ -+ return 0; -+ -+ padd_from = inode->i_size & (PAGE_CACHE_SIZE - 1); -+ if (!padd_from) -+ /* file is truncated to page boundary */ -+ return 0; -+ -+ result = reserve_partial_page(reiser4_tree_by_inode(inode)); -+ if (result) { -+ reiser4_release_reserved(inode->i_sb); -+ return result; -+ } -+ -+ /* last page is partially truncated - zero its content */ -+ index = (inode->i_size >> PAGE_CACHE_SHIFT); -+ page = read_mapping_page(inode->i_mapping, index, NULL); -+ if (IS_ERR(page)) { -+ /* -+ * the below does up(sbinfo->delete_mutex). Do not get -+ * confused -+ */ -+ reiser4_release_reserved(inode->i_sb); -+ if (likely(PTR_ERR(page) == -EINVAL)) { -+ /* looks like file is built of tail items */ -+ return 0; -+ } -+ return PTR_ERR(page); -+ } -+ wait_on_page_locked(page); -+ if (!PageUptodate(page)) { -+ page_cache_release(page); -+ /* -+ * the below does up(sbinfo->delete_mutex). Do not get -+ * confused -+ */ -+ reiser4_release_reserved(inode->i_sb); -+ return RETERR(-EIO); -+ } -+ -+ /* -+ * if page correspons to hole extent unit - unallocated one will be -+ * created here. This is not necessary -+ */ -+ result = find_or_create_extent(page); -+ -+ /* -+ * FIXME: cut_file_items has already updated inode. Probably it would -+ * be better to update it here when file is really truncated -+ */ -+ if (result) { -+ page_cache_release(page); -+ /* -+ * the below does up(sbinfo->delete_mutex). Do not get -+ * confused -+ */ -+ reiser4_release_reserved(inode->i_sb); -+ return result; -+ } -+ -+ lock_page(page); -+ assert("vs-1066", PageLocked(page)); -+ kaddr = kmap_atomic(page, KM_USER0); -+ memset(kaddr + padd_from, 0, PAGE_CACHE_SIZE - padd_from); -+ flush_dcache_page(page); -+ kunmap_atomic(kaddr, KM_USER0); -+ unlock_page(page); -+ page_cache_release(page); -+ /* the below does up(sbinfo->delete_mutex). Do not get confused */ -+ reiser4_release_reserved(inode->i_sb); -+ return 0; -+} -+ -+/** -+ * should_have_notail -+ * @uf_info: -+ * @new_size: -+ * -+ * Calls formatting plugin to see whether file of size @new_size has to be -+ * stored in unformatted nodes or in tail items. 0 is returned for later case. -+ */ -+static int should_have_notail(const unix_file_info_t *uf_info, loff_t new_size) -+{ -+ if (!uf_info->tplug) -+ return 1; -+ return !uf_info->tplug->have_tail(unix_file_info_to_inode(uf_info), -+ new_size); -+ -+} -+ -+/** -+ * truncate_file_body - change length of file -+ * @inode: inode of file -+ * @new_size: new file length -+ * -+ * Adjusts items file @inode is built of to match @new_size. It may either cut -+ * items or add them to represent a hole at the end of file. The caller has to -+ * obtain exclusive access to the file. -+ */ -+static int truncate_file_body(struct inode *inode, loff_t new_size) -+{ -+ int result; -+ -+ if (inode->i_size < new_size) { -+ /* expanding truncate */ -+ struct dentry dentry; -+ struct file file; -+ unix_file_info_t *uf_info; -+ -+ dentry.d_inode = inode; -+ file.f_dentry = &dentry; -+ file.private_data = NULL; -+ file.f_pos = new_size; -+ file.private_data = NULL; -+ uf_info = unix_file_inode_data(inode); -+ result = find_file_state(inode, uf_info); -+ if (result) -+ return result; -+ -+ if (should_have_notail(uf_info, new_size)) { -+ /* -+ * file of size @new_size has to be built of -+ * extents. If it is built of tails - convert to -+ * extents -+ */ -+ if (uf_info->container == UF_CONTAINER_TAILS) { -+ /* -+ * if file is being convered by another process -+ * - wait until it completes -+ */ -+ while (1) { -+ if (reiser4_inode_get_flag(inode, -+ REISER4_PART_IN_CONV)) { -+ drop_exclusive_access(uf_info); -+ schedule(); -+ get_exclusive_access(uf_info); -+ continue; -+ } -+ break; -+ } -+ -+ if (uf_info->container == UF_CONTAINER_TAILS) { -+ result = tail2extent(uf_info); -+ if (result) -+ return result; -+ } -+ } -+ result = reiser4_write_extent(&file, NULL, 0, -+ &new_size); -+ if (result) -+ return result; -+ uf_info->container = UF_CONTAINER_EXTENTS; -+ } else { -+ if (uf_info->container == UF_CONTAINER_EXTENTS) { -+ result = reiser4_write_extent(&file, NULL, 0, -+ &new_size); -+ if (result) -+ return result; -+ } else { -+ result = reiser4_write_tail(&file, NULL, 0, -+ &new_size); -+ if (result) -+ return result; -+ uf_info->container = UF_CONTAINER_TAILS; -+ } -+ } -+ BUG_ON(result > 0); -+ INODE_SET_FIELD(inode, i_size, new_size); -+ file_update_time(&file); -+ result = reiser4_update_sd(inode); -+ BUG_ON(result != 0); -+ reiser4_free_file_fsdata(&file); -+ } else -+ result = shorten_file(inode, new_size); -+ return result; -+} -+ -+/* plugin->u.write_sd_by_inode = write_sd_by_inode_common */ -+ -+/** -+ * load_file_hint - copy hint from struct file to local variable -+ * @file: file to get hint from -+ * @hint: structure to fill -+ * -+ * Reiser4 specific portion of struct file may contain information (hint) -+ * stored on exiting from previous read or write. That information includes -+ * seal of znode and coord within that znode where previous read or write -+ * stopped. This function copies that information to @hint if it was stored or -+ * initializes @hint by 0s otherwise. -+ */ -+int load_file_hint(struct file *file, hint_t *hint) -+{ -+ reiser4_file_fsdata *fsdata; -+ -+ if (file) { -+ fsdata = reiser4_get_file_fsdata(file); -+ if (IS_ERR(fsdata)) -+ return PTR_ERR(fsdata); -+ -+ spin_lock_inode(file->f_dentry->d_inode); -+ if (reiser4_seal_is_set(&fsdata->reg.hint.seal)) { -+ *hint = fsdata->reg.hint; -+ init_lh(&hint->lh); -+ hint->ext_coord.lh = &hint->lh; -+ spin_unlock_inode(file->f_dentry->d_inode); -+ /* -+ * force re-validation of the coord on the first -+ * iteration of the read/write loop. -+ */ -+ hint->ext_coord.valid = 0; -+ assert("nikita-19892", coords_equal(&hint->seal.coord1, -+ &hint->ext_coord. -+ coord)); -+ return 0; -+ } -+ memset(&fsdata->reg.hint, 0, sizeof(hint_t)); -+ spin_unlock_inode(file->f_dentry->d_inode); -+ } -+ hint_init_zero(hint); -+ return 0; -+} -+ -+/** -+ * save_file_hint - copy hint to reiser4 private struct file's part -+ * @file: file to save hint in -+ * @hint: hint to save -+ * -+ * This copies @hint to reiser4 private part of struct file. It can help -+ * speedup future accesses to the file. -+ */ -+void save_file_hint(struct file *file, const hint_t *hint) -+{ -+ reiser4_file_fsdata *fsdata; -+ -+ assert("edward-1337", hint != NULL); -+ -+ if (!file || !reiser4_seal_is_set(&hint->seal)) -+ return; -+ fsdata = reiser4_get_file_fsdata(file); -+ assert("vs-965", !IS_ERR(fsdata)); -+ assert("nikita-19891", -+ coords_equal(&hint->seal.coord1, &hint->ext_coord.coord)); -+ assert("vs-30", hint->lh.owner == NULL); -+ spin_lock_inode(file->f_dentry->d_inode); -+ fsdata->reg.hint = *hint; -+ spin_unlock_inode(file->f_dentry->d_inode); -+ return; -+} -+ -+void reiser4_unset_hint(hint_t * hint) -+{ -+ assert("vs-1315", hint); -+ hint->ext_coord.valid = 0; -+ reiser4_seal_done(&hint->seal); -+ done_lh(&hint->lh); -+} -+ -+/* coord must be set properly. So, that reiser4_set_hint -+ has nothing to do */ -+void reiser4_set_hint(hint_t * hint, const reiser4_key * key, -+ znode_lock_mode mode) -+{ -+ ON_DEBUG(coord_t * coord = &hint->ext_coord.coord); -+ assert("vs-1207", WITH_DATA(coord->node, check_coord(coord, key))); -+ -+ reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, key); -+ hint->offset = get_key_offset(key); -+ hint->mode = mode; -+ done_lh(&hint->lh); -+} -+ -+int hint_is_set(const hint_t * hint) -+{ -+ return reiser4_seal_is_set(&hint->seal); -+} -+ -+#if REISER4_DEBUG -+static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2) -+{ -+ return (get_key_locality(k1) == get_key_locality(k2) && -+ get_key_type(k1) == get_key_type(k2) && -+ get_key_band(k1) == get_key_band(k2) && -+ get_key_ordering(k1) == get_key_ordering(k2) && -+ get_key_objectid(k1) == get_key_objectid(k2)); -+} -+#endif -+ -+static int -+hint_validate(hint_t * hint, const reiser4_key * key, int check_key, -+ znode_lock_mode lock_mode) -+{ -+ if (!hint || !hint_is_set(hint) || hint->mode != lock_mode) -+ /* hint either not set or set by different operation */ -+ return RETERR(-E_REPEAT); -+ -+ assert("vs-1277", all_but_offset_key_eq(key, &hint->seal.key)); -+ -+ if (check_key && get_key_offset(key) != hint->offset) -+ /* hint is set for different key */ -+ return RETERR(-E_REPEAT); -+ -+ assert("vs-31", hint->ext_coord.lh == &hint->lh); -+ return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord, key, -+ hint->ext_coord.lh, lock_mode, -+ ZNODE_LOCK_LOPRI); -+} -+ -+/** -+ * find_or_create_extent - -+ * @page: -+ * -+ * -+ */ -+/* look for place at twig level for extent corresponding to page, call extent's writepage method to create -+ unallocated extent if it does not exist yet, initialize jnode, capture page */ -+int find_or_create_extent(struct page *page) -+{ -+ int result; -+ struct inode *inode; -+ int plugged_hole; -+ -+ jnode *node; -+ -+ assert("vs-1065", page->mapping && page->mapping->host); -+ inode = page->mapping->host; -+ -+ lock_page(page); -+ node = jnode_of_page(page); -+ if (IS_ERR(node)) { -+ unlock_page(page); -+ return PTR_ERR(node); -+ } -+ JF_SET(node, JNODE_WRITE_PREPARED); -+ unlock_page(page); -+ -+ if (node->blocknr == 0) { -+ plugged_hole = 0; -+ result = reiser4_update_extent(inode, node, page_offset(page), -+ &plugged_hole); -+ if (result) { -+ JF_CLR(node, JNODE_WRITE_PREPARED); -+ jput(node); -+ warning("", "reiser4_update_extent failed: %d", result); -+ return result; -+ } -+ if (plugged_hole) -+ reiser4_update_sd(inode); -+ } else { -+ spin_lock_jnode(node); -+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0); -+ BUG_ON(result != 0); -+ jnode_make_dirty_locked(node); -+ spin_unlock_jnode(node); -+ } -+ -+ BUG_ON(node->atom == NULL); -+ JF_CLR(node, JNODE_WRITE_PREPARED); -+ jput(node); -+ -+ if (get_current_context()->entd) { -+ entd_context *ent = get_entd_context(node->tree->super); -+ -+ if (ent->cur_request->page == page) -+ ent->cur_request->node = node; -+ } -+ return 0; -+} -+ -+/** -+ * has_anonymous_pages - check whether inode has pages dirtied via mmap -+ * @inode: inode to check -+ * -+ * Returns true if inode's mapping has dirty pages which do not belong to any -+ * atom. Those are either tagged PAGECACHE_TAG_REISER4_MOVED in mapping's page -+ * tree or were eflushed and can be found via jnodes tagged -+ * EFLUSH_TAG_ANONYMOUS in radix tree of jnodes. -+ */ -+static int has_anonymous_pages(struct inode *inode) -+{ -+ int result; -+ -+ read_lock_irq(&inode->i_mapping->tree_lock); -+ result = radix_tree_tagged(&inode->i_mapping->page_tree, PAGECACHE_TAG_REISER4_MOVED); -+ read_unlock_irq(&inode->i_mapping->tree_lock); -+ return result; -+} -+ -+/** -+ * capture_page_and_create_extent - -+ * @page: page to be captured -+ * -+ * Grabs space for extent creation and stat data update and calls function to -+ * do actual work. -+ */ -+static int capture_page_and_create_extent(struct page *page) -+{ -+ int result; -+ struct inode *inode; -+ -+ assert("vs-1084", page->mapping && page->mapping->host); -+ inode = page->mapping->host; -+ assert("vs-1139", -+ unix_file_inode_data(inode)->container == UF_CONTAINER_EXTENTS); -+ /* page belongs to file */ -+ assert("vs-1393", -+ inode->i_size > page_offset(page)); -+ -+ /* page capture may require extent creation (if it does not exist yet) -+ and stat data's update (number of blocks changes on extent -+ creation) */ -+ grab_space_enable(); -+ result = reiser4_grab_space(2 * estimate_one_insert_into_item -+ (reiser4_tree_by_inode(inode)), -+ BA_CAN_COMMIT); -+ if (likely(!result)) -+ result = find_or_create_extent(page); -+ -+ if (result != 0) -+ SetPageError(page); -+ return result; -+} -+ -+/* this is implementation of method commit_write of struct -+ address_space_operations for unix file plugin */ -+int -+commit_write_unix_file(struct file *file, struct page *page, -+ unsigned from, unsigned to) -+{ -+ reiser4_context *ctx; -+ struct inode *inode; -+ int result; -+ -+ assert("umka-3101", file != NULL); -+ assert("umka-3102", page != NULL); -+ assert("umka-3093", PageLocked(page)); -+ -+ SetPageUptodate(page); -+ -+ inode = page->mapping->host; -+ ctx = reiser4_init_context(page->mapping->host->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ page_cache_get(page); -+ unlock_page(page); -+ result = capture_page_and_create_extent(page); -+ lock_page(page); -+ page_cache_release(page); -+ -+ /* don't commit transaction under inode semaphore */ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/* -+ * Support for "anonymous" pages and jnodes. -+ * -+ * When file is write-accessed through mmap pages can be dirtied from the user -+ * level. In this case kernel is not notified until one of following happens: -+ * -+ * (1) msync() -+ * -+ * (2) truncate() (either explicit or through unlink) -+ * -+ * (3) VM scanner starts reclaiming mapped pages, dirtying them before -+ * starting write-back. -+ * -+ * As a result of (3) ->writepage may be called on a dirty page without -+ * jnode. Such page is called "anonymous" in reiser4. Certain work-loads -+ * (iozone) generate huge number of anonymous pages. Emergency flush handles -+ * this situation by creating jnode for anonymous page, starting IO on the -+ * page, and marking jnode with JNODE_KEEPME bit so that it's not thrown out of -+ * memory. Such jnode is also called anonymous. -+ * -+ * reiser4_sync_sb() method tries to insert anonymous pages and jnodes into -+ * tree. This is done by capture_anonymous_*() functions below. -+ */ -+ -+/** -+ * capture_anonymous_page - involve page into transaction -+ * @pg: page to deal with -+ * -+ * Takes care that @page has corresponding metadata in the tree, creates jnode -+ * for @page and captures it. On success 1 is returned. -+ */ -+static int capture_anonymous_page(struct page *page) -+{ -+ int result; -+ -+ if (PageWriteback(page)) -+ /* FIXME: do nothing? */ -+ return 0; -+ -+ result = capture_page_and_create_extent(page); -+ if (result == 0) { -+ result = 1; -+ } else -+ warning("nikita-3329", -+ "Cannot capture anon page: %i", result); -+ -+ return result; -+} -+ -+/** -+ * capture_anonymous_pages - find and capture pages dirtied via mmap -+ * @mapping: address space where to look for pages -+ * @index: start index -+ * @to_capture: maximum number of pages to capture -+ * -+ * Looks for pages tagged REISER4_MOVED starting from the *@index-th page, -+ * captures (involves into atom) them, returns number of captured pages, -+ * updates @index to next page after the last captured one. -+ */ -+static int -+capture_anonymous_pages(struct address_space *mapping, pgoff_t *index, -+ unsigned int to_capture) -+{ -+ int result; -+ struct pagevec pvec; -+ unsigned int i, count; -+ int nr; -+ -+ pagevec_init(&pvec, 0); -+ count = min(pagevec_space(&pvec), to_capture); -+ nr = 0; -+ -+ /* find pages tagged MOVED */ -+ write_lock_irq(&mapping->tree_lock); -+ pvec.nr = radix_tree_gang_lookup_tag(&mapping->page_tree, -+ (void **)pvec.pages, *index, count, -+ PAGECACHE_TAG_REISER4_MOVED); -+ if (pagevec_count(&pvec) == 0) { -+ /* -+ * there are no pages tagged MOVED in mapping->page_tree -+ * starting from *index -+ */ -+ write_unlock_irq(&mapping->tree_lock); -+ *index = (pgoff_t)-1; -+ return 0; -+ } -+ -+ /* clear MOVED tag for all found pages */ -+ for (i = 0; i < pagevec_count(&pvec); i++) { -+ void *p; -+ -+ page_cache_get(pvec.pages[i]); -+ p = radix_tree_tag_clear(&mapping->page_tree, pvec.pages[i]->index, -+ PAGECACHE_TAG_REISER4_MOVED); -+ assert("vs-49", p == pvec.pages[i]); -+ } -+ write_unlock_irq(&mapping->tree_lock); -+ -+ -+ *index = pvec.pages[i - 1]->index + 1; -+ -+ for (i = 0; i < pagevec_count(&pvec); i++) { -+ /* -+ * tag PAGECACHE_TAG_REISER4_MOVED will be cleared by -+ * reiser4_set_page_dirty_internal which is called when jnode is -+ * captured -+ */ -+ result = capture_anonymous_page(pvec.pages[i]); -+ if (result == 1) -+ nr++; -+ else { -+ if (result < 0) { -+ warning("vs-1454", -+ "failed to capture page: " -+ "result=%d, captured=%d)\n", -+ result, i); -+ -+ /* -+ * set MOVED tag to all pages which left not -+ * captured -+ */ -+ write_lock_irq(&mapping->tree_lock); -+ for (; i < pagevec_count(&pvec); i ++) { -+ radix_tree_tag_set(&mapping->page_tree, -+ pvec.pages[i]->index, -+ PAGECACHE_TAG_REISER4_MOVED); -+ } -+ write_unlock_irq(&mapping->tree_lock); -+ -+ pagevec_release(&pvec); -+ return result; -+ } else { -+ /* -+ * result == 0. capture_anonymous_page returns -+ * 0 for Writeback-ed page. Set MOVED tag on -+ * that page -+ */ -+ write_lock_irq(&mapping->tree_lock); -+ radix_tree_tag_set(&mapping->page_tree, -+ pvec.pages[i]->index, -+ PAGECACHE_TAG_REISER4_MOVED); -+ write_unlock_irq(&mapping->tree_lock); -+ if (i == 0) -+ *index = pvec.pages[0]->index; -+ else -+ *index = pvec.pages[i - 1]->index + 1; -+ } -+ } -+ } -+ pagevec_release(&pvec); -+ return nr; -+} -+ -+/** -+ * capture_anonymous_jnodes - find and capture anonymous jnodes -+ * @mapping: address space where to look for jnodes -+ * @from: start index -+ * @to: end index -+ * @to_capture: maximum number of jnodes to capture -+ * -+ * Looks for jnodes tagged EFLUSH_TAG_ANONYMOUS in inode's tree of jnodes in -+ * the range of indexes @from-@to and captures them, returns number of captured -+ * jnodes, updates @from to next jnode after the last captured one. -+ */ -+static int -+capture_anonymous_jnodes(struct address_space *mapping, -+ pgoff_t *from, pgoff_t to, int to_capture) -+{ -+ *from = to; -+ return 0; -+} -+ -+/* -+ * Commit atom of the jnode of a page. -+ */ -+static int sync_page(struct page *page) -+{ -+ int result; -+ do { -+ jnode *node; -+ txn_atom *atom; -+ -+ lock_page(page); -+ node = jprivate(page); -+ if (node != NULL) { -+ spin_lock_jnode(node); -+ atom = jnode_get_atom(node); -+ spin_unlock_jnode(node); -+ } else -+ atom = NULL; -+ unlock_page(page); -+ result = reiser4_sync_atom(atom); -+ } while (result == -E_REPEAT); -+ /* -+ * ZAM-FIXME-HANS: document the logic of this loop, is it just to -+ * handle the case where more pages get added to the atom while we are -+ * syncing it? -+ */ -+ assert("nikita-3485", ergo(result == 0, -+ get_current_context()->trans->atom == NULL)); -+ return result; -+} -+ -+/* -+ * Commit atoms of pages on @pages list. -+ * call sync_page for each page from mapping's page tree -+ */ -+static int sync_page_list(struct inode *inode) -+{ -+ int result; -+ struct address_space *mapping; -+ unsigned long from; /* start index for radix_tree_gang_lookup */ -+ unsigned int found; /* return value for radix_tree_gang_lookup */ -+ -+ mapping = inode->i_mapping; -+ from = 0; -+ result = 0; -+ read_lock_irq(&mapping->tree_lock); -+ while (result == 0) { -+ struct page *page; -+ -+ found = -+ radix_tree_gang_lookup(&mapping->page_tree, (void **)&page, -+ from, 1); -+ assert("", found < 2); -+ if (found == 0) -+ break; -+ -+ /* page may not leave radix tree because it is protected from truncating by inode->i_mutex locked by -+ sys_fsync */ -+ page_cache_get(page); -+ read_unlock_irq(&mapping->tree_lock); -+ -+ from = page->index + 1; -+ -+ result = sync_page(page); -+ -+ page_cache_release(page); -+ read_lock_irq(&mapping->tree_lock); -+ } -+ -+ read_unlock_irq(&mapping->tree_lock); -+ return result; -+} -+ -+static int commit_file_atoms(struct inode *inode) -+{ -+ int result; -+ unix_file_info_t *uf_info; -+ -+ uf_info = unix_file_inode_data(inode); -+ -+ get_exclusive_access(uf_info); -+ /* -+ * find what items file is made from -+ */ -+ result = find_file_state(inode, uf_info); -+ drop_exclusive_access(uf_info); -+ if (result != 0) -+ return result; -+ -+ /* -+ * file state cannot change because we are under ->i_mutex -+ */ -+ switch (uf_info->container) { -+ case UF_CONTAINER_EXTENTS: -+ /* find_file_state might open join an atom */ -+ reiser4_txn_restart_current(); -+ result = -+ /* -+ * when we are called by -+ * filemap_fdatawrite-> -+ * do_writepages()-> -+ * reiser4_writepages() -+ * -+ * inode->i_mapping->dirty_pages are spices into -+ * ->io_pages, leaving ->dirty_pages dirty. -+ * -+ * When we are called from -+ * reiser4_fsync()->sync_unix_file(), we have to -+ * commit atoms of all pages on the ->dirty_list. -+ * -+ * So for simplicity we just commit ->io_pages and -+ * ->dirty_pages. -+ */ -+ sync_page_list(inode); -+ break; -+ case UF_CONTAINER_TAILS: -+ /* -+ * NOTE-NIKITA probably we can be smarter for tails. For now -+ * just commit all existing atoms. -+ */ -+ result = txnmgr_force_commit_all(inode->i_sb, 0); -+ break; -+ case UF_CONTAINER_EMPTY: -+ result = 0; -+ break; -+ case UF_CONTAINER_UNKNOWN: -+ default: -+ result = -EIO; -+ break; -+ } -+ -+ /* -+ * commit current transaction: there can be captured nodes from -+ * find_file_state() and finish_conversion(). -+ */ -+ reiser4_txn_restart_current(); -+ return result; -+} -+ -+/** -+ * writepages_unix_file - writepages of struct address_space_operations -+ * @mapping: -+ * @wbc: -+ * -+ * This captures anonymous pages and anonymous jnodes. Anonymous pages are -+ * pages which are dirtied via mmapping. Anonymous jnodes are ones which were -+ * created by reiser4_writepage. -+ */ -+int writepages_unix_file(struct address_space *mapping, -+ struct writeback_control *wbc) -+{ -+ int result; -+ unix_file_info_t *uf_info; -+ pgoff_t pindex, jindex, nr_pages; -+ long to_capture; -+ struct inode *inode; -+ -+ inode = mapping->host; -+ if (!has_anonymous_pages(inode)) { -+ result = 0; -+ goto end; -+ } -+ jindex = pindex = wbc->range_start >> PAGE_CACHE_SHIFT; -+ result = 0; -+ nr_pages = -+ (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; -+ uf_info = unix_file_inode_data(inode); -+ -+ do { -+ reiser4_context *ctx; -+ -+ if (wbc->sync_mode != WB_SYNC_ALL) -+ to_capture = min(wbc->nr_to_write, CAPTURE_APAGE_BURST); -+ else -+ to_capture = CAPTURE_APAGE_BURST; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) { -+ result = PTR_ERR(ctx); -+ break; -+ } -+ /* avoid recursive calls to ->sync_inodes */ -+ ctx->nobalance = 1; -+ assert("zam-760", lock_stack_isclean(get_current_lock_stack())); -+ assert("", LOCK_CNT_NIL(inode_sem_w)); -+ assert("", LOCK_CNT_NIL(inode_sem_r)); -+ -+ reiser4_txn_restart_current(); -+ -+ /* we have to get nonexclusive access to the file */ -+ if (get_current_context()->entd) { -+ /* -+ * use nonblocking version of nonexclusive_access to -+ * avoid deadlock which might look like the following: -+ * process P1 holds NEA on file F1 and called entd to -+ * reclaim some memory. Entd works for P1 and is going -+ * to capture pages of file F2. To do that entd has to -+ * get NEA to F2. F2 is held by process P2 which also -+ * called entd. But entd is serving P1 at the moment -+ * and P2 has to wait. Process P3 trying to get EA to -+ * file F2. Existence of pending EA request to file F2 -+ * makes impossible for entd to get NEA to file -+ * F2. Neither of these process can continue. Using -+ * nonblocking version of gettign NEA is supposed to -+ * avoid this deadlock. -+ */ -+ if (try_to_get_nonexclusive_access(uf_info) == 0) { -+ result = RETERR(-EBUSY); -+ reiser4_exit_context(ctx); -+ break; -+ } -+ } else -+ get_nonexclusive_access(uf_info); -+ -+ while (to_capture > 0) { -+ pgoff_t start; -+ -+ assert("vs-1727", jindex <= pindex); -+ if (pindex == jindex) { -+ start = pindex; -+ result = -+ capture_anonymous_pages(inode->i_mapping, -+ &pindex, -+ to_capture); -+ if (result <= 0) -+ break; -+ to_capture -= result; -+ wbc->nr_to_write -= result; -+ if (start + result == pindex) { -+ jindex = pindex; -+ continue; -+ } -+ if (to_capture <= 0) -+ break; -+ } -+ /* deal with anonymous jnodes between jindex and pindex */ -+ result = -+ capture_anonymous_jnodes(inode->i_mapping, &jindex, -+ pindex, to_capture); -+ if (result < 0) -+ break; -+ to_capture -= result; -+ get_current_context()->nr_captured += result; -+ -+ if (jindex == (pgoff_t) - 1) { -+ assert("vs-1728", pindex == (pgoff_t) - 1); -+ break; -+ } -+ } -+ if (to_capture <= 0) -+ /* there may be left more pages */ -+ __mark_inode_dirty(inode, I_DIRTY_PAGES); -+ -+ drop_nonexclusive_access(uf_info); -+ if (result < 0) { -+ /* error happened */ -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ if (wbc->sync_mode != WB_SYNC_ALL) { -+ reiser4_exit_context(ctx); -+ return 0; -+ } -+ result = commit_file_atoms(inode); -+ reiser4_exit_context(ctx); -+ if (pindex >= nr_pages && jindex == pindex) -+ break; -+ } while (1); -+ -+ end: -+ if (is_in_reiser4_context()) { -+ if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) { -+ /* -+ * there are already pages to flush, flush them out, do -+ * not delay until end of reiser4_sync_inodes -+ */ -+ reiser4_writeout(inode->i_sb, wbc); -+ get_current_context()->nr_captured = 0; -+ } -+ } -+ return result; -+} -+ -+/* -+ * ->sync() method for unix file. -+ * -+ * We are trying to be smart here. Instead of committing all atoms (original -+ * solution), we scan dirty pages of this file and commit all atoms they are -+ * part of. -+ * -+ * Situation is complicated by anonymous pages: i.e., extent-less pages -+ * dirtied through mmap. Fortunately sys_fsync() first calls -+ * filemap_fdatawrite() that will ultimately call reiser4_writepages(), insert -+ * all missing extents and capture anonymous pages. -+ */ -+int sync_unix_file(struct file *file, struct dentry *dentry, int datasync) -+{ -+ reiser4_context *ctx; -+ txn_atom *atom; -+ reiser4_block_nr reserve; -+ -+ ctx = reiser4_init_context(dentry->d_inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ reserve = estimate_update_common(dentry->d_inode); -+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) { -+ reiser4_exit_context(ctx); -+ return RETERR(-ENOSPC); -+ } -+ write_sd_by_inode_common(dentry->d_inode); -+ -+ atom = get_current_atom_locked(); -+ spin_lock_txnh(ctx->trans); -+ force_commit_atom(ctx->trans); -+ reiser4_exit_context(ctx); -+ return 0; -+} -+ -+/** -+ * readpage_unix_file_nolock - readpage of struct address_space_operations -+ * @file: -+ * @page: -+ * -+ * Compose a key and search for item containing information about @page -+ * data. If item is found - its readpage method is called. -+ */ -+int readpage_unix_file(struct file *file, struct page *page) -+{ -+ reiser4_context *ctx; -+ int result; -+ struct inode *inode; -+ reiser4_key key; -+ item_plugin *iplug; -+ hint_t *hint; -+ lock_handle *lh; -+ coord_t *coord; -+ -+ assert("vs-1062", PageLocked(page)); -+ assert("vs-976", !PageUptodate(page)); -+ assert("vs-1061", page->mapping && page->mapping->host); -+ -+ if (page->mapping->host->i_size <= page_offset(page)) { -+ /* page is out of file already */ -+ unlock_page(page); -+ return -EINVAL; -+ } -+ -+ inode = page->mapping->host; -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) { -+ unlock_page(page); -+ return PTR_ERR(ctx); -+ } -+ -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) { -+ unlock_page(page); -+ reiser4_exit_context(ctx); -+ return RETERR(-ENOMEM); -+ } -+ -+ result = load_file_hint(file, hint); -+ if (result) { -+ kfree(hint); -+ unlock_page(page); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ lh = &hint->lh; -+ -+ /* get key of first byte of the page */ -+ key_by_inode_and_offset_common(inode, page_offset(page), &key); -+ -+ /* look for file metadata corresponding to first byte of page */ -+ page_cache_get(page); -+ unlock_page(page); -+ result = find_file_item(hint, &key, ZNODE_READ_LOCK, inode); -+ lock_page(page); -+ page_cache_release(page); -+ -+ if (page->mapping == NULL) { -+ /* -+ * readpage allows truncate to run concurrently. Page was -+ * truncated while it was not locked -+ */ -+ done_lh(lh); -+ kfree(hint); -+ unlock_page(page); -+ reiser4_txn_restart(ctx); -+ reiser4_exit_context(ctx); -+ return -EINVAL; -+ } -+ -+ if (result != CBK_COORD_FOUND || hint->ext_coord.coord.between != AT_UNIT) { -+ if (result == CBK_COORD_FOUND && -+ hint->ext_coord.coord.between != AT_UNIT) -+ /* file is truncated */ -+ result = -EINVAL; -+ done_lh(lh); -+ kfree(hint); -+ unlock_page(page); -+ reiser4_txn_restart(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ /* -+ * item corresponding to page is found. It can not be removed because -+ * znode lock is held -+ */ -+ if (PageUptodate(page)) { -+ done_lh(lh); -+ kfree(hint); -+ unlock_page(page); -+ reiser4_txn_restart(ctx); -+ reiser4_exit_context(ctx); -+ return 0; -+ } -+ -+ coord = &hint->ext_coord.coord; -+ result = zload(coord->node); -+ if (result) { -+ done_lh(lh); -+ kfree(hint); -+ unlock_page(page); -+ reiser4_txn_restart(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ validate_extended_coord(&hint->ext_coord, page_offset(page)); -+ -+ if (!coord_is_existing_unit(coord)) { -+ /* this indicates corruption */ -+ warning("vs-280", -+ "Looking for page %lu of file %llu (size %lli). " -+ "No file items found (%d). File is corrupted?\n", -+ page->index, (unsigned long long)get_inode_oid(inode), -+ inode->i_size, result); -+ zrelse(coord->node); -+ done_lh(lh); -+ kfree(hint); -+ unlock_page(page); -+ reiser4_txn_restart(ctx); -+ reiser4_exit_context(ctx); -+ return RETERR(-EIO); -+ } -+ -+ /* -+ * get plugin of found item or use plugin if extent if there are no -+ * one -+ */ -+ iplug = item_plugin_by_coord(coord); -+ if (iplug->s.file.readpage) -+ result = iplug->s.file.readpage(coord, page); -+ else -+ result = RETERR(-EINVAL); -+ -+ if (!result) { -+ set_key_offset(&key, -+ (loff_t) (page->index + 1) << PAGE_CACHE_SHIFT); -+ /* FIXME should call reiser4_set_hint() */ -+ reiser4_unset_hint(hint); -+ } else { -+ unlock_page(page); -+ reiser4_unset_hint(hint); -+ } -+ assert("vs-979", -+ ergo(result == 0, (PageLocked(page) || PageUptodate(page)))); -+ assert("vs-9791", ergo(result != 0, !PageLocked(page))); -+ -+ zrelse(coord->node); -+ done_lh(lh); -+ -+ save_file_hint(file, hint); -+ kfree(hint); -+ -+ /* -+ * FIXME: explain why it is needed. HINT: page allocation in write can -+ * not be done when atom is not NULL because reiser4_writepage can not -+ * kick entd and have to eflush -+ */ -+ reiser4_txn_restart(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+struct uf_readpages_context { -+ lock_handle lh; -+ coord_t coord; -+}; -+ -+/* A callback function for readpages_unix_file/read_cache_pages. -+ * If the file is build of tails, then return error (-ENOENT). -+ * -+ * @data -- a pointer to reiser4_readpages_context object, -+ * to save the twig lock and the coord between -+ * read_cache_page iterations. -+ * @page -- page to start read. -+ */ -+static int uf_readpages_filler(void * data, struct page * page) -+{ -+ struct uf_readpages_context *rc = data; -+ jnode * node; -+ int ret = 0; -+ reiser4_extent *ext; -+ __u64 ext_index; -+ int cbk_done = 0; -+ struct address_space * mapping = page->mapping; -+ -+ if (PageUptodate(page)) { -+ unlock_page(page); -+ return 0; -+ } -+ if (rc->lh.node == 0) { -+ /* no twig lock - have to do tree search. */ -+ reiser4_key key; -+ repeat: -+ unlock_page(page); -+ key_by_inode_and_offset_common( -+ mapping->host, page_offset(page), &key); -+ ret = coord_by_key( -+ &get_super_private(mapping->host->i_sb)->tree, -+ &key, &rc->coord, &rc->lh, -+ ZNODE_READ_LOCK, FIND_EXACT, -+ TWIG_LEVEL, TWIG_LEVEL, CBK_UNIQUE, NULL); -+ if (ret) -+ return ret; -+ lock_page(page); -+ cbk_done = 1; -+ } -+ ret = zload(rc->coord.node); -+ if (ret) { -+ unlock_page(page); -+ return ret; -+ } -+ if (!coord_is_existing_item(&rc->coord) || -+ !item_is_extent(&rc->coord)) { -+ zrelse(rc->coord.node); -+ unlock_page(page); -+ return RETERR(-EIO); -+ } -+ ext = extent_by_coord(&rc->coord); -+ ext_index = extent_unit_index(&rc->coord); -+ if (page->index < ext_index || -+ page->index >= ext_index + extent_get_width(ext)) { -+ /* the page index doesn't belong to the extent unit -+ which the coord points to - release the lock and -+ repeat with tree search. */ -+ zrelse(rc->coord.node); -+ done_lh(&rc->lh); -+ /* we can be here after a CBK call only in case of -+ corruption of the tree or the tree lookup algorithm bug. */ -+ if (unlikely(cbk_done)) { -+ unlock_page(page); -+ return RETERR(-EIO); -+ } -+ goto repeat; -+ } -+ node = jnode_of_page(page); -+ if (unlikely(IS_ERR(node))) { -+ zrelse(rc->coord.node); -+ unlock_page(page); -+ return PTR_ERR(node); -+ } -+ ret = reiser4_do_readpage_extent(ext, page->index - ext_index, page); -+ jput(node); -+ zrelse(rc->coord.node); -+ if (ret) -+ unlock_page(page); -+ return ret; -+} -+ -+/** -+ * readpages_unix_file - called by the readahead code, starts reading for each -+ * page of given list of pages -+ */ -+int readpages_unix_file( -+ struct file *file, struct address_space *mapping, -+ struct list_head *pages, unsigned nr_pages) -+{ -+ reiser4_context *ctx; -+ struct uf_readpages_context rc; -+ int ret; -+ -+ ctx = reiser4_init_context(mapping->host->i_sb); -+ if (IS_ERR(ctx)) { -+ put_pages_list(pages); -+ return PTR_ERR(ctx); -+ } -+ init_lh(&rc.lh); -+ ret = read_cache_pages(mapping, pages, uf_readpages_filler, &rc); -+ done_lh(&rc.lh); -+ context_set_commit_async(ctx); -+ /* close the transaction to protect further page allocation from deadlocks */ -+ reiser4_txn_restart(ctx); -+ reiser4_exit_context(ctx); -+ return ret; -+} -+ -+static reiser4_block_nr unix_file_estimate_read(struct inode *inode, -+ loff_t count UNUSED_ARG) -+{ -+ /* We should reserve one block, because of updating of the stat data -+ item */ -+ assert("vs-1249", -+ inode_file_plugin(inode)->estimate.update == -+ estimate_update_common); -+ return estimate_update_common(inode); -+} -+ -+/* this is called with nonexclusive access obtained, file's container can not change */ -+static ssize_t read_file(hint_t *hint, struct file *file, /* file to read from to */ -+ char __user *buf, /* address of user-space buffer */ -+ size_t count, /* number of bytes to read */ -+ loff_t *off) -+{ -+ int result; -+ struct inode *inode; -+ flow_t flow; -+ int (*read_f) (struct file *, flow_t *, hint_t *); -+ coord_t *coord; -+ znode *loaded; -+ -+ inode = file->f_dentry->d_inode; -+ -+ /* build flow */ -+ assert("vs-1250", -+ inode_file_plugin(inode)->flow_by_inode == -+ flow_by_inode_unix_file); -+ result = -+ flow_by_inode_unix_file(inode, buf, 1 /* user space */ , count, -+ *off, READ_OP, &flow); -+ if (unlikely(result)) -+ return result; -+ -+ /* get seal and coord sealed with it from reiser4 private data -+ of struct file. The coord will tell us where our last read -+ of this file finished, and the seal will help to determine -+ if that location is still valid. -+ */ -+ coord = &hint->ext_coord.coord; -+ while (flow.length && result == 0) { -+ result = -+ find_file_item(hint, &flow.key, ZNODE_READ_LOCK, inode); -+ if (cbk_errored(result)) -+ /* error happened */ -+ break; -+ -+ if (coord->between != AT_UNIT) { -+ /* there were no items corresponding to given offset */ -+ done_lh(hint->ext_coord.lh); -+ break; -+ } -+ -+ loaded = coord->node; -+ result = zload(loaded); -+ if (unlikely(result)) { -+ done_lh(hint->ext_coord.lh); -+ break; -+ } -+ -+ if (hint->ext_coord.valid == 0) -+ validate_extended_coord(&hint->ext_coord, -+ get_key_offset(&flow.key)); -+ -+ assert("vs-4", hint->ext_coord.valid == 1); -+ assert("vs-33", hint->ext_coord.lh == &hint->lh); -+ /* call item's read method */ -+ read_f = item_plugin_by_coord(coord)->s.file.read; -+ result = read_f(file, &flow, hint); -+ zrelse(loaded); -+ done_lh(hint->ext_coord.lh); -+ } -+ -+ return (count - flow.length) ? (count - flow.length) : result; -+} -+ -+static ssize_t read_unix_file_container_tails(struct file*, char __user*, size_t, loff_t*); -+ -+/** -+ * read_unix_file - read of struct file_operations -+ * @file: file to read from -+ * @buf: address of user-space buffer -+ * @read_amount: number of bytes to read -+ * @off: position in file to read from -+ * -+ * This is implementation of vfs's read method of struct file_operations for -+ * unix file plugin. -+ */ -+ssize_t read_unix_file(struct file *file, char __user *buf, size_t read_amount, -+ loff_t *off) -+{ -+ reiser4_context *ctx; -+ ssize_t result; -+ struct inode *inode; -+ unix_file_info_t *uf_info; -+ -+ if (unlikely(read_amount == 0)) -+ return 0; -+ -+ assert("umka-072", file != NULL); -+ assert("umka-074", off != NULL); -+ inode = file->f_dentry->d_inode; -+ assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD)); -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ uf_info = unix_file_inode_data(inode); -+ if (uf_info->container == UF_CONTAINER_UNKNOWN) { -+ get_exclusive_access(uf_info); -+ result = find_file_state(inode, uf_info); -+ if (unlikely(result != 0)) -+ goto out; -+ } else -+ get_nonexclusive_access(uf_info); -+ result = reiser4_grab_space_force(unix_file_estimate_read(inode, read_amount), -+ BA_CAN_COMMIT); -+ if (unlikely(result != 0)) -+ goto out; -+ if (uf_info->container == UF_CONTAINER_EXTENTS){ -+ result = do_sync_read(file, buf, read_amount, off); -+ } else if (uf_info->container == UF_CONTAINER_TAILS || -+ reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV) || -+ reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) { -+ result = read_unix_file_container_tails(file, buf, read_amount, off); -+ } else { -+ assert("zam-1085", uf_info->container == UF_CONTAINER_EMPTY); -+ result = 0; -+ } -+out: -+ drop_access(uf_info); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+static ssize_t read_unix_file_container_tails( -+ struct file *file, char __user *buf, size_t read_amount, loff_t *off) -+{ -+ int result; -+ struct inode *inode; -+ hint_t *hint; -+ unix_file_info_t *uf_info; -+ size_t count, read, left; -+ loff_t size; -+ -+ assert("umka-072", file != NULL); -+ assert("umka-074", off != NULL); -+ inode = file->f_dentry->d_inode; -+ assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD)); -+ -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) -+ return RETERR(-ENOMEM); -+ -+ result = load_file_hint(file, hint); -+ if (result) { -+ kfree(hint); -+ return result; -+ } -+ -+ left = read_amount; -+ count = 0; -+ uf_info = unix_file_inode_data(inode); -+ while (left > 0) { -+ reiser4_txn_restart_current(); -+ size = i_size_read(inode); -+ if (*off >= size) -+ /* position to read from is past the end of file */ -+ break; -+ if (*off + left > size) -+ left = size - *off; -+ /* faultin user page */ -+ result = fault_in_pages_writeable(buf, left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left); -+ if (result) -+ return RETERR(-EFAULT); -+ -+ read = read_file(hint, file, buf, -+ left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left, -+ off); -+ if (read < 0) { -+ result = read; -+ break; -+ } -+ left -= read; -+ buf += read; -+ -+ /* update position in a file */ -+ *off += read; -+ /* total number of read bytes */ -+ count += read; -+ } -+ done_lh(&hint->lh); -+ save_file_hint(file, hint); -+ kfree(hint); -+ if (count) -+ file_accessed(file); -+ /* return number of read bytes or error code if nothing is read */ -+ return count ? count : result; -+} -+ -+/* This function takes care about @file's pages. First of all it checks if -+ filesystems readonly and if so gets out. Otherwise, it throws out all -+ pages of file if it was mapped for read and going to be mapped for write -+ and consists of tails. This is done in order to not manage few copies -+ of the data (first in page cache and second one in tails them selves) -+ for the case of mapping files consisting tails. -+ -+ Here also tail2extent conversion is performed if it is allowed and file -+ is going to be written or mapped for write. This functions may be called -+ from write_unix_file() or mmap_unix_file(). */ -+static int check_pages_unix_file(struct file *file, struct inode *inode) -+{ -+ reiser4_invalidate_pages(inode->i_mapping, 0, -+ (inode->i_size + PAGE_CACHE_SIZE - -+ 1) >> PAGE_CACHE_SHIFT, 0); -+ return unpack(file, inode, 0 /* not forever */ ); -+} -+ -+/** -+ * mmap_unix_file - mmap of struct file_operations -+ * @file: file to mmap -+ * @vma: -+ * -+ * This is implementation of vfs's mmap method of struct file_operations for -+ * unix file plugin. It converts file to extent if necessary. Sets -+ * reiser4_inode's flag - REISER4_HAS_MMAP. -+ */ -+int mmap_unix_file(struct file *file, struct vm_area_struct *vma) -+{ -+ reiser4_context *ctx; -+ int result; -+ struct inode *inode; -+ unix_file_info_t *uf_info; -+ reiser4_block_nr needed; -+ -+ inode = file->f_dentry->d_inode; -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ uf_info = unix_file_inode_data(inode); -+ -+ get_exclusive_access_careful(uf_info, inode); -+ -+ if (!IS_RDONLY(inode) && (vma->vm_flags & (VM_MAYWRITE | VM_SHARED))) { -+ /* -+ * we need file built of extent items. If it is still built of -+ * tail items we have to convert it. Find what items the file -+ * is built of -+ */ -+ result = find_file_state(inode, uf_info); -+ if (result != 0) { -+ drop_exclusive_access(uf_info); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ assert("vs-1648", (uf_info->container == UF_CONTAINER_TAILS || -+ uf_info->container == UF_CONTAINER_EXTENTS || -+ uf_info->container == UF_CONTAINER_EMPTY)); -+ if (uf_info->container == UF_CONTAINER_TAILS) { -+ /* -+ * invalidate all pages and convert file from tails to -+ * extents -+ */ -+ result = check_pages_unix_file(file, inode); -+ if (result) { -+ drop_exclusive_access(uf_info); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ } -+ } -+ -+ /* -+ * generic_file_mmap will do update_atime. Grab space for stat data -+ * update. -+ */ -+ needed = inode_file_plugin(inode)->estimate.update(inode); -+ result = reiser4_grab_space_force(needed, BA_CAN_COMMIT); -+ if (result) { -+ drop_exclusive_access(uf_info); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ result = generic_file_mmap(file, vma); -+ if (result == 0) { -+ /* mark file as having mapping. */ -+ reiser4_inode_set_flag(inode, REISER4_HAS_MMAP); -+ } -+ -+ drop_exclusive_access(uf_info); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/** -+ * find_first_item -+ * @inode: -+ * -+ * Finds file item which is responsible for first byte in the file. -+ */ -+static int find_first_item(struct inode *inode) -+{ -+ coord_t coord; -+ lock_handle lh; -+ reiser4_key key; -+ int result; -+ -+ coord_init_zero(&coord); -+ init_lh(&lh); -+ inode_file_plugin(inode)->key_by_inode(inode, 0, &key); -+ result = find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, -+ inode); -+ if (result == CBK_COORD_FOUND) { -+ if (coord.between == AT_UNIT) { -+ result = zload(coord.node); -+ if (result == 0) { -+ result = item_id_by_coord(&coord); -+ zrelse(coord.node); -+ if (result != EXTENT_POINTER_ID && -+ result != FORMATTING_ID) -+ result = RETERR(-EIO); -+ } -+ } else -+ result = RETERR(-EIO); -+ } -+ done_lh(&lh); -+ return result; -+} -+ -+/** -+ * open_unix_file -+ * @inode: -+ * @file: -+ * -+ * If filesystem is not readonly - complete uncompleted tail conversion if -+ * there was one -+ */ -+int open_unix_file(struct inode *inode, struct file *file) -+{ -+ int result; -+ reiser4_context *ctx; -+ unix_file_info_t *uf_info; -+ -+ if (IS_RDONLY(inode)) -+ return 0; -+ -+ if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) -+ return 0; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ uf_info = unix_file_inode_data(inode); -+ -+ get_exclusive_access_careful(uf_info, inode); -+ -+ if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) { -+ /* -+ * other process completed the conversion -+ */ -+ drop_exclusive_access(uf_info); -+ reiser4_exit_context(ctx); -+ return 0; -+ } -+ -+ /* -+ * file left in semi converted state after unclean shutdown or another -+ * thread is doing conversion and dropped exclusive access which doing -+ * balance dirty pages. Complete the conversion -+ */ -+ result = find_first_item(inode); -+ if (result == EXTENT_POINTER_ID) -+ /* -+ * first item is extent, therefore there was incomplete -+ * tail2extent conversion. Complete it -+ */ -+ result = tail2extent(unix_file_inode_data(inode)); -+ else if (result == FORMATTING_ID) -+ /* -+ * first item is formatting item, therefore there was -+ * incomplete extent2tail conversion. Complete it -+ */ -+ result = extent2tail(unix_file_inode_data(inode)); -+ else -+ result = -EIO; -+ -+ assert("vs-1712", -+ ergo(result == 0, -+ (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED) && -+ !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV)))); -+ drop_exclusive_access(uf_info); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+#define NEITHER_OBTAINED 0 -+#define EA_OBTAINED 1 -+#define NEA_OBTAINED 2 -+ -+static void drop_access(unix_file_info_t *uf_info) -+{ -+ if (uf_info->exclusive_use) -+ drop_exclusive_access(uf_info); -+ else -+ drop_nonexclusive_access(uf_info); -+} -+ -+#define debug_wuf(format, ...) printk("%s: %d: %s: " format "\n", \ -+ __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) -+ -+/** -+ * write_unix_file - write of struct file_operations -+ * @file: file to write to -+ * @buf: address of user-space buffer -+ * @write_amount: number of bytes to write -+ * @off: position in file to write to -+ * -+ * This is implementation of vfs's write method of struct file_operations for -+ * unix file plugin. -+ */ -+ssize_t write_unix_file(struct file *file, const char __user *buf, -+ size_t count, loff_t *pos) -+{ -+ int result; -+ reiser4_context *ctx; -+ struct inode *inode; -+ unix_file_info_t *uf_info; -+ ssize_t written; -+ int try_free_space; -+ int to_write = PAGE_CACHE_SIZE * WRITE_GRANULARITY; -+ size_t left; -+ ssize_t (*write_op)(struct file *, const char __user *, size_t, -+ loff_t *pos); -+ int ea; -+ loff_t new_size; -+ -+ inode = file->f_dentry->d_inode; -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ mutex_lock(&inode->i_mutex); -+ -+ assert("vs-947", !reiser4_inode_get_flag(inode, REISER4_NO_SD)); -+ assert("vs-9471", (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED))); -+ -+ /* check amount of bytes to write and writing position */ -+ result = generic_write_checks(file, pos, &count, 0); -+ if (result) { -+ mutex_unlock(&inode->i_mutex); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ result = remove_suid(file->f_dentry); -+ if (result) { -+ mutex_unlock(&inode->i_mutex); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ uf_info = unix_file_inode_data(inode); -+ -+ current->backing_dev_info = inode->i_mapping->backing_dev_info; -+ written = 0; -+ try_free_space = 0; -+ left = count; -+ ea = NEITHER_OBTAINED; -+ -+ new_size = i_size_read(inode); -+ if (*pos + count > new_size) -+ new_size = *pos + count; -+ -+ while (left) { -+ if (left < to_write) -+ to_write = left; -+ -+ if (uf_info->container == UF_CONTAINER_EMPTY) { -+ get_exclusive_access(uf_info); -+ ea = EA_OBTAINED; -+ if (uf_info->container != UF_CONTAINER_EMPTY) { -+ /* file is made not empty by another process */ -+ drop_exclusive_access(uf_info); -+ ea = NEITHER_OBTAINED; -+ continue; -+ } -+ } else if (uf_info->container == UF_CONTAINER_UNKNOWN) { -+ /* -+ * get exclusive access directly just to not have to -+ * re-obtain it if file will appear empty -+ */ -+ get_exclusive_access(uf_info); -+ ea = EA_OBTAINED; -+ result = find_file_state(inode, uf_info); -+ if (result) { -+ drop_exclusive_access(uf_info); -+ ea = NEITHER_OBTAINED; -+ break; -+ } -+ } else { -+ get_nonexclusive_access(uf_info); -+ ea = NEA_OBTAINED; -+ } -+ -+ /* either EA or NEA is obtained. Choose item write method */ -+ if (uf_info->container == UF_CONTAINER_EXTENTS) { -+ /* file is built of extent items */ -+ write_op = reiser4_write_extent; -+ } else if (uf_info->container == UF_CONTAINER_EMPTY) { -+ /* file is empty */ -+ if (should_have_notail(uf_info, new_size)) -+ write_op = reiser4_write_extent; -+ else -+ write_op = reiser4_write_tail; -+ } else { -+ /* file is built of tail items */ -+ if (should_have_notail(uf_info, new_size)) { -+ if (ea == NEA_OBTAINED) { -+ drop_nonexclusive_access(uf_info); -+ get_exclusive_access(uf_info); -+ ea = EA_OBTAINED; -+ } -+ if (uf_info->container == UF_CONTAINER_TAILS) { -+ /* -+ * if file is being convered by another -+ * process - wait until it completes -+ */ -+ while (1) { -+ if (reiser4_inode_get_flag(inode, -+ REISER4_PART_IN_CONV)) { -+ drop_exclusive_access(uf_info); -+ schedule(); -+ get_exclusive_access(uf_info); -+ continue; -+ } -+ break; -+ } -+ if (uf_info->container == UF_CONTAINER_TAILS) { -+ result = tail2extent(uf_info); -+ if (result) -+ break; -+ } -+ } -+ drop_exclusive_access(uf_info); -+ ea = NEITHER_OBTAINED; -+ continue; -+ } -+ write_op = reiser4_write_tail; -+ } -+ -+ written = write_op(file, buf, to_write, pos); -+ if (written == -ENOSPC && try_free_space) { -+ drop_access(uf_info); -+ txnmgr_force_commit_all(inode->i_sb, 0); -+ try_free_space = 0; -+ continue; -+ } -+ if (written < 0) { -+ drop_access(uf_info); -+ result = written; -+ break; -+ } -+ /* something is written. */ -+ if (uf_info->container == UF_CONTAINER_EMPTY) { -+ assert("", ea == EA_OBTAINED); -+ uf_info->container = -+ (write_op == reiser4_write_extent) ? -+ UF_CONTAINER_EXTENTS : UF_CONTAINER_TAILS; -+ } else { -+ assert("", ergo(uf_info->container == UF_CONTAINER_EXTENTS, -+ write_op == reiser4_write_extent)); -+ assert("", ergo(uf_info->container == UF_CONTAINER_TAILS, -+ write_op == reiser4_write_tail)); -+ } -+ if (*pos + written > inode->i_size) -+ INODE_SET_FIELD(inode, i_size, *pos + written); -+ file_update_time(file); -+ result = reiser4_update_sd(inode); -+ if (result) { -+ mutex_unlock(&inode->i_mutex); -+ current->backing_dev_info = NULL; -+ drop_access(uf_info); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ drop_access(uf_info); -+ ea = NEITHER_OBTAINED; -+ reiser4_txn_restart(ctx); -+ current->journal_info = NULL; -+ /* -+ * tell VM how many pages were dirtied. Maybe number of pages -+ * which were dirty already should not be counted -+ */ -+ balance_dirty_pages_ratelimited_nr(inode->i_mapping, -+ (written + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE); -+ current->journal_info = ctx; -+ -+ left -= written; -+ buf += written; -+ *pos += written; -+ } -+ -+ mutex_unlock(&inode->i_mutex); -+ -+ if (result == 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { -+ reiser4_txn_restart_current(); -+ grab_space_enable(); -+ result = sync_unix_file(file, file->f_dentry, -+ 0 /* data and stat data */ ); -+ if (result) -+ warning("reiser4-7", "failed to sync file %llu", -+ (unsigned long long)get_inode_oid(inode)); -+ } -+ -+ current->backing_dev_info = NULL; -+ -+ reiser4_exit_context(ctx); -+ -+ /* -+ * return number of written bytes or error code if nothing is -+ * written. Note, that it does not work correctly in case when -+ * sync_unix_file returns error -+ */ -+ return (count - left) ? (count - left) : result; -+} -+ -+/** -+ * release_unix_file - release of struct file_operations -+ * @inode: inode of released file -+ * @file: file to release -+ * -+ * Implementation of release method of struct file_operations for unix file -+ * plugin. If last reference to indode is released - convert all extent items -+ * into tail items if necessary. Frees reiser4 specific file data. -+ */ -+int release_unix_file(struct inode *inode, struct file *file) -+{ -+ reiser4_context *ctx; -+ unix_file_info_t *uf_info; -+ int result; -+ int in_reiser4; -+ -+ in_reiser4 = is_in_reiser4_context(); -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ result = 0; -+ if (in_reiser4 == 0) { -+ uf_info = unix_file_inode_data(inode); -+ -+ get_exclusive_access_careful(uf_info, inode); -+ if (atomic_read(&file->f_dentry->d_count) == 1 && -+ uf_info->container == UF_CONTAINER_EXTENTS && -+ !should_have_notail(uf_info, inode->i_size) && -+ !rofs_inode(inode)) { -+ result = extent2tail(uf_info); -+ if (result != 0) { -+ warning("nikita-3233", -+ "Failed (%d) to convert in %s (%llu)", -+ result, __FUNCTION__, -+ (unsigned long long) -+ get_inode_oid(inode)); -+ } -+ } -+ drop_exclusive_access(uf_info); -+ } else { -+ /* -+ we are within reiser4 context already. How latter is -+ possible? Simple: -+ -+ (gdb) bt -+ #0 get_exclusive_access () -+ #2 0xc01e56d3 in release_unix_file () -+ #3 0xc01c3643 in reiser4_release () -+ #4 0xc014cae0 in __fput () -+ #5 0xc013ffc3 in remove_vm_struct () -+ #6 0xc0141786 in exit_mmap () -+ #7 0xc0118480 in mmput () -+ #8 0xc0133205 in oom_kill () -+ #9 0xc01332d1 in out_of_memory () -+ #10 0xc013bc1d in try_to_free_pages () -+ #11 0xc013427b in __alloc_pages () -+ #12 0xc013f058 in do_anonymous_page () -+ #13 0xc013f19d in do_no_page () -+ #14 0xc013f60e in handle_mm_fault () -+ #15 0xc01131e5 in do_page_fault () -+ #16 0xc0104935 in error_code () -+ #17 0xc025c0c6 in __copy_to_user_ll () -+ #18 0xc01d496f in reiser4_read_tail () -+ #19 0xc01e4def in read_unix_file () -+ #20 0xc01c3504 in reiser4_read () -+ #21 0xc014bd4f in vfs_read () -+ #22 0xc014bf66 in sys_read () -+ */ -+ warning("vs-44", "out of memory?"); -+ } -+ -+ reiser4_free_file_fsdata(file); -+ -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+static void set_file_notail(struct inode *inode) -+{ -+ reiser4_inode *state; -+ formatting_plugin *tplug; -+ -+ state = reiser4_inode_data(inode); -+ tplug = formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID); -+ force_plugin_pset(inode, PSET_FORMATTING, (reiser4_plugin *)tplug); -+} -+ -+/* if file is built of tails - convert it to extents */ -+static int unpack(struct file *filp, struct inode *inode, int forever) -+{ -+ int result = 0; -+ unix_file_info_t *uf_info; -+ -+ uf_info = unix_file_inode_data(inode); -+ assert("vs-1628", ea_obtained(uf_info)); -+ -+ result = find_file_state(inode, uf_info); -+ if (result) -+ return result; -+ assert("vs-1074", uf_info->container != UF_CONTAINER_UNKNOWN); -+ -+ if (uf_info->container == UF_CONTAINER_TAILS) { -+ /* -+ * if file is being convered by another process - wait until it -+ * completes -+ */ -+ while (1) { -+ if (reiser4_inode_get_flag(inode, -+ REISER4_PART_IN_CONV)) { -+ drop_exclusive_access(uf_info); -+ schedule(); -+ get_exclusive_access(uf_info); -+ continue; -+ } -+ break; -+ } -+ if (uf_info->container == UF_CONTAINER_TAILS) { -+ result = tail2extent(uf_info); -+ if (result) -+ return result; -+ } -+ } -+ if (forever) { -+ /* safe new formatting plugin in stat data */ -+ __u64 tograb; -+ -+ set_file_notail(inode); -+ -+ grab_space_enable(); -+ tograb = inode_file_plugin(inode)->estimate.update(inode); -+ result = reiser4_grab_space(tograb, BA_CAN_COMMIT); -+ result = reiser4_update_sd(inode); -+ } -+ -+ return result; -+} -+ -+/* implentation of vfs' ioctl method of struct file_operations for unix file -+ plugin -+*/ -+int -+ioctl_unix_file(struct inode *inode, struct file *filp, -+ unsigned int cmd, unsigned long arg UNUSED_ARG) -+{ -+ reiser4_context *ctx; -+ int result; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ switch (cmd) { -+ case REISER4_IOC_UNPACK: -+ get_exclusive_access(unix_file_inode_data(inode)); -+ result = unpack(filp, inode, 1 /* forever */ ); -+ drop_exclusive_access(unix_file_inode_data(inode)); -+ break; -+ -+ default: -+ result = RETERR(-ENOSYS); -+ break; -+ } -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/* implentation of vfs' bmap method of struct address_space_operations for unix -+ file plugin -+*/ -+sector_t bmap_unix_file(struct address_space * mapping, sector_t lblock) -+{ -+ reiser4_context *ctx; -+ sector_t result; -+ reiser4_key key; -+ coord_t coord; -+ lock_handle lh; -+ struct inode *inode; -+ item_plugin *iplug; -+ sector_t block; -+ -+ inode = mapping->host; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ key_by_inode_and_offset_common(inode, -+ (loff_t) lblock * current_blocksize, -+ &key); -+ -+ init_lh(&lh); -+ result = -+ find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, inode); -+ if (cbk_errored(result)) { -+ done_lh(&lh); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ result = zload(coord.node); -+ if (result) { -+ done_lh(&lh); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ iplug = item_plugin_by_coord(&coord); -+ if (iplug->s.file.get_block) { -+ result = iplug->s.file.get_block(&coord, lblock, &block); -+ if (result == 0) -+ result = block; -+ } else -+ result = RETERR(-EINVAL); -+ -+ zrelse(coord.node); -+ done_lh(&lh); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/** -+ * flow_by_inode_unix_file - initizlize structure flow -+ * @inode: inode of file for which read or write is abou -+ * @buf: buffer to perform read to or write from -+ * @user: flag showing whether @buf is user space or kernel space -+ * @size: size of buffer @buf -+ * @off: start offset fro read or write -+ * @op: READ or WRITE -+ * @flow: -+ * -+ * Initializes fields of @flow: key, size of data, i/o mode (read or write). -+ */ -+int flow_by_inode_unix_file(struct inode *inode, -+ const char __user *buf, int user, -+ loff_t size, loff_t off, -+ rw_op op, flow_t *flow) -+{ -+ assert("nikita-1100", inode != NULL); -+ -+ flow->length = size; -+ memcpy(&flow->data, &buf, sizeof(buf)); -+ flow->user = user; -+ flow->op = op; -+ assert("nikita-1931", inode_file_plugin(inode) != NULL); -+ assert("nikita-1932", -+ inode_file_plugin(inode)->key_by_inode == -+ key_by_inode_and_offset_common); -+ /* calculate key of write position and insert it into flow->key */ -+ return key_by_inode_and_offset_common(inode, off, &flow->key); -+} -+ -+/* plugin->u.file.set_plug_in_sd = NULL -+ plugin->u.file.set_plug_in_inode = NULL -+ plugin->u.file.create_blank_sd = NULL */ -+/* plugin->u.file.delete */ -+/* -+ plugin->u.file.add_link = reiser4_add_link_common -+ plugin->u.file.rem_link = NULL */ -+ -+/* plugin->u.file.owns_item -+ this is common_file_owns_item with assertion */ -+/* Audited by: green(2002.06.15) */ -+int -+owns_item_unix_file(const struct inode *inode /* object to check against */ , -+ const coord_t * coord /* coord to check */ ) -+{ -+ int result; -+ -+ result = owns_item_common(inode, coord); -+ if (!result) -+ return 0; -+ if (!plugin_of_group(item_plugin_by_coord(coord), -+ UNIX_FILE_METADATA_ITEM_TYPE)) -+ return 0; -+ assert("vs-547", -+ item_id_by_coord(coord) == EXTENT_POINTER_ID || -+ item_id_by_coord(coord) == FORMATTING_ID); -+ return 1; -+} -+ -+static int setattr_truncate(struct inode *inode, struct iattr *attr) -+{ -+ int result; -+ int s_result; -+ loff_t old_size; -+ reiser4_tree *tree; -+ -+ inode_check_scale(inode, inode->i_size, attr->ia_size); -+ -+ old_size = inode->i_size; -+ tree = reiser4_tree_by_inode(inode); -+ -+ result = safe_link_grab(tree, BA_CAN_COMMIT); -+ if (result == 0) -+ result = safe_link_add(inode, SAFE_TRUNCATE); -+ if (result == 0) -+ result = truncate_file_body(inode, attr->ia_size); -+ if (result) -+ warning("vs-1588", "truncate_file failed: oid %lli, " -+ "old size %lld, new size %lld, retval %d", -+ (unsigned long long)get_inode_oid(inode), -+ old_size, attr->ia_size, result); -+ -+ s_result = safe_link_grab(tree, BA_CAN_COMMIT); -+ if (s_result == 0) -+ s_result = -+ safe_link_del(tree, get_inode_oid(inode), SAFE_TRUNCATE); -+ if (s_result != 0) { -+ warning("nikita-3417", "Cannot kill safelink %lli: %i", -+ (unsigned long long)get_inode_oid(inode), s_result); -+ } -+ safe_link_release(tree); -+ return result; -+} -+ -+/* plugin->u.file.setattr method */ -+/* This calls inode_setattr and if truncate is in effect it also takes -+ exclusive inode access to avoid races */ -+int setattr_unix_file(struct dentry *dentry, /* Object to change attributes */ -+ struct iattr *attr /* change description */ ) -+{ -+ int result; -+ -+ if (attr->ia_valid & ATTR_SIZE) { -+ reiser4_context *ctx; -+ unix_file_info_t *uf_info; -+ -+ /* truncate does reservation itself and requires exclusive -+ access obtained */ -+ ctx = reiser4_init_context(dentry->d_inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ uf_info = unix_file_inode_data(dentry->d_inode); -+ get_exclusive_access_careful(uf_info, dentry->d_inode); -+ result = setattr_truncate(dentry->d_inode, attr); -+ drop_exclusive_access(uf_info); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ } else -+ result = reiser4_setattr_common(dentry, attr); -+ -+ return result; -+} -+ -+/* plugin->u.file.init_inode_data */ -+void -+init_inode_data_unix_file(struct inode *inode, -+ reiser4_object_create_data * crd, int create) -+{ -+ unix_file_info_t *data; -+ -+ data = unix_file_inode_data(inode); -+ data->container = create ? UF_CONTAINER_EMPTY : UF_CONTAINER_UNKNOWN; -+ init_rwsem(&data->latch); -+ data->tplug = inode_formatting_plugin(inode); -+ data->exclusive_use = 0; -+ -+#if REISER4_DEBUG -+ data->ea_owner = NULL; -+ atomic_set(&data->nr_neas, 0); -+#endif -+ init_inode_ordering(inode, crd, create); -+} -+ -+/** -+ * delete_object_unix_file - delete_object of file_plugin -+ * @inode: inode to be deleted -+ * -+ * Truncates file to length 0, removes stat data and safe link. -+ */ -+int delete_object_unix_file(struct inode *inode) -+{ -+ unix_file_info_t *uf_info; -+ int result; -+ -+ if (reiser4_inode_get_flag(inode, REISER4_NO_SD)) -+ return 0; -+ -+ /* truncate file bogy first */ -+ uf_info = unix_file_inode_data(inode); -+ get_exclusive_access(uf_info); -+ result = truncate_file_body(inode, 0 /* size */ ); -+ drop_exclusive_access(uf_info); -+ -+ if (result) -+ warning("", "failed to truncate file (%llu) on removal: %d", -+ get_inode_oid(inode), result); -+ -+ /* remove stat data and safe link */ -+ return reiser4_delete_object_common(inode); -+} -+ -+/** -+ * sendfile_unix_file - sendfile of struct file_operations -+ * @file: file to be sent -+ * @ppos: position to start from -+ * @count: number of bytes to send -+ * @actor: function to copy data -+ * @target: where to copy read data -+ * -+ * Reads @count bytes from @file and calls @actor for every page read. This is -+ * needed for loop back devices support. -+ */ -+ssize_t -+sendfile_unix_file(struct file *file, loff_t *ppos, size_t count, -+ read_actor_t actor, void *target) -+{ -+ reiser4_context *ctx; -+ ssize_t result; -+ struct inode *inode; -+ unix_file_info_t *uf_info; -+ -+ inode = file->f_dentry->d_inode; -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ /* -+ * generic_file_sndfile may want to call update_atime. Grab space for -+ * stat data update -+ */ -+ result = reiser4_grab_space(estimate_update_common(inode), -+ BA_CAN_COMMIT); -+ if (result) -+ goto error; -+ mutex_lock(&inode->i_mutex); -+ reiser4_inode_set_flag(inode, REISER4_HAS_MMAP); -+ mutex_unlock(&inode->i_mutex); -+ -+ uf_info = unix_file_inode_data(inode); -+ get_nonexclusive_access(uf_info); -+ result = generic_file_sendfile(file, ppos, count, actor, target); -+ drop_nonexclusive_access(uf_info); -+ error: -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+int -+prepare_write_unix_file(struct file *file, struct page *page, -+ unsigned from, unsigned to) -+{ -+ reiser4_context *ctx; -+ unix_file_info_t *uf_info; -+ int ret; -+ -+ ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ uf_info = unix_file_inode_data(file->f_dentry->d_inode); -+ get_exclusive_access(uf_info); -+ ret = find_file_state(file->f_dentry->d_inode, uf_info); -+ if (ret == 0) { -+ if (uf_info->container == UF_CONTAINER_TAILS) -+ ret = -EINVAL; -+ else -+ ret = do_prepare_write(file, page, from, to); -+ } -+ drop_exclusive_access(uf_info); -+ -+ /* don't commit transaction under inode semaphore */ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return ret; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/file_conversion.c linux-2.6.20/fs/reiser4/plugin/file/file_conversion.c ---- linux-2.6.20.orig/fs/reiser4/plugin/file/file_conversion.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/file/file_conversion.c 2007-05-06 14:50:43.783001971 +0400 -@@ -0,0 +1,594 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, -+ licensing governed by reiser4/README */ -+ -+/* This file contains hooks that converts (*) cryptcompress files to unix-files, -+ and a set of protected (**) methods of a cryptcompress file plugin to perform -+ such conversion. -+ -+(*) -+ The conversion is performed for incompressible files to reduce cpu and memory -+ usage. If first logical cluster (64K by default) of a file is incompressible, -+ then we make a desicion, that the whole file is incompressible. -+ The conversion can be enabled via installing a special compression mode -+ plugin (CONVX_COMPRESSION_MODE_ID, see plugin/compress/compress_mode.c for -+ details). -+ -+(**) -+ The protection means serialization of critical sections (readers and writers -+ of @pset->file) -+*/ -+ -+#include "../../inode.h" -+#include "../cluster.h" -+#include "file.h" -+ -+#define conversion_enabled(inode) \ -+ (inode_compression_mode_plugin(inode) == \ -+ compression_mode_plugin_by_id(CONVX_COMPRESSION_MODE_ID)) -+ -+ -+/* Located sections (readers and writers of @pset->file) are not -+ permanently critical: cryptcompress file can be converted only -+ if the conversion is enabled (see the macrio above). And we don't -+ convert unix files at all. -+ The following helper macro is a sanity check to decide if we -+ need to protect a located section. -+*/ -+#define should_protect(inode) \ -+ (inode_file_plugin(inode) == \ -+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID) && \ -+ conversion_enabled(inode)) -+ -+/* All protected methods have prefix "prot" in their names. -+ It is convenient to construct them by usual (unprotected) ones -+ using the following common macros: -+*/ -+ -+/* Macro for passive protection. -+ method_cryptcompress contains only readers */ -+#define PROT_PASSIVE(type, method, args) \ -+({ \ -+ type _result; \ -+ struct rw_semaphore * guard = \ -+ &reiser4_inode_data(inode)->conv_sem; \ -+ \ -+ if (should_protect(inode)) { \ -+ down_read(guard); \ -+ if (!should_protect(inode)) \ -+ up_read(guard); \ -+ } \ -+ if (inode_file_plugin(inode) == \ -+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID)) \ -+ _result = method ## _unix_file args; \ -+ else \ -+ _result = method ## _cryptcompress args; \ -+ if (should_protect(inode)) \ -+ up_read(guard); \ -+ _result; \ -+}) -+ -+#define PROT_PASSIVE_VOID(method, args) \ -+({ \ -+ struct rw_semaphore * guard = \ -+ &reiser4_inode_data(inode)->conv_sem; \ -+ \ -+ if (should_protect(inode)) { \ -+ down_read(guard); \ -+ if (!should_protect(inode)) \ -+ up_read(guard); \ -+ } \ -+ if (inode_file_plugin(inode) == \ -+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID)) \ -+ method ## _unix_file args; \ -+ else \ -+ method ## _cryptcompress args; \ -+ if (should_protect(inode)) \ -+ up_read(guard); \ -+}) -+ -+/* Macro for active protection. -+ active_expr contains readers and writers; after its -+ evaluation conversion should be disabled */ -+#define PROT_ACTIVE(type, method, args, active_expr) \ -+({ \ -+ type _result = 0; \ -+ struct rw_semaphore * guard = \ -+ &reiser4_inode_data(inode)->conv_sem; \ -+ reiser4_context * ctx = reiser4_init_context(inode->i_sb); \ -+ if (IS_ERR(ctx)) \ -+ return PTR_ERR(ctx); \ -+ \ -+ if (should_protect(inode)) { \ -+ down_write(guard); \ -+ if (should_protect(inode)) \ -+ _result = active_expr; \ -+ up_write(guard); \ -+ } \ -+ if (_result == 0) { \ -+ if (inode_file_plugin(inode) == \ -+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID)) \ -+ _result = method ## _unix_file args; \ -+ else \ -+ _result = method ## _cryptcompress args; \ -+ } \ -+ reiser4_exit_context(ctx); \ -+ _result; \ -+}) -+ -+/* Pass management to the unix-file plugin with "notail" policy */ -+static int __cryptcompress2unixfile(struct file *file, struct inode * inode) -+{ -+ int result; -+ reiser4_inode *info; -+ unix_file_info_t * uf; -+ info = reiser4_inode_data(inode); -+ -+ result = aset_set_unsafe(&info->pset, -+ PSET_FILE, -+ (reiser4_plugin *) -+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID)); -+ if (result) -+ return result; -+ result = aset_set_unsafe(&info->pset, -+ PSET_FORMATTING, -+ (reiser4_plugin *) -+ formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID)); -+ if (result) -+ return result; -+ /* get rid of non-standard plugins */ -+ info->plugin_mask &= ~cryptcompress_mask; -+ /* get rid of plugin stat-data extension */ -+ info->extmask &= ~(1 << PLUGIN_STAT); -+ -+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN); -+ -+ /* FIXME use init_inode_data_unix_file() instead, -+ but aviod init_inode_ordering() */ -+ /* Init unix-file specific part of inode */ -+ uf = unix_file_inode_data(inode); -+ uf->container = UF_CONTAINER_UNKNOWN; -+ init_rwsem(&uf->latch); -+ uf->tplug = inode_formatting_plugin(inode); -+ uf->exclusive_use = 0; -+#if REISER4_DEBUG -+ uf->ea_owner = NULL; -+ atomic_set(&uf->nr_neas, 0); -+#endif -+ inode->i_op = -+ &file_plugin_by_id(UNIX_FILE_PLUGIN_ID)->inode_ops; -+ inode->i_fop = -+ &file_plugin_by_id(UNIX_FILE_PLUGIN_ID)->file_ops; -+ inode->i_mapping->a_ops = -+ &file_plugin_by_id(UNIX_FILE_PLUGIN_ID)->as_ops; -+ file->f_op = inode->i_fop; -+ return 0; -+} -+ -+#if REISER4_DEBUG -+static int disabled_conversion_inode_ok(struct inode * inode) -+{ -+ __u64 extmask = reiser4_inode_data(inode)->extmask; -+ __u16 plugin_mask = reiser4_inode_data(inode)->plugin_mask; -+ -+ return ((extmask & (1 << LIGHT_WEIGHT_STAT)) && -+ (extmask & (1 << UNIX_STAT)) && -+ (extmask & (1 << LARGE_TIMES_STAT)) && -+ (extmask & (1 << PLUGIN_STAT)) && -+ (plugin_mask & (1 << PSET_COMPRESSION_MODE))); -+} -+#endif -+ -+/* Assign another mode that will control -+ compression at flush time only */ -+static int disable_conversion_no_update_sd(struct inode * inode) -+{ -+ int result; -+ result = -+ force_plugin_pset(inode, -+ PSET_COMPRESSION_MODE, -+ (reiser4_plugin *)compression_mode_plugin_by_id -+ (LATTD_COMPRESSION_MODE_ID)); -+ assert("edward-1500", -+ ergo(!result, disabled_conversion_inode_ok(inode))); -+ return result; -+} -+ -+/* Disable future attempts to check/convert. This function is called by -+ conversion hooks. */ -+static int disable_conversion(struct inode * inode) -+{ -+ return disable_conversion_no_update_sd(inode); -+} -+ -+static int check_position(struct inode * inode, -+ loff_t pos /* initial position in the file */, -+ reiser4_cluster_t * clust, -+ int * check_compress) -+{ -+ assert("edward-1505", conversion_enabled(inode)); -+ assert("edward-1506", inode->i_size <= inode_cluster_size(inode)); -+ /* if file size is more then cluster size, then compressible -+ status must be figured out (i.e. compression was disabled, -+ or file plugin was converted to unix_file) */ -+ -+ if (pos > inode->i_size) -+ /* first logical cluster will contain a (partial) hole */ -+ return disable_conversion(inode); -+ if (inode->i_size == inode_cluster_size(inode)) -+ *check_compress = 1; -+ return 0; -+} -+ -+static void start_check_compressibility(struct inode * inode, -+ reiser4_cluster_t * clust, -+ hint_t * hint) -+{ -+ assert("edward-1507", clust->index == 1); -+ assert("edward-1508", !tfm_cluster_is_uptodate(&clust->tc)); -+ assert("edward-1509", cluster_get_tfm_act(&clust->tc) == TFMA_READ); -+ -+ hint_init_zero(hint); -+ clust->hint = hint; -+ clust->index --; -+ clust->nr_pages = count_to_nrpages(fsize_to_count(clust, inode)); -+ -+ /* first logical cluster (of index #0) must be complete */ -+ assert("edward-1510", fsize_to_count(clust, inode) == -+ inode_cluster_size(inode)); -+} -+ -+static void finish_check_compressibility(struct inode * inode, -+ reiser4_cluster_t * clust, -+ hint_t * hint) -+{ -+ reiser4_unset_hint(clust->hint); -+ clust->hint = hint; -+ clust->index ++; -+} -+ -+#if REISER4_DEBUG -+static int prepped_dclust_ok(hint_t * hint) -+{ -+ reiser4_key key; -+ coord_t * coord = &hint->ext_coord.coord; -+ -+ item_key_by_coord(coord, &key); -+ return (item_id_by_coord(coord) == CTAIL_ID && -+ !coord_is_unprepped_ctail(coord) && -+ (get_key_offset(&key) + nr_units_ctail(coord) == -+ dclust_get_extension_dsize(hint))); -+} -+#endif -+ -+#define fifty_persent(size) (size >> 1) -+/* evaluation of data compressibility */ -+#define data_is_compressible(osize, isize) \ -+ (osize < fifty_persent(isize)) -+ -+/* This is called only once per file life. -+ Read first logical cluster (of index #0) and estimate its compressibility. -+ Save estimation result in @compressible */ -+static int read_check_compressibility(struct inode * inode, -+ reiser4_cluster_t * clust, -+ int * compressible) -+{ -+ int i; -+ int result; -+ __u32 dst_len; -+ hint_t tmp_hint; -+ hint_t * cur_hint = clust->hint; -+ -+ start_check_compressibility(inode, clust, &tmp_hint); -+ -+ result = grab_cluster_pages(inode, clust); -+ if (result) -+ return result; -+ /* Read page cluster here */ -+ for (i = 0; i < clust->nr_pages; i++) { -+ struct page *page = clust->pages[i]; -+ lock_page(page); -+ result = do_readpage_ctail(inode, clust, page, -+ ZNODE_READ_LOCK); -+ unlock_page(page); -+ if (result) -+ goto error; -+ } -+ tfm_cluster_clr_uptodate(&clust->tc); -+ -+ cluster_set_tfm_act(&clust->tc, TFMA_WRITE); -+ -+ if (hint_is_valid(&tmp_hint) && !hint_is_unprepped_dclust(&tmp_hint)) { -+ /* lenght of compressed data is known, no need to compress */ -+ assert("edward-1511", -+ znode_is_write_locked(tmp_hint.ext_coord.coord.node)); -+ assert("edward-1512", -+ WITH_DATA(tmp_hint.ext_coord.coord.node, -+ prepped_dclust_ok(&tmp_hint))); -+ dst_len = dclust_get_extension_dsize(&tmp_hint); -+ } -+ else { -+ tfm_cluster_t * tc = &clust->tc; -+ compression_plugin * cplug = inode_compression_plugin(inode); -+ result = grab_tfm_stream(inode, tc, INPUT_STREAM); -+ if (result) -+ goto error; -+ for (i = 0; i < clust->nr_pages; i++) { -+ char *data; -+ lock_page(clust->pages[i]); -+ BUG_ON(!PageUptodate(clust->pages[i])); -+ data = kmap(clust->pages[i]); -+ memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i), -+ data, PAGE_CACHE_SIZE); -+ kunmap(clust->pages[i]); -+ unlock_page(clust->pages[i]); -+ } -+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM); -+ if (result) -+ goto error; -+ result = grab_coa(tc, cplug); -+ if (result) -+ goto error; -+ tc->len = tc->lsize = fsize_to_count(clust, inode); -+ assert("edward-1513", tc->len == inode_cluster_size(inode)); -+ dst_len = tfm_stream_size(tc, OUTPUT_STREAM); -+ cplug->compress(get_coa(tc, cplug->h.id, tc->act), -+ tfm_input_data(clust), tc->len, -+ tfm_output_data(clust), &dst_len); -+ assert("edward-1514", -+ dst_len <= tfm_stream_size(tc, OUTPUT_STREAM)); -+ } -+ finish_check_compressibility(inode, clust, cur_hint); -+ *compressible = data_is_compressible(dst_len, -+ inode_cluster_size(inode)); -+ return 0; -+ error: -+ reiser4_release_cluster_pages(clust); -+ return result; -+} -+ -+/* Cut disk cluster of index @idx */ -+static int cut_disk_cluster(struct inode * inode, cloff_t idx) -+{ -+ reiser4_key from, to; -+ assert("edward-1515", inode_file_plugin(inode) == -+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); -+ key_by_inode_cryptcompress(inode, clust_to_off(idx, inode), &from); -+ to = from; -+ set_key_offset(&to, -+ get_key_offset(&from) + inode_cluster_size(inode) - 1); -+ return reiser4_cut_tree(reiser4_tree_by_inode(inode), -+ &from, &to, inode, 0); -+} -+ -+static int reserve_cryptcompress2unixfile(struct inode *inode) -+{ -+ reiser4_block_nr unformatted_nodes; -+ reiser4_tree *tree; -+ -+ tree = reiser4_tree_by_inode(inode); -+ -+ /* number of unformatted nodes which will be created */ -+ unformatted_nodes = cluster_nrpages(inode); /* N */ -+ -+ /* -+ * space required for one iteration of extent->tail conversion: -+ * -+ * 1. kill ctail items -+ * -+ * 2. insert N unformatted nodes -+ * -+ * 3. insert N (worst-case single-block -+ * extents) extent units. -+ * -+ * 4. drilling to the leaf level by coord_by_key() -+ * -+ * 5. possible update of stat-data -+ * -+ */ -+ grab_space_enable(); -+ return reiser4_grab_space -+ (2 * tree->height + -+ unformatted_nodes + -+ unformatted_nodes * estimate_one_insert_into_item(tree) + -+ 1 + estimate_one_insert_item(tree) + -+ inode_file_plugin(inode)->estimate.update(inode), -+ BA_CAN_COMMIT); -+} -+ -+/* clear flag that indicated conversion and update -+ stat-data with new (unix-file - specific) info */ -+static int complete_file_conversion(struct inode *inode) -+{ -+ int result; -+ -+ grab_space_enable(); -+ result = -+ reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode), -+ BA_CAN_COMMIT); -+ if (result == 0) { -+ reiser4_inode_clr_flag(inode, REISER4_FILE_CONV_IN_PROGRESS); -+ result = reiser4_update_sd(inode); -+ } -+ if (result) -+ warning("edward-1452", -+ "Converting %llu to unix-file: update sd failed (%i)", -+ (unsigned long long)get_inode_oid(inode), result); -+ return 0; -+} -+ -+ -+/* do conversion */ -+static int cryptcompress2unixfile(struct file *file, struct inode * inode, -+ reiser4_cluster_t * clust) -+{ -+ int i; -+ int result = 0; -+ cryptcompress_info_t *cr_info; -+ unix_file_info_t *uf_info; -+ -+ assert("edward-1516", clust->pages[0]->index == 0); -+ assert("edward-1517", clust->hint != NULL); -+ -+ /* release all cryptcompress-specific recources */ -+ cr_info = cryptcompress_inode_data(inode); -+ result = reserve_cryptcompress2unixfile(inode); -+ if (result) -+ goto out; -+ reiser4_inode_set_flag(inode, REISER4_FILE_CONV_IN_PROGRESS); -+ reiser4_unset_hint(clust->hint); -+ result = cut_disk_cluster(inode, 0); -+ if (result) -+ goto out; -+ /* captured jnode of cluster and assotiated resources (pages, -+ reserved disk space) were released by ->kill_hook() method -+ of the item plugin */ -+ -+ result = __cryptcompress2unixfile(file, inode); -+ if (result) -+ goto out; -+ /* At this point file is managed by unix file plugin */ -+ -+ uf_info = unix_file_inode_data(inode); -+ -+ assert("edward-1518", -+ ergo(jprivate(clust->pages[0]), -+ !jnode_is_cluster_page(jprivate(clust->pages[0])))); -+ for(i = 0; i < clust->nr_pages; i++) { -+ assert("edward-1519", clust->pages[i]); -+ assert("edward-1520", PageUptodate(clust->pages[i])); -+ -+ result = find_or_create_extent(clust->pages[i]); -+ if (result) -+ break; -+ } -+ if (!result) { -+ uf_info->container = UF_CONTAINER_EXTENTS; -+ complete_file_conversion(inode); -+ } -+ out: -+ all_grabbed2free(); -+ if (result) -+ warning("edward-1453", "Failed to convert file %llu: %i", -+ (unsigned long long)get_inode_oid(inode), result); -+ return result; -+} -+ -+/* Check, then perform or disable conversion if needed */ -+int write_conversion_hook(struct file *file, struct inode * inode, loff_t pos, -+ reiser4_cluster_t * clust, int * progress) -+{ -+ int result; -+ int check_compress = 0; -+ int compressible = 0; -+ -+ if (!conversion_enabled(inode)) -+ return 0; -+ result = check_position(inode, pos, clust, &check_compress); -+ if (result || !check_compress) -+ return result; -+ result = read_check_compressibility(inode, clust, &compressible); -+ if (result) -+ return result; -+ -+ /* At this point page cluster is grabbed and uptodate */ -+ if (!compressible) { -+ result = cryptcompress2unixfile(file, inode, clust); -+ if (result == 0) -+ *progress = 1; -+ } -+ else -+ result = disable_conversion(inode); -+ -+ reiser4_release_cluster_pages(clust); -+ return result; -+} -+ -+static int setattr_conversion_hook(struct inode * inode, struct iattr *attr) -+{ -+ return (attr->ia_valid & ATTR_SIZE ? disable_conversion(inode) : 0); -+} -+ -+/* Protected methods of cryptcompress file plugin constructed -+ by the macros above */ -+ -+/* Wrappers with active protection for: -+ . write_cryptcompress; -+ . setattr_cryptcompress; -+*/ -+ -+ssize_t prot_write_cryptcompress(struct file *file, const char __user *buf, -+ size_t count, loff_t *off) -+{ -+ int prot = 0; -+ int conv = 0; -+ ssize_t written_cr = 0; -+ ssize_t written_uf = 0; -+ struct inode * inode = file->f_dentry->d_inode; -+ struct rw_semaphore * guard = &reiser4_inode_data(inode)->conv_sem; -+ -+ if (should_protect(inode)) { -+ prot = 1; -+ down_write(guard); -+ } -+ written_cr = write_cryptcompress(file, buf, count, off, &conv); -+ if (prot) -+ up_write(guard); -+ if (written_cr < 0) -+ return written_cr; -+ if (conv) -+ written_uf = write_unix_file(file, buf + written_cr, -+ count - written_cr, off); -+ return written_cr + (written_uf < 0 ? 0 : written_uf); -+} -+ -+int prot_setattr_cryptcompress(struct dentry *dentry, struct iattr *attr) -+{ -+ struct inode * inode = dentry->d_inode; -+ return PROT_ACTIVE(int, setattr, (dentry, attr), -+ setattr_conversion_hook(inode, attr)); -+} -+ -+/* Wrappers with passive protection for: -+ . read_cryptcomperess; -+ . mmap_cryptcompress; -+ . release_cryptcompress; -+ . sendfile_cryptcompress; -+ . delete_object_cryptcompress. -+*/ -+ssize_t prot_read_cryptcompress(struct file * file, char __user * buf, -+ size_t size, loff_t * off) -+{ -+ struct inode * inode = file->f_dentry->d_inode; -+ return PROT_PASSIVE(ssize_t, read, (file, buf, size, off)); -+} -+ -+int prot_mmap_cryptcompress(struct file *file, struct vm_area_struct *vma) -+{ -+ struct inode *inode = file->f_dentry->d_inode; -+ return PROT_PASSIVE(int, mmap, (file, vma)); -+} -+ -+int prot_release_cryptcompress(struct inode *inode, struct file *file) -+{ -+ return PROT_PASSIVE(int, release, (inode, file)); -+} -+ -+ssize_t prot_sendfile_cryptcompress(struct file *file, loff_t *ppos, -+ size_t count, read_actor_t actor, -+ void *target) -+{ -+ struct inode * inode = file->f_dentry->d_inode; -+ return PROT_PASSIVE(ssize_t, sendfile, -+ (file, ppos, count, actor, target)); -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/file.h linux-2.6.20/fs/reiser4/plugin/file/file.h ---- linux-2.6.20.orig/fs/reiser4/plugin/file/file.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/file/file.h 2007-05-06 14:50:43.783001971 +0400 -@@ -0,0 +1,272 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* this file contains declarations of methods implementing -+ file plugins (UNIX_FILE_PLUGIN_ID, CRYPTCOMPRESS_FILE_PLUGIN_ID -+ and SYMLINK_FILE_PLUGIN_ID) */ -+ -+#if !defined( __REISER4_FILE_H__ ) -+#define __REISER4_FILE_H__ -+ -+/* declarations of functions implementing UNIX_FILE_PLUGIN_ID file plugin */ -+ -+/* inode operations */ -+int setattr_unix_file(struct dentry *, struct iattr *); -+ -+/* file operations */ -+ssize_t read_unix_file(struct file *, char __user *buf, size_t read_amount, -+ loff_t *off); -+ssize_t write_unix_file(struct file *, const char __user *buf, size_t write_amount, -+ loff_t * off); -+int ioctl_unix_file(struct inode *, struct file *, unsigned int cmd, -+ unsigned long arg); -+int mmap_unix_file(struct file *, struct vm_area_struct *); -+int open_unix_file(struct inode *, struct file *); -+int release_unix_file(struct inode *, struct file *); -+int sync_unix_file(struct file *, struct dentry *, int datasync); -+ssize_t sendfile_unix_file(struct file *, loff_t *ppos, size_t count, -+ read_actor_t, void *target); -+ -+/* address space operations */ -+int readpage_unix_file(struct file *, struct page *); -+int readpages_unix_file(struct file*, struct address_space*, struct list_head*, unsigned); -+int writepages_unix_file(struct address_space *, struct writeback_control *); -+int prepare_write_unix_file(struct file *, struct page *, unsigned from, -+ unsigned to); -+int commit_write_unix_file(struct file *, struct page *, unsigned from, -+ unsigned to); -+sector_t bmap_unix_file(struct address_space *, sector_t lblock); -+ -+/* file plugin operations */ -+int flow_by_inode_unix_file(struct inode *, const char __user *buf, -+ int user, loff_t, loff_t, rw_op, flow_t *); -+int owns_item_unix_file(const struct inode *, const coord_t *); -+void init_inode_data_unix_file(struct inode *, reiser4_object_create_data *, -+ int create); -+int delete_object_unix_file(struct inode *); -+ -+/* -+ * all the write into unix file is performed by item write method. Write method -+ * of unix file plugin only decides which item plugin (extent or tail) and in -+ * which mode (one from the enum below) to call -+ */ -+typedef enum { -+ FIRST_ITEM = 1, -+ APPEND_ITEM = 2, -+ OVERWRITE_ITEM = 3 -+} write_mode_t; -+ -+/* unix file may be in one the following states */ -+typedef enum { -+ UF_CONTAINER_UNKNOWN = 0, -+ UF_CONTAINER_TAILS = 1, -+ UF_CONTAINER_EXTENTS = 2, -+ UF_CONTAINER_EMPTY = 3 -+} file_container_t; -+ -+struct formatting_plugin; -+struct inode; -+ -+/* unix file plugin specific part of reiser4 inode */ -+typedef struct unix_file_info { -+ /* -+ * this read-write lock protects file containerization change. Accesses -+ * which do not change file containerization (see file_container_t) -+ * (read, readpage, writepage, write (until tail conversion is -+ * involved)) take read-lock. Accesses which modify file -+ * containerization (truncate, conversion from tail to extent and back) -+ * take write-lock. -+ */ -+ struct rw_semaphore latch; -+ /* this enum specifies which items are used to build the file */ -+ file_container_t container; -+ /* -+ * plugin which controls when file is to be converted to extents and -+ * back to tail -+ */ -+ struct formatting_plugin *tplug; -+ /* if this is set, file is in exclusive use */ -+ int exclusive_use; -+#if REISER4_DEBUG -+ /* pointer to task struct of thread owning exclusive access to file */ -+ void *ea_owner; -+ atomic_t nr_neas; -+ void *last_reader; -+#endif -+} unix_file_info_t; -+ -+struct unix_file_info *unix_file_inode_data(const struct inode *inode); -+void get_exclusive_access(unix_file_info_t *); -+void drop_exclusive_access(unix_file_info_t *); -+void get_nonexclusive_access(unix_file_info_t *); -+void drop_nonexclusive_access(unix_file_info_t *); -+int try_to_get_nonexclusive_access(unix_file_info_t *); -+int find_file_item(hint_t *, const reiser4_key *, znode_lock_mode, -+ struct inode *); -+int find_file_item_nohint(coord_t *, lock_handle *, -+ const reiser4_key *, znode_lock_mode, -+ struct inode *); -+ -+int load_file_hint(struct file *, hint_t *); -+void save_file_hint(struct file *, const hint_t *); -+ -+#include "../item/extent.h" -+#include "../item/tail.h" -+#include "../item/ctail.h" -+ -+struct uf_coord { -+ coord_t coord; -+ lock_handle *lh; -+ int valid; -+ union { -+ extent_coord_extension_t extent; -+ tail_coord_extension_t tail; -+ ctail_coord_extension_t ctail; -+ } extension; -+}; -+ -+#include "../../forward.h" -+#include "../../seal.h" -+#include "../../lock.h" -+ -+/* -+ * This structure is used to speed up file operations (reads and writes). A -+ * hint is a suggestion about where a key resolved to last time. A seal -+ * indicates whether a node has been modified since a hint was last recorded. -+ * You check the seal, and if the seal is still valid, you can use the hint -+ * without traversing the tree again. -+ */ -+struct hint { -+ seal_t seal; /* a seal over last file item accessed */ -+ uf_coord_t ext_coord; -+ loff_t offset; -+ znode_lock_mode mode; -+ lock_handle lh; -+}; -+ -+static inline int hint_is_valid(hint_t * hint) -+{ -+ return hint->ext_coord.valid; -+} -+ -+static inline void hint_set_valid(hint_t * hint) -+{ -+ hint->ext_coord.valid = 1; -+} -+ -+static inline void hint_clr_valid(hint_t * hint) -+{ -+ hint->ext_coord.valid = 0; -+} -+ -+int load_file_hint(struct file *, hint_t *); -+void save_file_hint(struct file *, const hint_t *); -+void hint_init_zero(hint_t *); -+void reiser4_set_hint(hint_t *, const reiser4_key *, znode_lock_mode); -+int hint_is_set(const hint_t *); -+void reiser4_unset_hint(hint_t *); -+ -+int reiser4_update_file_size(struct inode *, reiser4_key *, int update_sd); -+int cut_file_items(struct inode *, loff_t new_size, int update_sd, -+ loff_t cur_size, int (*update_actor) (struct inode *, -+ reiser4_key *, int)); -+#if REISER4_DEBUG -+ -+/* return 1 is exclusive access is obtained, 0 - otherwise */ -+static inline int ea_obtained(unix_file_info_t * uf_info) -+{ -+ int ret; -+ -+ ret = down_read_trylock(&uf_info->latch); -+ if (ret) -+ up_read(&uf_info->latch); -+ return !ret; -+} -+ -+#endif -+ -+/* declarations of functions implementing SYMLINK_FILE_PLUGIN_ID file plugin */ -+int reiser4_create_symlink(struct inode *symlink, struct inode *dir, -+ reiser4_object_create_data *); -+void destroy_inode_symlink(struct inode *); -+ -+/* declarations of functions implementing CRYPTCOMPRESS_FILE_PLUGIN_ID -+ file plugin */ -+ -+/* inode operations */ -+int setattr_cryptcompress(struct dentry *, struct iattr *); -+int prot_setattr_cryptcompress(struct dentry *, struct iattr *); -+ -+/* file operations */ -+ssize_t read_cryptcompress(struct file *, char __user *buf, size_t read_amount, -+ loff_t * off); -+ssize_t prot_read_cryptcompress(struct file *, char __user *buf, -+ size_t read_amount, loff_t * off); -+ -+ssize_t write_cryptcompress(struct file *, const char __user *buf, size_t write_amount, -+ loff_t * off, int * conv); -+ssize_t prot_write_cryptcompress(struct file *, const char __user *buf, size_t write_amount, -+ loff_t * off); -+int mmap_cryptcompress(struct file *, struct vm_area_struct *); -+int prot_mmap_cryptcompress(struct file *, struct vm_area_struct *); -+ssize_t sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count, -+ read_actor_t actor, void *target); -+ssize_t prot_sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count, -+ read_actor_t actor, void *target); -+ -+int release_cryptcompress(struct inode *, struct file *); -+int prot_release_cryptcompress(struct inode *, struct file *); -+ -+/* address space operations */ -+extern int readpage_cryptcompress(struct file *, struct page *); -+extern int writepages_cryptcompress(struct address_space *, -+ struct writeback_control *); -+/* file plugin operations */ -+int flow_by_inode_cryptcompress(struct inode *, const char __user *buf, -+ int user, loff_t, loff_t, rw_op, flow_t *); -+int key_by_inode_cryptcompress(struct inode *, loff_t off, reiser4_key *); -+int create_cryptcompress(struct inode *, struct inode *, -+ reiser4_object_create_data *); -+int delete_object_cryptcompress(struct inode *); -+void init_inode_data_cryptcompress(struct inode *, reiser4_object_create_data *, -+ int create); -+int cut_tree_worker_cryptcompress(tap_t *, const reiser4_key * from_key, -+ const reiser4_key * to_key, -+ reiser4_key * smallest_removed, -+ struct inode *object, int truncate, -+ int *progress); -+void destroy_inode_cryptcompress(struct inode *); -+int open_object_cryptcompress(struct inode * inode, struct file * file); -+ -+extern reiser4_plugin_ops cryptcompress_plugin_ops; -+ -+#define WRITE_GRANULARITY 32 -+ -+int tail2extent(unix_file_info_t *); -+int extent2tail(unix_file_info_t *); -+ -+int goto_right_neighbor(coord_t *, lock_handle *); -+int find_or_create_extent(struct page *); -+int equal_to_ldk(znode *, const reiser4_key *); -+ -+void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh); -+ -+static inline int cbk_errored(int cbk_result) -+{ -+ return (cbk_result != CBK_COORD_NOTFOUND -+ && cbk_result != CBK_COORD_FOUND); -+} -+ -+/* __REISER4_FILE_H__ */ -+#endif -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/invert.c linux-2.6.20/fs/reiser4/plugin/file/invert.c ---- linux-2.6.20.orig/fs/reiser4/plugin/file/invert.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/file/invert.c 2007-05-06 14:50:43.783001971 +0400 -@@ -0,0 +1,493 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Suppose you want to conveniently read and write a large variety of small files conveniently within a single emacs -+ buffer, without having a separate buffer for each 8 byte or so file. Inverts are the way to do that. An invert -+ provides you with the contents of a set of subfiles plus its own contents. It is a file which inherits other files -+ when you read it, and allows you to write to it and through it to the files that it inherits from. In order for it -+ to know which subfiles each part of your write should go into, there must be delimiters indicating that. It tries to -+ make that easy for you by providing those delimiters in what you read from it. -+ -+ When you read it, an invert performs an inverted assignment. Instead of taking an assignment command and writing a -+ bunch of files, it takes a bunch of files and composes an assignment command for you to read from it that if executed -+ would create those files. But which files? Well, that must be specified in the body of the invert using a special -+ syntax, and that specification is called the invert of the assignment. -+ -+ When written to, an invert performs the assignment command that is written -+ to it, and modifies its own body to contain the invert of that -+ assignment. -+ -+ In other words, writing to an invert file what you have read from it -+ is the identity operation. -+ -+ Malformed assignments cause write errors. Partial writes are not -+ supported in v4.0, but will be. -+ -+ Example: -+ -+ If an invert contains: -+ -+ /filenameA/<>+"(some text stored in the invert)+/filenameB/<> -+ -+====================== -+Each element in this definition should be an invert, and all files -+should be called recursively - too. This is bad. If one of the -+included files in not a regular or invert file, then we can't read -+main file. -+ -+I think to make it is possible easier: -+ -+internal structure of invert file should be like symlink file. But -+read and write method should be explitely indicated in i/o operation.. -+ -+By default we read and write (if probably) as symlink and if we -+specify ..invert at reading time that too we can specify it at write time. -+ -+example: -+/my_invert_file/..invert<- ( (/filenameA<-"(The contents of filenameA))+"(some text stored in the invert)+(/filenameB<-"(The contents of filenameB) ) ) -+will create /my_invert_file as invert, and will creat /filenameA and /filenameB with specified body. -+ -+read of /my_invert_file/..invert will be -+/filenameA<-"(The contents of filenameA)+"(some text stored in the invert)+/filenameB<-"(The contents of filenameB) -+ -+but read of /my_invert_file/ will be -+The contents of filenameAsome text stored in the invertThe contents of filenameB -+ -+we also can creat this file as -+/my_invert_file/<-/filenameA+"(some text stored in the invert)+/filenameB -+will create /my_invert_file , and use existing files /filenameA and /filenameB. -+ -+and when we will read it will be as previously invert file. -+ -+This is correct? -+ -+ vv -+DEMIDOV-FIXME-HANS: -+ -+Maybe you are right, but then you must disable writes to /my_invert_file/ and only allow writes to /my_invert_file/..invert -+ -+Do you agree? Discuss it on reiserfs-list.... -+ -+-Hans -+======================= -+ -+ Then a read will return: -+ -+ /filenameA<-"(The contents of filenameA)+"(some text stored in the invert)+/filenameB<-"(The contents of filenameB) -+ -+ and a write of the line above to the invert will set the contents of -+ the invert and filenameA and filenameB to their original values. -+ -+ Note that the contents of an invert have no influence on the effect -+ of a write unless the write is a partial write (and a write of a -+ shorter file without using truncate first is a partial write). -+ -+ truncate() has no effect on filenameA and filenameB, it merely -+ resets the value of the invert. -+ -+ Writes to subfiles via the invert are implemented by preceding them -+ with truncates. -+ -+ Parse failures cause write failures. -+ -+ Questions to ponder: should the invert be acted on prior to file -+ close when writing to an open filedescriptor? -+ -+ Example: -+ -+ If an invert contains: -+ -+ "(This text and a pair of quotes are all that is here.) -+ -+Then a read will return: -+ -+ "(This text and a pair of quotes are all that is here.) -+ -+*/ -+ -+/* OPEN method places a struct file in memory associated with invert body -+ and returns something like file descriptor to the user for the future access -+ to the invert file. -+ During opening we parse the body of invert and get a list of the 'entryes' -+ (that describes all its subfiles) and place pointer on the first struct in -+ reiserfs-specific part of invert inode (arbitrary decision). -+ -+ Each subfile is described by the struct inv_entry that has a pointer @sd on -+ in-core based stat-data and a pointer on struct file @f (if we find that the -+ subfile uses more then one unformated node (arbitrary decision), we load -+ struct file in memory, otherwise we load base stat-data (and maybe 1-2 bytes -+ of some other information we need) -+ -+ Since READ and WRITE methods for inverts were formulated in assignment -+ language, they don't contain arguments 'size' and 'offset' that make sense -+ only in ordinary read/write methods. -+ -+ READ method is a combination of two methods: -+ 1) ordinary read method (with offset=0, lenght = @f->...->i_size) for entries -+ with @f != 0, this method uses pointer on struct file as an argument -+ 2) read method for inode-less files with @sd != 0, this method uses -+ in-core based stat-data instead struct file as an argument. -+ in the first case we don't use pagecache, just copy data that we got after -+ cbk() into userspace. -+ -+ WRITE method for invert files is more complex. -+ Besides declared WRITE-interface in assignment languageb above we need -+ to have an opportunity to edit unwrapped body of invert file with some -+ text editor, it means we need GENERIC WRITE METHOD for invert file: -+ -+ my_invert_file/..invert <- "string" -+ -+ this method parses "string" and looks for correct subfile signatures, also -+ the parsing process splits this "string" on the set of flows in accordance -+ with the set of subfiles specified by this signarure. -+ The found list of signatures #S is compared with the opened one #I of invert -+ file. If it doesn't have this one (#I==0, it will be so for instance if we -+ have just create this invert file) the write method assignes found signature -+ (#I=#S;) to the invert file. Then if #I==#S, generic write method splits -+ itself to the some write methods for ordinary or light-weight, or call itself -+ recursively for invert files with corresponding flows. -+ I am not sure, but the list of signatures looks like what mr.Demidov means -+ by 'delimiters'. -+ -+ The cases when #S<#I (#I<#S) (in the sense of set-theory) are also available -+ and cause delete (create new) subfiles (arbitrary decision - it may looks -+ too complex, but this interface will be the completest). The order of entries -+ of list #S (#I) and inherited order on #I (#S) must coincide. -+ The other parsing results give malformed signature that aborts READ method -+ and releases all resources. -+ -+ Format of subfile (entry) signature: -+ -+ "START_MAGIC"<>(TYPE="...",LOOKUP_ARG="...")SUBFILE_BODY"END_MAGIC" -+ -+ Legend: -+ -+ START_MAGIC - keyword indicates the start of subfile signature; -+ -+ <> indicates the start of 'subfile metadata', that is the pair -+ (TYPE="...",LOOKUP_ARG="...") in parenthesis separated by comma. -+ -+ TYPE - the string "type" indicates the start of one of the three words: -+ - ORDINARY_FILE, -+ - LIGHT_WEIGHT_FILE, -+ - INVERT_FILE; -+ -+ LOOKUP_ARG - lookup argument depends on previous type: -+ */ -+ -+ /************************************************************/ -+ /* TYPE * LOOKUP ARGUMENT */ -+ /************************************************************/ -+ /* LIGH_WEIGHT_FILE * stat-data key */ -+ /************************************************************/ -+ /* ORDINARY_FILE * filename */ -+ /************************************************************/ -+ /* INVERT_FILE * filename */ -+ /************************************************************/ -+ -+ /* where: -+ *stat-data key - the string contains stat data key of this subfile, it will be -+ passed to fast-access lookup method for light-weight files; -+ *filename - pathname of this subfile, iyt well be passed to VFS lookup methods -+ for ordinary and invert files; -+ -+ SUBFILE_BODY - data of this subfile (it will go to the flow) -+ END_MAGIC - the keyword indicates the end of subfile signature. -+ -+ The other simbols inside the signature interpreted as 'unformatted content', -+ which is available with VFS's read_link() (arbitraruy decision). -+ -+ NOTE: Parse method for a body of invert file uses mentioned signatures _without_ -+ subfile bodies. -+ -+ Now the only unclear thing is WRITE in regular light-weight subfile A that we -+ can describe only in assignment language: -+ -+ A <- "some_string" -+ -+ I guess we don't want to change stat-data and body items of file A -+ if this file exist, and size(A) != size("some_string") because this operation is -+ expencive, so we only do the partial write if size(A) > size("some_string") -+ and do truncate of the "some_string", and then do A <- "truncated string", if -+ size(A) < size("some_string"). This decision is also arbitrary.. -+ */ -+ -+/* here is infrastructure for formated flows */ -+ -+#define SUBFILE_HEADER_MAGIC 0x19196605 -+#define FLOW_HEADER_MAGIC 0x01194304 -+ -+#include "../plugin.h" -+#include "../../debug.h" -+#include "../../forward.h" -+#include "../object.h" -+#include "../item/item.h" -+#include "../item/static_stat.h" -+#include "../../dformat.h" -+#include "../znode.h" -+#include "../inode.h" -+ -+#include -+#include /* for struct file */ -+#include /* for struct list_head */ -+ -+typedef enum { -+ LIGHT_WEIGHT_FILE, -+ ORDINARY_FILE, -+ INVERT_FILE -+} inv_entry_type; -+ -+typedef struct flow_header { -+ d32 fl_magic; -+ d16 fl_nr; /* number of subfiles in the flow */ -+}; -+ -+typedef struct subfile_header { -+ d32 sh_magic; /* subfile magic */ -+ d16 sh_type; /* type of subfile: light-weight, ordinary, invert */ -+ d16 sh_arg_len; /* lenght of lookup argument (filename, key) */ -+ d32 sh_body_len; /* lenght of subfile body */ -+}; -+ -+/* functions to get/set fields of flow header */ -+ -+static void fl_set_magic(flow_header * fh, __u32 value) -+{ -+ cputod32(value, &fh->fh_magic); -+} -+ -+static __u32 fl_get_magic(flow_header * fh) -+{ -+ return d32tocpu(&fh->fh_magic); -+} -+static void fl_set_number(flow_header * fh, __u16 value) -+{ -+ cputod16(value, &fh->fh_nr); -+} -+static unsigned fl_get_number(flow_header * fh) -+{ -+ return d16tocpu(&fh->fh_nr); -+} -+ -+/* functions to get/set fields of subfile header */ -+ -+static void sh_set_magic(subfile_header * sh, __u32 value) -+{ -+ cputod32(value, &sh->sh_magic); -+} -+ -+static __u32 sh_get_magic(subfile_header * sh) -+{ -+ return d32tocpu(&sh->sh_magic); -+} -+static void sh_set_type(subfile_header * sh, __u16 value) -+{ -+ cputod16(value, &sh->sh_magic); -+} -+static unsigned sh_get_type(subfile_header * sh) -+{ -+ return d16tocpu(&sh->sh_magic); -+} -+static void sh_set_arg_len(subfile_header * sh, __u16 value) -+{ -+ cputod16(value, &sh->sh_arg_len); -+} -+static unsigned sh_get_arg_len(subfile_header * sh) -+{ -+ return d16tocpu(&sh->sh_arg_len); -+} -+static void sh_set_body_len(subfile_header * sh, __u32 value) -+{ -+ cputod32(value, &sh->sh_body_len); -+} -+ -+static __u32 sh_get_body_len(subfile_header * sh) -+{ -+ return d32tocpu(&sh->sh_body_len); -+} -+ -+/* in-core minimal stat-data, light-weight analog of inode */ -+ -+struct incore_sd_base { -+ umode_t isd_mode; -+ nlink_t isd_nlink; -+ loff_t isd_size; -+ char *isd_data; /* 'subflow' to write */ -+}; -+ -+/* open invert create a list of invert entries, -+ every entry is represented by structure inv_entry */ -+ -+struct inv_entry { -+ struct list_head *ie_list; -+ struct file *ie_file; /* this is NULL if the file doesn't -+ have unformated nodes */ -+ struct incore_sd_base *ie_sd; /* inode-less analog of struct file */ -+}; -+ -+/* allocate and init invert entry */ -+ -+static struct inv_entry *allocate_inv_entry(void) -+{ -+ struct inv_entry *inv_entry; -+ -+ inv_entry = reiser4_kmalloc(sizeof(struct inv_entry), GFP_KERNEL); -+ if (!inv_entry) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ inv_entry->ie_file = NULL; -+ inv_entry->ie_sd = NULL; -+ INIT_LIST_HEAD(&inv_entry->ie_list); -+ return inv_entry; -+} -+ -+static int put_inv_entry(struct inv_entry *ientry) -+{ -+ int result = 0; -+ -+ assert("edward-96", ientry != NULL); -+ assert("edward-97", ientry->ie_list != NULL); -+ -+ list_del(ientry->ie_list); -+ if (ientry->ie_sd != NULL) { -+ kfree(ientry->ie_sd); -+ kfree(ientry); -+ } -+ if (ientry->ie_file != NULL) -+ result = filp_close(ientry->file, NULL); -+ return result; -+} -+ -+static int allocate_incore_sd_base(struct inv_entry *inv_entry) -+{ -+ struct incore_sd_base *isd_base assert("edward-98", inv_entry != NULL); -+ assert("edward-99", inv_entry->ie_inode = NULL); -+ assert("edward-100", inv_entry->ie_sd = NULL); -+ -+ isd_base = reiser4_kmalloc(sizeof(struct incore_sd_base), GFP_KERNEL); -+ if (!isd_base) -+ return RETERR(-ENOMEM); -+ inv_entry->ie_sd = isd_base; -+ return 0; -+} -+ -+/* this can be installed as ->init_inv_entry () method of -+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). -+ Copies data from on-disk stat-data format into light-weight analog of inode . -+ Doesn't hanlde stat-data extensions. */ -+ -+static void sd_base_load(struct inv_entry *inv_entry, char *sd) -+{ -+ reiser4_stat_data_base *sd_base; -+ -+ assert("edward-101", inv_entry != NULL); -+ assert("edward-101", inv_entry->ie_sd != NULL); -+ assert("edward-102", sd != NULL); -+ -+ sd_base = (reiser4_stat_data_base *) sd; -+ inv_entry->incore_sd_base->isd_mode = d16tocpu(&sd_base->mode); -+ inv_entry->incore_sd_base->isd_nlink = d32tocpu(&sd_base->nlink); -+ inv_entry->incore_sd_base->isd_size = d64tocpu(&sd_base->size); -+ inv_entry->incore_sd_base->isd_data = NULL; -+} -+ -+/* initialise incore stat-data */ -+ -+static void init_incore_sd_base(struct inv_entry *inv_entry, coord_t * coord) -+{ -+ reiser4_plugin *plugin = item_plugin_by_coord(coord); -+ void *body = item_body_by_coord(coord); -+ -+ assert("edward-103", inv_entry != NULL); -+ assert("edward-104", plugin != NULL); -+ assert("edward-105", body != NULL); -+ -+ sd_base_load(inv_entry, body); -+} -+ -+/* takes a key or filename and allocates new invert_entry, -+ init and adds it into the list, -+ we use lookup_sd_by_key() for light-weight files and VFS lookup by filename */ -+ -+int get_inv_entry(struct inode *invert_inode, /* inode of invert's body */ -+ inv_entry_type type, /* LIGHT-WEIGHT or ORDINARY */ -+ const reiser4_key * key, /* key of invert entry stat-data */ -+ char *filename, /* filename of the file to be opened */ -+ int flags, int mode) -+{ -+ int result; -+ struct inv_entry *ientry; -+ -+ assert("edward-107", invert_inode != NULL); -+ -+ ientry = allocate_inv_entry(); -+ if (IS_ERR(ientry)) -+ return (PTR_ERR(ientry)); -+ -+ if (type == LIGHT_WEIGHT_FILE) { -+ coord_t coord; -+ lock_handle lh; -+ -+ assert("edward-108", key != NULL); -+ -+ init_coord(&coord); -+ init_lh(&lh); -+ result = -+ lookup_sd_by_key(reiser4_tree_by_inode(invert_inode), -+ ZNODE_READ_LOCK, &coord, &lh, key); -+ if (result == 0) -+ init_incore_sd_base(ientry, coord); -+ -+ done_lh(&lh); -+ done_coord(&coord); -+ return (result); -+ } else { -+ struct file *file = filp_open(filename, flags, mode); -+ /* FIXME_EDWARD here we need to check if we -+ did't follow to any mount point */ -+ -+ assert("edward-108", filename != NULL); -+ -+ if (IS_ERR(file)) -+ return (PTR_ERR(file)); -+ ientry->ie_file = file; -+ return 0; -+ } -+} -+ -+/* takes inode of invert, reads the body of this invert, parses it, -+ opens all invert entries and return pointer on the first inv_entry */ -+ -+struct inv_entry *open_invert(struct file *invert_file) -+{ -+ -+} -+ -+ssize_t subfile_read(struct *invert_entry, flow * f) -+{ -+ -+} -+ -+ssize_t subfile_write(struct *invert_entry, flow * f) -+{ -+ -+} -+ -+ssize_t invert_read(struct *file, flow * f) -+{ -+ -+} -+ -+ssize_t invert_write(struct *file, flow * f) -+{ -+ -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/Makefile linux-2.6.20/fs/reiser4/plugin/file/Makefile ---- linux-2.6.20.orig/fs/reiser4/plugin/file/Makefile 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/file/Makefile 2007-05-06 14:50:43.783001971 +0400 -@@ -0,0 +1,7 @@ -+obj-$(CONFIG_REISER4_FS) += file_plugins.o -+ -+file_plugins-objs := \ -+ file.o \ -+ tail_conversion.o \ -+ symlink.o \ -+ cryptcompress.o -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/symfile.c linux-2.6.20/fs/reiser4/plugin/file/symfile.c ---- linux-2.6.20.orig/fs/reiser4/plugin/file/symfile.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/file/symfile.c 2007-05-06 14:50:43.787003221 +0400 -@@ -0,0 +1,87 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Symfiles are a generalization of Unix symlinks. -+ -+ A symfile when read behaves as though you took its contents and -+ substituted them into the reiser4 naming system as the right hand side -+ of an assignment, and then read that which you had assigned to it. -+ -+ A key issue for symfiles is how to implement writes through to -+ subfiles. In general, one must have some method of determining what -+ of that which is written to the symfile is written to what subfile. -+ This can be done by use of custom plugin methods written by users, or -+ by using a few general methods we provide for those willing to endure -+ the insertion of delimiters into what is read. -+ -+ Writing to symfiles without delimiters to denote what is written to -+ what subfile is not supported by any plugins we provide in this -+ release. Our most sophisticated support for writes is that embodied -+ by the invert plugin (see invert.c). -+ -+ A read only version of the /etc/passwd file might be -+ constructed as a symfile whose contents are as follows: -+ -+ /etc/passwd/userlines/* -+ -+ or -+ -+ /etc/passwd/userlines/demidov+/etc/passwd/userlines/edward+/etc/passwd/userlines/reiser+/etc/passwd/userlines/root -+ -+ or -+ -+ /etc/passwd/userlines/(demidov+edward+reiser+root) -+ -+ A symfile with contents -+ -+ /filenameA+"(some text stored in the uninvertable symfile)+/filenameB -+ -+ will return when read -+ -+ The contents of filenameAsome text stored in the uninvertable symfileThe contents of filenameB -+ -+ and write of what has been read will not be possible to implement as -+ an identity operation because there are no delimiters denoting the -+ boundaries of what is to be written to what subfile. -+ -+ Note that one could make this a read/write symfile if one specified -+ delimiters, and the write method understood those delimiters delimited -+ what was written to subfiles. -+ -+ So, specifying the symfile in a manner that allows writes: -+ -+ /etc/passwd/userlines/demidov+"( -+ )+/etc/passwd/userlines/edward+"( -+ )+/etc/passwd/userlines/reiser+"( -+ )+/etc/passwd/userlines/root+"( -+ ) -+ -+ or -+ -+ /etc/passwd/userlines/(demidov+"( -+ )+edward+"( -+ )+reiser+"( -+ )+root+"( -+ )) -+ -+ and the file demidov might be specified as: -+ -+ /etc/passwd/userlines/demidov/username+"(:)+/etc/passwd/userlines/demidov/password+"(:)+/etc/passwd/userlines/demidov/userid+"(:)+/etc/passwd/userlines/demidov/groupid+"(:)+/etc/passwd/userlines/demidov/gecos+"(:)+/etc/passwd/userlines/demidov/home+"(:)+/etc/passwd/userlines/demidov/shell -+ -+ or -+ -+ /etc/passwd/userlines/demidov/(username+"(:)+password+"(:)+userid+"(:)+groupid+"(:)+gecos+"(:)+home+"(:)+shell) -+ -+ Notice that if the file demidov has a carriage return in it, the -+ parsing fails, but then if you put carriage returns in the wrong place -+ in a normal /etc/passwd file it breaks things also. -+ -+ Note that it is forbidden to have no text between two interpolations -+ if one wants to be able to define what parts of a write go to what -+ subfiles referenced in an interpolation. -+ -+ If one wants to be able to add new lines by writing to the file, one -+ must either write a custom plugin for /etc/passwd that knows how to -+ name an added line, or one must use an invert, or one must use a more -+ sophisticated symfile syntax that we are not planning to write for -+ version 4.0. -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/symlink.c linux-2.6.20/fs/reiser4/plugin/file/symlink.c ---- linux-2.6.20.orig/fs/reiser4/plugin/file/symlink.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/file/symlink.c 2007-05-06 14:50:43.787003221 +0400 -@@ -0,0 +1,95 @@ -+/* Copyright 2002, 2003, 2005 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "../../inode.h" -+ -+#include -+#include -+ -+/* file plugin methods specific for symlink files -+ (SYMLINK_FILE_PLUGIN_ID) */ -+ -+/* this is implementation of create_object method of file plugin for -+ SYMLINK_FILE_PLUGIN_ID -+ */ -+ -+/** -+ * reiser4_create_symlink - create_object of file plugin for SYMLINK_FILE_PLUGIN_ID -+ * @symlink: inode of symlink object -+ * @dir: inode of parent directory -+ * @info: parameters of new object -+ * -+ * Inserts stat data with symlink extension where into the tree. -+ */ -+int reiser4_create_symlink(struct inode *symlink, -+ struct inode *dir UNUSED_ARG, -+ reiser4_object_create_data *data /* info passed to us -+ * this is filled by -+ * reiser4() syscall -+ * in particular */) -+{ -+ int result; -+ -+ assert("nikita-680", symlink != NULL); -+ assert("nikita-681", S_ISLNK(symlink->i_mode)); -+ assert("nikita-685", reiser4_inode_get_flag(symlink, REISER4_NO_SD)); -+ assert("nikita-682", dir != NULL); -+ assert("nikita-684", data != NULL); -+ assert("nikita-686", data->id == SYMLINK_FILE_PLUGIN_ID); -+ -+ /* -+ * stat data of symlink has symlink extension in which we store -+ * symlink content, that is, path symlink is pointing to. -+ */ -+ reiser4_inode_data(symlink)->extmask |= (1 << SYMLINK_STAT); -+ -+ assert("vs-838", symlink->i_private == NULL); -+ symlink->i_private = (void *)data->name; -+ -+ assert("vs-843", symlink->i_size == 0); -+ INODE_SET_FIELD(symlink, i_size, strlen(data->name)); -+ -+ /* insert stat data appended with data->name */ -+ result = inode_file_plugin(symlink)->write_sd_by_inode(symlink); -+ if (result) { -+ /* FIXME-VS: Make sure that symlink->i_private is not attached -+ to kmalloced data */ -+ INODE_SET_FIELD(symlink, i_size, 0); -+ } else { -+ assert("vs-849", symlink->i_private -+ && reiser4_inode_get_flag(symlink, -+ REISER4_GENERIC_PTR_USED)); -+ assert("vs-850", -+ !memcmp((char *)symlink->i_private, data->name, -+ (size_t) symlink->i_size + 1)); -+ } -+ return result; -+} -+ -+/* this is implementation of destroy_inode method of file plugin for -+ SYMLINK_FILE_PLUGIN_ID -+ */ -+void destroy_inode_symlink(struct inode *inode) -+{ -+ assert("edward-799", -+ inode_file_plugin(inode) == -+ file_plugin_by_id(SYMLINK_FILE_PLUGIN_ID)); -+ assert("edward-800", !is_bad_inode(inode) && is_inode_loaded(inode)); -+ assert("edward-801", reiser4_inode_get_flag(inode, -+ REISER4_GENERIC_PTR_USED)); -+ assert("vs-839", S_ISLNK(inode->i_mode)); -+ -+ kfree(inode->i_private); -+ inode->i_private = NULL; -+ reiser4_inode_clr_flag(inode, REISER4_GENERIC_PTR_USED); -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file/tail_conversion.c linux-2.6.20/fs/reiser4/plugin/file/tail_conversion.c ---- linux-2.6.20.orig/fs/reiser4/plugin/file/tail_conversion.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/file/tail_conversion.c 2007-05-06 14:50:43.787003221 +0400 -@@ -0,0 +1,729 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "../../inode.h" -+#include "../../super.h" -+#include "../../page_cache.h" -+#include "../../carry.h" -+#include "../../safe_link.h" -+#include "../../vfs_ops.h" -+ -+#include -+ -+/* this file contains: -+ tail2extent and extent2tail */ -+ -+/* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */ -+void get_exclusive_access(unix_file_info_t * uf_info) -+{ -+ assert("nikita-3028", reiser4_schedulable()); -+ assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w)); -+ assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r)); -+ /* -+ * "deadlock avoidance": sometimes we commit a transaction under -+ * rw-semaphore on a file. Such commit can deadlock with another -+ * thread that captured some block (hence preventing atom from being -+ * committed) and waits on rw-semaphore. -+ */ -+ reiser4_txn_restart_current(); -+ LOCK_CNT_INC(inode_sem_w); -+ down_write(&uf_info->latch); -+ uf_info->exclusive_use = 1; -+ assert("vs-1713", uf_info->ea_owner == NULL); -+ assert("vs-1713", atomic_read(&uf_info->nr_neas) == 0); -+ ON_DEBUG(uf_info->ea_owner = current); -+} -+ -+void drop_exclusive_access(unix_file_info_t * uf_info) -+{ -+ assert("vs-1714", uf_info->ea_owner == current); -+ assert("vs-1715", atomic_read(&uf_info->nr_neas) == 0); -+ ON_DEBUG(uf_info->ea_owner = NULL); -+ uf_info->exclusive_use = 0; -+ up_write(&uf_info->latch); -+ assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r)); -+ assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w)); -+ LOCK_CNT_DEC(inode_sem_w); -+ reiser4_txn_restart_current(); -+} -+ -+/** -+ * nea_grabbed - do something when file semaphore is down_read-ed -+ * @uf_info: -+ * -+ * This is called when nonexclisive access is obtained on file. All it does is -+ * for debugging purposes. -+ */ -+static void nea_grabbed(unix_file_info_t *uf_info) -+{ -+#if REISER4_DEBUG -+ LOCK_CNT_INC(inode_sem_r); -+ assert("vs-1716", uf_info->ea_owner == NULL); -+ atomic_inc(&uf_info->nr_neas); -+ uf_info->last_reader = current; -+#endif -+} -+ -+/** -+ * get_nonexclusive_access - get nonexclusive access to a file -+ * @uf_info: unix file specific part of inode to obtain access to -+ * -+ * Nonexclusive access is obtained on a file before read, write, readpage. -+ */ -+void get_nonexclusive_access(unix_file_info_t *uf_info) -+{ -+ assert("nikita-3029", reiser4_schedulable()); -+ assert("nikita-3361", get_current_context()->trans->atom == NULL); -+ -+ down_read(&uf_info->latch); -+ nea_grabbed(uf_info); -+} -+ -+/** -+ * try_to_get_nonexclusive_access - try to get nonexclusive access to a file -+ * @uf_info: unix file specific part of inode to obtain access to -+ * -+ * Non-blocking version of nonexclusive access obtaining. -+ */ -+int try_to_get_nonexclusive_access(unix_file_info_t *uf_info) -+{ -+ int result; -+ -+ result = down_read_trylock(&uf_info->latch); -+ if (result) -+ nea_grabbed(uf_info); -+ return result; -+} -+ -+void drop_nonexclusive_access(unix_file_info_t * uf_info) -+{ -+ assert("vs-1718", uf_info->ea_owner == NULL); -+ assert("vs-1719", atomic_read(&uf_info->nr_neas) > 0); -+ ON_DEBUG(atomic_dec(&uf_info->nr_neas)); -+ -+ up_read(&uf_info->latch); -+ -+ LOCK_CNT_DEC(inode_sem_r); -+ reiser4_txn_restart_current(); -+} -+ -+/* part of tail2extent. Cut all items covering @count bytes starting from -+ @offset */ -+/* Audited by: green(2002.06.15) */ -+static int cut_formatting_items(struct inode *inode, loff_t offset, int count) -+{ -+ reiser4_key from, to; -+ -+ /* AUDIT: How about putting an assertion here, what would check -+ all provided range is covered by tail items only? */ -+ /* key of first byte in the range to be cut */ -+ inode_file_plugin(inode)->key_by_inode(inode, offset, &from); -+ -+ /* key of last byte in that range */ -+ to = from; -+ set_key_offset(&to, (__u64) (offset + count - 1)); -+ -+ /* cut everything between those keys */ -+ return reiser4_cut_tree(reiser4_tree_by_inode(inode), &from, &to, -+ inode, 0); -+} -+ -+static void release_all_pages(struct page **pages, unsigned nr_pages) -+{ -+ unsigned i; -+ -+ for (i = 0; i < nr_pages; i++) { -+ if (pages[i] == NULL) { -+ unsigned j; -+ for (j = i + 1; j < nr_pages; j++) -+ assert("vs-1620", pages[j] == NULL); -+ break; -+ } -+ page_cache_release(pages[i]); -+ pages[i] = NULL; -+ } -+} -+ -+/* part of tail2extent. replace tail items with extent one. Content of tail -+ items (@count bytes) being cut are copied already into -+ pages. extent_writepage method is called to create extents corresponding to -+ those pages */ -+static int replace(struct inode *inode, struct page **pages, unsigned nr_pages, int count) -+{ -+ int result; -+ unsigned i; -+ STORE_COUNTERS; -+ -+ if (nr_pages == 0) -+ return 0; -+ -+ assert("vs-596", pages[0]); -+ -+ /* cut copied items */ -+ result = cut_formatting_items(inode, page_offset(pages[0]), count); -+ if (result) -+ return result; -+ -+ CHECK_COUNTERS; -+ -+ /* put into tree replacement for just removed items: extent item, namely */ -+ for (i = 0; i < nr_pages; i++) { -+ result = add_to_page_cache_lru(pages[i], inode->i_mapping, -+ pages[i]->index, -+ mapping_gfp_mask(inode-> -+ i_mapping)); -+ if (result) -+ break; -+ unlock_page(pages[i]); -+ result = find_or_create_extent(pages[i]); -+ if (result) -+ break; -+ SetPageUptodate(pages[i]); -+ } -+ return result; -+} -+ -+#define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail -+ * items */ -+ -+static int reserve_tail2extent_iteration(struct inode *inode) -+{ -+ reiser4_block_nr unformatted_nodes; -+ reiser4_tree *tree; -+ -+ tree = reiser4_tree_by_inode(inode); -+ -+ /* number of unformatted nodes which will be created */ -+ unformatted_nodes = TAIL2EXTENT_PAGE_NUM; -+ -+ /* -+ * space required for one iteration of extent->tail conversion: -+ * -+ * 1. kill N tail items -+ * -+ * 2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes -+ * -+ * 3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block -+ * extents) extent units. -+ * -+ * 4. drilling to the leaf level by coord_by_key() -+ * -+ * 5. possible update of stat-data -+ * -+ */ -+ grab_space_enable(); -+ return reiser4_grab_space -+ (2 * tree->height + -+ TAIL2EXTENT_PAGE_NUM + -+ TAIL2EXTENT_PAGE_NUM * estimate_one_insert_into_item(tree) + -+ 1 + estimate_one_insert_item(tree) + -+ inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT); -+} -+ -+/* clear stat data's flag indicating that conversion is being converted */ -+static int complete_conversion(struct inode *inode) -+{ -+ int result; -+ -+ grab_space_enable(); -+ result = -+ reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode), -+ BA_CAN_COMMIT); -+ if (result == 0) { -+ reiser4_inode_clr_flag(inode, REISER4_PART_MIXED); -+ result = reiser4_update_sd(inode); -+ } -+ if (result) -+ warning("vs-1696", "Failed to clear converting bit of %llu: %i", -+ (unsigned long long)get_inode_oid(inode), result); -+ return 0; -+} -+ -+/** -+ * find_start -+ * @inode: -+ * @id: -+ * @offset: -+ * -+ * this is used by tail2extent and extent2tail to detect where previous -+ * uncompleted conversion stopped -+ */ -+static int find_start(struct inode *inode, reiser4_plugin_id id, __u64 *offset) -+{ -+ int result; -+ lock_handle lh; -+ coord_t coord; -+ unix_file_info_t *ufo; -+ int found; -+ reiser4_key key; -+ -+ ufo = unix_file_inode_data(inode); -+ init_lh(&lh); -+ result = 0; -+ found = 0; -+ inode_file_plugin(inode)->key_by_inode(inode, *offset, &key); -+ do { -+ init_lh(&lh); -+ result = find_file_item_nohint(&coord, &lh, &key, -+ ZNODE_READ_LOCK, inode); -+ -+ if (result == CBK_COORD_FOUND) { -+ if (coord.between == AT_UNIT) { -+ /*coord_clear_iplug(&coord); */ -+ result = zload(coord.node); -+ if (result == 0) { -+ if (item_id_by_coord(&coord) == id) -+ found = 1; -+ else -+ item_plugin_by_coord(&coord)->s. -+ file.append_key(&coord, -+ &key); -+ zrelse(coord.node); -+ } -+ } else -+ result = RETERR(-ENOENT); -+ } -+ done_lh(&lh); -+ } while (result == 0 && !found); -+ *offset = get_key_offset(&key); -+ return result; -+} -+ -+/** -+ * tail2extent -+ * @uf_info: -+ * -+ * -+ */ -+int tail2extent(unix_file_info_t *uf_info) -+{ -+ int result; -+ reiser4_key key; /* key of next byte to be moved to page */ -+ char *p_data; /* data of page */ -+ unsigned page_off = 0, /* offset within the page where to copy data */ -+ count; /* number of bytes of item which can be -+ * copied to page */ -+ struct page *pages[TAIL2EXTENT_PAGE_NUM]; -+ struct page *page; -+ int done; /* set to 1 when all file is read */ -+ char *item; -+ int i; -+ struct inode *inode; -+ int first_iteration; -+ int bytes; -+ __u64 offset; -+ -+ assert("nikita-3362", ea_obtained(uf_info)); -+ inode = unix_file_info_to_inode(uf_info); -+ assert("nikita-3412", !IS_RDONLY(inode)); -+ assert("vs-1649", uf_info->container != UF_CONTAINER_EXTENTS); -+ assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV)); -+ -+ offset = 0; -+ first_iteration = 1; -+ result = 0; -+ if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) { -+ /* -+ * file is marked on disk as there was a conversion which did -+ * not complete due to either crash or some error. Find which -+ * offset tail conversion stopped at -+ */ -+ result = find_start(inode, FORMATTING_ID, &offset); -+ if (result == -ENOENT) { -+ /* no tail items found, everything is converted */ -+ uf_info->container = UF_CONTAINER_EXTENTS; -+ complete_conversion(inode); -+ return 0; -+ } else if (result != 0) -+ /* some other error */ -+ return result; -+ first_iteration = 0; -+ } -+ -+ reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV); -+ -+ /* get key of first byte of a file */ -+ inode_file_plugin(inode)->key_by_inode(inode, offset, &key); -+ -+ done = 0; -+ while (done == 0) { -+ memset(pages, 0, sizeof(pages)); -+ result = reserve_tail2extent_iteration(inode); -+ if (result != 0) -+ goto out; -+ if (first_iteration) { -+ reiser4_inode_set_flag(inode, REISER4_PART_MIXED); -+ reiser4_update_sd(inode); -+ first_iteration = 0; -+ } -+ bytes = 0; -+ for (i = 0; i < sizeof_array(pages) && done == 0; i++) { -+ assert("vs-598", -+ (get_key_offset(&key) & ~PAGE_CACHE_MASK) == 0); -+ page = alloc_page(reiser4_ctx_gfp_mask_get()); -+ if (!page) { -+ result = RETERR(-ENOMEM); -+ goto error; -+ } -+ -+ page->index = -+ (unsigned long)(get_key_offset(&key) >> -+ PAGE_CACHE_SHIFT); -+ /* -+ * usually when one is going to longterm lock znode (as -+ * find_file_item does, for instance) he must not hold -+ * locked pages. However, there is an exception for -+ * case tail2extent. Pages appearing here are not -+ * reachable to everyone else, they are clean, they do -+ * not have jnodes attached so keeping them locked do -+ * not risk deadlock appearance -+ */ -+ assert("vs-983", !PagePrivate(page)); -+ reiser4_invalidate_pages(inode->i_mapping, page->index, -+ 1, 0); -+ -+ for (page_off = 0; page_off < PAGE_CACHE_SIZE;) { -+ coord_t coord; -+ lock_handle lh; -+ -+ /* get next item */ -+ /* FIXME: we might want to readahead here */ -+ init_lh(&lh); -+ result = -+ find_file_item_nohint(&coord, &lh, &key, -+ ZNODE_READ_LOCK, -+ inode); -+ if (result != CBK_COORD_FOUND) { -+ /* -+ * error happened of not items of file -+ * were found -+ */ -+ done_lh(&lh); -+ page_cache_release(page); -+ goto error; -+ } -+ -+ if (coord.between == AFTER_UNIT) { -+ /* -+ * end of file is reached. Padd page -+ * with zeros -+ */ -+ done_lh(&lh); -+ done = 1; -+ p_data = kmap_atomic(page, KM_USER0); -+ memset(p_data + page_off, 0, -+ PAGE_CACHE_SIZE - page_off); -+ kunmap_atomic(p_data, KM_USER0); -+ break; -+ } -+ -+ result = zload(coord.node); -+ if (result) { -+ page_cache_release(page); -+ done_lh(&lh); -+ goto error; -+ } -+ assert("vs-856", coord.between == AT_UNIT); -+ item = ((char *)item_body_by_coord(&coord)) + -+ coord.unit_pos; -+ -+ /* how many bytes to copy */ -+ count = -+ item_length_by_coord(&coord) - -+ coord.unit_pos; -+ /* limit length of copy to end of page */ -+ if (count > PAGE_CACHE_SIZE - page_off) -+ count = PAGE_CACHE_SIZE - page_off; -+ -+ /* -+ * copy item (as much as will fit starting from -+ * the beginning of the item) into the page -+ */ -+ p_data = kmap_atomic(page, KM_USER0); -+ memcpy(p_data + page_off, item, count); -+ kunmap_atomic(p_data, KM_USER0); -+ -+ page_off += count; -+ bytes += count; -+ set_key_offset(&key, -+ get_key_offset(&key) + count); -+ -+ zrelse(coord.node); -+ done_lh(&lh); -+ } /* end of loop which fills one page by content of -+ * formatting items */ -+ -+ if (page_off) { -+ /* something was copied into page */ -+ pages[i] = page; -+ } else { -+ page_cache_release(page); -+ assert("vs-1648", done == 1); -+ break; -+ } -+ } /* end of loop through pages of one conversion iteration */ -+ -+ if (i > 0) { -+ result = replace(inode, pages, i, bytes); -+ release_all_pages(pages, sizeof_array(pages)); -+ if (result) -+ goto error; -+ /* -+ * We have to drop exclusive access to avoid deadlock -+ * which may happen because called by reiser4_writepages -+ * capture_unix_file requires to get non-exclusive -+ * access to a file. It is safe to drop EA in the middle -+ * of tail2extent conversion because write_unix_file, -+ * setattr_unix_file(truncate), mmap_unix_file, -+ * release_unix_file(extent2tail) checks if conversion -+ * is not in progress (see comments before -+ * get_exclusive_access_careful(). -+ * Other processes that acquire non-exclusive access -+ * (read_unix_file, reiser4_writepages, etc) should work -+ * on partially converted files. -+ */ -+ drop_exclusive_access(uf_info); -+ /* throttle the conversion */ -+ reiser4_throttle_write(inode); -+ get_exclusive_access(uf_info); -+ -+ /* -+ * nobody is allowed to complete conversion but a -+ * process which started it -+ */ -+ assert("", reiser4_inode_get_flag(inode, -+ REISER4_PART_MIXED)); -+ } -+ } -+ -+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV); -+ -+ if (result == 0) { -+ /* file is converted to extent items */ -+ assert("vs-1697", reiser4_inode_get_flag(inode, -+ REISER4_PART_MIXED)); -+ -+ uf_info->container = UF_CONTAINER_EXTENTS; -+ complete_conversion(inode); -+ } else { -+ /* -+ * conversion is not complete. Inode was already marked as -+ * REISER4_PART_CONV and stat-data were updated at the first -+ * iteration of the loop above. -+ */ -+ error: -+ release_all_pages(pages, sizeof_array(pages)); -+ warning("nikita-2282", "Partial conversion of %llu: %i", -+ (unsigned long long)get_inode_oid(inode), result); -+ } -+ -+ out: -+ return result; -+} -+ -+static int reserve_extent2tail_iteration(struct inode *inode) -+{ -+ reiser4_tree *tree; -+ -+ tree = reiser4_tree_by_inode(inode); -+ /* -+ * reserve blocks for (in this order): -+ * -+ * 1. removal of extent item -+ * -+ * 2. insertion of tail by insert_flow() -+ * -+ * 3. drilling to the leaf level by coord_by_key() -+ * -+ * 4. possible update of stat-data -+ */ -+ grab_space_enable(); -+ return reiser4_grab_space -+ (estimate_one_item_removal(tree) + -+ estimate_insert_flow(tree->height) + -+ 1 + estimate_one_insert_item(tree) + -+ inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT); -+} -+ -+/* for every page of file: read page, cut part of extent pointing to this page, -+ put data of page tree by tail item */ -+int extent2tail(unix_file_info_t *uf_info) -+{ -+ int result; -+ struct inode *inode; -+ struct page *page; -+ unsigned long num_pages, i; -+ unsigned long start_page; -+ reiser4_key from; -+ reiser4_key to; -+ unsigned count; -+ __u64 offset; -+ -+ assert("nikita-3362", ea_obtained(uf_info)); -+ inode = unix_file_info_to_inode(uf_info); -+ assert("nikita-3412", !IS_RDONLY(inode)); -+ assert("vs-1649", uf_info->container != UF_CONTAINER_TAILS); -+ assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV)); -+ -+ offset = 0; -+ if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) { -+ /* -+ * file is marked on disk as there was a conversion which did -+ * not complete due to either crash or some error. Find which -+ * offset tail conversion stopped at -+ */ -+ result = find_start(inode, EXTENT_POINTER_ID, &offset); -+ if (result == -ENOENT) { -+ /* no extent found, everything is converted */ -+ uf_info->container = UF_CONTAINER_TAILS; -+ complete_conversion(inode); -+ return 0; -+ } else if (result != 0) -+ /* some other error */ -+ return result; -+ } -+ -+ reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV); -+ -+ /* number of pages in the file */ -+ num_pages = -+ (inode->i_size + - offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; -+ start_page = offset >> PAGE_CACHE_SHIFT; -+ -+ inode_file_plugin(inode)->key_by_inode(inode, offset, &from); -+ to = from; -+ -+ result = 0; -+ for (i = 0; i < num_pages; i++) { -+ __u64 start_byte; -+ -+ result = reserve_extent2tail_iteration(inode); -+ if (result != 0) -+ break; -+ if (i == 0 && offset == 0) { -+ reiser4_inode_set_flag(inode, REISER4_PART_MIXED); -+ reiser4_update_sd(inode); -+ } -+ -+ page = read_mapping_page(inode->i_mapping, -+ (unsigned)(i + start_page), NULL); -+ if (IS_ERR(page)) { -+ result = PTR_ERR(page); -+ break; -+ } -+ -+ wait_on_page_locked(page); -+ -+ if (!PageUptodate(page)) { -+ page_cache_release(page); -+ result = RETERR(-EIO); -+ break; -+ } -+ -+ /* cut part of file we have read */ -+ start_byte = (__u64) (i << PAGE_CACHE_SHIFT); -+ set_key_offset(&from, start_byte); -+ set_key_offset(&to, start_byte + PAGE_CACHE_SIZE - 1); -+ /* -+ * reiser4_cut_tree_object() returns -E_REPEAT to allow atom -+ * commits during over-long truncates. But -+ * extent->tail conversion should be performed in one -+ * transaction. -+ */ -+ result = reiser4_cut_tree(reiser4_tree_by_inode(inode), &from, -+ &to, inode, 0); -+ -+ if (result) { -+ page_cache_release(page); -+ break; -+ } -+ -+ /* put page data into tree via tail_write */ -+ count = PAGE_CACHE_SIZE; -+ if ((i == (num_pages - 1)) && -+ (inode->i_size & ~PAGE_CACHE_MASK)) -+ /* last page can be incompleted */ -+ count = (inode->i_size & ~PAGE_CACHE_MASK); -+ while (count) { -+ struct dentry dentry; -+ struct file file; -+ loff_t pos; -+ -+ dentry.d_inode = inode; -+ file.f_dentry = &dentry; -+ file.private_data = NULL; -+ file.f_pos = start_byte; -+ file.private_data = NULL; -+ pos = start_byte; -+ result = reiser4_write_tail(&file, -+ (char __user *)kmap(page), -+ count, &pos); -+ reiser4_free_file_fsdata(&file); -+ if (result <= 0) { -+ warning("", "reiser4_write_tail failed"); -+ page_cache_release(page); -+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV); -+ return result; -+ } -+ count -= result; -+ } -+ -+ /* release page */ -+ lock_page(page); -+ /* page is already detached from jnode and mapping. */ -+ assert("vs-1086", page->mapping == NULL); -+ assert("nikita-2690", -+ (!PagePrivate(page) && jprivate(page) == 0)); -+ /* waiting for writeback completion with page lock held is -+ * perfectly valid. */ -+ wait_on_page_writeback(page); -+ reiser4_drop_page(page); -+ /* release reference taken by read_cache_page() above */ -+ page_cache_release(page); -+ -+ drop_exclusive_access(uf_info); -+ /* throttle the conversion */ -+ reiser4_throttle_write(inode); -+ get_exclusive_access(uf_info); -+ /* -+ * nobody is allowed to complete conversion but a process which -+ * started it -+ */ -+ assert("", reiser4_inode_get_flag(inode, REISER4_PART_MIXED)); -+ } -+ -+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV); -+ -+ if (i == num_pages) { -+ /* file is converted to formatted items */ -+ assert("vs-1698", reiser4_inode_get_flag(inode, -+ REISER4_PART_MIXED)); -+ assert("vs-1260", -+ inode_has_no_jnodes(reiser4_inode_data(inode))); -+ -+ uf_info->container = UF_CONTAINER_TAILS; -+ complete_conversion(inode); -+ return 0; -+ } -+ /* -+ * conversion is not complete. Inode was already marked as -+ * REISER4_PART_MIXED and stat-data were updated at the first * -+ * iteration of the loop above. -+ */ -+ warning("nikita-2282", -+ "Partial conversion of %llu: %lu of %lu: %i", -+ (unsigned long long)get_inode_oid(inode), i, -+ num_pages, result); -+ -+ return result; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file_ops.c linux-2.6.20/fs/reiser4/plugin/file_ops.c ---- linux-2.6.20.orig/fs/reiser4/plugin/file_ops.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/file_ops.c 2007-05-06 14:50:43.787003221 +0400 -@@ -0,0 +1,168 @@ -+/* Copyright 2005 by Hans Reiser, licensing governed by -+ reiser4/README */ -+ -+/* this file contains typical implementations for some of methods of -+ struct file_operations and of struct address_space_operations -+*/ -+ -+#include "../inode.h" -+#include "object.h" -+ -+/* file operations */ -+ -+/* implementation of vfs's llseek method of struct file_operations for -+ typical directory can be found in readdir_common.c -+*/ -+loff_t reiser4_llseek_dir_common(struct file *, loff_t, int origin); -+ -+/* implementation of vfs's readdir method of struct file_operations for -+ typical directory can be found in readdir_common.c -+*/ -+int reiser4_readdir_common(struct file *, void *dirent, filldir_t); -+ -+/** -+ * reiser4_release_dir_common - release of struct file_operations -+ * @inode: inode of released file -+ * @file: file to release -+ * -+ * Implementation of release method of struct file_operations for typical -+ * directory. All it does is freeing of reiser4 specific file data. -+*/ -+int reiser4_release_dir_common(struct inode *inode, struct file *file) -+{ -+ reiser4_context *ctx; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ reiser4_free_file_fsdata(file); -+ reiser4_exit_context(ctx); -+ return 0; -+} -+ -+/* this is common implementation of vfs's fsync method of struct -+ file_operations -+*/ -+int reiser4_sync_common(struct file *file, struct dentry *dentry, int datasync) -+{ -+ reiser4_context *ctx; -+ int result; -+ -+ ctx = reiser4_init_context(dentry->d_inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ result = txnmgr_force_commit_all(dentry->d_inode->i_sb, 0); -+ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/* this is common implementation of vfs's sendfile method of struct -+ file_operations -+ -+ Reads @count bytes from @file and calls @actor for every page read. This is -+ needed for loop back devices support. -+*/ -+#if 0 -+ssize_t -+sendfile_common(struct file *file, loff_t *ppos, size_t count, -+ read_actor_t actor, void *target) -+{ -+ reiser4_context *ctx; -+ ssize_t result; -+ -+ ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ result = generic_file_sendfile(file, ppos, count, actor, target); -+ reiser4_exit_context(ctx); -+ return result; -+} -+#endif /* 0 */ -+ -+/* address space operations */ -+ -+/* this is common implementation of vfs's prepare_write method of struct -+ address_space_operations -+*/ -+int -+prepare_write_common(struct file *file, struct page *page, unsigned from, -+ unsigned to) -+{ -+ reiser4_context *ctx; -+ int result; -+ -+ ctx = reiser4_init_context(page->mapping->host->i_sb); -+ result = do_prepare_write(file, page, from, to); -+ -+ /* don't commit transaction under inode semaphore */ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ -+ return result; -+} -+ -+/* this is helper for prepare_write_common and prepare_write_unix_file -+ */ -+int -+do_prepare_write(struct file *file, struct page *page, unsigned from, -+ unsigned to) -+{ -+ int result; -+ file_plugin *fplug; -+ struct inode *inode; -+ -+ assert("umka-3099", file != NULL); -+ assert("umka-3100", page != NULL); -+ assert("umka-3095", PageLocked(page)); -+ -+ if (to - from == PAGE_CACHE_SIZE || PageUptodate(page)) -+ return 0; -+ -+ inode = page->mapping->host; -+ fplug = inode_file_plugin(inode); -+ -+ if (page->mapping->a_ops->readpage == NULL) -+ return RETERR(-EINVAL); -+ -+ result = page->mapping->a_ops->readpage(file, page); -+ if (result != 0) { -+ SetPageError(page); -+ ClearPageUptodate(page); -+ /* All reiser4 readpage() implementations should return the -+ * page locked in case of error. */ -+ assert("nikita-3472", PageLocked(page)); -+ } else { -+ /* -+ * ->readpage() either: -+ * -+ * 1. starts IO against @page. @page is locked for IO in -+ * this case. -+ * -+ * 2. doesn't start IO. @page is unlocked. -+ * -+ * In either case, page should be locked. -+ */ -+ lock_page(page); -+ /* -+ * IO (if any) is completed at this point. Check for IO -+ * errors. -+ */ -+ if (!PageUptodate(page)) -+ result = RETERR(-EIO); -+ } -+ assert("umka-3098", PageLocked(page)); -+ return result; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file_ops_readdir.c linux-2.6.20/fs/reiser4/plugin/file_ops_readdir.c ---- linux-2.6.20.orig/fs/reiser4/plugin/file_ops_readdir.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/file_ops_readdir.c 2007-05-06 14:50:43.791004471 +0400 -@@ -0,0 +1,657 @@ -+/* Copyright 2005 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#include "../inode.h" -+ -+/* return true, iff @coord points to the valid directory item that is part of -+ * @inode directory. */ -+static int is_valid_dir_coord(struct inode *inode, coord_t * coord) -+{ -+ return plugin_of_group(item_plugin_by_coord(coord), -+ DIR_ENTRY_ITEM_TYPE) && -+ inode_file_plugin(inode)->owns_item(inode, coord); -+} -+ -+/* compare two logical positions within the same directory */ -+static cmp_t dir_pos_cmp(const dir_pos * p1, const dir_pos * p2) -+{ -+ cmp_t result; -+ -+ assert("nikita-2534", p1 != NULL); -+ assert("nikita-2535", p2 != NULL); -+ -+ result = de_id_cmp(&p1->dir_entry_key, &p2->dir_entry_key); -+ if (result == EQUAL_TO) { -+ int diff; -+ -+ diff = p1->pos - p2->pos; -+ result = -+ (diff < 0) ? LESS_THAN : (diff ? GREATER_THAN : EQUAL_TO); -+ } -+ return result; -+} -+ -+/* see comment before reiser4_readdir_common() for overview of why "adjustment" is -+ * necessary. */ -+static void -+adjust_dir_pos(struct file *dir, -+ readdir_pos * readdir_spot, const dir_pos * mod_point, int adj) -+{ -+ dir_pos *pos; -+ -+ /* -+ * new directory entry was added (adj == +1) or removed (adj == -1) at -+ * the @mod_point. Directory file descriptor @dir is doing readdir and -+ * is currently positioned at @readdir_spot. Latter has to be updated -+ * to maintain stable readdir. -+ */ -+ /* directory is positioned to the beginning. */ -+ if (readdir_spot->entry_no == 0) -+ return; -+ -+ pos = &readdir_spot->position; -+ switch (dir_pos_cmp(mod_point, pos)) { -+ case LESS_THAN: -+ /* @mod_pos is _before_ @readdir_spot, that is, entry was -+ * added/removed on the left (in key order) of current -+ * position. */ -+ /* logical number of directory entry readdir is "looking" at -+ * changes */ -+ readdir_spot->entry_no += adj; -+ assert("nikita-2577", -+ ergo(dir != NULL, reiser4_get_dir_fpos(dir) + adj >= 0)); -+ if (de_id_cmp(&pos->dir_entry_key, -+ &mod_point->dir_entry_key) == EQUAL_TO) { -+ assert("nikita-2575", mod_point->pos < pos->pos); -+ /* -+ * if entry added/removed has the same key as current -+ * for readdir, update counter of duplicate keys in -+ * @readdir_spot. -+ */ -+ pos->pos += adj; -+ } -+ break; -+ case GREATER_THAN: -+ /* directory is modified after @pos: nothing to do. */ -+ break; -+ case EQUAL_TO: -+ /* cannot insert an entry readdir is looking at, because it -+ already exists. */ -+ assert("nikita-2576", adj < 0); -+ /* directory entry to which @pos points to is being -+ removed. -+ -+ NOTE-NIKITA: Right thing to do is to update @pos to point -+ to the next entry. This is complex (we are under spin-lock -+ for one thing). Just rewind it to the beginning. Next -+ readdir will have to scan the beginning of -+ directory. Proper solution is to use semaphore in -+ spin lock's stead and use rewind_right() here. -+ -+ NOTE-NIKITA: now, semaphore is used, so... -+ */ -+ memset(readdir_spot, 0, sizeof *readdir_spot); -+ } -+} -+ -+/* scan all file-descriptors for this directory and adjust their -+ positions respectively. Should be used by implementations of -+ add_entry and rem_entry of dir plugin */ -+void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de, -+ int offset, int adj) -+{ -+ reiser4_file_fsdata *scan; -+ dir_pos mod_point; -+ -+ assert("nikita-2536", dir != NULL); -+ assert("nikita-2538", de != NULL); -+ assert("nikita-2539", adj != 0); -+ -+ build_de_id(dir, &de->d_name, &mod_point.dir_entry_key); -+ mod_point.pos = offset; -+ -+ spin_lock_inode(dir); -+ -+ /* -+ * new entry was added/removed in directory @dir. Scan all file -+ * descriptors for @dir that are currently involved into @readdir and -+ * update them. -+ */ -+ -+ list_for_each_entry(scan, get_readdir_list(dir), dir.linkage) -+ adjust_dir_pos(scan->back, &scan->dir.readdir, &mod_point, adj); -+ -+ spin_unlock_inode(dir); -+} -+ -+/* -+ * traverse tree to start/continue readdir from the readdir position @pos. -+ */ -+static int dir_go_to(struct file *dir, readdir_pos * pos, tap_t * tap) -+{ -+ reiser4_key key; -+ int result; -+ struct inode *inode; -+ -+ assert("nikita-2554", pos != NULL); -+ -+ inode = dir->f_dentry->d_inode; -+ result = inode_dir_plugin(inode)->build_readdir_key(dir, &key); -+ if (result != 0) -+ return result; -+ result = reiser4_object_lookup(inode, -+ &key, -+ tap->coord, -+ tap->lh, -+ tap->mode, -+ FIND_EXACT, -+ LEAF_LEVEL, LEAF_LEVEL, -+ 0, &tap->ra_info); -+ if (result == CBK_COORD_FOUND) -+ result = rewind_right(tap, (int)pos->position.pos); -+ else { -+ tap->coord->node = NULL; -+ done_lh(tap->lh); -+ result = RETERR(-EIO); -+ } -+ return result; -+} -+ -+/* -+ * handling of non-unique keys: calculate at what ordinal position within -+ * sequence of directory items with identical keys @pos is. -+ */ -+static int set_pos(struct inode *inode, readdir_pos * pos, tap_t * tap) -+{ -+ int result; -+ coord_t coord; -+ lock_handle lh; -+ tap_t scan; -+ de_id *did; -+ reiser4_key de_key; -+ -+ coord_init_zero(&coord); -+ init_lh(&lh); -+ reiser4_tap_init(&scan, &coord, &lh, ZNODE_READ_LOCK); -+ reiser4_tap_copy(&scan, tap); -+ reiser4_tap_load(&scan); -+ pos->position.pos = 0; -+ -+ did = &pos->position.dir_entry_key; -+ -+ if (is_valid_dir_coord(inode, scan.coord)) { -+ -+ build_de_id_by_key(unit_key_by_coord(scan.coord, &de_key), did); -+ -+ while (1) { -+ -+ result = go_prev_unit(&scan); -+ if (result != 0) -+ break; -+ -+ if (!is_valid_dir_coord(inode, scan.coord)) { -+ result = -EINVAL; -+ break; -+ } -+ -+ /* get key of directory entry */ -+ unit_key_by_coord(scan.coord, &de_key); -+ if (de_id_key_cmp(did, &de_key) != EQUAL_TO) { -+ /* duplicate-sequence is over */ -+ break; -+ } -+ pos->position.pos++; -+ } -+ } else -+ result = RETERR(-ENOENT); -+ reiser4_tap_relse(&scan); -+ reiser4_tap_done(&scan); -+ return result; -+} -+ -+/* -+ * "rewind" directory to @offset, i.e., set @pos and @tap correspondingly. -+ */ -+static int dir_rewind(struct file *dir, readdir_pos * pos, tap_t * tap) -+{ -+ __u64 destination; -+ __s64 shift; -+ int result; -+ struct inode *inode; -+ loff_t dirpos; -+ -+ assert("nikita-2553", dir != NULL); -+ assert("nikita-2548", pos != NULL); -+ assert("nikita-2551", tap->coord != NULL); -+ assert("nikita-2552", tap->lh != NULL); -+ -+ dirpos = reiser4_get_dir_fpos(dir); -+ shift = dirpos - pos->fpos; -+ /* this is logical directory entry within @dir which we are rewinding -+ * to */ -+ destination = pos->entry_no + shift; -+ -+ inode = dir->f_dentry->d_inode; -+ if (dirpos < 0) -+ return RETERR(-EINVAL); -+ else if (destination == 0ll || dirpos == 0) { -+ /* rewind to the beginning of directory */ -+ memset(pos, 0, sizeof *pos); -+ return dir_go_to(dir, pos, tap); -+ } else if (destination >= inode->i_size) -+ return RETERR(-ENOENT); -+ -+ if (shift < 0) { -+ /* I am afraid of negative numbers */ -+ shift = -shift; -+ /* rewinding to the left */ -+ if (shift <= (int)pos->position.pos) { -+ /* destination is within sequence of entries with -+ duplicate keys. */ -+ result = dir_go_to(dir, pos, tap); -+ } else { -+ shift -= pos->position.pos; -+ while (1) { -+ /* repetitions: deadlock is possible when -+ going to the left. */ -+ result = dir_go_to(dir, pos, tap); -+ if (result == 0) { -+ result = rewind_left(tap, shift); -+ if (result == -E_DEADLOCK) { -+ reiser4_tap_done(tap); -+ continue; -+ } -+ } -+ break; -+ } -+ } -+ } else { -+ /* rewinding to the right */ -+ result = dir_go_to(dir, pos, tap); -+ if (result == 0) -+ result = rewind_right(tap, shift); -+ } -+ if (result == 0) { -+ result = set_pos(inode, pos, tap); -+ if (result == 0) { -+ /* update pos->position.pos */ -+ pos->entry_no = destination; -+ pos->fpos = dirpos; -+ } -+ } -+ return result; -+} -+ -+/* -+ * Function that is called by common_readdir() on each directory entry while -+ * doing readdir. ->filldir callback may block, so we had to release long term -+ * lock while calling it. To avoid repeating tree traversal, seal is used. If -+ * seal is broken, we return -E_REPEAT. Node is unlocked in this case. -+ * -+ * Whether node is unlocked in case of any other error is undefined. It is -+ * guaranteed to be still locked if success (0) is returned. -+ * -+ * When ->filldir() wants no more, feed_entry() returns 1, and node is -+ * unlocked. -+ */ -+static int -+feed_entry(struct file *f, -+ readdir_pos * pos, tap_t * tap, filldir_t filldir, void *dirent) -+{ -+ item_plugin *iplug; -+ char *name; -+ reiser4_key sd_key; -+ int result; -+ char buf[DE_NAME_BUF_LEN]; -+ char name_buf[32]; -+ char *local_name; -+ unsigned file_type; -+ seal_t seal; -+ coord_t *coord; -+ reiser4_key entry_key; -+ -+ coord = tap->coord; -+ iplug = item_plugin_by_coord(coord); -+ -+ /* pointer to name within the node */ -+ name = iplug->s.dir.extract_name(coord, buf); -+ assert("nikita-1371", name != NULL); -+ -+ /* key of object the entry points to */ -+ if (iplug->s.dir.extract_key(coord, &sd_key) != 0) -+ return RETERR(-EIO); -+ -+ /* we must release longterm znode lock before calling filldir to avoid -+ deadlock which may happen if filldir causes page fault. So, copy -+ name to intermediate buffer */ -+ if (strlen(name) + 1 > sizeof(name_buf)) { -+ local_name = kmalloc(strlen(name) + 1, -+ reiser4_ctx_gfp_mask_get()); -+ if (local_name == NULL) -+ return RETERR(-ENOMEM); -+ } else -+ local_name = name_buf; -+ -+ strcpy(local_name, name); -+ file_type = iplug->s.dir.extract_file_type(coord); -+ -+ unit_key_by_coord(coord, &entry_key); -+ reiser4_seal_init(&seal, coord, &entry_key); -+ -+ longterm_unlock_znode(tap->lh); -+ -+ /* -+ * send information about directory entry to the ->filldir() filler -+ * supplied to us by caller (VFS). -+ * -+ * ->filldir is entitled to do weird things. For example, ->filldir -+ * supplied by knfsd re-enters file system. Make sure no locks are -+ * held. -+ */ -+ assert("nikita-3436", lock_stack_isclean(get_current_lock_stack())); -+ -+ reiser4_txn_restart_current(); -+ result = filldir(dirent, name, (int)strlen(name), -+ /* offset of this entry */ -+ f->f_pos, -+ /* inode number of object bounden by this entry */ -+ oid_to_uino(get_key_objectid(&sd_key)), file_type); -+ if (local_name != name_buf) -+ kfree(local_name); -+ if (result < 0) -+ /* ->filldir() is satisfied. (no space in buffer, IOW) */ -+ result = 1; -+ else -+ result = reiser4_seal_validate(&seal, coord, &entry_key, -+ tap->lh, tap->mode, -+ ZNODE_LOCK_HIPRI); -+ return result; -+} -+ -+static void move_entry(readdir_pos * pos, coord_t * coord) -+{ -+ reiser4_key de_key; -+ de_id *did; -+ -+ /* update @pos */ -+ ++pos->entry_no; -+ did = &pos->position.dir_entry_key; -+ -+ /* get key of directory entry */ -+ unit_key_by_coord(coord, &de_key); -+ -+ if (de_id_key_cmp(did, &de_key) == EQUAL_TO) -+ /* we are within sequence of directory entries -+ with duplicate keys. */ -+ ++pos->position.pos; -+ else { -+ pos->position.pos = 0; -+ build_de_id_by_key(&de_key, did); -+ } -+ ++pos->fpos; -+} -+ -+/* -+ * STATELESS READDIR -+ * -+ * readdir support in reiser4 relies on ability to update readdir_pos embedded -+ * into reiser4_file_fsdata on each directory modification (name insertion and -+ * removal), see reiser4_readdir_common() function below. This obviously doesn't -+ * work when reiser4 is accessed over NFS, because NFS doesn't keep any state -+ * across client READDIR requests for the same directory. -+ * -+ * To address this we maintain a "pool" of detached reiser4_file_fsdata -+ * (d_cursor). Whenever NFS readdir request comes, we detect this, and try to -+ * find detached reiser4_file_fsdata corresponding to previous readdir -+ * request. In other words, additional state is maintained on the -+ * server. (This is somewhat contrary to the design goals of NFS protocol.) -+ * -+ * To efficiently detect when our ->readdir() method is called by NFS server, -+ * dentry is marked as "stateless" in reiser4_decode_fh() (this is checked by -+ * file_is_stateless() function). -+ * -+ * To find out d_cursor in the pool, we encode client id (cid) in the highest -+ * bits of NFS readdir cookie: when first readdir request comes to the given -+ * directory from the given client, cookie is set to 0. This situation is -+ * detected, global cid_counter is incremented, and stored in highest bits of -+ * all direntry offsets returned to the client, including last one. As the -+ * only valid readdir cookie is one obtained as direntry->offset, we are -+ * guaranteed that next readdir request (continuing current one) will have -+ * current cid in the highest bits of starting readdir cookie. All d_cursors -+ * are hashed into per-super-block hash table by (oid, cid) key. -+ * -+ * In addition d_cursors are placed into per-super-block radix tree where they -+ * are keyed by oid alone. This is necessary to efficiently remove them during -+ * rmdir. -+ * -+ * At last, currently unused d_cursors are linked into special list. This list -+ * is used d_cursor_shrink to reclaim d_cursors on memory pressure. -+ * -+ */ -+ -+/* -+ * prepare for readdir. -+ */ -+static int dir_readdir_init(struct file *f, tap_t * tap, readdir_pos ** pos) -+{ -+ struct inode *inode; -+ reiser4_file_fsdata *fsdata; -+ int result; -+ -+ assert("nikita-1359", f != NULL); -+ inode = f->f_dentry->d_inode; -+ assert("nikita-1360", inode != NULL); -+ -+ if (!S_ISDIR(inode->i_mode)) -+ return RETERR(-ENOTDIR); -+ -+ /* try to find detached readdir state */ -+ result = reiser4_attach_fsdata(f, inode); -+ if (result != 0) -+ return result; -+ -+ fsdata = reiser4_get_file_fsdata(f); -+ assert("nikita-2571", fsdata != NULL); -+ if (IS_ERR(fsdata)) -+ return PTR_ERR(fsdata); -+ -+ /* add file descriptor to the readdir list hanging of directory -+ * inode. This list is used to scan "readdirs-in-progress" while -+ * inserting or removing names in the directory. */ -+ spin_lock_inode(inode); -+ if (list_empty_careful(&fsdata->dir.linkage)) -+ list_add(&fsdata->dir.linkage, get_readdir_list(inode)); -+ *pos = &fsdata->dir.readdir; -+ spin_unlock_inode(inode); -+ -+ /* move @tap to the current position */ -+ return dir_rewind(f, *pos, tap); -+} -+ -+/* this is implementation of vfs's llseek method of struct file_operations for -+ typical directory -+ See comment before reiser4_readdir_common() for explanation. -+*/ -+loff_t reiser4_llseek_dir_common(struct file * file, loff_t off, int origin) -+{ -+ reiser4_context *ctx; -+ loff_t result; -+ struct inode *inode; -+ -+ inode = file->f_dentry->d_inode; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ mutex_lock(&inode->i_mutex); -+ -+ /* update ->f_pos */ -+ result = default_llseek(file, off, origin); -+ if (result >= 0) { -+ int ff; -+ coord_t coord; -+ lock_handle lh; -+ tap_t tap; -+ readdir_pos *pos; -+ -+ coord_init_zero(&coord); -+ init_lh(&lh); -+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK); -+ -+ ff = dir_readdir_init(file, &tap, &pos); -+ reiser4_detach_fsdata(file); -+ if (ff != 0) -+ result = (loff_t) ff; -+ reiser4_tap_done(&tap); -+ } -+ reiser4_detach_fsdata(file); -+ mutex_unlock(&inode->i_mutex); -+ -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/* this is common implementation of vfs's readdir method of struct -+ file_operations -+ -+ readdir problems: -+ -+ readdir(2)/getdents(2) interface is based on implicit assumption that -+ readdir can be restarted from any particular point by supplying file system -+ with off_t-full of data. That is, file system fills ->d_off field in struct -+ dirent and later user passes ->d_off to the seekdir(3), which is, actually, -+ implemented by glibc as lseek(2) on directory. -+ -+ Reiser4 cannot restart readdir from 64 bits of data, because two last -+ components of the key of directory entry are unknown, which given 128 bits: -+ locality and type fields in the key of directory entry are always known, to -+ start readdir() from given point objectid and offset fields have to be -+ filled. -+ -+ Traditional UNIX API for scanning through directory -+ (readdir/seekdir/telldir/opendir/closedir/rewindir/getdents) is based on the -+ assumption that directory is structured very much like regular file, in -+ particular, it is implied that each name within given directory (directory -+ entry) can be uniquely identified by scalar offset and that such offset is -+ stable across the life-time of the name is identifies. -+ -+ This is manifestly not so for reiser4. In reiser4 the only stable unique -+ identifies for the directory entry is its key that doesn't fit into -+ seekdir/telldir API. -+ -+ solution: -+ -+ Within each file descriptor participating in readdir-ing of directory -+ plugin/dir/dir.h:readdir_pos is maintained. This structure keeps track of -+ the "current" directory entry that file descriptor looks at. It contains a -+ key of directory entry (plus some additional info to deal with non-unique -+ keys that we wouldn't dwell onto here) and a logical position of this -+ directory entry starting from the beginning of the directory, that is -+ ordinal number of this entry in the readdir order. -+ -+ Obviously this logical position is not stable in the face of directory -+ modifications. To work around this, on each addition or removal of directory -+ entry all file descriptors for directory inode are scanned and their -+ readdir_pos are updated accordingly (adjust_dir_pos()). -+*/ -+int reiser4_readdir_common(struct file *f /* directory file being read */, -+ void *dirent /* opaque data passed to us by VFS */, -+ filldir_t filld /* filler function passed to us -+ * by VFS */) -+{ -+ reiser4_context *ctx; -+ int result; -+ struct inode *inode; -+ coord_t coord; -+ lock_handle lh; -+ tap_t tap; -+ readdir_pos *pos; -+ -+ assert("nikita-1359", f != NULL); -+ inode = f->f_dentry->d_inode; -+ assert("nikita-1360", inode != NULL); -+ -+ if (!S_ISDIR(inode->i_mode)) -+ return RETERR(-ENOTDIR); -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ coord_init_zero(&coord); -+ init_lh(&lh); -+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK); -+ -+ reiser4_readdir_readahead_init(inode, &tap); -+ -+ repeat: -+ result = dir_readdir_init(f, &tap, &pos); -+ if (result == 0) { -+ result = reiser4_tap_load(&tap); -+ /* scan entries one by one feeding them to @filld */ -+ while (result == 0) { -+ coord_t *coord; -+ -+ coord = tap.coord; -+ assert("nikita-2572", coord_is_existing_unit(coord)); -+ assert("nikita-3227", is_valid_dir_coord(inode, coord)); -+ -+ result = feed_entry(f, pos, &tap, filld, dirent); -+ if (result > 0) { -+ break; -+ } else if (result == 0) { -+ ++f->f_pos; -+ result = go_next_unit(&tap); -+ if (result == -E_NO_NEIGHBOR || -+ result == -ENOENT) { -+ result = 0; -+ break; -+ } else if (result == 0) { -+ if (is_valid_dir_coord(inode, coord)) -+ move_entry(pos, coord); -+ else -+ break; -+ } -+ } else if (result == -E_REPEAT) { -+ /* feed_entry() had to restart. */ -+ ++f->f_pos; -+ reiser4_tap_relse(&tap); -+ goto repeat; -+ } else -+ warning("vs-1617", -+ "reiser4_readdir_common: unexpected error %d", -+ result); -+ } -+ reiser4_tap_relse(&tap); -+ -+ if (result >= 0) -+ f->f_version = inode->i_version; -+ } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) -+ result = 0; -+ reiser4_tap_done(&tap); -+ reiser4_detach_fsdata(f); -+ -+ /* try to update directory's atime */ -+ if (reiser4_grab_space_force(inode_file_plugin(inode)->estimate.update(inode), -+ BA_CAN_COMMIT) != 0) -+ warning("", "failed to update atime on readdir: %llu", -+ get_inode_oid(inode)); -+ else -+ file_accessed(f); -+ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ -+ return (result <= 0) ? result : 0; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/file_plugin_common.c linux-2.6.20/fs/reiser4/plugin/file_plugin_common.c ---- linux-2.6.20.orig/fs/reiser4/plugin/file_plugin_common.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/file_plugin_common.c 2007-05-06 14:50:43.791004471 +0400 -@@ -0,0 +1,1007 @@ -+/* Copyright 2005 by Hans Reiser, licensing governed by -+ reiser4/README */ -+ -+/* this file contains typical implementations for most of methods of -+ file plugin -+*/ -+ -+#include "../inode.h" -+#include "object.h" -+#include "../safe_link.h" -+ -+#include -+ -+static int insert_new_sd(struct inode *inode); -+static int update_sd(struct inode *inode); -+ -+/* this is common implementation of write_sd_by_inode method of file plugin -+ either insert stat data or update it -+ */ -+int write_sd_by_inode_common(struct inode *inode /* object to save */ ) -+{ -+ int result; -+ -+ assert("nikita-730", inode != NULL); -+ -+ if (reiser4_inode_get_flag(inode, REISER4_NO_SD)) -+ /* object doesn't have stat-data yet */ -+ result = insert_new_sd(inode); -+ else -+ result = update_sd(inode); -+ if (result != 0 && result != -ENAMETOOLONG && result != -ENOMEM) -+ /* Don't issue warnings about "name is too long" */ -+ warning("nikita-2221", "Failed to save sd for %llu: %i", -+ (unsigned long long)get_inode_oid(inode), result); -+ return result; -+} -+ -+/* this is common implementation of key_by_inode method of file plugin -+ */ -+int -+key_by_inode_and_offset_common(struct inode *inode, loff_t off, -+ reiser4_key * key) -+{ -+ reiser4_key_init(key); -+ set_key_locality(key, reiser4_inode_data(inode)->locality_id); -+ set_key_ordering(key, get_inode_ordering(inode)); -+ set_key_objectid(key, get_inode_oid(inode)); /*FIXME: inode->i_ino */ -+ set_key_type(key, KEY_BODY_MINOR); -+ set_key_offset(key, (__u64) off); -+ return 0; -+} -+ -+/* this is common implementation of set_plug_in_inode method of file plugin -+ */ -+int set_plug_in_inode_common(struct inode *object /* inode to set plugin on */ , -+ struct inode *parent /* parent object */ , -+ reiser4_object_create_data * data /* creational -+ * data */ ) -+{ -+ __u64 mask; -+ -+ object->i_mode = data->mode; -+ /* this should be plugin decision */ -+ object->i_uid = current->fsuid; -+ object->i_mtime = object->i_atime = object->i_ctime = CURRENT_TIME; -+ -+ /* support for BSD style group-id assignment. See mount's manual page -+ description of bsdgroups ext2 mount options for more details */ -+ if (reiser4_is_set(object->i_sb, REISER4_BSD_GID)) -+ object->i_gid = parent->i_gid; -+ else if (parent->i_mode & S_ISGID) { -+ /* parent directory has sguid bit */ -+ object->i_gid = parent->i_gid; -+ if (S_ISDIR(object->i_mode)) -+ /* sguid is inherited by sub-directories */ -+ object->i_mode |= S_ISGID; -+ } else -+ object->i_gid = current->fsgid; -+ -+ /* this object doesn't have stat-data yet */ -+ reiser4_inode_set_flag(object, REISER4_NO_SD); -+#if 0 -+ /* this is now called after all inode plugins are initialized: -+ do_create_vfs_child after adjust_to_parent */ -+ /* setup inode and file-operations for this inode */ -+ setup_inode_ops(object, data); -+#endif -+ object->i_nlink = 0; -+ reiser4_seal_init(&reiser4_inode_data(object)->sd_seal, NULL, NULL); -+ mask = (1 << UNIX_STAT) | (1 << LIGHT_WEIGHT_STAT); -+ if (!reiser4_is_set(object->i_sb, REISER4_32_BIT_TIMES)) -+ mask |= (1 << LARGE_TIMES_STAT); -+ -+ reiser4_inode_data(object)->extmask = mask; -+ return 0; -+} -+ -+/* this is common implementation of adjust_to_parent method of file plugin for -+ regular files -+ */ -+int adjust_to_parent_common(struct inode *object /* new object */ , -+ struct inode *parent /* parent directory */ , -+ struct inode *root /* root directory */ ) -+{ -+ assert("nikita-2165", object != NULL); -+ if (parent == NULL) -+ parent = root; -+ assert("nikita-2069", parent != NULL); -+ -+ /* -+ * inherit missing plugins from parent -+ */ -+ -+ grab_plugin_pset(object, parent, PSET_FILE); -+ grab_plugin_pset(object, parent, PSET_SD); -+ grab_plugin_pset(object, parent, PSET_FORMATTING); -+ grab_plugin_pset(object, parent, PSET_PERM); -+ return 0; -+} -+ -+/* this is common implementation of adjust_to_parent method of file plugin for -+ typical directories -+ */ -+int adjust_to_parent_common_dir(struct inode *object /* new object */ , -+ struct inode *parent /* parent directory */ , -+ struct inode *root /* root directory */ ) -+{ -+ int result = 0; -+ pset_member memb; -+ -+ assert("nikita-2166", object != NULL); -+ if (parent == NULL) -+ parent = root; -+ assert("nikita-2167", parent != NULL); -+ -+ /* -+ * inherit missing plugins from parent -+ */ -+ for (memb = 0; memb < PSET_LAST; ++memb) { -+ result = grab_plugin_pset(object, parent, memb); -+ if (result != 0) -+ break; -+ } -+ return result; -+} -+ -+int adjust_to_parent_cryptcompress(struct inode *object /* new object */ , -+ struct inode *parent /* parent directory */, -+ struct inode *root /* root directory */) -+{ -+ int result; -+ result = adjust_to_parent_common(object, parent, root); -+ if (result) -+ return result; -+ assert("edward-1416", parent != NULL); -+ -+ grab_plugin_pset(object, parent, PSET_CLUSTER); -+ grab_plugin_pset(object, parent, PSET_CIPHER); -+ grab_plugin_pset(object, parent, PSET_DIGEST); -+ grab_plugin_pset(object, parent, PSET_COMPRESSION); -+ grab_plugin_pset(object, parent, PSET_COMPRESSION_MODE); -+ -+ return 0; -+} -+ -+/* this is common implementation of create_object method of file plugin -+ */ -+int reiser4_create_object_common(struct inode *object, struct inode *parent, -+ reiser4_object_create_data * data) -+{ -+ reiser4_block_nr reserve; -+ assert("nikita-744", object != NULL); -+ assert("nikita-745", parent != NULL); -+ assert("nikita-747", data != NULL); -+ assert("nikita-748", reiser4_inode_get_flag(object, REISER4_NO_SD)); -+ -+ reserve = estimate_create_common(object); -+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) -+ return RETERR(-ENOSPC); -+ return write_sd_by_inode_common(object); -+} -+ -+static int common_object_delete_no_reserve(struct inode *inode); -+ -+/** -+ * reiser4_delete_object_common - delete_object of file_plugin -+ * @inode: inode to be deleted -+ * -+ * This is common implementation of delete_object method of file_plugin. It -+ * applies to object its deletion consists of removing two items - stat data -+ * and safe-link. -+ */ -+int reiser4_delete_object_common(struct inode *inode) -+{ -+ int result; -+ -+ assert("nikita-1477", inode != NULL); -+ /* FIXME: if file body deletion failed (i/o error, for instance), -+ inode->i_size can be != 0 here */ -+ assert("nikita-3420", inode->i_size == 0 || S_ISLNK(inode->i_mode)); -+ assert("nikita-3421", inode->i_nlink == 0); -+ -+ if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) { -+ reiser4_block_nr reserve; -+ -+ /* grab space which is needed to remove 2 items from the tree: -+ stat data and safe-link */ -+ reserve = 2 * -+ estimate_one_item_removal(reiser4_tree_by_inode(inode)); -+ if (reiser4_grab_space_force(reserve, -+ BA_RESERVED | BA_CAN_COMMIT)) -+ return RETERR(-ENOSPC); -+ result = common_object_delete_no_reserve(inode); -+ } else -+ result = 0; -+ return result; -+} -+ -+/** -+ * reiser4_delete_dir_common - delete_object of file_plugin -+ * @inode: inode to be deleted -+ * -+ * This is common implementation of delete_object method of file_plugin for -+ * typical directory. It calls done method of dir_plugin to remove "." and -+ * removes stat data and safe-link. -+ */ -+int reiser4_delete_dir_common(struct inode *inode) -+{ -+ int result; -+ dir_plugin *dplug; -+ -+ assert("", (get_current_context() && -+ get_current_context()->trans->atom == NULL)); -+ -+ dplug = inode_dir_plugin(inode); -+ assert("vs-1101", dplug && dplug->done); -+ -+ /* kill cursors which might be attached to inode */ -+ reiser4_kill_cursors(inode); -+ -+ /* grab space enough for removing two items */ -+ if (reiser4_grab_space -+ (2 * estimate_one_item_removal(reiser4_tree_by_inode(inode)), -+ BA_RESERVED | BA_CAN_COMMIT)) -+ return RETERR(-ENOSPC); -+ -+ result = dplug->done(inode); -+ if (!result) -+ result = common_object_delete_no_reserve(inode); -+ return result; -+} -+ -+/* this is common implementation of add_link method of file plugin -+ */ -+int reiser4_add_link_common(struct inode *object, struct inode *parent) -+{ -+ /* -+ * increment ->i_nlink and update ->i_ctime -+ */ -+ -+ INODE_INC_FIELD(object, i_nlink); -+ object->i_ctime = CURRENT_TIME; -+ return 0; -+} -+ -+/* this is common implementation of rem_link method of file plugin -+ */ -+int reiser4_rem_link_common(struct inode *object, struct inode *parent) -+{ -+ assert("nikita-2021", object != NULL); -+ assert("nikita-2163", object->i_nlink > 0); -+ -+ /* -+ * decrement ->i_nlink and update ->i_ctime -+ */ -+ -+ INODE_DEC_FIELD(object, i_nlink); -+ object->i_ctime = CURRENT_TIME; -+ return 0; -+} -+ -+/* this is common implementation of rem_link method of file plugin for typical -+ directory -+*/ -+int rem_link_common_dir(struct inode *object, struct inode *parent UNUSED_ARG) -+{ -+ assert("nikita-20211", object != NULL); -+ assert("nikita-21631", object->i_nlink > 0); -+ -+ /* -+ * decrement ->i_nlink and update ->i_ctime -+ */ -+ INODE_DEC_FIELD(object, i_nlink); -+ if (object->i_nlink == 1) -+ INODE_DEC_FIELD(object, i_nlink); -+ object->i_ctime = CURRENT_TIME; -+ return 0; -+} -+ -+/* this is common implementation of owns_item method of file plugin -+ compare objectids of keys in inode and coord */ -+int owns_item_common(const struct inode *inode, /* object to check -+ * against */ -+ const coord_t * coord /* coord to check */ ) -+{ -+ reiser4_key item_key; -+ reiser4_key file_key; -+ -+ assert("nikita-760", inode != NULL); -+ assert("nikita-761", coord != NULL); -+ -+ return coord_is_existing_item(coord) && -+ (get_key_objectid(build_sd_key(inode, &file_key)) == -+ get_key_objectid(item_key_by_coord(coord, &item_key))); -+} -+ -+/* this is common implementation of owns_item method of file plugin -+ for typical directory -+*/ -+int owns_item_common_dir(const struct inode *inode, /* object to check against */ -+ const coord_t * coord /* coord of item to check */ ) -+{ -+ reiser4_key item_key; -+ -+ assert("nikita-1335", inode != NULL); -+ assert("nikita-1334", coord != NULL); -+ -+ if (plugin_of_group(item_plugin_by_coord(coord), DIR_ENTRY_ITEM_TYPE)) -+ return get_key_locality(item_key_by_coord(coord, &item_key)) == -+ get_inode_oid(inode); -+ else -+ return owns_item_common(inode, coord); -+} -+ -+/* this is common implementation of can_add_link method of file plugin -+ checks whether yet another hard links to this object can be added -+*/ -+int can_add_link_common(const struct inode *object /* object to check */ ) -+{ -+ assert("nikita-732", object != NULL); -+ -+ /* inode->i_nlink is unsigned int, so just check for integer -+ overflow */ -+ return object->i_nlink + 1 != 0; -+} -+ -+/* this is common implementation of can_rem_link method of file plugin for -+ typical directory -+*/ -+int can_rem_link_common_dir(const struct inode *inode) -+{ -+ /* is_dir_empty() returns 0 is dir is empty */ -+ return !is_dir_empty(inode); -+} -+ -+/* this is common implementation of detach method of file plugin for typical -+ directory -+*/ -+int reiser4_detach_common_dir(struct inode *child, struct inode *parent) -+{ -+ dir_plugin *dplug; -+ -+ dplug = inode_dir_plugin(child); -+ assert("nikita-2883", dplug != NULL); -+ assert("nikita-2884", dplug->detach != NULL); -+ return dplug->detach(child, parent); -+} -+ -+/* this is common implementation of bind method of file plugin for typical -+ directory -+*/ -+int reiser4_bind_common_dir(struct inode *child, struct inode *parent) -+{ -+ dir_plugin *dplug; -+ -+ dplug = inode_dir_plugin(child); -+ assert("nikita-2646", dplug != NULL); -+ return dplug->attach(child, parent); -+} -+ -+static int process_truncate(struct inode *, __u64 size); -+ -+/* this is common implementation of safelink method of file plugin -+ */ -+int safelink_common(struct inode *object, reiser4_safe_link_t link, __u64 value) -+{ -+ int result; -+ -+ assert("vs-1705", get_current_context()->trans->atom == NULL); -+ if (link == SAFE_UNLINK) -+ /* nothing to do. iput() in the caller (process_safelink) will -+ * finish with file */ -+ result = 0; -+ else if (link == SAFE_TRUNCATE) -+ result = process_truncate(object, value); -+ else { -+ warning("nikita-3438", "Unrecognized safe-link type: %i", link); -+ result = RETERR(-EIO); -+ } -+ return result; -+} -+ -+/* this is common implementation of estimate.create method of file plugin -+ can be used when object creation involves insertion of one item (usually stat -+ data) into tree -+*/ -+reiser4_block_nr estimate_create_common(const struct inode * object) -+{ -+ return estimate_one_insert_item(reiser4_tree_by_inode(object)); -+} -+ -+/* this is common implementation of estimate.create method of file plugin for -+ typical directory -+ can be used when directory creation involves insertion of two items (usually -+ stat data and item containing "." and "..") into tree -+*/ -+reiser4_block_nr estimate_create_common_dir(const struct inode * object) -+{ -+ return 2 * estimate_one_insert_item(reiser4_tree_by_inode(object)); -+} -+ -+/* this is common implementation of estimate.update method of file plugin -+ can be used when stat data update does not do more than inserting a unit -+ into a stat data item which is probably true for most cases -+*/ -+reiser4_block_nr estimate_update_common(const struct inode * inode) -+{ -+ return estimate_one_insert_into_item(reiser4_tree_by_inode(inode)); -+} -+ -+/* this is common implementation of estimate.unlink method of file plugin -+ */ -+reiser4_block_nr -+estimate_unlink_common(const struct inode * object UNUSED_ARG, -+ const struct inode * parent UNUSED_ARG) -+{ -+ return 0; -+} -+ -+/* this is common implementation of estimate.unlink method of file plugin for -+ typical directory -+*/ -+reiser4_block_nr -+estimate_unlink_common_dir(const struct inode * object, -+ const struct inode * parent) -+{ -+ dir_plugin *dplug; -+ -+ dplug = inode_dir_plugin(object); -+ assert("nikita-2888", dplug != NULL); -+ assert("nikita-2887", dplug->estimate.unlink != NULL); -+ return dplug->estimate.unlink(object, parent); -+} -+ -+char *wire_write_common(struct inode *inode, char *start) -+{ -+ return build_inode_onwire(inode, start); -+} -+ -+char *wire_read_common(char *addr, reiser4_object_on_wire * obj) -+{ -+ return extract_obj_key_id_from_onwire(addr, &obj->u.std.key_id); -+} -+ -+struct dentry *wire_get_common(struct super_block *sb, -+ reiser4_object_on_wire * obj) -+{ -+ struct inode *inode; -+ struct dentry *dentry; -+ reiser4_key key; -+ -+ extract_key_from_id(&obj->u.std.key_id, &key); -+ inode = reiser4_iget(sb, &key, 1); -+ if (!IS_ERR(inode)) { -+ reiser4_iget_complete(inode); -+ dentry = d_alloc_anon(inode); -+ if (dentry == NULL) { -+ iput(inode); -+ dentry = ERR_PTR(-ENOMEM); -+ } else -+ dentry->d_op = &get_super_private(sb)->ops.dentry; -+ } else if (PTR_ERR(inode) == -ENOENT) -+ /* -+ * inode wasn't found at the key encoded in the file -+ * handle. Hence, file handle is stale. -+ */ -+ dentry = ERR_PTR(RETERR(-ESTALE)); -+ else -+ dentry = (void *)inode; -+ return dentry; -+} -+ -+int wire_size_common(struct inode *inode) -+{ -+ return inode_onwire_size(inode); -+} -+ -+void wire_done_common(reiser4_object_on_wire * obj) -+{ -+ /* nothing to do */ -+} -+ -+/* helper function to print errors */ -+static void key_warning(const reiser4_key * key /* key to print */ , -+ const struct inode *inode, -+ int code /* error code to print */ ) -+{ -+ assert("nikita-716", key != NULL); -+ -+ if (code != -ENOMEM) { -+ warning("nikita-717", "Error for inode %llu (%i)", -+ (unsigned long long)get_key_objectid(key), code); -+ reiser4_print_key("for key", key); -+ } -+} -+ -+/* NIKITA-FIXME-HANS: perhaps this function belongs in another file? */ -+#if REISER4_DEBUG -+static void -+check_inode_seal(const struct inode *inode, -+ const coord_t * coord, const reiser4_key * key) -+{ -+ reiser4_key unit_key; -+ -+ unit_key_by_coord(coord, &unit_key); -+ assert("nikita-2752", -+ WITH_DATA_RET(coord->node, 1, keyeq(key, &unit_key))); -+ assert("nikita-2753", get_inode_oid(inode) == get_key_objectid(key)); -+} -+ -+static void check_sd_coord(coord_t * coord, const reiser4_key * key) -+{ -+ reiser4_key ukey; -+ -+ coord_clear_iplug(coord); -+ if (zload(coord->node)) -+ return; -+ -+ if (!coord_is_existing_unit(coord) || -+ !item_plugin_by_coord(coord) || -+ !keyeq(unit_key_by_coord(coord, &ukey), key) || -+ (znode_get_level(coord->node) != LEAF_LEVEL) || -+ !item_is_statdata(coord)) { -+ warning("nikita-1901", "Conspicuous seal"); -+ reiser4_print_key("key", key); -+ print_coord("coord", coord, 1); -+ impossible("nikita-2877", "no way"); -+ } -+ zrelse(coord->node); -+} -+ -+#else -+#define check_inode_seal(inode, coord, key) noop -+#define check_sd_coord(coord, key) noop -+#endif -+ -+/* insert new stat-data into tree. Called with inode state -+ locked. Return inode state locked. */ -+static int insert_new_sd(struct inode *inode /* inode to create sd for */ ) -+{ -+ int result; -+ reiser4_key key; -+ coord_t coord; -+ reiser4_item_data data; -+ char *area; -+ reiser4_inode *ref; -+ lock_handle lh; -+ oid_t oid; -+ -+ assert("nikita-723", inode != NULL); -+ assert("nikita-3406", reiser4_inode_get_flag(inode, REISER4_NO_SD)); -+ -+ ref = reiser4_inode_data(inode); -+ spin_lock_inode(inode); -+ -+ if (ref->plugin_mask != 0) -+ /* inode has non-standard plugins */ -+ inode_set_extension(inode, PLUGIN_STAT); -+ /* -+ * prepare specification of new item to be inserted -+ */ -+ -+ data.iplug = inode_sd_plugin(inode); -+ data.length = data.iplug->s.sd.save_len(inode); -+ spin_unlock_inode(inode); -+ -+ data.data = NULL; -+ data.user = 0; -+/* could be optimized for case where there is only one node format in -+ * use in the filesystem, probably there are lots of such -+ * places we could optimize for only one node layout.... -Hans */ -+ if (data.length > reiser4_tree_by_inode(inode)->nplug->max_item_size()){ -+ /* This is silly check, but we don't know actual node where -+ insertion will go into. */ -+ return RETERR(-ENAMETOOLONG); -+ } -+ oid = oid_allocate(inode->i_sb); -+/* NIKITA-FIXME-HANS: what is your opinion on whether this error check should be encapsulated into oid_allocate? */ -+ if (oid == ABSOLUTE_MAX_OID) -+ return RETERR(-EOVERFLOW); -+ -+ set_inode_oid(inode, oid); -+ -+ coord_init_zero(&coord); -+ init_lh(&lh); -+ -+ result = insert_by_key(reiser4_tree_by_inode(inode), -+ build_sd_key(inode, &key), &data, &coord, &lh, -+ /* stat data lives on a leaf level */ -+ LEAF_LEVEL, CBK_UNIQUE); -+ -+ /* we don't want to re-check that somebody didn't insert -+ stat-data while we were doing io, because if it did, -+ insert_by_key() returned error. */ -+ /* but what _is_ possible is that plugin for inode's stat-data, -+ list of non-standard plugins or their state would change -+ during io, so that stat-data wouldn't fit into sd. To avoid -+ this race we keep inode_state lock. This lock has to be -+ taken each time you access inode in a way that would cause -+ changes in sd size: changing plugins etc. -+ */ -+ -+ if (result == IBK_INSERT_OK) { -+ coord_clear_iplug(&coord); -+ result = zload(coord.node); -+ if (result == 0) { -+ /* have we really inserted stat data? */ -+ assert("nikita-725", item_is_statdata(&coord)); -+ -+ /* inode was just created. It is inserted into hash -+ table, but no directory entry was yet inserted into -+ parent. So, inode is inaccessible through -+ ->lookup(). All places that directly grab inode -+ from hash-table (like old knfsd), should check -+ IMMUTABLE flag that is set by common_create_child. -+ */ -+ assert("nikita-3240", data.iplug != NULL); -+ assert("nikita-3241", data.iplug->s.sd.save != NULL); -+ area = item_body_by_coord(&coord); -+ result = data.iplug->s.sd.save(inode, &area); -+ znode_make_dirty(coord.node); -+ if (result == 0) { -+ /* object has stat-data now */ -+ reiser4_inode_clr_flag(inode, REISER4_NO_SD); -+ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN); -+ /* initialise stat-data seal */ -+ reiser4_seal_init(&ref->sd_seal, &coord, &key); -+ ref->sd_coord = coord; -+ check_inode_seal(inode, &coord, &key); -+ } else if (result != -ENOMEM) -+ /* -+ * convert any other error code to -EIO to -+ * avoid confusing user level with unexpected -+ * errors. -+ */ -+ result = RETERR(-EIO); -+ zrelse(coord.node); -+ } -+ } -+ done_lh(&lh); -+ -+ if (result != 0) -+ key_warning(&key, inode, result); -+ else -+ oid_count_allocated(); -+ -+ return result; -+} -+ -+/* find sd of inode in a tree, deal with errors */ -+int lookup_sd(struct inode *inode /* inode to look sd for */ , -+ znode_lock_mode lock_mode /* lock mode */ , -+ coord_t * coord /* resulting coord */ , -+ lock_handle * lh /* resulting lock handle */ , -+ const reiser4_key * key /* resulting key */ , -+ int silent) -+{ -+ int result; -+ __u32 flags; -+ -+ assert("nikita-1692", inode != NULL); -+ assert("nikita-1693", coord != NULL); -+ assert("nikita-1694", key != NULL); -+ -+ /* look for the object's stat data in a tree. -+ This returns in "node" pointer to a locked znode and in "pos" -+ position of an item found in node. Both are only valid if -+ coord_found is returned. */ -+ flags = (lock_mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0; -+ flags |= CBK_UNIQUE; -+ /* -+ * traverse tree to find stat data. We cannot use vroot here, because -+ * it only covers _body_ of the file, and stat data don't belong -+ * there. -+ */ -+ result = coord_by_key(reiser4_tree_by_inode(inode), -+ key, -+ coord, -+ lh, -+ lock_mode, -+ FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, flags, NULL); -+ if (REISER4_DEBUG && result == 0) -+ check_sd_coord(coord, key); -+ -+ if (result != 0 && !silent) -+ key_warning(key, inode, result); -+ return result; -+} -+ -+static int -+locate_inode_sd(struct inode *inode, -+ reiser4_key * key, coord_t * coord, lock_handle * lh) -+{ -+ reiser4_inode *state; -+ seal_t seal; -+ int result; -+ -+ assert("nikita-3483", inode != NULL); -+ -+ state = reiser4_inode_data(inode); -+ spin_lock_inode(inode); -+ *coord = state->sd_coord; -+ coord_clear_iplug(coord); -+ seal = state->sd_seal; -+ spin_unlock_inode(inode); -+ -+ build_sd_key(inode, key); -+ if (reiser4_seal_is_set(&seal)) { -+ /* first, try to use seal */ -+ result = reiser4_seal_validate(&seal, -+ coord, -+ key, -+ lh, ZNODE_WRITE_LOCK, -+ ZNODE_LOCK_LOPRI); -+ if (result == 0) -+ check_sd_coord(coord, key); -+ } else -+ result = -E_REPEAT; -+ -+ if (result != 0) { -+ coord_init_zero(coord); -+ result = lookup_sd(inode, ZNODE_WRITE_LOCK, coord, lh, key, 0); -+ } -+ return result; -+} -+ -+#if REISER4_DEBUG -+static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2) -+{ -+ return (get_key_locality(k1) == get_key_locality(k2) && -+ get_key_type(k1) == get_key_type(k2) && -+ get_key_band(k1) == get_key_band(k2) && -+ get_key_ordering(k1) == get_key_ordering(k2) && -+ get_key_objectid(k1) == get_key_objectid(k2)); -+} -+ -+#include "../tree_walk.h" -+ -+/* make some checks before and after stat-data resize operation */ -+static int check_sd_resize(struct inode * inode, coord_t * coord, -+ int length, int progress /* 1 means after resize */) -+{ -+ int ret = 0; -+ lock_handle left_lock; -+ coord_t left_coord; -+ reiser4_key left_key; -+ reiser4_key key; -+ -+ if (inode_file_plugin(inode) != -+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)) -+ return 0; -+ if (!length) -+ return 0; -+ if (coord->item_pos != 0) -+ return 0; -+ -+ init_lh(&left_lock); -+ ret = reiser4_get_left_neighbor(&left_lock, -+ coord->node, -+ ZNODE_WRITE_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR || -+ ret == -ENOENT || ret == -EINVAL -+ || ret == -E_DEADLOCK) { -+ ret = 0; -+ goto exit; -+ } -+ ret = zload(left_lock.node); -+ if (ret) -+ goto exit; -+ coord_init_last_unit(&left_coord, left_lock.node); -+ item_key_by_coord(&left_coord, &left_key); -+ item_key_by_coord(coord, &key); -+ -+ if (all_but_offset_key_eq(&key, &left_key)) -+ /* corruption occured */ -+ ret = 1; -+ zrelse(left_lock.node); -+ exit: -+ done_lh(&left_lock); -+ return ret; -+} -+#endif -+ -+/* update stat-data at @coord */ -+static int -+update_sd_at(struct inode *inode, coord_t * coord, reiser4_key * key, -+ lock_handle * lh) -+{ -+ int result; -+ reiser4_item_data data; -+ char *area; -+ reiser4_inode *state; -+ znode *loaded; -+ -+ state = reiser4_inode_data(inode); -+ -+ coord_clear_iplug(coord); -+ result = zload(coord->node); -+ if (result != 0) -+ return result; -+ loaded = coord->node; -+ -+ spin_lock_inode(inode); -+ assert("nikita-728", inode_sd_plugin(inode) != NULL); -+ data.iplug = inode_sd_plugin(inode); -+ -+ /* if inode has non-standard plugins, add appropriate stat data -+ * extension */ -+ if (state->extmask & (1 << PLUGIN_STAT)) { -+ if (state->plugin_mask == 0) -+ inode_clr_extension(inode, PLUGIN_STAT); -+ } else if (state->plugin_mask != 0) -+ inode_set_extension(inode, PLUGIN_STAT); -+ -+ if (state->extmask & (1 << HEIR_STAT)) { -+ if (state->heir_mask == 0) -+ inode_clr_extension(inode, HEIR_STAT); -+ } else if (state->heir_mask != 0) -+ inode_set_extension(inode, HEIR_STAT); -+ -+ /* data.length is how much space to add to (or remove -+ from if negative) sd */ -+ if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) { -+ /* recalculate stat-data length */ -+ data.length = -+ data.iplug->s.sd.save_len(inode) - -+ item_length_by_coord(coord); -+ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN); -+ } else -+ data.length = 0; -+ spin_unlock_inode(inode); -+ -+ /* if on-disk stat data is of different length than required -+ for this inode, resize it */ -+ -+ if (data.length != 0) { -+ data.data = NULL; -+ data.user = 0; -+ -+ assert("edward-1441", -+ !check_sd_resize(inode, coord, -+ data.length, 0/* before resize */)); -+ -+ /* insertion code requires that insertion point (coord) was -+ * between units. */ -+ coord->between = AFTER_UNIT; -+ result = reiser4_resize_item(coord, &data, key, lh, -+ COPI_DONT_SHIFT_LEFT); -+ if (result != 0) { -+ key_warning(key, inode, result); -+ zrelse(loaded); -+ return result; -+ } -+ if (loaded != coord->node) { -+ /* reiser4_resize_item moved coord to another node. -+ Zload it */ -+ zrelse(loaded); -+ coord_clear_iplug(coord); -+ result = zload(coord->node); -+ if (result != 0) -+ return result; -+ loaded = coord->node; -+ } -+ assert("edward-1442", -+ !check_sd_resize(inode, coord, -+ data.length, 1/* after resize */)); -+ } -+ area = item_body_by_coord(coord); -+ spin_lock_inode(inode); -+ result = data.iplug->s.sd.save(inode, &area); -+ znode_make_dirty(coord->node); -+ -+ /* re-initialise stat-data seal */ -+ -+ /* -+ * coord.between was possibly skewed from AT_UNIT when stat-data size -+ * was changed and new extensions were pasted into item. -+ */ -+ coord->between = AT_UNIT; -+ reiser4_seal_init(&state->sd_seal, coord, key); -+ state->sd_coord = *coord; -+ spin_unlock_inode(inode); -+ check_inode_seal(inode, coord, key); -+ zrelse(loaded); -+ return result; -+} -+ -+/* Update existing stat-data in a tree. Called with inode state locked. Return -+ inode state locked. */ -+static int update_sd(struct inode *inode /* inode to update sd for */ ) -+{ -+ int result; -+ reiser4_key key; -+ coord_t coord; -+ lock_handle lh; -+ -+ assert("nikita-726", inode != NULL); -+ -+ /* no stat-data, nothing to update?! */ -+ assert("nikita-3482", !reiser4_inode_get_flag(inode, REISER4_NO_SD)); -+ -+ init_lh(&lh); -+ -+ result = locate_inode_sd(inode, &key, &coord, &lh); -+ if (result == 0) -+ result = update_sd_at(inode, &coord, &key, &lh); -+ done_lh(&lh); -+ -+ return result; -+} -+ -+/* helper for reiser4_delete_object_common and reiser4_delete_dir_common. -+ Remove object stat data. Space for that must be reserved by caller before -+*/ -+static int -+common_object_delete_no_reserve(struct inode *inode /* object to remove */ ) -+{ -+ int result; -+ -+ assert("nikita-1477", inode != NULL); -+ -+ if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) { -+ reiser4_key sd_key; -+ -+ DQUOT_FREE_INODE(inode); -+ DQUOT_DROP(inode); -+ -+ build_sd_key(inode, &sd_key); -+ result = -+ reiser4_cut_tree(reiser4_tree_by_inode(inode), -+ &sd_key, &sd_key, NULL, 0); -+ if (result == 0) { -+ reiser4_inode_set_flag(inode, REISER4_NO_SD); -+ result = oid_release(inode->i_sb, get_inode_oid(inode)); -+ if (result == 0) { -+ oid_count_released(); -+ -+ result = safe_link_del(reiser4_tree_by_inode(inode), -+ get_inode_oid(inode), -+ SAFE_UNLINK); -+ } -+ } -+ } else -+ result = 0; -+ return result; -+} -+ -+/* helper for safelink_common */ -+static int process_truncate(struct inode *inode, __u64 size) -+{ -+ int result; -+ struct iattr attr; -+ file_plugin *fplug; -+ reiser4_context *ctx; -+ struct dentry dentry; -+ -+ assert("vs-21", is_in_reiser4_context()); -+ ctx = reiser4_init_context(inode->i_sb); -+ assert("vs-22", !IS_ERR(ctx)); -+ -+ attr.ia_size = size; -+ attr.ia_valid = ATTR_SIZE | ATTR_CTIME; -+ fplug = inode_file_plugin(inode); -+ -+ mutex_lock(&inode->i_mutex); -+ assert("vs-1704", get_current_context()->trans->atom == NULL); -+ dentry.d_inode = inode; -+ result = inode->i_op->setattr(&dentry, &attr); -+ mutex_unlock(&inode->i_mutex); -+ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ -+ return result; -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/hash.c linux-2.6.20/fs/reiser4/plugin/hash.c ---- linux-2.6.20.orig/fs/reiser4/plugin/hash.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/hash.c 2007-05-06 14:50:43.791004471 +0400 -@@ -0,0 +1,353 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Hash functions */ -+ -+#include "../debug.h" -+#include "plugin_header.h" -+#include "plugin.h" -+#include "../super.h" -+#include "../inode.h" -+ -+#include -+ -+/* old rupasov (yura) hash */ -+static __u64 hash_rupasov(const unsigned char *name /* name to hash */ , -+ int len /* @name's length */ ) -+{ -+ int i; -+ int j; -+ int pow; -+ __u64 a; -+ __u64 c; -+ -+ assert("nikita-672", name != NULL); -+ assert("nikita-673", len >= 0); -+ -+ for (pow = 1, i = 1; i < len; ++i) -+ pow = pow * 10; -+ -+ if (len == 1) -+ a = name[0] - 48; -+ else -+ a = (name[0] - 48) * pow; -+ -+ for (i = 1; i < len; ++i) { -+ c = name[i] - 48; -+ for (pow = 1, j = i; j < len - 1; ++j) -+ pow = pow * 10; -+ a = a + c * pow; -+ } -+ for (; i < 40; ++i) { -+ c = '0' - 48; -+ for (pow = 1, j = i; j < len - 1; ++j) -+ pow = pow * 10; -+ a = a + c * pow; -+ } -+ -+ for (; i < 256; ++i) { -+ c = i; -+ for (pow = 1, j = i; j < len - 1; ++j) -+ pow = pow * 10; -+ a = a + c * pow; -+ } -+ -+ a = a << 7; -+ return a; -+} -+ -+/* r5 hash */ -+static __u64 hash_r5(const unsigned char *name /* name to hash */ , -+ int len UNUSED_ARG /* @name's length */ ) -+{ -+ __u64 a = 0; -+ -+ assert("nikita-674", name != NULL); -+ assert("nikita-675", len >= 0); -+ -+ while (*name) { -+ a += *name << 4; -+ a += *name >> 4; -+ a *= 11; -+ name++; -+ } -+ return a; -+} -+ -+/* Keyed 32-bit hash function using TEA in a Davis-Meyer function -+ H0 = Key -+ Hi = E Mi(Hi-1) + Hi-1 -+ -+ (see Applied Cryptography, 2nd edition, p448). -+ -+ Jeremy Fitzhardinge 1998 -+ -+ Jeremy has agreed to the contents of reiserfs/README. -Hans -+ -+ This code was blindly upgraded to __u64 by s/__u32/__u64/g. -+*/ -+static __u64 hash_tea(const unsigned char *name /* name to hash */ , -+ int len /* @name's length */ ) -+{ -+ __u64 k[] = { 0x9464a485u, 0x542e1a94u, 0x3e846bffu, 0xb75bcfc3u }; -+ -+ __u64 h0 = k[0], h1 = k[1]; -+ __u64 a, b, c, d; -+ __u64 pad; -+ int i; -+ -+ assert("nikita-676", name != NULL); -+ assert("nikita-677", len >= 0); -+ -+#define DELTA 0x9E3779B9u -+#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */ -+#define PARTROUNDS 6 /* 6 gets complete mixing */ -+ -+/* a, b, c, d - data; h0, h1 - accumulated hash */ -+#define TEACORE(rounds) \ -+ do { \ -+ __u64 sum = 0; \ -+ int n = rounds; \ -+ __u64 b0, b1; \ -+ \ -+ b0 = h0; \ -+ b1 = h1; \ -+ \ -+ do \ -+ { \ -+ sum += DELTA; \ -+ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \ -+ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \ -+ } while(--n); \ -+ \ -+ h0 += b0; \ -+ h1 += b1; \ -+ } while(0) -+ -+ pad = (__u64) len | ((__u64) len << 8); -+ pad |= pad << 16; -+ -+ while (len >= 16) { -+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] << -+ 16 | (__u64) name[3] << 24; -+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] << -+ 16 | (__u64) name[7] << 24; -+ c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] << -+ 16 | (__u64) name[11] << 24; -+ d = (__u64) name[12] | (__u64) name[13] << 8 | (__u64) name[14] -+ << 16 | (__u64) name[15] << 24; -+ -+ TEACORE(PARTROUNDS); -+ -+ len -= 16; -+ name += 16; -+ } -+ -+ if (len >= 12) { -+ //assert(len < 16); -+ if (len >= 16) -+ *(int *)0 = 0; -+ -+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] << -+ 16 | (__u64) name[3] << 24; -+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] << -+ 16 | (__u64) name[7] << 24; -+ c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] << -+ 16 | (__u64) name[11] << 24; -+ -+ d = pad; -+ for (i = 12; i < len; i++) { -+ d <<= 8; -+ d |= name[i]; -+ } -+ } else if (len >= 8) { -+ //assert(len < 12); -+ if (len >= 12) -+ *(int *)0 = 0; -+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] << -+ 16 | (__u64) name[3] << 24; -+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] << -+ 16 | (__u64) name[7] << 24; -+ -+ c = d = pad; -+ for (i = 8; i < len; i++) { -+ c <<= 8; -+ c |= name[i]; -+ } -+ } else if (len >= 4) { -+ //assert(len < 8); -+ if (len >= 8) -+ *(int *)0 = 0; -+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] << -+ 16 | (__u64) name[3] << 24; -+ -+ b = c = d = pad; -+ for (i = 4; i < len; i++) { -+ b <<= 8; -+ b |= name[i]; -+ } -+ } else { -+ //assert(len < 4); -+ if (len >= 4) -+ *(int *)0 = 0; -+ a = b = c = d = pad; -+ for (i = 0; i < len; i++) { -+ a <<= 8; -+ a |= name[i]; -+ } -+ } -+ -+ TEACORE(FULLROUNDS); -+ -+/* return 0;*/ -+ return h0 ^ h1; -+ -+} -+ -+/* classical 64 bit Fowler/Noll/Vo-1 (FNV-1) hash. -+ -+ See http://www.isthe.com/chongo/tech/comp/fnv/ for details. -+ -+ Excerpts: -+ -+ FNV hashes are designed to be fast while maintaining a low collision -+ rate. -+ -+ [This version also seems to preserve lexicographical order locally.] -+ -+ FNV hash algorithms and source code have been released into the public -+ domain. -+ -+*/ -+static __u64 hash_fnv1(const unsigned char *name /* name to hash */ , -+ int len UNUSED_ARG /* @name's length */ ) -+{ -+ unsigned long long a = 0xcbf29ce484222325ull; -+ const unsigned long long fnv_64_prime = 0x100000001b3ull; -+ -+ assert("nikita-678", name != NULL); -+ assert("nikita-679", len >= 0); -+ -+ /* FNV-1 hash each octet in the buffer */ -+ for (; *name; ++name) { -+ /* multiply by the 32 bit FNV magic prime mod 2^64 */ -+ a *= fnv_64_prime; -+ /* xor the bottom with the current octet */ -+ a ^= (unsigned long long)(*name); -+ } -+ /* return our new hash value */ -+ return a; -+} -+ -+/* degenerate hash function used to simplify testing of non-unique key -+ handling */ -+static __u64 hash_deg(const unsigned char *name UNUSED_ARG /* name to hash */ , -+ int len UNUSED_ARG /* @name's length */ ) -+{ -+ return 0xc0c0c0c010101010ull; -+} -+ -+static int change_hash(struct inode *inode, -+ reiser4_plugin * plugin, -+ pset_member memb) -+{ -+ int result; -+ -+ assert("nikita-3503", inode != NULL); -+ assert("nikita-3504", plugin != NULL); -+ -+ assert("nikita-3505", is_reiser4_inode(inode)); -+ assert("nikita-3507", plugin->h.type_id == REISER4_HASH_PLUGIN_TYPE); -+ -+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE)) -+ return RETERR(-EINVAL); -+ -+ result = 0; -+ if (inode_hash_plugin(inode) == NULL || -+ inode_hash_plugin(inode)->h.id != plugin->h.id) { -+ if (is_dir_empty(inode) == 0) -+ result = aset_set_unsafe(&reiser4_inode_data(inode)->pset, -+ PSET_HASH, plugin); -+ else -+ result = RETERR(-ENOTEMPTY); -+ -+ } -+ return result; -+} -+ -+static reiser4_plugin_ops hash_plugin_ops = { -+ .init = NULL, -+ .load = NULL, -+ .save_len = NULL, -+ .save = NULL, -+ .change = change_hash -+}; -+ -+/* hash plugins */ -+hash_plugin hash_plugins[LAST_HASH_ID] = { -+ [RUPASOV_HASH_ID] = { -+ .h = { -+ .type_id = REISER4_HASH_PLUGIN_TYPE, -+ .id = RUPASOV_HASH_ID, -+ .pops = &hash_plugin_ops, -+ .label = "rupasov", -+ .desc = "Original Yura's hash", -+ .linkage = {NULL, NULL} -+ }, -+ .hash = hash_rupasov -+ }, -+ [R5_HASH_ID] = { -+ .h = { -+ .type_id = REISER4_HASH_PLUGIN_TYPE, -+ .id = R5_HASH_ID, -+ .pops = &hash_plugin_ops, -+ .label = "r5", -+ .desc = "r5 hash", -+ .linkage = {NULL, NULL} -+ }, -+ .hash = hash_r5 -+ }, -+ [TEA_HASH_ID] = { -+ .h = { -+ .type_id = REISER4_HASH_PLUGIN_TYPE, -+ .id = TEA_HASH_ID, -+ .pops = &hash_plugin_ops, -+ .label = "tea", -+ .desc = "tea hash", -+ .linkage = {NULL, NULL} -+ }, -+ .hash = hash_tea -+ }, -+ [FNV1_HASH_ID] = { -+ .h = { -+ .type_id = REISER4_HASH_PLUGIN_TYPE, -+ .id = FNV1_HASH_ID, -+ .pops = &hash_plugin_ops, -+ .label = "fnv1", -+ .desc = "fnv1 hash", -+ .linkage = {NULL, NULL} -+ }, -+ .hash = hash_fnv1 -+ }, -+ [DEGENERATE_HASH_ID] = { -+ .h = { -+ .type_id = REISER4_HASH_PLUGIN_TYPE, -+ .id = DEGENERATE_HASH_ID, -+ .pops = &hash_plugin_ops, -+ .label = "degenerate hash", -+ .desc = "Degenerate hash: only for testing", -+ .linkage = {NULL, NULL} -+ }, -+ .hash = hash_deg -+ } -+}; -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/inode_ops.c linux-2.6.20/fs/reiser4/plugin/inode_ops.c ---- linux-2.6.20.orig/fs/reiser4/plugin/inode_ops.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/inode_ops.c 2007-05-06 14:50:43.795005721 +0400 -@@ -0,0 +1,897 @@ -+/* -+ * Copyright 2005 by Hans Reiser, licensing governed by reiser4/README -+ */ -+ -+/* -+ * this file contains typical implementations for most of methods of struct -+ * inode_operations -+ */ -+ -+#include "../inode.h" -+#include "../safe_link.h" -+ -+#include -+#include -+ -+static int create_vfs_object(struct inode *parent, struct dentry *dentry, -+ reiser4_object_create_data *data); -+ -+/** -+ * reiser4_create_common - create of inode operations -+ * @parent: inode of parent directory -+ * @dentry: dentry of new object to create -+ * @mode: the permissions to use -+ * @nameidata: -+ * -+ * This is common implementation of vfs's create method of struct -+ * inode_operations. -+ * Creates regular file using file plugin from parent directory plugin set. -+ */ -+int reiser4_create_common(struct inode *parent, struct dentry *dentry, -+ int mode, struct nameidata *nameidata) -+{ -+ reiser4_object_create_data data; -+ file_plugin *fplug; -+ -+ memset(&data, 0, sizeof data); -+ data.mode = S_IFREG | mode; -+ fplug = child_create_plugin(parent) ? : inode_create_plugin(parent); -+ if (!plugin_of_group(fplug, REISER4_REGULAR_FILE)) { -+ warning("vpf-1900", "'%s' is not a regular file plugin.", -+ fplug->h.label); -+ return RETERR(-EIO); -+ } -+ data.id = fplug->h.id; -+ return create_vfs_object(parent, dentry, &data); -+} -+ -+int reiser4_lookup_name(struct inode *dir, struct dentry *, reiser4_key *); -+void check_light_weight(struct inode *inode, struct inode *parent); -+ -+/** -+ * reiser4_lookup_common - lookup of inode operations -+ * @parent: inode of directory to lookup into -+ * @dentry: name to look for -+ * @nameidata: -+ * -+ * This is common implementation of vfs's lookup method of struct -+ * inode_operations. -+ */ -+struct dentry *reiser4_lookup_common(struct inode *parent, -+ struct dentry *dentry, -+ struct nameidata *nameidata) -+{ -+ reiser4_context *ctx; -+ int result; -+ struct dentry *new; -+ struct inode *inode; -+ reiser4_dir_entry_desc entry; -+ -+ ctx = reiser4_init_context(parent->i_sb); -+ if (IS_ERR(ctx)) -+ return (struct dentry *)ctx; -+ -+ /* set up operations on dentry. */ -+ dentry->d_op = &get_super_private(parent->i_sb)->ops.dentry; -+ -+ result = reiser4_lookup_name(parent, dentry, &entry.key); -+ if (result) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ if (result == -ENOENT) { -+ /* object not found */ -+ if (!IS_DEADDIR(parent)) -+ d_add(dentry, NULL); -+ return NULL; -+ } -+ return ERR_PTR(result); -+ } -+ -+ inode = reiser4_iget(parent->i_sb, &entry.key, 0); -+ if (IS_ERR(inode)) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return ERR_PTR(PTR_ERR(inode)); -+ } -+ -+ /* success */ -+ check_light_weight(inode, parent); -+ new = d_splice_alias(inode, dentry); -+ reiser4_iget_complete(inode); -+ -+ /* prevent balance_dirty_pages() from being called: we don't want to -+ * do this under directory i_mutex. */ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return new; -+} -+ -+static reiser4_block_nr common_estimate_link(struct inode *parent, -+ struct inode *object); -+int reiser4_update_dir(struct inode *); -+ -+/** -+ * reiser4_link_common - link of inode operations -+ * @existing: dentry of object which is to get new name -+ * @parent: directory where new name is to be created -+ * @newname: new name -+ * -+ * This is common implementation of vfs's link method of struct -+ * inode_operations. -+ */ -+int reiser4_link_common(struct dentry *existing, struct inode *parent, -+ struct dentry *newname) -+{ -+ reiser4_context *ctx; -+ int result; -+ struct inode *object; -+ dir_plugin *parent_dplug; -+ reiser4_dir_entry_desc entry; -+ reiser4_object_create_data data; -+ reiser4_block_nr reserve; -+ -+ ctx = reiser4_init_context(parent->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ assert("nikita-1431", existing != NULL); -+ assert("nikita-1432", parent != NULL); -+ assert("nikita-1433", newname != NULL); -+ -+ object = existing->d_inode; -+ assert("nikita-1434", object != NULL); -+ -+ /* check for race with create_object() */ -+ if (reiser4_inode_get_flag(object, REISER4_IMMUTABLE)) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return RETERR(-E_REPEAT); -+ } -+ -+ parent_dplug = inode_dir_plugin(parent); -+ -+ memset(&entry, 0, sizeof entry); -+ entry.obj = object; -+ -+ data.mode = object->i_mode; -+ data.id = inode_file_plugin(object)->h.id; -+ -+ reserve = common_estimate_link(parent, existing->d_inode); -+ if ((__s64) reserve < 0) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return reserve; -+ } -+ -+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return RETERR(-ENOSPC); -+ } -+ -+ /* -+ * Subtle race handling: sys_link() doesn't take i_mutex on @parent. It -+ * means that link(2) can race against unlink(2) or rename(2), and -+ * inode is dead (->i_nlink == 0) when reiser4_link() is entered. -+ * -+ * For such inode we have to undo special processing done in -+ * reiser4_unlink() viz. creation of safe-link. -+ */ -+ if (unlikely(object->i_nlink == 0)) { -+ result = safe_link_del(reiser4_tree_by_inode(object), -+ get_inode_oid(object), SAFE_UNLINK); -+ if (result != 0) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ } -+ -+ /* increment nlink of @existing and update its stat data */ -+ result = reiser4_add_nlink(object, parent, 1); -+ if (result == 0) { -+ /* add entry to the parent */ -+ result = -+ parent_dplug->add_entry(parent, newname, &data, &entry); -+ if (result != 0) { -+ /* failed to add entry to the parent, decrement nlink -+ of @existing */ -+ reiser4_del_nlink(object, parent, 1); -+ /* -+ * now, if that failed, we have a file with too big -+ * nlink---space leak, much better than directory -+ * entry pointing to nowhere -+ */ -+ } -+ } -+ if (result == 0) { -+ atomic_inc(&object->i_count); -+ /* -+ * Upon successful completion, link() shall mark for update -+ * the st_ctime field of the file. Also, the st_ctime and -+ * st_mtime fields of the directory that contains the new -+ * entry shall be marked for update. --SUS -+ */ -+ result = reiser4_update_dir(parent); -+ } -+ if (result == 0) -+ d_instantiate(newname, existing->d_inode); -+ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+static int unlink_check_and_grab(struct inode *parent, struct dentry *victim); -+ -+/** -+ * reiser4_unlink_common - unlink of inode operations -+ * @parent: inode of directory to remove name from -+ * @victim: name to be removed -+ * -+ * This is common implementation of vfs's unlink method of struct -+ * inode_operations. -+ */ -+int reiser4_unlink_common(struct inode *parent, struct dentry *victim) -+{ -+ reiser4_context *ctx; -+ int result; -+ struct inode *object; -+ file_plugin *fplug; -+ -+ ctx = reiser4_init_context(parent->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ object = victim->d_inode; -+ fplug = inode_file_plugin(object); -+ assert("nikita-2882", fplug->detach != NULL); -+ -+ result = unlink_check_and_grab(parent, victim); -+ if (result != 0) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ result = fplug->detach(object, parent); -+ if (result == 0) { -+ dir_plugin *parent_dplug; -+ reiser4_dir_entry_desc entry; -+ -+ parent_dplug = inode_dir_plugin(parent); -+ memset(&entry, 0, sizeof entry); -+ -+ /* first, delete directory entry */ -+ result = parent_dplug->rem_entry(parent, victim, &entry); -+ if (result == 0) { -+ /* -+ * if name was removed successfully, we _have_ to -+ * return 0 from this function, because upper level -+ * caller (vfs_{rmdir,unlink}) expect this. -+ * -+ * now that directory entry is removed, update -+ * stat-data -+ */ -+ reiser4_del_nlink(object, parent, 1); -+ /* -+ * Upon successful completion, unlink() shall mark for -+ * update the st_ctime and st_mtime fields of the -+ * parent directory. Also, if the file's link count is -+ * not 0, the st_ctime field of the file shall be -+ * marked for update. --SUS -+ */ -+ reiser4_update_dir(parent); -+ /* add safe-link for this file */ -+ if (object->i_nlink == 0) -+ safe_link_add(object, SAFE_UNLINK); -+ } -+ } -+ -+ if (unlikely(result != 0)) { -+ if (result != -ENOMEM) -+ warning("nikita-3398", "Cannot unlink %llu (%i)", -+ (unsigned long long)get_inode_oid(object), -+ result); -+ /* if operation failed commit pending inode modifications to -+ * the stat-data */ -+ reiser4_update_sd(object); -+ reiser4_update_sd(parent); -+ } -+ -+ reiser4_release_reserved(object->i_sb); -+ -+ /* @object's i_ctime was updated by ->rem_link() method(). */ -+ -+ /* @victim can be already removed from the disk by this time. Inode is -+ then marked so that iput() wouldn't try to remove stat data. But -+ inode itself is still there. -+ */ -+ -+ /* -+ * we cannot release directory semaphore here, because name has -+ * already been deleted, but dentry (@victim) still exists. Prevent -+ * balance_dirty_pages() from being called on exiting this context: we -+ * don't want to do this under directory i_mutex. -+ */ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/** -+ * reiser4_symlink_common - symlink of inode operations -+ * @parent: inode of parent directory -+ * @dentry: dentry of object to be created -+ * @linkname: string symlink is to contain -+ * -+ * This is common implementation of vfs's symlink method of struct -+ * inode_operations. -+ * Creates object using file plugin SYMLINK_FILE_PLUGIN_ID. -+ */ -+int reiser4_symlink_common(struct inode *parent, struct dentry *dentry, -+ const char *linkname) -+{ -+ reiser4_object_create_data data; -+ -+ memset(&data, 0, sizeof data); -+ data.name = linkname; -+ data.id = SYMLINK_FILE_PLUGIN_ID; -+ data.mode = S_IFLNK | S_IRWXUGO; -+ return create_vfs_object(parent, dentry, &data); -+} -+ -+/** -+ * reiser4_mkdir_common - mkdir of inode operations -+ * @parent: inode of parent directory -+ * @dentry: dentry of object to be created -+ * @mode: the permissions to use -+ * -+ * This is common implementation of vfs's mkdir method of struct -+ * inode_operations. -+ * Creates object using file plugin DIRECTORY_FILE_PLUGIN_ID. -+ */ -+int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode) -+{ -+ reiser4_object_create_data data; -+ -+ memset(&data, 0, sizeof data); -+ data.mode = S_IFDIR | mode; -+ data.id = DIRECTORY_FILE_PLUGIN_ID; -+ return create_vfs_object(parent, dentry, &data); -+} -+ -+/** -+ * reiser4_mknod_common - mknod of inode operations -+ * @parent: inode of parent directory -+ * @dentry: dentry of object to be created -+ * @mode: the permissions to use and file type -+ * @rdev: minor and major of new device file -+ * -+ * This is common implementation of vfs's mknod method of struct -+ * inode_operations. -+ * Creates object using file plugin SPECIAL_FILE_PLUGIN_ID. -+ */ -+int reiser4_mknod_common(struct inode *parent, struct dentry *dentry, -+ int mode, dev_t rdev) -+{ -+ reiser4_object_create_data data; -+ -+ memset(&data, 0, sizeof data); -+ data.mode = mode; -+ data.rdev = rdev; -+ data.id = SPECIAL_FILE_PLUGIN_ID; -+ return create_vfs_object(parent, dentry, &data); -+} -+ -+/* -+ * implementation of vfs's rename method of struct inode_operations for typical -+ * directory is in inode_ops_rename.c -+ */ -+ -+/** -+ * reiser4_follow_link_common - follow_link of inode operations -+ * @dentry: dentry of symlink -+ * @data: -+ * -+ * This is common implementation of vfs's followlink method of struct -+ * inode_operations. -+ * Assumes that inode's i_private points to the content of symbolic link. -+ */ -+void *reiser4_follow_link_common(struct dentry *dentry, struct nameidata *nd) -+{ -+ assert("vs-851", S_ISLNK(dentry->d_inode->i_mode)); -+ -+ if (!dentry->d_inode->i_private -+ || !reiser4_inode_get_flag(dentry->d_inode, -+ REISER4_GENERIC_PTR_USED)) -+ return ERR_PTR(RETERR(-EINVAL)); -+ nd_set_link(nd, dentry->d_inode->i_private); -+ return NULL; -+} -+ -+/** -+ * reiser4_permission_common - permission of inode operations -+ * @inode: inode to check permissions for -+ * @mask: mode bits to check permissions for -+ * @nameidata: -+ * -+ * Uses generic function to check for rwx permissions. -+ */ -+int reiser4_permission_common(struct inode *inode, int mask, -+ struct nameidata *nameidata) -+{ -+ return generic_permission(inode, mask, NULL); -+} -+ -+static int setattr_reserve(reiser4_tree *); -+ -+/* this is common implementation of vfs's setattr method of struct -+ inode_operations -+*/ -+int reiser4_setattr_common(struct dentry *dentry, struct iattr *attr) -+{ -+ reiser4_context *ctx; -+ struct inode *inode; -+ int result; -+ -+ inode = dentry->d_inode; -+ result = inode_change_ok(inode, attr); -+ if (result) -+ return result; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ assert("nikita-3119", !(attr->ia_valid & ATTR_SIZE)); -+ -+ /* -+ * grab disk space and call standard inode_setattr(). -+ */ -+ result = setattr_reserve(reiser4_tree_by_inode(inode)); -+ if (!result) { -+ if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) -+ || (attr->ia_valid & ATTR_GID -+ && attr->ia_gid != inode->i_gid)) { -+ result = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; -+ if (result) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ } -+ result = inode_setattr(inode, attr); -+ if (!result) -+ reiser4_update_sd(inode); -+ } -+ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/* this is common implementation of vfs's getattr method of struct -+ inode_operations -+*/ -+int reiser4_getattr_common(struct vfsmount *mnt UNUSED_ARG, -+ struct dentry *dentry, struct kstat *stat) -+{ -+ struct inode *obj; -+ -+ assert("nikita-2298", dentry != NULL); -+ assert("nikita-2299", stat != NULL); -+ assert("nikita-2300", dentry->d_inode != NULL); -+ -+ obj = dentry->d_inode; -+ -+ stat->dev = obj->i_sb->s_dev; -+ stat->ino = oid_to_uino(get_inode_oid(obj)); -+ stat->mode = obj->i_mode; -+ /* don't confuse userland with huge nlink. This is not entirely -+ * correct, because nlink_t is not necessary 16 bit signed. */ -+ stat->nlink = min(obj->i_nlink, (typeof(obj->i_nlink)) 0x7fff); -+ stat->uid = obj->i_uid; -+ stat->gid = obj->i_gid; -+ stat->rdev = obj->i_rdev; -+ stat->atime = obj->i_atime; -+ stat->mtime = obj->i_mtime; -+ stat->ctime = obj->i_ctime; -+ stat->size = obj->i_size; -+ stat->blocks = -+ (inode_get_bytes(obj) + VFS_BLKSIZE - 1) >> VFS_BLKSIZE_BITS; -+ /* "preferred" blocksize for efficient file system I/O */ -+ stat->blksize = get_super_private(obj->i_sb)->optimal_io_size; -+ -+ return 0; -+} -+ -+/* Estimate the maximum amount of nodes which might be allocated or changed on -+ typical new object creation. Typical creation consists of calling create -+ method of file plugin, adding directory entry to parent and update parent -+ directory's stat data. -+*/ -+static reiser4_block_nr estimate_create_vfs_object(struct inode *parent, /* parent object */ -+ struct inode *object -+ /* object */ ) -+{ -+ assert("vpf-309", parent != NULL); -+ assert("vpf-307", object != NULL); -+ -+ return -+ /* object creation estimation */ -+ inode_file_plugin(object)->estimate.create(object) + -+ /* stat data of parent directory estimation */ -+ inode_file_plugin(parent)->estimate.update(parent) + -+ /* adding entry estimation */ -+ inode_dir_plugin(parent)->estimate.add_entry(parent) + -+ /* to undo in the case of failure */ -+ inode_dir_plugin(parent)->estimate.rem_entry(parent); -+} -+ -+/* Create child in directory. -+ -+ . get object's plugin -+ . get fresh inode -+ . initialize inode -+ . add object's stat-data -+ . initialize object's directory -+ . add entry to the parent -+ . instantiate dentry -+ -+*/ -+static int do_create_vfs_child(reiser4_object_create_data * data, /* parameters of new -+ object */ -+ struct inode **retobj) -+{ -+ int result; -+ -+ struct dentry *dentry; /* parent object */ -+ struct inode *parent; /* new name */ -+ -+ dir_plugin *par_dir; /* directory plugin on the parent */ -+ dir_plugin *obj_dir; /* directory plugin on the new object */ -+ file_plugin *obj_plug; /* object plugin on the new object */ -+ struct inode *object; /* new object */ -+ reiser4_block_nr reserve; -+ -+ reiser4_dir_entry_desc entry; /* new directory entry */ -+ -+ assert("nikita-1420", data != NULL); -+ parent = data->parent; -+ dentry = data->dentry; -+ -+ assert("nikita-1418", parent != NULL); -+ assert("nikita-1419", dentry != NULL); -+ -+ /* check, that name is acceptable for parent */ -+ par_dir = inode_dir_plugin(parent); -+ if (par_dir->is_name_acceptable && -+ !par_dir->is_name_acceptable(parent, -+ dentry->d_name.name, -+ (int)dentry->d_name.len)) -+ return RETERR(-ENAMETOOLONG); -+ -+ result = 0; -+ obj_plug = file_plugin_by_id((int)data->id); -+ if (obj_plug == NULL) { -+ warning("nikita-430", "Cannot find plugin %i", data->id); -+ return RETERR(-ENOENT); -+ } -+ object = new_inode(parent->i_sb); -+ if (object == NULL) -+ return RETERR(-ENOMEM); -+ /* we'll update i_nlink below */ -+ object->i_nlink = 0; -+ /* new_inode() initializes i_ino to "arbitrary" value. Reset it to 0, -+ * to simplify error handling: if some error occurs before i_ino is -+ * initialized with oid, i_ino should already be set to some -+ * distinguished value. */ -+ object->i_ino = 0; -+ -+ /* So that on error iput will be called. */ -+ *retobj = object; -+ -+ if (DQUOT_ALLOC_INODE(object)) { -+ DQUOT_DROP(object); -+ object->i_flags |= S_NOQUOTA; -+ return RETERR(-EDQUOT); -+ } -+ -+ memset(&entry, 0, sizeof entry); -+ entry.obj = object; -+ -+ set_plugin(&reiser4_inode_data(object)->pset, PSET_FILE, -+ file_plugin_to_plugin(obj_plug)); -+ result = obj_plug->set_plug_in_inode(object, parent, data); -+ if (result) { -+ warning("nikita-431", "Cannot install plugin %i on %llx", -+ data->id, (unsigned long long)get_inode_oid(object)); -+ DQUOT_FREE_INODE(object); -+ object->i_flags |= S_NOQUOTA; -+ return result; -+ } -+ -+ /* reget plugin after installation */ -+ obj_plug = inode_file_plugin(object); -+ -+ if (obj_plug->create_object == NULL) { -+ DQUOT_FREE_INODE(object); -+ object->i_flags |= S_NOQUOTA; -+ return RETERR(-EPERM); -+ } -+ -+ /* if any of hash, tail, sd or permission plugins for newly created -+ object are not set yet set them here inheriting them from parent -+ directory -+ */ -+ assert("nikita-2070", obj_plug->adjust_to_parent != NULL); -+ result = obj_plug->adjust_to_parent(object, -+ parent, -+ object->i_sb->s_root->d_inode); -+ if (result == 0) -+ result = finish_pset(object); -+ if (result != 0) { -+ warning("nikita-432", "Cannot inherit from %llx to %llx", -+ (unsigned long long)get_inode_oid(parent), -+ (unsigned long long)get_inode_oid(object)); -+ DQUOT_FREE_INODE(object); -+ object->i_flags |= S_NOQUOTA; -+ return result; -+ } -+ -+ /* setup inode and file-operations for this inode */ -+ setup_inode_ops(object, data); -+ -+ /* call file plugin's method to initialize plugin specific part of -+ * inode */ -+ if (obj_plug->init_inode_data) -+ obj_plug->init_inode_data(object, data, 1 /*create */ ); -+ -+ /* obtain directory plugin (if any) for new object. */ -+ obj_dir = inode_dir_plugin(object); -+ if (obj_dir != NULL && obj_dir->init == NULL) { -+ DQUOT_FREE_INODE(object); -+ object->i_flags |= S_NOQUOTA; -+ return RETERR(-EPERM); -+ } -+ -+ reiser4_inode_data(object)->locality_id = get_inode_oid(parent); -+ -+ reserve = estimate_create_vfs_object(parent, object); -+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) { -+ DQUOT_FREE_INODE(object); -+ object->i_flags |= S_NOQUOTA; -+ return RETERR(-ENOSPC); -+ } -+ -+ /* mark inode `immutable'. We disable changes to the file being -+ created until valid directory entry for it is inserted. Otherwise, -+ if file were expanded and insertion of directory entry fails, we -+ have to remove file, but we only alloted enough space in -+ transaction to remove _empty_ file. 3.x code used to remove stat -+ data in different transaction thus possibly leaking disk space on -+ crash. This all only matters if it's possible to access file -+ without name, for example, by inode number -+ */ -+ reiser4_inode_set_flag(object, REISER4_IMMUTABLE); -+ -+ /* create empty object, this includes allocation of new objectid. For -+ directories this implies creation of dot and dotdot */ -+ assert("nikita-2265", reiser4_inode_get_flag(object, REISER4_NO_SD)); -+ -+ /* mark inode as `loaded'. From this point onward -+ reiser4_delete_inode() will try to remove its stat-data. */ -+ reiser4_inode_set_flag(object, REISER4_LOADED); -+ -+ result = obj_plug->create_object(object, parent, data); -+ if (result != 0) { -+ reiser4_inode_clr_flag(object, REISER4_IMMUTABLE); -+ if (result != -ENAMETOOLONG && result != -ENOMEM) -+ warning("nikita-2219", -+ "Failed to create sd for %llu", -+ (unsigned long long)get_inode_oid(object)); -+ DQUOT_FREE_INODE(object); -+ object->i_flags |= S_NOQUOTA; -+ return result; -+ } -+ -+ if (obj_dir != NULL) -+ result = obj_dir->init(object, parent, data); -+ if (result == 0) { -+ assert("nikita-434", !reiser4_inode_get_flag(object, -+ REISER4_NO_SD)); -+ /* insert inode into VFS hash table */ -+ insert_inode_hash(object); -+ /* create entry */ -+ result = par_dir->add_entry(parent, dentry, data, &entry); -+ if (result == 0) { -+ result = reiser4_add_nlink(object, parent, 0); -+ /* If O_CREAT is set and the file did not previously -+ exist, upon successful completion, open() shall -+ mark for update the st_atime, st_ctime, and -+ st_mtime fields of the file and the st_ctime and -+ st_mtime fields of the parent directory. --SUS -+ */ -+ /* @object times are already updated by -+ reiser4_add_nlink() */ -+ if (result == 0) -+ reiser4_update_dir(parent); -+ if (result != 0) -+ /* cleanup failure to add nlink */ -+ par_dir->rem_entry(parent, dentry, &entry); -+ } -+ if (result != 0) -+ /* cleanup failure to add entry */ -+ obj_plug->detach(object, parent); -+ } else if (result != -ENOMEM) -+ warning("nikita-2219", "Failed to initialize dir for %llu: %i", -+ (unsigned long long)get_inode_oid(object), result); -+ -+ /* -+ * update stat-data, committing all pending modifications to the inode -+ * fields. -+ */ -+ reiser4_update_sd(object); -+ if (result != 0) { -+ DQUOT_FREE_INODE(object); -+ object->i_flags |= S_NOQUOTA; -+ /* if everything was ok (result == 0), parent stat-data is -+ * already updated above (update_parent_dir()) */ -+ reiser4_update_sd(parent); -+ /* failure to create entry, remove object */ -+ obj_plug->delete_object(object); -+ } -+ -+ /* file has name now, clear immutable flag */ -+ reiser4_inode_clr_flag(object, REISER4_IMMUTABLE); -+ -+ /* on error, iput() will call ->delete_inode(). We should keep track -+ of the existence of stat-data for this inode and avoid attempt to -+ remove it in reiser4_delete_inode(). This is accomplished through -+ REISER4_NO_SD bit in inode.u.reiser4_i.plugin.flags -+ */ -+ return result; -+} -+ -+/* this is helper for common implementations of reiser4_mkdir, reiser4_create, -+ reiser4_mknod and reiser4_symlink -+*/ -+static int -+create_vfs_object(struct inode *parent, -+ struct dentry *dentry, reiser4_object_create_data * data) -+{ -+ reiser4_context *ctx; -+ int result; -+ struct inode *child; -+ -+ ctx = reiser4_init_context(parent->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ context_set_commit_async(ctx); -+ -+ data->parent = parent; -+ data->dentry = dentry; -+ child = NULL; -+ result = do_create_vfs_child(data, &child); -+ if (unlikely(result != 0)) { -+ if (child != NULL) { -+ reiser4_make_bad_inode(child); -+ iput(child); -+ } -+ } else -+ d_instantiate(dentry, child); -+ -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/* helper for link_common. Estimate disk space necessary to add a link -+ from @parent to @object -+*/ -+static reiser4_block_nr common_estimate_link(struct inode *parent, /* parent directory */ -+ struct inode *object -+ /* object to which new link is being cerated */ -+ ) -+{ -+ reiser4_block_nr res = 0; -+ file_plugin *fplug; -+ dir_plugin *dplug; -+ -+ assert("vpf-317", object != NULL); -+ assert("vpf-318", parent != NULL); -+ -+ fplug = inode_file_plugin(object); -+ dplug = inode_dir_plugin(parent); -+ /* VS-FIXME-HANS: why do we do fplug->estimate.update(object) twice instead of multiplying by 2? */ -+ /* reiser4_add_nlink(object) */ -+ res += fplug->estimate.update(object); -+ /* add_entry(parent) */ -+ res += dplug->estimate.add_entry(parent); -+ /* reiser4_del_nlink(object) */ -+ res += fplug->estimate.update(object); -+ /* update_dir(parent) */ -+ res += inode_file_plugin(parent)->estimate.update(parent); -+ /* safe-link */ -+ res += estimate_one_item_removal(reiser4_tree_by_inode(object)); -+ -+ return res; -+} -+ -+/* Estimate disk space necessary to remove a link between @parent and -+ @object. -+*/ -+static reiser4_block_nr estimate_unlink(struct inode *parent, /* parent directory */ -+ struct inode *object -+ /* object to which new link is being cerated */ -+ ) -+{ -+ reiser4_block_nr res = 0; -+ file_plugin *fplug; -+ dir_plugin *dplug; -+ -+ assert("vpf-317", object != NULL); -+ assert("vpf-318", parent != NULL); -+ -+ fplug = inode_file_plugin(object); -+ dplug = inode_dir_plugin(parent); -+ -+ /* rem_entry(parent) */ -+ res += dplug->estimate.rem_entry(parent); -+ /* reiser4_del_nlink(object) */ -+ res += fplug->estimate.update(object); -+ /* update_dir(parent) */ -+ res += inode_file_plugin(parent)->estimate.update(parent); -+ /* fplug->unlink */ -+ res += fplug->estimate.unlink(object, parent); -+ /* safe-link */ -+ res += estimate_one_insert_item(reiser4_tree_by_inode(object)); -+ -+ return res; -+} -+ -+/* helper for reiser4_unlink_common. Estimate and grab space for unlink. */ -+static int unlink_check_and_grab(struct inode *parent, struct dentry *victim) -+{ -+ file_plugin *fplug; -+ struct inode *child; -+ int result; -+ -+ result = 0; -+ child = victim->d_inode; -+ fplug = inode_file_plugin(child); -+ -+ /* check for race with create_object() */ -+ if (reiser4_inode_get_flag(child, REISER4_IMMUTABLE)) -+ return RETERR(-E_REPEAT); -+ /* object being deleted should have stat data */ -+ assert("vs-949", !reiser4_inode_get_flag(child, REISER4_NO_SD)); -+ -+ /* ask object plugin */ -+ if (fplug->can_rem_link != NULL && !fplug->can_rem_link(child)) -+ return RETERR(-ENOTEMPTY); -+ -+ result = (int)estimate_unlink(parent, child); -+ if (result < 0) -+ return result; -+ -+ return reiser4_grab_reserved(child->i_sb, result, BA_CAN_COMMIT); -+} -+ -+/* helper for reiser4_setattr_common */ -+static int setattr_reserve(reiser4_tree * tree) -+{ -+ assert("vs-1096", is_grab_enabled(get_current_context())); -+ return reiser4_grab_space(estimate_one_insert_into_item(tree), -+ BA_CAN_COMMIT); -+} -+ -+/* helper function. Standards require that for many file-system operations -+ on success ctime and mtime of parent directory is to be updated. */ -+int reiser4_update_dir(struct inode *dir) -+{ -+ assert("nikita-2525", dir != NULL); -+ -+ dir->i_ctime = dir->i_mtime = CURRENT_TIME; -+ return reiser4_update_sd(dir); -+} -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/inode_ops_rename.c linux-2.6.20/fs/reiser4/plugin/inode_ops_rename.c ---- linux-2.6.20.orig/fs/reiser4/plugin/inode_ops_rename.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/inode_ops_rename.c 2007-05-06 14:50:43.795005721 +0400 -@@ -0,0 +1,914 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#include "../inode.h" -+#include "../safe_link.h" -+ -+static const char *possible_leak = "Possible disk space leak."; -+ -+/* re-bind existing name at @from_coord in @from_dir to point to @to_inode. -+ -+ Helper function called from hashed_rename() */ -+static int replace_name(struct inode *to_inode, /* inode where @from_coord is -+ * to be re-targeted at */ -+ struct inode *from_dir, /* directory where @from_coord -+ * lives */ -+ struct inode *from_inode, /* inode @from_coord -+ * originally point to */ -+ coord_t * from_coord, /* where directory entry is in -+ * the tree */ -+ lock_handle * from_lh /* lock handle on @from_coord */ ) -+{ -+ item_plugin *from_item; -+ int result; -+ znode *node; -+ -+ coord_clear_iplug(from_coord); -+ node = from_coord->node; -+ result = zload(node); -+ if (result != 0) -+ return result; -+ from_item = item_plugin_by_coord(from_coord); -+ if (plugin_of_group(item_plugin_by_coord(from_coord), -+ DIR_ENTRY_ITEM_TYPE)) -+ { -+ reiser4_key to_key; -+ -+ build_sd_key(to_inode, &to_key); -+ -+ /* everything is found and prepared to change directory entry -+ at @from_coord to point to @to_inode. -+ -+ @to_inode is just about to get new name, so bump its link -+ counter. -+ -+ */ -+ result = reiser4_add_nlink(to_inode, from_dir, 0); -+ if (result != 0) { -+ /* Don't issue warning: this may be plain -EMLINK */ -+ zrelse(node); -+ return result; -+ } -+ -+ result = -+ from_item->s.dir.update_key(from_coord, &to_key, from_lh); -+ if (result != 0) { -+ reiser4_del_nlink(to_inode, from_dir, 0); -+ zrelse(node); -+ return result; -+ } -+ -+ /* @from_inode just lost its name, he-he. -+ -+ If @from_inode was directory, it contained dotdot pointing -+ to @from_dir. @from_dir i_nlink will be decreased when -+ iput() will be called on @from_inode. -+ -+ If file-system is not ADG (hard-links are -+ supported on directories), iput(from_inode) will not remove -+ @from_inode, and thus above is incorrect, but hard-links on -+ directories are problematic in many other respects. -+ */ -+ result = reiser4_del_nlink(from_inode, from_dir, 0); -+ if (result != 0) { -+ warning("nikita-2330", -+ "Cannot remove link from source: %i. %s", -+ result, possible_leak); -+ } -+ /* Has to return success, because entry is already -+ * modified. */ -+ result = 0; -+ -+ /* NOTE-NIKITA consider calling plugin method in stead of -+ accessing inode fields directly. */ -+ from_dir->i_mtime = CURRENT_TIME; -+ } else { -+ warning("nikita-2326", "Unexpected item type"); -+ result = RETERR(-EIO); -+ } -+ zrelse(node); -+ return result; -+} -+ -+/* add new entry pointing to @inode into @dir at @coord, locked by @lh -+ -+ Helper function used by hashed_rename(). */ -+static int add_name(struct inode *inode, /* inode where @coord is to be -+ * re-targeted at */ -+ struct inode *dir, /* directory where @coord lives */ -+ struct dentry *name, /* new name */ -+ coord_t * coord, /* where directory entry is in the tree */ -+ lock_handle * lh, /* lock handle on @coord */ -+ int is_dir /* true, if @inode is directory */ ) -+{ -+ int result; -+ reiser4_dir_entry_desc entry; -+ -+ assert("nikita-2333", lh->node == coord->node); -+ assert("nikita-2334", is_dir == S_ISDIR(inode->i_mode)); -+ -+ memset(&entry, 0, sizeof entry); -+ entry.obj = inode; -+ /* build key of directory entry description */ -+ inode_dir_plugin(dir)->build_entry_key(dir, &name->d_name, &entry.key); -+ -+ /* ext2 does this in different order: first inserts new entry, -+ then increases directory nlink. We don't want do this, -+ because reiser4_add_nlink() calls ->add_link() plugin -+ method that can fail for whatever reason, leaving as with -+ cleanup problems. -+ */ -+ /* @inode is getting new name */ -+ reiser4_add_nlink(inode, dir, 0); -+ /* create @new_name in @new_dir pointing to -+ @old_inode */ -+ result = WITH_COORD(coord, -+ inode_dir_item_plugin(dir)->s.dir.add_entry(dir, -+ coord, -+ lh, -+ name, -+ &entry)); -+ if (result != 0) { -+ int result2; -+ result2 = reiser4_del_nlink(inode, dir, 0); -+ if (result2 != 0) { -+ warning("nikita-2327", -+ "Cannot drop link on %lli %i. %s", -+ (unsigned long long)get_inode_oid(inode), -+ result2, possible_leak); -+ } -+ } else -+ INODE_INC_FIELD(dir, i_size); -+ return result; -+} -+ -+static reiser4_block_nr estimate_rename(struct inode *old_dir, /* directory where @old is located */ -+ struct dentry *old_name, /* old name */ -+ struct inode *new_dir, /* directory where @new is located */ -+ struct dentry *new_name /* new name */ ) -+{ -+ reiser4_block_nr res1, res2; -+ dir_plugin *p_parent_old, *p_parent_new; -+ file_plugin *p_child_old, *p_child_new; -+ -+ assert("vpf-311", old_dir != NULL); -+ assert("vpf-312", new_dir != NULL); -+ assert("vpf-313", old_name != NULL); -+ assert("vpf-314", new_name != NULL); -+ -+ p_parent_old = inode_dir_plugin(old_dir); -+ p_parent_new = inode_dir_plugin(new_dir); -+ p_child_old = inode_file_plugin(old_name->d_inode); -+ if (new_name->d_inode) -+ p_child_new = inode_file_plugin(new_name->d_inode); -+ else -+ p_child_new = NULL; -+ -+ /* find_entry - can insert one leaf. */ -+ res1 = res2 = 1; -+ -+ /* replace_name */ -+ { -+ /* reiser4_add_nlink(p_child_old) and reiser4_del_nlink(p_child_old) */ -+ res1 += 2 * p_child_old->estimate.update(old_name->d_inode); -+ /* update key */ -+ res1 += 1; -+ /* reiser4_del_nlink(p_child_new) */ -+ if (p_child_new) -+ res1 += p_child_new->estimate.update(new_name->d_inode); -+ } -+ -+ /* else add_name */ -+ { -+ /* reiser4_add_nlink(p_parent_new) and reiser4_del_nlink(p_parent_new) */ -+ res2 += -+ 2 * inode_file_plugin(new_dir)->estimate.update(new_dir); -+ /* reiser4_add_nlink(p_parent_old) */ -+ res2 += p_child_old->estimate.update(old_name->d_inode); -+ /* add_entry(p_parent_new) */ -+ res2 += p_parent_new->estimate.add_entry(new_dir); -+ /* reiser4_del_nlink(p_parent_old) */ -+ res2 += p_child_old->estimate.update(old_name->d_inode); -+ } -+ -+ res1 = res1 < res2 ? res2 : res1; -+ -+ /* reiser4_write_sd(p_parent_new) */ -+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir); -+ -+ /* reiser4_write_sd(p_child_new) */ -+ if (p_child_new) -+ res1 += p_child_new->estimate.update(new_name->d_inode); -+ -+ /* hashed_rem_entry(p_parent_old) */ -+ res1 += p_parent_old->estimate.rem_entry(old_dir); -+ -+ /* reiser4_del_nlink(p_child_old) */ -+ res1 += p_child_old->estimate.update(old_name->d_inode); -+ -+ /* replace_name */ -+ { -+ /* reiser4_add_nlink(p_parent_dir_new) */ -+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir); -+ /* update_key */ -+ res1 += 1; -+ /* reiser4_del_nlink(p_parent_new) */ -+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir); -+ /* reiser4_del_nlink(p_parent_old) */ -+ res1 += inode_file_plugin(old_dir)->estimate.update(old_dir); -+ } -+ -+ /* reiser4_write_sd(p_parent_old) */ -+ res1 += inode_file_plugin(old_dir)->estimate.update(old_dir); -+ -+ /* reiser4_write_sd(p_child_old) */ -+ res1 += p_child_old->estimate.update(old_name->d_inode); -+ -+ return res1; -+} -+ -+static int hashed_rename_estimate_and_grab(struct inode *old_dir, /* directory where @old is located */ -+ struct dentry *old_name, /* old name */ -+ struct inode *new_dir, /* directory where @new is located */ -+ struct dentry *new_name -+ /* new name */ ) -+{ -+ reiser4_block_nr reserve; -+ -+ reserve = estimate_rename(old_dir, old_name, new_dir, new_name); -+ -+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) -+ return RETERR(-ENOSPC); -+ -+ return 0; -+} -+ -+/* check whether @old_inode and @new_inode can be moved within file system -+ * tree. This singles out attempts to rename pseudo-files, for example. */ -+static int can_rename(struct inode *old_dir, struct inode *old_inode, -+ struct inode *new_dir, struct inode *new_inode) -+{ -+ file_plugin *fplug; -+ dir_plugin *dplug; -+ -+ assert("nikita-3370", old_inode != NULL); -+ -+ dplug = inode_dir_plugin(new_dir); -+ fplug = inode_file_plugin(old_inode); -+ -+ if (dplug == NULL) -+ return RETERR(-ENOTDIR); -+ else if (new_dir->i_op->create == NULL) -+ return RETERR(-EPERM); -+ else if (!fplug->can_add_link(old_inode)) -+ return RETERR(-EMLINK); -+ else if (new_inode != NULL) { -+ fplug = inode_file_plugin(new_inode); -+ if (fplug->can_rem_link != NULL && -+ !fplug->can_rem_link(new_inode)) -+ return RETERR(-EBUSY); -+ } -+ return 0; -+} -+ -+int reiser4_find_entry(struct inode *, struct dentry *, lock_handle *, -+ znode_lock_mode, reiser4_dir_entry_desc *); -+int reiser4_update_dir(struct inode *); -+ -+/* this is common implementation of vfs's rename method of struct -+ inode_operations -+ See comments in the body. -+ -+ It is arguable that this function can be made generic so, that it -+ will be applicable to any kind of directory plugin that deals with -+ directories composed out of directory entries. The only obstacle -+ here is that we don't have any data-type to represent directory -+ entry. This should be re-considered when more than one different -+ directory plugin will be implemented. -+*/ -+int reiser4_rename_common(struct inode *old_dir /* directory where @old -+ * is located */ , -+ struct dentry *old_name /* old name */ , -+ struct inode *new_dir /* directory where @new -+ * is located */ , -+ struct dentry *new_name /* new name */ ) -+{ -+ /* From `The Open Group Base Specifications Issue 6' -+ -+ If either the old or new argument names a symbolic link, rename() -+ shall operate on the symbolic link itself, and shall not resolve -+ the last component of the argument. If the old argument and the new -+ argument resolve to the same existing file, rename() shall return -+ successfully and perform no other action. -+ -+ [this is done by VFS: vfs_rename()] -+ -+ If the old argument points to the pathname of a file that is not a -+ directory, the new argument shall not point to the pathname of a -+ directory. -+ -+ [checked by VFS: vfs_rename->may_delete()] -+ -+ If the link named by the new argument exists, it shall -+ be removed and old renamed to new. In this case, a link named new -+ shall remain visible to other processes throughout the renaming -+ operation and refer either to the file referred to by new or old -+ before the operation began. -+ -+ [we should assure this] -+ -+ Write access permission is required for -+ both the directory containing old and the directory containing new. -+ -+ [checked by VFS: vfs_rename->may_delete(), may_create()] -+ -+ If the old argument points to the pathname of a directory, the new -+ argument shall not point to the pathname of a file that is not a -+ directory. -+ -+ [checked by VFS: vfs_rename->may_delete()] -+ -+ If the directory named by the new argument exists, it -+ shall be removed and old renamed to new. In this case, a link named -+ new shall exist throughout the renaming operation and shall refer -+ either to the directory referred to by new or old before the -+ operation began. -+ -+ [we should assure this] -+ -+ If new names an existing directory, it shall be -+ required to be an empty directory. -+ -+ [we should check this] -+ -+ If the old argument points to a pathname of a symbolic link, the -+ symbolic link shall be renamed. If the new argument points to a -+ pathname of a symbolic link, the symbolic link shall be removed. -+ -+ The new pathname shall not contain a path prefix that names -+ old. Write access permission is required for the directory -+ containing old and the directory containing new. If the old -+ argument points to the pathname of a directory, write access -+ permission may be required for the directory named by old, and, if -+ it exists, the directory named by new. -+ -+ [checked by VFS: vfs_rename(), vfs_rename_dir()] -+ -+ If the link named by the new argument exists and the file's link -+ count becomes 0 when it is removed and no process has the file -+ open, the space occupied by the file shall be freed and the file -+ shall no longer be accessible. If one or more processes have the -+ file open when the last link is removed, the link shall be removed -+ before rename() returns, but the removal of the file contents shall -+ be postponed until all references to the file are closed. -+ -+ [iput() handles this, but we can do this manually, a la -+ reiser4_unlink()] -+ -+ Upon successful completion, rename() shall mark for update the -+ st_ctime and st_mtime fields of the parent directory of each file. -+ -+ [N/A] -+ -+ */ -+ reiser4_context *ctx; -+ int result; -+ int is_dir; /* is @old_name directory */ -+ -+ struct inode *old_inode; -+ struct inode *new_inode; -+ coord_t *new_coord; -+ -+ reiser4_dentry_fsdata *new_fsdata; -+ dir_plugin *dplug; -+ file_plugin *fplug; -+ -+ reiser4_dir_entry_desc *old_entry, *new_entry, *dotdot_entry; -+ lock_handle *new_lh, *dotdot_lh; -+ struct dentry *dotdot_name; -+ reiser4_dentry_fsdata *dataonstack; -+ -+ ctx = reiser4_init_context(old_dir->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ old_entry = kmalloc(3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) + -+ sizeof(*dotdot_name) + sizeof(*dataonstack), -+ reiser4_ctx_gfp_mask_get()); -+ if (old_entry == NULL) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return RETERR(-ENOMEM); -+ } -+ memset(old_entry, 0, 3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) + -+ sizeof(*dotdot_name) + sizeof(*dataonstack)); -+ -+ new_entry = old_entry + 1; -+ dotdot_entry = old_entry + 2; -+ new_lh = (lock_handle *)(old_entry + 3); -+ dotdot_lh = new_lh + 1; -+ dotdot_name = (struct dentry *)(new_lh + 2); -+ dataonstack = (reiser4_dentry_fsdata *)(dotdot_name + 1); -+ -+ assert("nikita-2318", old_dir != NULL); -+ assert("nikita-2319", new_dir != NULL); -+ assert("nikita-2320", old_name != NULL); -+ assert("nikita-2321", new_name != NULL); -+ -+ old_inode = old_name->d_inode; -+ new_inode = new_name->d_inode; -+ -+ dplug = inode_dir_plugin(old_dir); -+ fplug = NULL; -+ -+ new_fsdata = reiser4_get_dentry_fsdata(new_name); -+ if (IS_ERR(new_fsdata)) { -+ kfree(old_entry); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return PTR_ERR(new_fsdata); -+ } -+ -+ new_coord = &new_fsdata->dec.entry_coord; -+ coord_clear_iplug(new_coord); -+ -+ is_dir = S_ISDIR(old_inode->i_mode); -+ -+ assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir); -+ -+ /* if target is existing directory and it's not empty---return error. -+ -+ This check is done specifically, because is_dir_empty() requires -+ tree traversal and have to be done before locks are taken. -+ */ -+ if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) { -+ kfree(old_entry); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return RETERR(-ENOTEMPTY); -+ } -+ -+ result = can_rename(old_dir, old_inode, new_dir, new_inode); -+ if (result != 0) { -+ kfree(old_entry); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ result = hashed_rename_estimate_and_grab(old_dir, old_name, -+ new_dir, new_name); -+ if (result != 0) { -+ kfree(old_entry); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ init_lh(new_lh); -+ -+ /* find entry for @new_name */ -+ result = reiser4_find_entry(new_dir, new_name, new_lh, ZNODE_WRITE_LOCK, -+ new_entry); -+ -+ if (IS_CBKERR(result)) { -+ done_lh(new_lh); -+ kfree(old_entry); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ reiser4_seal_done(&new_fsdata->dec.entry_seal); -+ -+ /* add or replace name for @old_inode as @new_name */ -+ if (new_inode != NULL) { -+ /* target (@new_name) exists. */ -+ /* Not clear what to do with objects that are -+ both directories and files at the same time. */ -+ if (result == CBK_COORD_FOUND) { -+ result = replace_name(old_inode, -+ new_dir, -+ new_inode, new_coord, new_lh); -+ if (result == 0) -+ fplug = inode_file_plugin(new_inode); -+ } else if (result == CBK_COORD_NOTFOUND) { -+ /* VFS told us that @new_name is bound to existing -+ inode, but we failed to find directory entry. */ -+ warning("nikita-2324", "Target not found"); -+ result = RETERR(-ENOENT); -+ } -+ } else { -+ /* target (@new_name) doesn't exists. */ -+ if (result == CBK_COORD_NOTFOUND) -+ result = add_name(old_inode, -+ new_dir, -+ new_name, new_coord, new_lh, is_dir); -+ else if (result == CBK_COORD_FOUND) { -+ /* VFS told us that @new_name is "negative" dentry, -+ but we found directory entry. */ -+ warning("nikita-2331", "Target found unexpectedly"); -+ result = RETERR(-EIO); -+ } -+ } -+ -+ assert("nikita-3462", ergo(result == 0, -+ old_inode->i_nlink >= 2 + !!is_dir)); -+ -+ /* We are done with all modifications to the @new_dir, release lock on -+ node. */ -+ done_lh(new_lh); -+ -+ if (fplug != NULL) { -+ /* detach @new_inode from name-space */ -+ result = fplug->detach(new_inode, new_dir); -+ if (result != 0) -+ warning("nikita-2330", "Cannot detach %lli: %i. %s", -+ (unsigned long long)get_inode_oid(new_inode), -+ result, possible_leak); -+ } -+ -+ if (new_inode != NULL) -+ reiser4_update_sd(new_inode); -+ -+ if (result == 0) { -+ old_entry->obj = old_inode; -+ -+ dplug->build_entry_key(old_dir, -+ &old_name->d_name, &old_entry->key); -+ -+ /* At this stage new name was introduced for -+ @old_inode. @old_inode, @new_dir, and @new_inode i_nlink -+ counters were updated. -+ -+ We want to remove @old_name now. If @old_inode wasn't -+ directory this is simple. -+ */ -+ result = dplug->rem_entry(old_dir, old_name, old_entry); -+ if (result != 0 && result != -ENOMEM) { -+ warning("nikita-2335", -+ "Cannot remove old name: %i", result); -+ } else { -+ result = reiser4_del_nlink(old_inode, old_dir, 0); -+ if (result != 0 && result != -ENOMEM) { -+ warning("nikita-2337", -+ "Cannot drop link on old: %i", result); -+ } -+ } -+ -+ if (result == 0 && is_dir) { -+ /* @old_inode is directory. We also have to update -+ dotdot entry. */ -+ coord_t *dotdot_coord; -+ -+ memset(dataonstack, 0, sizeof dataonstack); -+ memset(dotdot_entry, 0, sizeof dotdot_entry); -+ dotdot_entry->obj = old_dir; -+ memset(dotdot_name, 0, sizeof dotdot_name); -+ dotdot_name->d_name.name = ".."; -+ dotdot_name->d_name.len = 2; -+ /* -+ * allocate ->d_fsdata on the stack to avoid using -+ * reiser4_get_dentry_fsdata(). Locking is not needed, -+ * because dentry is private to the current thread. -+ */ -+ dotdot_name->d_fsdata = dataonstack; -+ init_lh(dotdot_lh); -+ -+ dotdot_coord = &dataonstack->dec.entry_coord; -+ coord_clear_iplug(dotdot_coord); -+ -+ result = reiser4_find_entry(old_inode, dotdot_name, -+ dotdot_lh, ZNODE_WRITE_LOCK, -+ dotdot_entry); -+ if (result == 0) { -+ /* replace_name() decreases i_nlink on -+ * @old_dir */ -+ result = replace_name(new_dir, -+ old_inode, -+ old_dir, -+ dotdot_coord, dotdot_lh); -+ } else -+ result = RETERR(-EIO); -+ done_lh(dotdot_lh); -+ } -+ } -+ reiser4_update_dir(new_dir); -+ reiser4_update_dir(old_dir); -+ reiser4_update_sd(old_inode); -+ if (result == 0) { -+ file_plugin *fplug; -+ -+ if (new_inode != NULL) { -+ /* add safe-link for target file (in case we removed -+ * last reference to the poor fellow */ -+ fplug = inode_file_plugin(new_inode); -+ if (new_inode->i_nlink == 0) -+ result = safe_link_add(new_inode, SAFE_UNLINK); -+ } -+ } -+ kfree(old_entry); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+#if 0 -+int reiser4_rename_common(struct inode *old_dir /* directory where @old -+ * is located */ , -+ struct dentry *old_name /* old name */ , -+ struct inode *new_dir /* directory where @new -+ * is located */ , -+ struct dentry *new_name /* new name */ ) -+{ -+ /* From `The Open Group Base Specifications Issue 6' -+ -+ If either the old or new argument names a symbolic link, rename() -+ shall operate on the symbolic link itself, and shall not resolve -+ the last component of the argument. If the old argument and the new -+ argument resolve to the same existing file, rename() shall return -+ successfully and perform no other action. -+ -+ [this is done by VFS: vfs_rename()] -+ -+ If the old argument points to the pathname of a file that is not a -+ directory, the new argument shall not point to the pathname of a -+ directory. -+ -+ [checked by VFS: vfs_rename->may_delete()] -+ -+ If the link named by the new argument exists, it shall -+ be removed and old renamed to new. In this case, a link named new -+ shall remain visible to other processes throughout the renaming -+ operation and refer either to the file referred to by new or old -+ before the operation began. -+ -+ [we should assure this] -+ -+ Write access permission is required for -+ both the directory containing old and the directory containing new. -+ -+ [checked by VFS: vfs_rename->may_delete(), may_create()] -+ -+ If the old argument points to the pathname of a directory, the new -+ argument shall not point to the pathname of a file that is not a -+ directory. -+ -+ [checked by VFS: vfs_rename->may_delete()] -+ -+ If the directory named by the new argument exists, it -+ shall be removed and old renamed to new. In this case, a link named -+ new shall exist throughout the renaming operation and shall refer -+ either to the directory referred to by new or old before the -+ operation began. -+ -+ [we should assure this] -+ -+ If new names an existing directory, it shall be -+ required to be an empty directory. -+ -+ [we should check this] -+ -+ If the old argument points to a pathname of a symbolic link, the -+ symbolic link shall be renamed. If the new argument points to a -+ pathname of a symbolic link, the symbolic link shall be removed. -+ -+ The new pathname shall not contain a path prefix that names -+ old. Write access permission is required for the directory -+ containing old and the directory containing new. If the old -+ argument points to the pathname of a directory, write access -+ permission may be required for the directory named by old, and, if -+ it exists, the directory named by new. -+ -+ [checked by VFS: vfs_rename(), vfs_rename_dir()] -+ -+ If the link named by the new argument exists and the file's link -+ count becomes 0 when it is removed and no process has the file -+ open, the space occupied by the file shall be freed and the file -+ shall no longer be accessible. If one or more processes have the -+ file open when the last link is removed, the link shall be removed -+ before rename() returns, but the removal of the file contents shall -+ be postponed until all references to the file are closed. -+ -+ [iput() handles this, but we can do this manually, a la -+ reiser4_unlink()] -+ -+ Upon successful completion, rename() shall mark for update the -+ st_ctime and st_mtime fields of the parent directory of each file. -+ -+ [N/A] -+ -+ */ -+ reiser4_context *ctx; -+ int result; -+ int is_dir; /* is @old_name directory */ -+ struct inode *old_inode; -+ struct inode *new_inode; -+ reiser4_dir_entry_desc old_entry; -+ reiser4_dir_entry_desc new_entry; -+ coord_t *new_coord; -+ reiser4_dentry_fsdata *new_fsdata; -+ lock_handle new_lh; -+ dir_plugin *dplug; -+ file_plugin *fplug; -+ -+ ctx = reiser4_init_context(old_dir->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ assert("nikita-2318", old_dir != NULL); -+ assert("nikita-2319", new_dir != NULL); -+ assert("nikita-2320", old_name != NULL); -+ assert("nikita-2321", new_name != NULL); -+ -+ old_inode = old_name->d_inode; -+ new_inode = new_name->d_inode; -+ -+ dplug = inode_dir_plugin(old_dir); -+ fplug = NULL; -+ -+ new_fsdata = reiser4_get_dentry_fsdata(new_name); -+ if (IS_ERR(new_fsdata)) { -+ result = PTR_ERR(new_fsdata); -+ goto exit; -+ } -+ -+ new_coord = &new_fsdata->dec.entry_coord; -+ coord_clear_iplug(new_coord); -+ -+ is_dir = S_ISDIR(old_inode->i_mode); -+ -+ assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir); -+ -+ /* if target is existing directory and it's not empty---return error. -+ -+ This check is done specifically, because is_dir_empty() requires -+ tree traversal and have to be done before locks are taken. -+ */ -+ if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) -+ return RETERR(-ENOTEMPTY); -+ -+ result = can_rename(old_dir, old_inode, new_dir, new_inode); -+ if (result != 0) -+ goto exit; -+ -+ result = hashed_rename_estimate_and_grab(old_dir, old_name, -+ new_dir, new_name); -+ if (result != 0) -+ goto exit; -+ -+ init_lh(&new_lh); -+ -+ /* find entry for @new_name */ -+ result = reiser4_find_entry(new_dir, new_name, &new_lh, -+ ZNODE_WRITE_LOCK, &new_entry); -+ -+ if (IS_CBKERR(result)) { -+ done_lh(&new_lh); -+ goto exit; -+ } -+ -+ reiser4_seal_done(&new_fsdata->dec.entry_seal); -+ -+ /* add or replace name for @old_inode as @new_name */ -+ if (new_inode != NULL) { -+ /* target (@new_name) exists. */ -+ /* Not clear what to do with objects that are -+ both directories and files at the same time. */ -+ if (result == CBK_COORD_FOUND) { -+ result = replace_name(old_inode, -+ new_dir, -+ new_inode, new_coord, &new_lh); -+ if (result == 0) -+ fplug = inode_file_plugin(new_inode); -+ } else if (result == CBK_COORD_NOTFOUND) { -+ /* VFS told us that @new_name is bound to existing -+ inode, but we failed to find directory entry. */ -+ warning("nikita-2324", "Target not found"); -+ result = RETERR(-ENOENT); -+ } -+ } else { -+ /* target (@new_name) doesn't exists. */ -+ if (result == CBK_COORD_NOTFOUND) -+ result = add_name(old_inode, -+ new_dir, -+ new_name, new_coord, &new_lh, is_dir); -+ else if (result == CBK_COORD_FOUND) { -+ /* VFS told us that @new_name is "negative" dentry, -+ but we found directory entry. */ -+ warning("nikita-2331", "Target found unexpectedly"); -+ result = RETERR(-EIO); -+ } -+ } -+ -+ assert("nikita-3462", ergo(result == 0, -+ old_inode->i_nlink >= 2 + !!is_dir)); -+ -+ /* We are done with all modifications to the @new_dir, release lock on -+ node. */ -+ done_lh(&new_lh); -+ -+ if (fplug != NULL) { -+ /* detach @new_inode from name-space */ -+ result = fplug->detach(new_inode, new_dir); -+ if (result != 0) -+ warning("nikita-2330", "Cannot detach %lli: %i. %s", -+ (unsigned long long)get_inode_oid(new_inode), -+ result, possible_leak); -+ } -+ -+ if (new_inode != NULL) -+ reiser4_update_sd(new_inode); -+ -+ if (result == 0) { -+ memset(&old_entry, 0, sizeof old_entry); -+ old_entry.obj = old_inode; -+ -+ dplug->build_entry_key(old_dir, -+ &old_name->d_name, &old_entry.key); -+ -+ /* At this stage new name was introduced for -+ @old_inode. @old_inode, @new_dir, and @new_inode i_nlink -+ counters were updated. -+ -+ We want to remove @old_name now. If @old_inode wasn't -+ directory this is simple. -+ */ -+ result = dplug->rem_entry(old_dir, old_name, &old_entry); -+ /*result = rem_entry_hashed(old_dir, old_name, &old_entry); */ -+ if (result != 0 && result != -ENOMEM) { -+ warning("nikita-2335", -+ "Cannot remove old name: %i", result); -+ } else { -+ result = reiser4_del_nlink(old_inode, old_dir, 0); -+ if (result != 0 && result != -ENOMEM) { -+ warning("nikita-2337", -+ "Cannot drop link on old: %i", result); -+ } -+ } -+ -+ if (result == 0 && is_dir) { -+ /* @old_inode is directory. We also have to update -+ dotdot entry. */ -+ coord_t *dotdot_coord; -+ lock_handle dotdot_lh; -+ struct dentry dotdot_name; -+ reiser4_dir_entry_desc dotdot_entry; -+ reiser4_dentry_fsdata dataonstack; -+ reiser4_dentry_fsdata *fsdata; -+ -+ memset(&dataonstack, 0, sizeof dataonstack); -+ memset(&dotdot_entry, 0, sizeof dotdot_entry); -+ dotdot_entry.obj = old_dir; -+ memset(&dotdot_name, 0, sizeof dotdot_name); -+ dotdot_name.d_name.name = ".."; -+ dotdot_name.d_name.len = 2; -+ /* -+ * allocate ->d_fsdata on the stack to avoid using -+ * reiser4_get_dentry_fsdata(). Locking is not needed, -+ * because dentry is private to the current thread. -+ */ -+ dotdot_name.d_fsdata = &dataonstack; -+ init_lh(&dotdot_lh); -+ -+ fsdata = &dataonstack; -+ dotdot_coord = &fsdata->dec.entry_coord; -+ coord_clear_iplug(dotdot_coord); -+ -+ result = reiser4_find_entry(old_inode, -+ &dotdot_name, -+ &dotdot_lh, -+ ZNODE_WRITE_LOCK, -+ &dotdot_entry); -+ if (result == 0) { -+ /* replace_name() decreases i_nlink on -+ * @old_dir */ -+ result = replace_name(new_dir, -+ old_inode, -+ old_dir, -+ dotdot_coord, &dotdot_lh); -+ } else -+ result = RETERR(-EIO); -+ done_lh(&dotdot_lh); -+ } -+ } -+ reiser4_update_dir(new_dir); -+ reiser4_update_dir(old_dir); -+ reiser4_update_sd(old_inode); -+ if (result == 0) { -+ file_plugin *fplug; -+ -+ if (new_inode != NULL) { -+ /* add safe-link for target file (in case we removed -+ * last reference to the poor fellow */ -+ fplug = inode_file_plugin(new_inode); -+ if (new_inode->i_nlink == 0) -+ result = safe_link_add(new_inode, SAFE_UNLINK); -+ } -+ } -+ exit: -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+#endif -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/acl.h linux-2.6.20/fs/reiser4/plugin/item/acl.h ---- linux-2.6.20.orig/fs/reiser4/plugin/item/acl.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/item/acl.h 2007-05-06 14:50:43.799006970 +0400 -@@ -0,0 +1,66 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Directory entry. */ -+ -+#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ ) -+#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ -+ -+#include "../../forward.h" -+#include "../../dformat.h" -+#include "../../kassign.h" -+#include "../../key.h" -+ -+#include -+#include /* for struct dentry */ -+ -+typedef struct directory_entry_format { -+ /* key of object stat-data. It's not necessary to store whole -+ key here, because it's always key of stat-data, so minor -+ packing locality and offset can be omitted here. But this -+ relies on particular key allocation scheme for stat-data, so, -+ for extensibility sake, whole key can be stored here. -+ -+ We store key as array of bytes, because we don't want 8-byte -+ alignment of dir entries. -+ */ -+ obj_key_id id; -+ /* file name. Null terminated string. */ -+ d8 name[0]; -+} directory_entry_format; -+ -+void print_de(const char *prefix, coord_t * coord); -+int extract_key_de(const coord_t * coord, reiser4_key * key); -+int update_key_de(const coord_t * coord, const reiser4_key * key, -+ lock_handle * lh); -+char *extract_name_de(const coord_t * coord, char *buf); -+unsigned extract_file_type_de(const coord_t * coord); -+int add_entry_de(struct inode *dir, coord_t * coord, -+ lock_handle * lh, const struct dentry *name, -+ reiser4_dir_entry_desc * entry); -+int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord, -+ lock_handle * lh, reiser4_dir_entry_desc * entry); -+int max_name_len_de(const struct inode *dir); -+ -+int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length); -+ -+char *extract_dent_name(const coord_t * coord, -+ directory_entry_format * dent, char *buf); -+ -+#if REISER4_LARGE_KEY -+#define DE_NAME_BUF_LEN (24) -+#else -+#define DE_NAME_BUF_LEN (16) -+#endif -+ -+/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/blackbox.c linux-2.6.20/fs/reiser4/plugin/item/blackbox.c ---- linux-2.6.20.orig/fs/reiser4/plugin/item/blackbox.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/item/blackbox.c 2007-05-06 14:50:43.799006970 +0400 -@@ -0,0 +1,142 @@ -+/* Copyright 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Black box item implementation */ -+ -+#include "../../forward.h" -+#include "../../debug.h" -+#include "../../dformat.h" -+#include "../../kassign.h" -+#include "../../coord.h" -+#include "../../tree.h" -+#include "../../lock.h" -+ -+#include "blackbox.h" -+#include "item.h" -+#include "../plugin.h" -+ -+int -+store_black_box(reiser4_tree * tree, -+ const reiser4_key * key, void *data, int length) -+{ -+ int result; -+ reiser4_item_data idata; -+ coord_t coord; -+ lock_handle lh; -+ -+ memset(&idata, 0, sizeof idata); -+ -+ idata.data = data; -+ idata.user = 0; -+ idata.length = length; -+ idata.iplug = item_plugin_by_id(BLACK_BOX_ID); -+ -+ init_lh(&lh); -+ result = insert_by_key(tree, key, -+ &idata, &coord, &lh, LEAF_LEVEL, CBK_UNIQUE); -+ -+ assert("nikita-3413", -+ ergo(result == 0, -+ WITH_COORD(&coord, -+ item_length_by_coord(&coord) == length))); -+ -+ done_lh(&lh); -+ return result; -+} -+ -+int -+load_black_box(reiser4_tree * tree, -+ reiser4_key * key, void *data, int length, int exact) -+{ -+ int result; -+ coord_t coord; -+ lock_handle lh; -+ -+ init_lh(&lh); -+ result = coord_by_key(tree, key, -+ &coord, &lh, ZNODE_READ_LOCK, -+ exact ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN, -+ LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL); -+ -+ if (result == 0) { -+ int ilen; -+ -+ result = zload(coord.node); -+ if (result == 0) { -+ ilen = item_length_by_coord(&coord); -+ if (ilen <= length) { -+ memcpy(data, item_body_by_coord(&coord), ilen); -+ unit_key_by_coord(&coord, key); -+ } else if (exact) { -+ /* -+ * item is larger than buffer provided by the -+ * user. Only issue a warning if @exact is -+ * set. If @exact is false, we are iterating -+ * over all safe-links and here we are reaching -+ * the end of the iteration. -+ */ -+ warning("nikita-3415", -+ "Wrong black box length: %i > %i", -+ ilen, length); -+ result = RETERR(-EIO); -+ } -+ zrelse(coord.node); -+ } -+ } -+ -+ done_lh(&lh); -+ return result; -+ -+} -+ -+int -+update_black_box(reiser4_tree * tree, -+ const reiser4_key * key, void *data, int length) -+{ -+ int result; -+ coord_t coord; -+ lock_handle lh; -+ -+ init_lh(&lh); -+ result = coord_by_key(tree, key, -+ &coord, &lh, ZNODE_READ_LOCK, -+ FIND_EXACT, -+ LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL); -+ if (result == 0) { -+ int ilen; -+ -+ result = zload(coord.node); -+ if (result == 0) { -+ ilen = item_length_by_coord(&coord); -+ if (length <= ilen) { -+ memcpy(item_body_by_coord(&coord), data, -+ length); -+ } else { -+ warning("nikita-3437", -+ "Wrong black box length: %i < %i", -+ ilen, length); -+ result = RETERR(-EIO); -+ } -+ zrelse(coord.node); -+ } -+ } -+ -+ done_lh(&lh); -+ return result; -+ -+} -+ -+int kill_black_box(reiser4_tree * tree, const reiser4_key * key) -+{ -+ return reiser4_cut_tree(tree, key, key, NULL, 1); -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/blackbox.h linux-2.6.20/fs/reiser4/plugin/item/blackbox.h ---- linux-2.6.20.orig/fs/reiser4/plugin/item/blackbox.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/item/blackbox.h 2007-05-06 14:50:43.799006970 +0400 -@@ -0,0 +1,33 @@ -+/* Copyright 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* "Black box" entry to fixed-width contain user supplied data */ -+ -+#if !defined( __FS_REISER4_BLACK_BOX_H__ ) -+#define __FS_REISER4_BLACK_BOX_H__ -+ -+#include "../../forward.h" -+#include "../../dformat.h" -+#include "../../kassign.h" -+#include "../../key.h" -+ -+extern int store_black_box(reiser4_tree * tree, -+ const reiser4_key * key, void *data, int length); -+extern int load_black_box(reiser4_tree * tree, -+ reiser4_key * key, void *data, int length, int exact); -+extern int kill_black_box(reiser4_tree * tree, const reiser4_key * key); -+extern int update_black_box(reiser4_tree * tree, -+ const reiser4_key * key, void *data, int length); -+ -+/* __FS_REISER4_BLACK_BOX_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/cde.c linux-2.6.20/fs/reiser4/plugin/item/cde.c ---- linux-2.6.20.orig/fs/reiser4/plugin/item/cde.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/item/cde.c 2007-05-06 14:50:43.799006970 +0400 -@@ -0,0 +1,1008 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Directory entry implementation */ -+ -+/* DESCRIPTION: -+ -+ This is "compound" directory item plugin implementation. This directory -+ item type is compound (as opposed to the "simple directory item" in -+ fs/reiser4/plugin/item/sde.[ch]), because it consists of several directory -+ entries. -+ -+ The reason behind this decision is disk space efficiency: all directory -+ entries inside the same directory have identical fragment in their -+ keys. This, of course, depends on key assignment policy. In our default key -+ assignment policy, all directory entries have the same locality which is -+ equal to the object id of their directory. -+ -+ Composing directory item out of several directory entries for the same -+ directory allows us to store said key fragment only once. That is, this is -+ some ad hoc form of key compression (stem compression) that is implemented -+ here, because general key compression is not supposed to be implemented in -+ v4.0. -+ -+ Another decision that was made regarding all directory item plugins, is -+ that they will store entry keys unaligned. This is for that sake of disk -+ space efficiency again. -+ -+ In should be noted, that storing keys unaligned increases CPU consumption, -+ at least on some architectures. -+ -+ Internal on-disk structure of the compound directory item is the following: -+ -+ HEADER cde_item_format. Here number of entries is stored. -+ ENTRY_HEADER_0 cde_unit_header. Here part of entry key and -+ ENTRY_HEADER_1 offset of entry body are stored. -+ ENTRY_HEADER_2 (basically two last parts of key) -+ ... -+ ENTRY_HEADER_N -+ ENTRY_BODY_0 directory_entry_format. Here part of stat data key and -+ ENTRY_BODY_1 NUL-terminated name are stored. -+ ENTRY_BODY_2 (part of statadta key in the -+ sence that since all SDs have -+ zero offset, this offset is not -+ stored on disk). -+ ... -+ ENTRY_BODY_N -+ -+ When it comes to the balancing, each directory entry in compound directory -+ item is unit, that is, something that can be cut from one item and pasted -+ into another item of the same type. Handling of unit cut and paste is major -+ reason for the complexity of code below. -+ -+*/ -+ -+#include "../../forward.h" -+#include "../../debug.h" -+#include "../../dformat.h" -+#include "../../kassign.h" -+#include "../../key.h" -+#include "../../coord.h" -+#include "sde.h" -+#include "cde.h" -+#include "item.h" -+#include "../node/node.h" -+#include "../plugin.h" -+#include "../../znode.h" -+#include "../../carry.h" -+#include "../../tree.h" -+#include "../../inode.h" -+ -+#include /* for struct inode */ -+#include /* for struct dentry */ -+#include -+ -+#if 0 -+#define CHECKME(coord) \ -+({ \ -+ const char *message; \ -+ coord_t dup; \ -+ \ -+ coord_dup_nocheck(&dup, (coord)); \ -+ dup.unit_pos = 0; \ -+ assert("nikita-2871", cde_check(&dup, &message) == 0); \ -+}) -+#else -+#define CHECKME(coord) noop -+#endif -+ -+/* return body of compound directory item at @coord */ -+static inline cde_item_format *formatted_at(const coord_t * coord) -+{ -+ assert("nikita-1282", coord != NULL); -+ return item_body_by_coord(coord); -+} -+ -+/* return entry header at @coord */ -+static inline cde_unit_header *header_at(const coord_t * -+ coord /* coord of item */ , -+ int idx /* index of unit */ ) -+{ -+ assert("nikita-1283", coord != NULL); -+ return &formatted_at(coord)->entry[idx]; -+} -+ -+/* return number of units in compound directory item at @coord */ -+static int units(const coord_t * coord /* coord of item */ ) -+{ -+ return le16_to_cpu(get_unaligned(&formatted_at(coord)->num_of_entries)); -+} -+ -+/* return offset of the body of @idx-th entry in @coord */ -+static unsigned int offset_of(const coord_t * coord /* coord of item */ , -+ int idx /* index of unit */ ) -+{ -+ if (idx < units(coord)) -+ return le16_to_cpu(get_unaligned(&header_at(coord, idx)->offset)); -+ else if (idx == units(coord)) -+ return item_length_by_coord(coord); -+ else -+ impossible("nikita-1308", "Wrong idx"); -+ return 0; -+} -+ -+/* set offset of the body of @idx-th entry in @coord */ -+static void set_offset(const coord_t * coord /* coord of item */ , -+ int idx /* index of unit */ , -+ unsigned int offset /* new offset */ ) -+{ -+ put_unaligned(cpu_to_le16((__u16) offset), &header_at(coord, idx)->offset); -+} -+ -+static void adj_offset(const coord_t * coord /* coord of item */ , -+ int idx /* index of unit */ , -+ int delta /* offset change */ ) -+{ -+ d16 *doffset; -+ __u16 offset; -+ -+ doffset = &header_at(coord, idx)->offset; -+ offset = le16_to_cpu(get_unaligned(doffset)); -+ offset += delta; -+ put_unaligned(cpu_to_le16((__u16) offset), doffset); -+} -+ -+/* return pointer to @offset-th byte from the beginning of @coord */ -+static char *address(const coord_t * coord /* coord of item */ , -+ int offset) -+{ -+ return ((char *)item_body_by_coord(coord)) + offset; -+} -+ -+/* return pointer to the body of @idx-th entry in @coord */ -+static directory_entry_format *entry_at(const coord_t * coord /* coord of -+ * item */ , -+ int idx /* index of unit */ ) -+{ -+ return (directory_entry_format *) address(coord, -+ (int)offset_of(coord, idx)); -+} -+ -+/* return number of unit referenced by @coord */ -+static int idx_of(const coord_t * coord /* coord of item */ ) -+{ -+ assert("nikita-1285", coord != NULL); -+ return coord->unit_pos; -+} -+ -+/* find position where entry with @entry_key would be inserted into @coord */ -+static int find(const coord_t * coord /* coord of item */ , -+ const reiser4_key * entry_key /* key to look for */ , -+ cmp_t * last /* result of last comparison */ ) -+{ -+ int entries; -+ -+ int left; -+ int right; -+ -+ cde_unit_header *header; -+ -+ assert("nikita-1295", coord != NULL); -+ assert("nikita-1296", entry_key != NULL); -+ assert("nikita-1297", last != NULL); -+ -+ entries = units(coord); -+ left = 0; -+ right = entries - 1; -+ while (right - left >= REISER4_SEQ_SEARCH_BREAK) { -+ int median; -+ -+ median = (left + right) >> 1; -+ -+ header = header_at(coord, median); -+ *last = de_id_key_cmp(&header->hash, entry_key); -+ switch (*last) { -+ case LESS_THAN: -+ left = median; -+ break; -+ case GREATER_THAN: -+ right = median; -+ break; -+ case EQUAL_TO:{ -+ do { -+ median--; -+ header--; -+ } while (median >= 0 && -+ de_id_key_cmp(&header->hash, -+ entry_key) == EQUAL_TO); -+ return median + 1; -+ } -+ } -+ } -+ header = header_at(coord, left); -+ for (; left < entries; ++left, ++header) { -+ prefetch(header + 1); -+ *last = de_id_key_cmp(&header->hash, entry_key); -+ if (*last != LESS_THAN) -+ break; -+ } -+ if (left < entries) -+ return left; -+ else -+ return RETERR(-ENOENT); -+ -+} -+ -+/* expand @coord as to accommodate for insertion of @no new entries starting -+ from @pos, with total bodies size @size. */ -+static int expand_item(const coord_t * coord /* coord of item */ , -+ int pos /* unit position */ , int no /* number of new -+ * units*/ , -+ int size /* total size of new units' data */ , -+ unsigned int data_size /* free space already reserved -+ * in the item for insertion */ ) -+{ -+ int entries; -+ cde_unit_header *header; -+ char *dent; -+ int i; -+ -+ assert("nikita-1310", coord != NULL); -+ assert("nikita-1311", pos >= 0); -+ assert("nikita-1312", no > 0); -+ assert("nikita-1313", data_size >= no * sizeof(directory_entry_format)); -+ assert("nikita-1343", -+ item_length_by_coord(coord) >= -+ (int)(size + data_size + no * sizeof *header)); -+ -+ entries = units(coord); -+ -+ if (pos == entries) -+ dent = address(coord, size); -+ else -+ dent = (char *)entry_at(coord, pos); -+ /* place where new header will be in */ -+ header = header_at(coord, pos); -+ /* free space for new entry headers */ -+ memmove(header + no, header, -+ (unsigned)(address(coord, size) - (char *)header)); -+ /* if adding to the end initialise first new header */ -+ if (pos == entries) { -+ set_offset(coord, pos, (unsigned)size); -+ } -+ -+ /* adjust entry pointer and size */ -+ dent = dent + no * sizeof *header; -+ size += no * sizeof *header; -+ /* free space for new entries */ -+ memmove(dent + data_size, dent, -+ (unsigned)(address(coord, size) - dent)); -+ -+ /* increase counter */ -+ entries += no; -+ put_unaligned(cpu_to_le16((__u16) entries), &formatted_at(coord)->num_of_entries); -+ -+ /* [ 0 ... pos ] entries were shifted by no * ( sizeof *header ) -+ bytes. */ -+ for (i = 0; i <= pos; ++i) -+ adj_offset(coord, i, no * sizeof *header); -+ /* [ pos + no ... +\infty ) entries were shifted by ( no * -+ sizeof *header + data_size ) bytes */ -+ for (i = pos + no; i < entries; ++i) -+ adj_offset(coord, i, no * sizeof *header + data_size); -+ return 0; -+} -+ -+/* insert new @entry into item */ -+static int expand(const coord_t * coord /* coord of item */ , -+ cde_entry * entry /* entry to insert */ , -+ int len /* length of @entry data */ , -+ int *pos /* position to insert */ , -+ reiser4_dir_entry_desc * dir_entry /* parameters for new -+ * entry */ ) -+{ -+ cmp_t cmp_res; -+ int datasize; -+ -+ *pos = find(coord, &dir_entry->key, &cmp_res); -+ if (*pos < 0) -+ *pos = units(coord); -+ -+ datasize = sizeof(directory_entry_format); -+ if (is_longname(entry->name->name, entry->name->len)) -+ datasize += entry->name->len + 1; -+ -+ expand_item(coord, *pos, 1, item_length_by_coord(coord) - len, -+ datasize); -+ return 0; -+} -+ -+/* paste body of @entry into item */ -+static int paste_entry(const coord_t * coord /* coord of item */ , -+ cde_entry * entry /* new entry */ , -+ int pos /* position to insert */ , -+ reiser4_dir_entry_desc * dir_entry /* parameters for -+ * new entry */ ) -+{ -+ cde_unit_header *header; -+ directory_entry_format *dent; -+ const char *name; -+ int len; -+ -+ header = header_at(coord, pos); -+ dent = entry_at(coord, pos); -+ -+ build_de_id_by_key(&dir_entry->key, &header->hash); -+ build_inode_key_id(entry->obj, &dent->id); -+ /* AUDIT unsafe strcpy() operation! It should be replaced with -+ much less CPU hungry -+ memcpy( ( char * ) dent -> name, entry -> name -> name , entry -> name -> len ); -+ -+ Also a more major thing is that there should be a way to figure out -+ amount of space in dent -> name and be able to check that we are -+ not going to overwrite more than we supposed to */ -+ name = entry->name->name; -+ len = entry->name->len; -+ if (is_longname(name, len)) { -+ strcpy((unsigned char *)dent->name, name); -+ put_unaligned(0, &dent->name[len]); -+ } -+ return 0; -+} -+ -+/* estimate how much space is necessary in item to insert/paste set of entries -+ described in @data. */ -+int estimate_cde(const coord_t * coord /* coord of item */ , -+ const reiser4_item_data * data /* parameters for new item */ ) -+{ -+ cde_entry_data *e; -+ int result; -+ int i; -+ -+ e = (cde_entry_data *) data->data; -+ -+ assert("nikita-1288", e != NULL); -+ assert("nikita-1289", e->num_of_entries >= 0); -+ -+ if (coord == NULL) -+ /* insert */ -+ result = sizeof(cde_item_format); -+ else -+ /* paste */ -+ result = 0; -+ -+ result += e->num_of_entries * -+ (sizeof(cde_unit_header) + sizeof(directory_entry_format)); -+ for (i = 0; i < e->num_of_entries; ++i) { -+ const char *name; -+ int len; -+ -+ name = e->entry[i].name->name; -+ len = e->entry[i].name->len; -+ assert("nikita-2054", strlen(name) == len); -+ if (is_longname(name, len)) -+ result += len + 1; -+ } -+ ((reiser4_item_data *) data)->length = result; -+ return result; -+} -+ -+/* ->nr_units() method for this item plugin. */ -+pos_in_node_t nr_units_cde(const coord_t * coord /* coord of item */ ) -+{ -+ return units(coord); -+} -+ -+/* ->unit_key() method for this item plugin. */ -+reiser4_key *unit_key_cde(const coord_t * coord /* coord of item */ , -+ reiser4_key * key /* resulting key */ ) -+{ -+ assert("nikita-1452", coord != NULL); -+ assert("nikita-1345", idx_of(coord) < units(coord)); -+ assert("nikita-1346", key != NULL); -+ -+ item_key_by_coord(coord, key); -+ extract_key_from_de_id(extract_dir_id_from_key(key), -+ &header_at(coord, idx_of(coord))->hash, key); -+ return key; -+} -+ -+/* mergeable_cde(): implementation of ->mergeable() item method. -+ -+ Two directory items are mergeable iff they are from the same -+ directory. That simple. -+ -+*/ -+int mergeable_cde(const coord_t * p1 /* coord of first item */ , -+ const coord_t * p2 /* coord of second item */ ) -+{ -+ reiser4_key k1; -+ reiser4_key k2; -+ -+ assert("nikita-1339", p1 != NULL); -+ assert("nikita-1340", p2 != NULL); -+ -+ return -+ (item_plugin_by_coord(p1) == item_plugin_by_coord(p2)) && -+ (extract_dir_id_from_key(item_key_by_coord(p1, &k1)) == -+ extract_dir_id_from_key(item_key_by_coord(p2, &k2))); -+ -+} -+ -+/* ->max_key_inside() method for this item plugin. */ -+reiser4_key *max_key_inside_cde(const coord_t * coord /* coord of item */ , -+ reiser4_key * result /* resulting key */ ) -+{ -+ assert("nikita-1342", coord != NULL); -+ -+ item_key_by_coord(coord, result); -+ set_key_ordering(result, get_key_ordering(reiser4_max_key())); -+ set_key_fulloid(result, get_key_fulloid(reiser4_max_key())); -+ set_key_offset(result, get_key_offset(reiser4_max_key())); -+ return result; -+} -+ -+/* @data contains data which are to be put into tree */ -+int can_contain_key_cde(const coord_t * coord /* coord of item */ , -+ const reiser4_key * key /* key to check */ , -+ const reiser4_item_data * data /* parameters of new -+ * item/unit being -+ * created */ ) -+{ -+ reiser4_key item_key; -+ -+ /* FIXME-VS: do not rely on anything but iplug field of @data. Only -+ data->iplug is initialized */ -+ assert("vs-457", data && data->iplug); -+/* assert( "vs-553", data -> user == 0 );*/ -+ item_key_by_coord(coord, &item_key); -+ -+ return (item_plugin_by_coord(coord) == data->iplug) && -+ (extract_dir_id_from_key(&item_key) == -+ extract_dir_id_from_key(key)); -+} -+ -+#if REISER4_DEBUG -+/* cde_check ->check() method for compressed directory items -+ -+ used for debugging, every item should have here the most complete -+ possible check of the consistency of the item that the inventor can -+ construct -+*/ -+int reiser4_check_cde(const coord_t * coord /* coord of item to check */, -+ const char **error /* where to store error message */) -+{ -+ int i; -+ int result; -+ char *item_start; -+ char *item_end; -+ reiser4_key key; -+ -+ coord_t c; -+ -+ assert("nikita-1357", coord != NULL); -+ assert("nikita-1358", error != NULL); -+ -+ if (!ergo(coord->item_pos != 0, -+ is_dot_key(item_key_by_coord(coord, &key)))) { -+ *error = "CDE doesn't start with dot"; -+ return -1; -+ } -+ item_start = item_body_by_coord(coord); -+ item_end = item_start + item_length_by_coord(coord); -+ -+ coord_dup(&c, coord); -+ result = 0; -+ for (i = 0; i < units(coord); ++i) { -+ directory_entry_format *entry; -+ -+ if ((char *)(header_at(coord, i) + 1) > -+ item_end - units(coord) * sizeof *entry) { -+ *error = "CDE header is out of bounds"; -+ result = -1; -+ break; -+ } -+ entry = entry_at(coord, i); -+ if ((char *)entry < item_start + sizeof(cde_item_format)) { -+ *error = "CDE header is too low"; -+ result = -1; -+ break; -+ } -+ if ((char *)(entry + 1) > item_end) { -+ *error = "CDE header is too high"; -+ result = -1; -+ break; -+ } -+ } -+ -+ return result; -+} -+#endif -+ -+/* ->init() method for this item plugin. */ -+int init_cde(coord_t * coord /* coord of item */ , -+ coord_t * from UNUSED_ARG, reiser4_item_data * data /* structure used for insertion */ -+ UNUSED_ARG) -+{ -+ put_unaligned(cpu_to_le16(0), &formatted_at(coord)->num_of_entries); -+ return 0; -+} -+ -+/* ->lookup() method for this item plugin. */ -+lookup_result lookup_cde(const reiser4_key * key /* key to search for */ , -+ lookup_bias bias /* search bias */ , -+ coord_t * coord /* coord of item to lookup in */ ) -+{ -+ cmp_t last_comp; -+ int pos; -+ -+ reiser4_key utmost_key; -+ -+ assert("nikita-1293", coord != NULL); -+ assert("nikita-1294", key != NULL); -+ -+ CHECKME(coord); -+ -+ if (keygt(item_key_by_coord(coord, &utmost_key), key)) { -+ coord->unit_pos = 0; -+ coord->between = BEFORE_UNIT; -+ return CBK_COORD_NOTFOUND; -+ } -+ pos = find(coord, key, &last_comp); -+ if (pos >= 0) { -+ coord->unit_pos = (int)pos; -+ switch (last_comp) { -+ case EQUAL_TO: -+ coord->between = AT_UNIT; -+ return CBK_COORD_FOUND; -+ case GREATER_THAN: -+ coord->between = BEFORE_UNIT; -+ return RETERR(-ENOENT); -+ case LESS_THAN: -+ default: -+ impossible("nikita-1298", "Broken find"); -+ return RETERR(-EIO); -+ } -+ } else { -+ coord->unit_pos = units(coord) - 1; -+ coord->between = AFTER_UNIT; -+ return (bias == -+ FIND_MAX_NOT_MORE_THAN) ? CBK_COORD_FOUND : -+ CBK_COORD_NOTFOUND; -+ } -+} -+ -+/* ->paste() method for this item plugin. */ -+int paste_cde(coord_t * coord /* coord of item */ , -+ reiser4_item_data * data /* parameters of new unit being -+ * inserted */ , -+ carry_plugin_info * info UNUSED_ARG /* todo carry queue */ ) -+{ -+ cde_entry_data *e; -+ int result; -+ int i; -+ -+ CHECKME(coord); -+ e = (cde_entry_data *) data->data; -+ -+ result = 0; -+ for (i = 0; i < e->num_of_entries; ++i) { -+ int pos; -+ int phantom_size; -+ -+ phantom_size = data->length; -+ if (units(coord) == 0) -+ phantom_size -= sizeof(cde_item_format); -+ -+ result = -+ expand(coord, e->entry + i, phantom_size, &pos, data->arg); -+ if (result != 0) -+ break; -+ result = paste_entry(coord, e->entry + i, pos, data->arg); -+ if (result != 0) -+ break; -+ } -+ CHECKME(coord); -+ return result; -+} -+ -+/* amount of space occupied by all entries starting from @idx both headers and -+ bodies. */ -+static unsigned int part_size(const coord_t * coord /* coord of item */ , -+ int idx /* index of unit */ ) -+{ -+ assert("nikita-1299", coord != NULL); -+ assert("nikita-1300", idx < (int)units(coord)); -+ -+ return sizeof(cde_item_format) + -+ (idx + 1) * sizeof(cde_unit_header) + offset_of(coord, -+ idx + 1) - -+ offset_of(coord, 0); -+} -+ -+/* how many but not more than @want units of @source can be merged with -+ item in @target node. If pend == append - we try to append last item -+ of @target by first units of @source. If pend == prepend - we try to -+ "prepend" first item in @target by last units of @source. @target -+ node has @free_space bytes of free space. Total size of those units -+ are returned via @size */ -+int can_shift_cde(unsigned free_space /* free space in item */ , -+ coord_t * coord /* coord of source item */ , -+ znode * target /* target node */ , -+ shift_direction pend /* shift direction */ , -+ unsigned *size /* resulting number of shifted bytes */ , -+ unsigned want /* maximal number of bytes to shift */ ) -+{ -+ int shift; -+ -+ CHECKME(coord); -+ if (want == 0) { -+ *size = 0; -+ return 0; -+ } -+ -+ /* pend == SHIFT_LEFT <==> shifting to the left */ -+ if (pend == SHIFT_LEFT) { -+ for (shift = min((int)want - 1, units(coord)); shift >= 0; -+ --shift) { -+ *size = part_size(coord, shift); -+ if (target != NULL) -+ *size -= sizeof(cde_item_format); -+ if (*size <= free_space) -+ break; -+ } -+ shift = shift + 1; -+ } else { -+ int total_size; -+ -+ assert("nikita-1301", pend == SHIFT_RIGHT); -+ -+ total_size = item_length_by_coord(coord); -+ for (shift = units(coord) - want - 1; shift < units(coord) - 1; -+ ++shift) { -+ *size = total_size - part_size(coord, shift); -+ if (target == NULL) -+ *size += sizeof(cde_item_format); -+ if (*size <= free_space) -+ break; -+ } -+ shift = units(coord) - shift - 1; -+ } -+ if (shift == 0) -+ *size = 0; -+ CHECKME(coord); -+ return shift; -+} -+ -+/* ->copy_units() method for this item plugin. */ -+void copy_units_cde(coord_t * target /* coord of target item */ , -+ coord_t * source /* coord of source item */ , -+ unsigned from /* starting unit */ , -+ unsigned count /* how many units to copy */ , -+ shift_direction where_is_free_space /* shift direction */ , -+ unsigned free_space /* free space in item */ ) -+{ -+ char *header_from; -+ char *header_to; -+ -+ char *entry_from; -+ char *entry_to; -+ -+ int pos_in_target; -+ int data_size; -+ int data_delta; -+ int i; -+ -+ assert("nikita-1303", target != NULL); -+ assert("nikita-1304", source != NULL); -+ assert("nikita-1305", (int)from < units(source)); -+ assert("nikita-1307", (int)(from + count) <= units(source)); -+ -+ if (where_is_free_space == SHIFT_LEFT) { -+ assert("nikita-1453", from == 0); -+ pos_in_target = units(target); -+ } else { -+ assert("nikita-1309", (int)(from + count) == units(source)); -+ pos_in_target = 0; -+ memmove(item_body_by_coord(target), -+ (char *)item_body_by_coord(target) + free_space, -+ item_length_by_coord(target) - free_space); -+ } -+ -+ CHECKME(target); -+ CHECKME(source); -+ -+ /* expand @target */ -+ data_size = -+ offset_of(source, (int)(from + count)) - offset_of(source, -+ (int)from); -+ -+ if (units(target) == 0) -+ free_space -= sizeof(cde_item_format); -+ -+ expand_item(target, pos_in_target, (int)count, -+ (int)(item_length_by_coord(target) - free_space), -+ (unsigned)data_size); -+ -+ /* copy first @count units of @source into @target */ -+ data_delta = -+ offset_of(target, pos_in_target) - offset_of(source, (int)from); -+ -+ /* copy entries */ -+ entry_from = (char *)entry_at(source, (int)from); -+ entry_to = (char *)entry_at(source, (int)(from + count)); -+ memmove(entry_at(target, pos_in_target), entry_from, -+ (unsigned)(entry_to - entry_from)); -+ -+ /* copy headers */ -+ header_from = (char *)header_at(source, (int)from); -+ header_to = (char *)header_at(source, (int)(from + count)); -+ memmove(header_at(target, pos_in_target), header_from, -+ (unsigned)(header_to - header_from)); -+ -+ /* update offsets */ -+ for (i = pos_in_target; i < (int)(pos_in_target + count); ++i) -+ adj_offset(target, i, data_delta); -+ CHECKME(target); -+ CHECKME(source); -+} -+ -+/* ->cut_units() method for this item plugin. */ -+int cut_units_cde(coord_t * coord /* coord of item */ , -+ pos_in_node_t from /* start unit pos */ , -+ pos_in_node_t to /* stop unit pos */ , -+ struct carry_cut_data *cdata UNUSED_ARG, -+ reiser4_key * smallest_removed, reiser4_key * new_first) -+{ -+ char *header_from; -+ char *header_to; -+ -+ char *entry_from; -+ char *entry_to; -+ -+ int size; -+ int entry_delta; -+ int header_delta; -+ int i; -+ -+ unsigned count; -+ -+ CHECKME(coord); -+ -+ count = to - from + 1; -+ -+ assert("nikita-1454", coord != NULL); -+ assert("nikita-1455", (int)(from + count) <= units(coord)); -+ -+ if (smallest_removed) -+ unit_key_by_coord(coord, smallest_removed); -+ -+ if (new_first) { -+ coord_t next; -+ -+ /* not everything is cut from item head */ -+ assert("vs-1527", from == 0); -+ assert("vs-1528", to < units(coord) - 1); -+ -+ coord_dup(&next, coord); -+ next.unit_pos++; -+ unit_key_by_coord(&next, new_first); -+ } -+ -+ size = item_length_by_coord(coord); -+ if (count == (unsigned)units(coord)) { -+ return size; -+ } -+ -+ header_from = (char *)header_at(coord, (int)from); -+ header_to = (char *)header_at(coord, (int)(from + count)); -+ -+ entry_from = (char *)entry_at(coord, (int)from); -+ entry_to = (char *)entry_at(coord, (int)(from + count)); -+ -+ /* move headers */ -+ memmove(header_from, header_to, -+ (unsigned)(address(coord, size) - header_to)); -+ -+ header_delta = header_to - header_from; -+ -+ entry_from -= header_delta; -+ entry_to -= header_delta; -+ size -= header_delta; -+ -+ /* copy entries */ -+ memmove(entry_from, entry_to, -+ (unsigned)(address(coord, size) - entry_to)); -+ -+ entry_delta = entry_to - entry_from; -+ size -= entry_delta; -+ -+ /* update offsets */ -+ -+ for (i = 0; i < (int)from; ++i) -+ adj_offset(coord, i, -header_delta); -+ -+ for (i = from; i < units(coord) - (int)count; ++i) -+ adj_offset(coord, i, -header_delta - entry_delta); -+ -+ put_unaligned(cpu_to_le16((__u16) units(coord) - count), -+ &formatted_at(coord)->num_of_entries); -+ -+ if (from == 0) { -+ /* entries from head was removed - move remaining to right */ -+ memmove((char *)item_body_by_coord(coord) + -+ header_delta + entry_delta, item_body_by_coord(coord), -+ (unsigned)size); -+ if (REISER4_DEBUG) -+ memset(item_body_by_coord(coord), 0, -+ (unsigned)header_delta + entry_delta); -+ } else { -+ /* freed space is already at the end of item */ -+ if (REISER4_DEBUG) -+ memset((char *)item_body_by_coord(coord) + size, 0, -+ (unsigned)header_delta + entry_delta); -+ } -+ -+ return header_delta + entry_delta; -+} -+ -+int kill_units_cde(coord_t * coord /* coord of item */ , -+ pos_in_node_t from /* start unit pos */ , -+ pos_in_node_t to /* stop unit pos */ , -+ struct carry_kill_data *kdata UNUSED_ARG, -+ reiser4_key * smallest_removed, reiser4_key * new_first) -+{ -+ return cut_units_cde(coord, from, to, NULL, smallest_removed, new_first); -+} -+ -+/* ->s.dir.extract_key() method for this item plugin. */ -+int extract_key_cde(const coord_t * coord /* coord of item */ , -+ reiser4_key * key /* resulting key */ ) -+{ -+ directory_entry_format *dent; -+ -+ assert("nikita-1155", coord != NULL); -+ assert("nikita-1156", key != NULL); -+ -+ dent = entry_at(coord, idx_of(coord)); -+ return extract_key_from_id(&dent->id, key); -+} -+ -+int -+update_key_cde(const coord_t * coord, const reiser4_key * key, -+ lock_handle * lh UNUSED_ARG) -+{ -+ directory_entry_format *dent; -+ obj_key_id obj_id; -+ int result; -+ -+ assert("nikita-2344", coord != NULL); -+ assert("nikita-2345", key != NULL); -+ -+ dent = entry_at(coord, idx_of(coord)); -+ result = build_obj_key_id(key, &obj_id); -+ if (result == 0) { -+ dent->id = obj_id; -+ znode_make_dirty(coord->node); -+ } -+ return 0; -+} -+ -+/* ->s.dir.extract_name() method for this item plugin. */ -+char *extract_name_cde(const coord_t * coord /* coord of item */ , char *buf) -+{ -+ directory_entry_format *dent; -+ -+ assert("nikita-1157", coord != NULL); -+ -+ dent = entry_at(coord, idx_of(coord)); -+ return extract_dent_name(coord, dent, buf); -+} -+ -+static int cde_bytes(int pasting, const reiser4_item_data * data) -+{ -+ int result; -+ -+ result = data->length; -+ if (!pasting) -+ result -= sizeof(cde_item_format); -+ return result; -+} -+ -+/* ->s.dir.add_entry() method for this item plugin */ -+int add_entry_cde(struct inode *dir /* directory object */ , -+ coord_t * coord /* coord of item */ , -+ lock_handle * lh /* lock handle for insertion */ , -+ const struct dentry *name /* name to insert */ , -+ reiser4_dir_entry_desc * dir_entry /* parameters of new -+ * directory entry */ ) -+{ -+ reiser4_item_data data; -+ cde_entry entry; -+ cde_entry_data edata; -+ int result; -+ -+ assert("nikita-1656", coord->node == lh->node); -+ assert("nikita-1657", znode_is_write_locked(coord->node)); -+ -+ edata.num_of_entries = 1; -+ edata.entry = &entry; -+ -+ entry.dir = dir; -+ entry.obj = dir_entry->obj; -+ entry.name = &name->d_name; -+ -+ data.data = (char *)&edata; -+ data.user = 0; /* &edata is not user space */ -+ data.iplug = item_plugin_by_id(COMPOUND_DIR_ID); -+ data.arg = dir_entry; -+ assert("nikita-1302", data.iplug != NULL); -+ -+ result = is_dot_key(&dir_entry->key); -+ data.length = estimate_cde(result ? coord : NULL, &data); -+ -+ /* NOTE-NIKITA quota plugin? */ -+ if (DQUOT_ALLOC_SPACE_NODIRTY(dir, cde_bytes(result, &data))) -+ return RETERR(-EDQUOT); -+ -+ if (result) -+ result = insert_by_coord(coord, &data, &dir_entry->key, lh, 0); -+ else -+ result = reiser4_resize_item(coord, &data, &dir_entry->key, -+ lh, 0); -+ return result; -+} -+ -+/* ->s.dir.rem_entry() */ -+int rem_entry_cde(struct inode *dir /* directory of item */ , -+ const struct qstr *name, coord_t * coord /* coord of item */ , -+ lock_handle * lh UNUSED_ARG /* lock handle for -+ * removal */ , -+ reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of -+ * directory entry -+ * being removed */ ) -+{ -+ coord_t shadow; -+ int result; -+ int length; -+ ON_DEBUG(char buf[DE_NAME_BUF_LEN]); -+ -+ assert("nikita-2870", strlen(name->name) == name->len); -+ assert("nikita-2869", -+ !strcmp(name->name, extract_name_cde(coord, buf))); -+ -+ length = sizeof(directory_entry_format) + sizeof(cde_unit_header); -+ if (is_longname(name->name, name->len)) -+ length += name->len + 1; -+ -+ if (inode_get_bytes(dir) < length) { -+ warning("nikita-2628", "Dir is broke: %llu: %llu", -+ (unsigned long long)get_inode_oid(dir), -+ inode_get_bytes(dir)); -+ -+ return RETERR(-EIO); -+ } -+ -+ /* cut_node() is supposed to take pointers to _different_ -+ coords, because it will modify them without respect to -+ possible aliasing. To work around this, create temporary copy -+ of @coord. -+ */ -+ coord_dup(&shadow, coord); -+ result = -+ kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0); -+ if (result == 0) { -+ /* NOTE-NIKITA quota plugin? */ -+ DQUOT_FREE_SPACE_NODIRTY(dir, length); -+ } -+ return result; -+} -+ -+/* ->s.dir.max_name_len() method for this item plugin */ -+int max_name_len_cde(const struct inode *dir /* directory */ ) -+{ -+ return -+ reiser4_tree_by_inode(dir)->nplug->max_item_size() - -+ sizeof(directory_entry_format) - sizeof(cde_item_format) - -+ sizeof(cde_unit_header) - 2; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/cde.h linux-2.6.20/fs/reiser4/plugin/item/cde.h ---- linux-2.6.20.orig/fs/reiser4/plugin/item/cde.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/item/cde.h 2007-05-06 14:50:43.803008220 +0400 -@@ -0,0 +1,87 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Compound directory item. See cde.c for description. */ -+ -+#if !defined( __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ ) -+#define __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ -+ -+#include "../../forward.h" -+#include "../../kassign.h" -+#include "../../dformat.h" -+ -+#include /* for struct inode */ -+#include /* for struct dentry, etc */ -+ -+typedef struct cde_unit_header { -+ de_id hash; -+ d16 offset; -+} cde_unit_header; -+ -+typedef struct cde_item_format { -+ d16 num_of_entries; -+ cde_unit_header entry[0]; -+} cde_item_format; -+ -+typedef struct cde_entry { -+ const struct inode *dir; -+ const struct inode *obj; -+ const struct qstr *name; -+} cde_entry; -+ -+typedef struct cde_entry_data { -+ int num_of_entries; -+ cde_entry *entry; -+} cde_entry_data; -+ -+/* plugin->item.b.* */ -+reiser4_key *max_key_inside_cde(const coord_t * coord, reiser4_key * result); -+int can_contain_key_cde(const coord_t * coord, const reiser4_key * key, -+ const reiser4_item_data *); -+int mergeable_cde(const coord_t * p1, const coord_t * p2); -+pos_in_node_t nr_units_cde(const coord_t * coord); -+reiser4_key *unit_key_cde(const coord_t * coord, reiser4_key * key); -+int estimate_cde(const coord_t * coord, const reiser4_item_data * data); -+void print_cde(const char *prefix, coord_t * coord); -+int init_cde(coord_t * coord, coord_t * from, reiser4_item_data * data); -+lookup_result lookup_cde(const reiser4_key * key, lookup_bias bias, -+ coord_t * coord); -+int paste_cde(coord_t * coord, reiser4_item_data * data, -+ carry_plugin_info * info UNUSED_ARG); -+int can_shift_cde(unsigned free_space, coord_t * coord, znode * target, -+ shift_direction pend, unsigned *size, unsigned want); -+void copy_units_cde(coord_t * target, coord_t * source, unsigned from, -+ unsigned count, shift_direction where_is_free_space, -+ unsigned free_space); -+int cut_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ struct carry_cut_data *, reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+int kill_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ struct carry_kill_data *, reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+void print_cde(const char *prefix, coord_t * coord); -+int reiser4_check_cde(const coord_t * coord, const char **error); -+ -+/* plugin->u.item.s.dir.* */ -+int extract_key_cde(const coord_t * coord, reiser4_key * key); -+int update_key_cde(const coord_t * coord, const reiser4_key * key, -+ lock_handle * lh); -+char *extract_name_cde(const coord_t * coord, char *buf); -+int add_entry_cde(struct inode *dir, coord_t * coord, -+ lock_handle * lh, const struct dentry *name, -+ reiser4_dir_entry_desc * entry); -+int rem_entry_cde(struct inode *dir, const struct qstr *name, coord_t * coord, -+ lock_handle * lh, reiser4_dir_entry_desc * entry); -+int max_name_len_cde(const struct inode *dir); -+ -+/* __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/ctail.c linux-2.6.20/fs/reiser4/plugin/item/ctail.c ---- linux-2.6.20.orig/fs/reiser4/plugin/item/ctail.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/item/ctail.c 2007-05-06 14:50:43.803008220 +0400 -@@ -0,0 +1,1570 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* ctails (aka "clustered tails") are items for cryptcompress objects */ -+ -+/* DESCRIPTION: -+ -+Each cryptcompress object is stored on disk as a set of clusters sliced -+into ctails. -+ -+Internal on-disk structure: -+ -+ HEADER (1) Here stored disk cluster shift -+ BODY -+*/ -+ -+#include "../../forward.h" -+#include "../../debug.h" -+#include "../../dformat.h" -+#include "../../kassign.h" -+#include "../../key.h" -+#include "../../coord.h" -+#include "item.h" -+#include "../node/node.h" -+#include "../plugin.h" -+#include "../object.h" -+#include "../../znode.h" -+#include "../../carry.h" -+#include "../../tree.h" -+#include "../../inode.h" -+#include "../../super.h" -+#include "../../context.h" -+#include "../../page_cache.h" -+#include "../cluster.h" -+#include "../../flush.h" -+#include "../../tree_walk.h" -+ -+#include -+#include -+#include -+ -+/* return body of ctail item at @coord */ -+static ctail_item_format *ctail_formatted_at(const coord_t * coord) -+{ -+ assert("edward-60", coord != NULL); -+ return item_body_by_coord(coord); -+} -+ -+static int cluster_shift_by_coord(const coord_t * coord) -+{ -+ return get_unaligned(&ctail_formatted_at(coord)->cluster_shift); -+} -+ -+static inline void dclust_set_extension_shift(hint_t * hint) -+{ -+ assert("edward-1270", -+ item_id_by_coord(&hint->ext_coord.coord) == CTAIL_ID); -+ hint->ext_coord.extension.ctail.shift = -+ cluster_shift_by_coord(&hint->ext_coord.coord); -+} -+ -+static loff_t off_by_coord(const coord_t * coord) -+{ -+ reiser4_key key; -+ return get_key_offset(item_key_by_coord(coord, &key)); -+} -+ -+int coord_is_unprepped_ctail(const coord_t * coord) -+{ -+ assert("edward-1233", coord != NULL); -+ assert("edward-1234", item_id_by_coord(coord) == CTAIL_ID); -+ assert("edward-1235", -+ ergo((int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT, -+ nr_units_ctail(coord) == (pos_in_node_t) UCTAIL_NR_UNITS)); -+ -+ return (int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT; -+} -+ -+static cloff_t clust_by_coord(const coord_t * coord, struct inode *inode) -+{ -+ int shift; -+ -+ if (inode != NULL) { -+ shift = inode_cluster_shift(inode); -+ assert("edward-1236", -+ ergo(!coord_is_unprepped_ctail(coord), -+ shift == cluster_shift_by_coord(coord))); -+ } else { -+ assert("edward-1237", !coord_is_unprepped_ctail(coord)); -+ shift = cluster_shift_by_coord(coord); -+ } -+ return off_by_coord(coord) >> shift; -+} -+ -+static int disk_cluster_size(const coord_t * coord) -+{ -+ assert("edward-1156", -+ item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID)); -+ /* calculation of disk cluster size -+ is meaninless if ctail is unprepped */ -+ assert("edward-1238", !coord_is_unprepped_ctail(coord)); -+ -+ return 1 << cluster_shift_by_coord(coord); -+} -+ -+/* true if the key is of first disk cluster item */ -+static int is_disk_cluster_key(const reiser4_key * key, const coord_t * coord) -+{ -+ assert("edward-1239", item_id_by_coord(coord) == CTAIL_ID); -+ -+ return coord_is_unprepped_ctail(coord) || -+ ((get_key_offset(key) & -+ ((loff_t) disk_cluster_size(coord) - 1)) == 0); -+} -+ -+static char *first_unit(coord_t * coord) -+{ -+ /* FIXME: warning: pointer of type `void *' used in arithmetic */ -+ return (char *)item_body_by_coord(coord) + sizeof(ctail_item_format); -+} -+ -+/* plugin->u.item.b.max_key_inside : -+ tail_max_key_inside */ -+ -+/* plugin->u.item.b.can_contain_key */ -+int -+can_contain_key_ctail(const coord_t * coord, const reiser4_key * key, -+ const reiser4_item_data * data) -+{ -+ reiser4_key item_key; -+ -+ if (item_plugin_by_coord(coord) != data->iplug) -+ return 0; -+ -+ item_key_by_coord(coord, &item_key); -+ if (get_key_locality(key) != get_key_locality(&item_key) || -+ get_key_objectid(key) != get_key_objectid(&item_key)) -+ return 0; -+ if (get_key_offset(&item_key) + nr_units_ctail(coord) != -+ get_key_offset(key)) -+ return 0; -+ if (is_disk_cluster_key(key, coord)) -+ return 0; -+ return 1; -+} -+ -+/* plugin->u.item.b.mergeable -+ c-tails of different clusters are not mergeable */ -+int mergeable_ctail(const coord_t * p1, const coord_t * p2) -+{ -+ reiser4_key key1, key2; -+ -+ assert("edward-62", item_id_by_coord(p1) == CTAIL_ID); -+ assert("edward-61", plugin_of_group(item_plugin_by_coord(p1), -+ UNIX_FILE_METADATA_ITEM_TYPE)); -+ -+ if (item_id_by_coord(p2) != CTAIL_ID) { -+ /* second item is of another type */ -+ return 0; -+ } -+ -+ item_key_by_coord(p1, &key1); -+ item_key_by_coord(p2, &key2); -+ if (get_key_locality(&key1) != get_key_locality(&key2) || -+ get_key_objectid(&key1) != get_key_objectid(&key2) || -+ get_key_type(&key1) != get_key_type(&key2)) { -+ /* items of different objects */ -+ return 0; -+ } -+ if (get_key_offset(&key1) + nr_units_ctail(p1) != get_key_offset(&key2)) -+ /* not adjacent items */ -+ return 0; -+ if (is_disk_cluster_key(&key2, p2)) -+ return 0; -+ return 1; -+} -+ -+/* plugin->u.item.b.nr_units */ -+pos_in_node_t nr_units_ctail(const coord_t * coord) -+{ -+ return (item_length_by_coord(coord) - -+ sizeof(ctail_formatted_at(coord)->cluster_shift)); -+} -+ -+/* plugin->u.item.b.estimate: -+ estimate how much space is needed to insert/paste @data->length bytes -+ into ctail at @coord */ -+int estimate_ctail(const coord_t * coord /* coord of item */ , -+ const reiser4_item_data * -+ data /* parameters for new item */ ) -+{ -+ if (coord == NULL) -+ /* insert */ -+ return (sizeof(ctail_item_format) + data->length); -+ else -+ /* paste */ -+ return data->length; -+} -+ -+/* ->init() method for this item plugin. */ -+int init_ctail(coord_t * to /* coord of item */ , -+ coord_t * from /* old_item */ , -+ reiser4_item_data * data /* structure used for insertion */ ) -+{ -+ int cluster_shift; /* cpu value to convert */ -+ -+ if (data) { -+ assert("edward-463", data->length > sizeof(ctail_item_format)); -+ cluster_shift = *((int *)(data->arg)); -+ data->length -= sizeof(ctail_item_format); -+ } else { -+ assert("edward-464", from != NULL); -+ assert("edward-855", ctail_ok(from)); -+ cluster_shift = (int)(cluster_shift_by_coord(from)); -+ } -+ put_unaligned((d8)cluster_shift, &ctail_formatted_at(to)->cluster_shift); -+ assert("edward-856", ctail_ok(to)); -+ return 0; -+} -+ -+/* plugin->u.item.b.lookup: -+ NULL: We are looking for item keys only */ -+ -+#if REISER4_DEBUG -+int ctail_ok(const coord_t * coord) -+{ -+ return coord_is_unprepped_ctail(coord) || -+ cluster_shift_ok(cluster_shift_by_coord(coord)); -+} -+ -+/* plugin->u.item.b.check */ -+int check_ctail(const coord_t * coord, const char **error) -+{ -+ if (!ctail_ok(coord)) { -+ if (error) -+ *error = "bad cluster shift in ctail"; -+ return 1; -+ } -+ return 0; -+} -+#endif -+ -+/* plugin->u.item.b.paste */ -+int -+paste_ctail(coord_t * coord, reiser4_item_data * data, -+ carry_plugin_info * info UNUSED_ARG) -+{ -+ unsigned old_nr_units; -+ -+ assert("edward-268", data->data != NULL); -+ /* copy only from kernel space */ -+ assert("edward-66", data->user == 0); -+ -+ old_nr_units = -+ item_length_by_coord(coord) - sizeof(ctail_item_format) - -+ data->length; -+ -+ /* ctail items never get pasted in the middle */ -+ -+ if (coord->unit_pos == 0 && coord->between == AT_UNIT) { -+ -+ /* paste at the beginning when create new item */ -+ assert("edward-450", -+ item_length_by_coord(coord) == -+ data->length + sizeof(ctail_item_format)); -+ assert("edward-451", old_nr_units == 0); -+ } else if (coord->unit_pos == old_nr_units - 1 -+ && coord->between == AFTER_UNIT) { -+ -+ /* paste at the end */ -+ coord->unit_pos++; -+ } else -+ impossible("edward-453", "bad paste position"); -+ -+ memcpy(first_unit(coord) + coord->unit_pos, data->data, data->length); -+ -+ assert("edward-857", ctail_ok(coord)); -+ -+ return 0; -+} -+ -+/* plugin->u.item.b.fast_paste */ -+ -+/* plugin->u.item.b.can_shift -+ number of units is returned via return value, number of bytes via @size. For -+ ctail items they coincide */ -+int -+can_shift_ctail(unsigned free_space, coord_t * source, -+ znode * target, shift_direction direction UNUSED_ARG, -+ unsigned *size /* number of bytes */ , unsigned want) -+{ -+ /* make sure that that we do not want to shift more than we have */ -+ assert("edward-68", want > 0 && want <= nr_units_ctail(source)); -+ -+ *size = min(want, free_space); -+ -+ if (!target) { -+ /* new item will be created */ -+ if (*size <= sizeof(ctail_item_format)) { -+ *size = 0; -+ return 0; -+ } -+ return *size - sizeof(ctail_item_format); -+ } -+ return *size; -+} -+ -+/* plugin->u.item.b.copy_units -+ cooperates with ->can_shift() */ -+void -+copy_units_ctail(coord_t * target, coord_t * source, -+ unsigned from, unsigned count /* units */ , -+ shift_direction where_is_free_space, -+ unsigned free_space /* bytes */ ) -+{ -+ /* make sure that item @target is expanded already */ -+ assert("edward-69", (unsigned)item_length_by_coord(target) >= count); -+ assert("edward-70", free_space == count || free_space == count + 1); -+ -+ assert("edward-858", ctail_ok(source)); -+ -+ if (where_is_free_space == SHIFT_LEFT) { -+ /* append item @target with @count first bytes of @source: -+ this restriction came from ordinary tails */ -+ assert("edward-71", from == 0); -+ assert("edward-860", ctail_ok(target)); -+ -+ memcpy(first_unit(target) + nr_units_ctail(target) - count, -+ first_unit(source), count); -+ } else { -+ /* target item is moved to right already */ -+ reiser4_key key; -+ -+ assert("edward-72", nr_units_ctail(source) == from + count); -+ -+ if (free_space == count) { -+ init_ctail(target, source, NULL); -+ } else { -+ /* new item has been created */ -+ assert("edward-862", ctail_ok(target)); -+ } -+ memcpy(first_unit(target), first_unit(source) + from, count); -+ -+ assert("edward-863", ctail_ok(target)); -+ -+ /* new units are inserted before first unit in an item, -+ therefore, we have to update item key */ -+ item_key_by_coord(source, &key); -+ set_key_offset(&key, get_key_offset(&key) + from); -+ -+ node_plugin_by_node(target->node)->update_item_key(target, &key, -+ NULL /*info */); -+ } -+} -+ -+/* plugin->u.item.b.create_hook */ -+int create_hook_ctail(const coord_t * coord, void *arg) -+{ -+ assert("edward-864", znode_is_loaded(coord->node)); -+ -+ znode_set_convertible(coord->node); -+ return 0; -+} -+ -+/* plugin->u.item.b.kill_hook */ -+int -+kill_hook_ctail(const coord_t * coord, pos_in_node_t from, pos_in_node_t count, -+ carry_kill_data * kdata) -+{ -+ struct inode *inode; -+ -+ assert("edward-1157", item_id_by_coord(coord) == CTAIL_ID); -+ assert("edward-291", znode_is_write_locked(coord->node)); -+ -+ inode = kdata->inode; -+ if (inode) { -+ reiser4_key key; -+ item_key_by_coord(coord, &key); -+ -+ if (from == 0 && is_disk_cluster_key(&key, coord)) { -+ /* disk cluster is killed */ -+ cloff_t start = -+ off_to_clust(get_key_offset(&key), inode); -+ truncate_page_cluster_cryptcompress(inode, start, -+ kdata->params.truncate); -+ inode_sub_bytes(inode, inode_cluster_size(inode)); -+ } -+ } -+ return 0; -+} -+ -+/* for shift_hook_ctail(), -+ return true if the first disk cluster item has dirty child -+*/ -+static int ctail_convertible(const coord_t * coord) -+{ -+ int result; -+ reiser4_key key; -+ jnode *child = NULL; -+ -+ assert("edward-477", coord != NULL); -+ assert("edward-478", item_id_by_coord(coord) == CTAIL_ID); -+ -+ if (coord_is_unprepped_ctail(coord)) -+ /* unprepped ctail should be converted */ -+ return 1; -+ -+ item_key_by_coord(coord, &key); -+ child = jlookup(current_tree, -+ get_key_objectid(&key), -+ off_to_pg(off_by_coord(coord))); -+ if (!child) -+ return 0; -+ result = JF_ISSET(child, JNODE_DIRTY); -+ jput(child); -+ return result; -+} -+ -+/* FIXME-EDWARD */ -+/* plugin->u.item.b.shift_hook */ -+int shift_hook_ctail(const coord_t * item /* coord of item */ , -+ unsigned from UNUSED_ARG /* start unit */ , -+ unsigned count UNUSED_ARG /* stop unit */ , -+ znode * old_node /* old parent */ ) -+{ -+ assert("edward-479", item != NULL); -+ assert("edward-480", item->node != old_node); -+ -+ if (!znode_convertible(old_node) || znode_convertible(item->node)) -+ return 0; -+ if (ctail_convertible(item)) -+ znode_set_convertible(item->node); -+ return 0; -+} -+ -+static int -+cut_or_kill_ctail_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ int cut, void *p, reiser4_key * smallest_removed, -+ reiser4_key * new_first) -+{ -+ pos_in_node_t count; /* number of units to cut */ -+ char *item; -+ -+ count = to - from + 1; -+ item = item_body_by_coord(coord); -+ -+ assert("edward-74", ergo(from != 0, to == coord_last_unit_pos(coord))); -+ -+ if (smallest_removed) { -+ /* store smallest key removed */ -+ item_key_by_coord(coord, smallest_removed); -+ set_key_offset(smallest_removed, -+ get_key_offset(smallest_removed) + from); -+ } -+ -+ if (new_first) { -+ assert("vs-1531", from == 0); -+ -+ item_key_by_coord(coord, new_first); -+ set_key_offset(new_first, -+ get_key_offset(new_first) + from + count); -+ } -+ -+ if (!cut) -+ kill_hook_ctail(coord, from, 0, (struct carry_kill_data *)p); -+ -+ if (from == 0) { -+ if (count != nr_units_ctail(coord)) { -+ /* part of item is removed, so move free space at the beginning -+ of the item and update item key */ -+ reiser4_key key; -+ memcpy(item + to + 1, item, sizeof(ctail_item_format)); -+ item_key_by_coord(coord, &key); -+ set_key_offset(&key, get_key_offset(&key) + count); -+ node_plugin_by_node(coord->node)->update_item_key(coord, -+ &key, -+ NULL); -+ } else { -+ /* cut_units should not be called to cut evrything */ -+ assert("vs-1532", ergo(cut, 0)); -+ /* whole item is cut, so more then amount of space occupied -+ by units got freed */ -+ count += sizeof(ctail_item_format); -+ } -+ if (REISER4_DEBUG) -+ memset(item, 0, count); -+ } else if (REISER4_DEBUG) -+ memset(item + sizeof(ctail_item_format) + from, 0, count); -+ return count; -+} -+ -+/* plugin->u.item.b.cut_units */ -+int -+cut_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to, -+ carry_cut_data * cdata, reiser4_key * smallest_removed, -+ reiser4_key * new_first) -+{ -+ return cut_or_kill_ctail_units(item, from, to, 1, NULL, -+ smallest_removed, new_first); -+} -+ -+/* plugin->u.item.b.kill_units */ -+int -+kill_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to, -+ struct carry_kill_data *kdata, reiser4_key * smallest_removed, -+ reiser4_key * new_first) -+{ -+ return cut_or_kill_ctail_units(item, from, to, 0, kdata, -+ smallest_removed, new_first); -+} -+ -+/* plugin->u.item.s.file.read */ -+int read_ctail(struct file *file UNUSED_ARG, flow_t * f, hint_t * hint) -+{ -+ uf_coord_t *uf_coord; -+ coord_t *coord; -+ -+ uf_coord = &hint->ext_coord; -+ coord = &uf_coord->coord; -+ assert("edward-127", f->user == 0); -+ assert("edward-129", coord && coord->node); -+ assert("edward-130", coord_is_existing_unit(coord)); -+ assert("edward-132", znode_is_loaded(coord->node)); -+ -+ /* start read only from the beginning of ctail */ -+ assert("edward-133", coord->unit_pos == 0); -+ /* read only whole ctails */ -+ assert("edward-135", nr_units_ctail(coord) <= f->length); -+ -+ assert("edward-136", reiser4_schedulable()); -+ assert("edward-886", ctail_ok(coord)); -+ -+ if (f->data) -+ memcpy(f->data, (char *)first_unit(coord), -+ (size_t) nr_units_ctail(coord)); -+ -+ dclust_set_extension_shift(hint); -+ mark_page_accessed(znode_page(coord->node)); -+ move_flow_forward(f, nr_units_ctail(coord)); -+ -+ return 0; -+} -+ -+/* Reads a disk cluster consists of ctail items, -+ attaches a transform stream with plain text */ -+int ctail_read_disk_cluster(reiser4_cluster_t * clust, struct inode *inode, -+ znode_lock_mode mode) -+{ -+ int result; -+ assert("edward-1450", mode == ZNODE_READ_LOCK || ZNODE_WRITE_LOCK); -+ assert("edward-671", clust->hint != NULL); -+ assert("edward-140", clust->dstat == INVAL_DISK_CLUSTER); -+ assert("edward-672", cryptcompress_inode_ok(inode)); -+ -+ /* set input stream */ -+ result = grab_tfm_stream(inode, &clust->tc, INPUT_STREAM); -+ if (result) -+ return result; -+ -+ result = find_disk_cluster(clust, inode, 1 /* read items */, mode); -+ assert("edward-1340", !result); -+ if (result) -+ return result; -+ if (mode == ZNODE_READ_LOCK) -+ /* write still need the lock to insert unprepped -+ items, etc... */ -+ put_hint_cluster(clust, inode, ZNODE_READ_LOCK); -+ -+ if (clust->dstat == FAKE_DISK_CLUSTER || -+ clust->dstat == UNPR_DISK_CLUSTER) { -+ tfm_cluster_set_uptodate(&clust->tc); -+ return 0; -+ } -+ result = grab_coa(&clust->tc, inode_compression_plugin(inode)); -+ if (result) -+ return result; -+ result = reiser4_inflate_cluster(clust, inode); -+ if (result) -+ return result; -+ tfm_cluster_set_uptodate(&clust->tc); -+ return 0; -+} -+ -+/* read one locked page */ -+int do_readpage_ctail(struct inode * inode, reiser4_cluster_t * clust, -+ struct page *page, znode_lock_mode mode) -+{ -+ int ret; -+ unsigned cloff; -+ char *data; -+ size_t pgcnt; -+ tfm_cluster_t *tc = &clust->tc; -+ -+ assert("edward-212", PageLocked(page)); -+ -+ if (PageUptodate(page)) -+ goto exit; -+ -+ if (!tfm_cluster_is_uptodate(&clust->tc)) { -+ clust->index = pg_to_clust(page->index, inode); -+ unlock_page(page); -+ ret = ctail_read_disk_cluster(clust, inode, mode); -+ lock_page(page); -+ if (ret) -+ return ret; -+ } -+ if (PageUptodate(page)) -+ /* races with another read/write */ -+ goto exit; -+ -+ /* bytes in the page */ -+ pgcnt = cnt_to_pgcnt(i_size_read(inode), page->index); -+ -+ if (pgcnt == 0) { -+ assert("edward-1290", 0); -+ return RETERR(-EINVAL); -+ } -+ assert("edward-119", tfm_cluster_is_uptodate(tc)); -+ -+ switch (clust->dstat) { -+ case UNPR_DISK_CLUSTER: -+ assert("edward-1285", 0); -+#if REISER4_DEBUG -+ warning("edward-1168", -+ "page %lu is not uptodate and disk cluster %lu (inode %llu) is unprepped\n", -+ page->index, clust->index, -+ (unsigned long long)get_inode_oid(inode)); -+#endif -+ case FAKE_DISK_CLUSTER: -+ /* fill the page by zeroes */ -+ data = kmap_atomic(page, KM_USER0); -+ -+ memset(data, 0, PAGE_CACHE_SIZE); -+ flush_dcache_page(page); -+ kunmap_atomic(data, KM_USER0); -+ SetPageUptodate(page); -+ break; -+ case PREP_DISK_CLUSTER: -+ /* fill the page by transformed data */ -+ assert("edward-1058", !PageUptodate(page)); -+ assert("edward-120", tc->len <= inode_cluster_size(inode)); -+ -+ /* start page offset in the cluster */ -+ cloff = pg_to_off_to_cloff(page->index, inode); -+ -+ data = kmap(page); -+ memcpy(data, tfm_stream_data(tc, OUTPUT_STREAM) + cloff, pgcnt); -+ memset(data + pgcnt, 0, (size_t) PAGE_CACHE_SIZE - pgcnt); -+ flush_dcache_page(page); -+ kunmap(page); -+ SetPageUptodate(page); -+ break; -+ default: -+ impossible("edward-1169", "bad disk cluster state"); -+ } -+ exit: -+ return 0; -+} -+ -+/* plugin->u.item.s.file.readpage */ -+int readpage_ctail(void *vp, struct page *page) -+{ -+ int result; -+ hint_t *hint; -+ reiser4_cluster_t *clust = vp; -+ -+ assert("edward-114", clust != NULL); -+ assert("edward-115", PageLocked(page)); -+ assert("edward-116", !PageUptodate(page)); -+ assert("edward-117", !jprivate(page) && !PagePrivate(page)); -+ assert("edward-118", page->mapping && page->mapping->host); -+ assert("edward-867", !tfm_cluster_is_uptodate(&clust->tc)); -+ -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) { -+ unlock_page(page); -+ return RETERR(-ENOMEM); -+ } -+ clust->hint = hint; -+ result = load_file_hint(clust->file, hint); -+ if (result) { -+ kfree(hint); -+ unlock_page(page); -+ return result; -+ } -+ assert("vs-25", hint->ext_coord.lh == &hint->lh); -+ result = do_readpage_ctail(page->mapping->host, clust, page, -+ ZNODE_READ_LOCK); -+ -+ assert("edward-213", PageLocked(page)); -+ assert("edward-1163", ergo(!result, PageUptodate(page))); -+ assert("edward-868", -+ ergo(!result, tfm_cluster_is_uptodate(&clust->tc))); -+ -+ unlock_page(page); -+ done_lh(&hint->lh); -+ hint->ext_coord.valid = 0; -+ save_file_hint(clust->file, hint); -+ kfree(hint); -+ tfm_cluster_clr_uptodate(&clust->tc); -+ -+ return result; -+} -+ -+/* Helper function for ->readpages() */ -+static int -+ctail_read_page_cluster(reiser4_cluster_t * clust, struct inode *inode) -+{ -+ int i; -+ int result; -+ assert("edward-779", clust != NULL); -+ assert("edward-1059", clust->win == NULL); -+ assert("edward-780", inode != NULL); -+ -+ result = prepare_page_cluster(inode, clust, 0 /* do not capture */ ); -+ if (result) -+ return result; -+ result = ctail_read_disk_cluster(clust, inode, ZNODE_READ_LOCK); -+ if (result) -+ goto out; -+ /* at this point stream with valid plain text is attached */ -+ assert("edward-781", tfm_cluster_is_uptodate(&clust->tc)); -+ -+ for (i = 0; i < clust->nr_pages; i++) { -+ struct page *page = clust->pages[i]; -+ lock_page(page); -+ result = do_readpage_ctail(inode, clust, page, ZNODE_READ_LOCK); -+ unlock_page(page); -+ if (result) -+ break; -+ } -+ tfm_cluster_clr_uptodate(&clust->tc); -+ out: -+ reiser4_release_cluster_pages(clust); -+ return result; -+} -+ -+/* filler for read_cache_pages() */ -+static int ctail_readpages_filler(void * data, struct page * page) -+{ -+ int ret = 0; -+ reiser4_cluster_t * clust = data; -+ struct inode * inode = clust->file->f_dentry->d_inode; -+ -+ if (PageUptodate(page)) { -+ unlock_page(page); -+ return 0; -+ } -+ unlock_page(page); -+ move_cluster_forward(clust, inode, page->index); -+ ret = ctail_read_page_cluster(clust, inode); -+ if (ret) -+ return ret; -+ assert("edward-869", !tfm_cluster_is_uptodate(&clust->tc)); -+ -+ lock_page(page); -+ ret = do_readpage_ctail(inode, clust, page, ZNODE_READ_LOCK); -+ assert("edward-1061", ergo(!ret, PageUptodate(page))); -+ unlock_page(page); -+ -+ return ret; -+} -+ -+/* We populate a bit more then upper readahead suggests: -+ with each nominated page we read the whole page cluster -+ this page belongs to. */ -+int readpages_ctail(struct file *file, struct address_space *mapping, -+ struct list_head *pages) -+{ -+ int ret = 0; -+ hint_t *hint; -+ reiser4_cluster_t clust; -+ struct inode *inode = mapping->host; -+ -+ assert("edward-1521", inode == file->f_dentry->d_inode); -+ -+ cluster_init_read(&clust, NULL); -+ clust.file = file; -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) { -+ warning("vs-28", "failed to allocate hint"); -+ ret = RETERR(-ENOMEM); -+ goto exit1; -+ } -+ clust.hint = hint; -+ ret = load_file_hint(clust.file, hint); -+ if (ret) { -+ warning("edward-1522", "failed to load hint"); -+ goto exit2; -+ } -+ assert("vs-26", hint->ext_coord.lh == &hint->lh); -+ ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode)); -+ if (ret) { -+ warning("edward-1523", "failed to alloc pgset"); -+ goto exit3; -+ } -+ ret = read_cache_pages(mapping, pages, ctail_readpages_filler, &clust); -+ -+ assert("edward-870", !tfm_cluster_is_uptodate(&clust.tc)); -+ exit3: -+ done_lh(&hint->lh); -+ save_file_hint(file, hint); -+ hint->ext_coord.valid = 0; -+ exit2: -+ kfree(hint); -+ exit1: -+ put_cluster_handle(&clust); -+ return ret; -+} -+ -+/* -+ plugin->u.item.s.file.append_key -+ key of the first item of the next disk cluster -+*/ -+reiser4_key *append_key_ctail(const coord_t * coord, reiser4_key * key) -+{ -+ assert("edward-1241", item_id_by_coord(coord) == CTAIL_ID); -+ assert("edward-1242", cluster_shift_ok(cluster_shift_by_coord(coord))); -+ -+ item_key_by_coord(coord, key); -+ set_key_offset(key, -+ ((__u64) (clust_by_coord(coord, NULL)) + -+ 1) << cluster_shift_by_coord(coord)); -+ return key; -+} -+ -+static int -+insert_unprepped_ctail(reiser4_cluster_t * clust, struct inode *inode) -+{ -+ int result; -+ char buf[UCTAIL_NR_UNITS]; -+ reiser4_item_data data; -+ reiser4_key key; -+ int shift = (int)UCTAIL_SHIFT; -+ -+ memset(buf, 0, (size_t) UCTAIL_NR_UNITS); -+ result = key_by_inode_cryptcompress(inode, -+ clust_to_off(clust->index, inode), -+ &key); -+ if (result) -+ return result; -+ data.user = 0; -+ data.iplug = item_plugin_by_id(CTAIL_ID); -+ data.arg = &shift; -+ data.length = sizeof(ctail_item_format) + (size_t) UCTAIL_NR_UNITS; -+ data.data = buf; -+ -+ result = insert_by_coord(&clust->hint->ext_coord.coord, -+ &data, &key, clust->hint->ext_coord.lh, 0); -+ return result; -+} -+ -+static int -+insert_cryptcompress_flow(coord_t * coord, lock_handle * lh, flow_t * f, -+ struct inode *inode) -+{ -+ int result; -+ carry_pool *pool; -+ carry_level *lowest_level; -+ reiser4_item_data *data; -+ carry_op *op; -+ int cluster_shift = inode_cluster_shift(inode); -+ -+ pool = -+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) + -+ sizeof(*data)); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ lowest_level = (carry_level *) (pool + 1); -+ init_carry_level(lowest_level, pool); -+ data = (reiser4_item_data *) (lowest_level + 3); -+ -+ assert("edward-466", coord->between == AFTER_ITEM -+ || coord->between == AFTER_UNIT || coord->between == BEFORE_ITEM -+ || coord->between == EMPTY_NODE -+ || coord->between == BEFORE_UNIT); -+ -+ if (coord->between == AFTER_UNIT) { -+ coord->unit_pos = 0; -+ coord->between = AFTER_ITEM; -+ } -+ op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node, -+ 0 /* operate directly on coord -> node */); -+ if (IS_ERR(op) || (op == NULL)) { -+ done_carry_pool(pool); -+ return RETERR(op ? PTR_ERR(op) : -EIO); -+ } -+ data->user = 0; -+ data->iplug = item_plugin_by_id(CTAIL_ID); -+ data->arg = &cluster_shift; -+ -+ data->length = 0; -+ data->data = NULL; -+ -+ op->u.insert_flow.flags = COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT; -+ op->u.insert_flow.insert_point = coord; -+ op->u.insert_flow.flow = f; -+ op->u.insert_flow.data = data; -+ op->u.insert_flow.new_nodes = 0; -+ -+ lowest_level->track_type = CARRY_TRACK_CHANGE; -+ lowest_level->tracked = lh; -+ -+ result = reiser4_carry(lowest_level, NULL); -+ done_carry_pool(pool); -+ -+ return result; -+} -+ -+/* Implementation of CRC_APPEND_ITEM mode of ctail conversion */ -+static int insert_cryptcompress_flow_in_place(coord_t * coord, -+ lock_handle * lh, flow_t * f, -+ struct inode *inode) -+{ -+ int ret; -+ coord_t pos; -+ lock_handle lock; -+ -+ assert("edward-674", f->length <= inode_scaled_cluster_size(inode)); -+ assert("edward-484", coord->between == AT_UNIT -+ || coord->between == AFTER_ITEM); -+ assert("edward-485", item_id_by_coord(coord) == CTAIL_ID); -+ -+ coord_dup(&pos, coord); -+ pos.unit_pos = 0; -+ pos.between = AFTER_ITEM; -+ -+ init_lh(&lock); -+ copy_lh(&lock, lh); -+ -+ ret = insert_cryptcompress_flow(&pos, &lock, f, inode); -+ done_lh(&lock); -+ assert("edward-1347", znode_is_write_locked(lh->node)); -+ assert("edward-1228", !ret); -+ return ret; -+} -+ -+/* Implementation of CRC_OVERWRITE_ITEM mode of ctail conversion */ -+static int overwrite_ctail(coord_t * coord, flow_t * f) -+{ -+ unsigned count; -+ -+ assert("edward-269", f->user == 0); -+ assert("edward-270", f->data != NULL); -+ assert("edward-271", f->length > 0); -+ assert("edward-272", coord_is_existing_unit(coord)); -+ assert("edward-273", coord->unit_pos == 0); -+ assert("edward-274", znode_is_write_locked(coord->node)); -+ assert("edward-275", reiser4_schedulable()); -+ assert("edward-467", item_id_by_coord(coord) == CTAIL_ID); -+ assert("edward-1243", ctail_ok(coord)); -+ -+ count = nr_units_ctail(coord); -+ -+ if (count > f->length) -+ count = f->length; -+ memcpy(first_unit(coord), f->data, count); -+ move_flow_forward(f, count); -+ coord->unit_pos += count; -+ return 0; -+} -+ -+/* Implementation of CRC_CUT_ITEM mode of ctail conversion: -+ cut ctail (part or whole) starting from next unit position */ -+static int cut_ctail(coord_t * coord) -+{ -+ coord_t stop; -+ -+ assert("edward-435", coord->between == AT_UNIT && -+ coord->item_pos < coord_num_items(coord) && -+ coord->unit_pos <= coord_num_units(coord)); -+ -+ if (coord->unit_pos == coord_num_units(coord)) -+ /* nothing to cut */ -+ return 0; -+ coord_dup(&stop, coord); -+ stop.unit_pos = coord_last_unit_pos(coord); -+ -+ return cut_node_content(coord, &stop, NULL, NULL, NULL); -+} -+ -+int -+ctail_insert_unprepped_cluster(reiser4_cluster_t * clust, struct inode *inode) -+{ -+ int result; -+ assert("edward-1244", inode != NULL); -+ assert("edward-1245", clust->hint != NULL); -+ assert("edward-1246", clust->dstat == FAKE_DISK_CLUSTER); -+ assert("edward-1247", clust->reserved == 1); -+ -+ result = get_disk_cluster_locked(clust, inode, ZNODE_WRITE_LOCK); -+ if (cbk_errored(result)) -+ return result; -+ assert("edward-1249", result == CBK_COORD_NOTFOUND); -+ assert("edward-1250", znode_is_write_locked(clust->hint->lh.node)); -+ -+ assert("edward-1295", -+ clust->hint->ext_coord.lh->node == -+ clust->hint->ext_coord.coord.node); -+ -+ coord_set_between_clusters(&clust->hint->ext_coord.coord); -+ -+ result = insert_unprepped_ctail(clust, inode); -+ all_grabbed2free(); -+ -+ assert("edward-1251", !result); -+ assert("edward-1252", cryptcompress_inode_ok(inode)); -+ assert("edward-1253", znode_is_write_locked(clust->hint->lh.node)); -+ assert("edward-1254", -+ reiser4_clustered_blocks(reiser4_get_current_sb())); -+ assert("edward-1255", -+ znode_convertible(clust->hint->ext_coord.coord.node)); -+ -+ return result; -+} -+ -+static int do_convert_ctail(flush_pos_t * pos, cryptcompress_write_mode_t mode) -+{ -+ int result = 0; -+ convert_item_info_t *info; -+ -+ assert("edward-468", pos != NULL); -+ assert("edward-469", pos->sq != NULL); -+ assert("edward-845", item_convert_data(pos) != NULL); -+ -+ info = item_convert_data(pos); -+ assert("edward-679", info->flow.data != NULL); -+ -+ switch (mode) { -+ case CRC_APPEND_ITEM: -+ assert("edward-1229", info->flow.length != 0); -+ assert("edward-1256", -+ cluster_shift_ok(cluster_shift_by_coord(&pos->coord))); -+ result = -+ insert_cryptcompress_flow_in_place(&pos->coord, -+ &pos->lock, -+ &info->flow, -+ info->inode); -+ break; -+ case CRC_OVERWRITE_ITEM: -+ assert("edward-1230", info->flow.length != 0); -+ overwrite_ctail(&pos->coord, &info->flow); -+ if (info->flow.length != 0) -+ break; -+ case CRC_CUT_ITEM: -+ assert("edward-1231", info->flow.length == 0); -+ result = cut_ctail(&pos->coord); -+ break; -+ default: -+ result = RETERR(-EIO); -+ impossible("edward-244", "bad convert mode"); -+ } -+ return result; -+} -+ -+/* plugin->u.item.f.scan */ -+int scan_ctail(flush_scan * scan) -+{ -+ int result = 0; -+ struct page *page; -+ struct inode *inode; -+ jnode *node = scan->node; -+ -+ assert("edward-227", scan->node != NULL); -+ assert("edward-228", jnode_is_cluster_page(scan->node)); -+ assert("edward-639", znode_is_write_locked(scan->parent_lock.node)); -+ -+ page = jnode_page(node); -+ inode = page->mapping->host; -+ -+ if (!reiser4_scanning_left(scan)) -+ return result; -+ if (!ZF_ISSET(scan->parent_lock.node, JNODE_DIRTY)) -+ znode_make_dirty(scan->parent_lock.node); -+ -+ if (!znode_convertible(scan->parent_lock.node)) { -+ if (JF_ISSET(scan->node, JNODE_DIRTY)) -+ znode_set_convertible(scan->parent_lock.node); -+ else { -+ warning("edward-681", -+ "cluster page is already processed"); -+ return -EAGAIN; -+ } -+ } -+ return result; -+} -+ -+/* If true, this function attaches children */ -+static int should_attach_convert_idata(flush_pos_t * pos) -+{ -+ int result; -+ assert("edward-431", pos != NULL); -+ assert("edward-432", pos->child == NULL); -+ assert("edward-619", znode_is_write_locked(pos->coord.node)); -+ assert("edward-470", -+ item_plugin_by_coord(&pos->coord) == -+ item_plugin_by_id(CTAIL_ID)); -+ -+ /* check for leftmost child */ -+ utmost_child_ctail(&pos->coord, LEFT_SIDE, &pos->child); -+ -+ if (!pos->child) -+ return 0; -+ spin_lock_jnode(pos->child); -+ result = (JF_ISSET(pos->child, JNODE_DIRTY) && -+ pos->child->atom == ZJNODE(pos->coord.node)->atom); -+ spin_unlock_jnode(pos->child); -+ if (!result && pos->child) { -+ /* existing child isn't to attach, clear up this one */ -+ jput(pos->child); -+ pos->child = NULL; -+ } -+ return result; -+} -+ -+/* plugin->init_convert_data() */ -+static int -+init_convert_data_ctail(convert_item_info_t * idata, struct inode *inode) -+{ -+ assert("edward-813", idata != NULL); -+ assert("edward-814", inode != NULL); -+ -+ idata->inode = inode; -+ idata->d_cur = DC_FIRST_ITEM; -+ idata->d_next = DC_INVALID_STATE; -+ -+ return 0; -+} -+ -+static int alloc_item_convert_data(convert_info_t * sq) -+{ -+ assert("edward-816", sq != NULL); -+ assert("edward-817", sq->itm == NULL); -+ -+ sq->itm = kmalloc(sizeof(*sq->itm), reiser4_ctx_gfp_mask_get()); -+ if (sq->itm == NULL) -+ return RETERR(-ENOMEM); -+ return 0; -+} -+ -+static void free_item_convert_data(convert_info_t * sq) -+{ -+ assert("edward-818", sq != NULL); -+ assert("edward-819", sq->itm != NULL); -+ assert("edward-820", sq->iplug != NULL); -+ -+ kfree(sq->itm); -+ sq->itm = NULL; -+ return; -+} -+ -+static int alloc_convert_data(flush_pos_t * pos) -+{ -+ assert("edward-821", pos != NULL); -+ assert("edward-822", pos->sq == NULL); -+ -+ pos->sq = kmalloc(sizeof(*pos->sq), reiser4_ctx_gfp_mask_get()); -+ if (!pos->sq) -+ return RETERR(-ENOMEM); -+ memset(pos->sq, 0, sizeof(*pos->sq)); -+ cluster_init_write(&pos->sq->clust, 0); -+ return 0; -+} -+ -+void free_convert_data(flush_pos_t * pos) -+{ -+ convert_info_t *sq; -+ -+ assert("edward-823", pos != NULL); -+ assert("edward-824", pos->sq != NULL); -+ -+ sq = pos->sq; -+ if (sq->itm) -+ free_item_convert_data(sq); -+ put_cluster_handle(&sq->clust); -+ kfree(pos->sq); -+ pos->sq = NULL; -+ return; -+} -+ -+static int init_item_convert_data(flush_pos_t * pos, struct inode *inode) -+{ -+ convert_info_t *sq; -+ -+ assert("edward-825", pos != NULL); -+ assert("edward-826", pos->sq != NULL); -+ assert("edward-827", item_convert_data(pos) != NULL); -+ assert("edward-828", inode != NULL); -+ -+ sq = pos->sq; -+ -+ memset(sq->itm, 0, sizeof(*sq->itm)); -+ -+ /* iplug->init_convert_data() */ -+ return init_convert_data_ctail(sq->itm, inode); -+} -+ -+/* create and attach disk cluster info used by 'convert' phase of the flush -+ squalloc() */ -+static int attach_convert_idata(flush_pos_t * pos, struct inode *inode) -+{ -+ int ret = 0; -+ convert_item_info_t *info; -+ reiser4_cluster_t *clust; -+ file_plugin *fplug = inode_file_plugin(inode); -+ compression_plugin *cplug = inode_compression_plugin(inode); -+ -+ assert("edward-248", pos != NULL); -+ assert("edward-249", pos->child != NULL); -+ assert("edward-251", inode != NULL); -+ assert("edward-682", cryptcompress_inode_ok(inode)); -+ assert("edward-252", -+ fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); -+ assert("edward-473", -+ item_plugin_by_coord(&pos->coord) == -+ item_plugin_by_id(CTAIL_ID)); -+ -+ if (!pos->sq) { -+ ret = alloc_convert_data(pos); -+ if (ret) -+ return ret; -+ } -+ clust = &pos->sq->clust; -+ ret = grab_coa(&clust->tc, cplug); -+ if (ret) -+ goto err; -+ ret = set_cluster_by_page(clust, -+ jnode_page(pos->child), -+ MAX_CLUSTER_NRPAGES); -+ if (ret) -+ goto err; -+ -+ assert("edward-829", pos->sq != NULL); -+ assert("edward-250", item_convert_data(pos) == NULL); -+ -+ pos->sq->iplug = item_plugin_by_id(CTAIL_ID); -+ -+ ret = alloc_item_convert_data(pos->sq); -+ if (ret) -+ goto err; -+ ret = init_item_convert_data(pos, inode); -+ if (ret) -+ goto err; -+ info = item_convert_data(pos); -+ -+ ret = flush_cluster_pages(clust, pos->child, inode); -+ if (ret) -+ goto err; -+ -+ reiser4_deflate_cluster(clust, inode); -+ inc_item_convert_count(pos); -+ -+ /* make flow by transformed stream */ -+ fplug->flow_by_inode(info->inode, -+ (const char __user *)tfm_stream_data(&clust->tc, OUTPUT_STREAM), -+ 0 /* kernel space */ , -+ clust->tc.len, -+ clust_to_off(clust->index, inode), -+ WRITE_OP, &info->flow); -+ jput(pos->child); -+ -+ assert("edward-683", cryptcompress_inode_ok(inode)); -+ return 0; -+ err: -+ jput(pos->child); -+ free_convert_data(pos); -+ return ret; -+} -+ -+/* clear up disk cluster info */ -+static void detach_convert_idata(convert_info_t * sq) -+{ -+ convert_item_info_t *info; -+ -+ assert("edward-253", sq != NULL); -+ assert("edward-840", sq->itm != NULL); -+ -+ info = sq->itm; -+ assert("edward-255", info->inode != NULL); -+ assert("edward-1212", info->flow.length == 0); -+ -+ free_item_convert_data(sq); -+ return; -+} -+ -+/* plugin->u.item.f.utmost_child */ -+ -+/* This function sets leftmost child for a first cluster item, -+ if the child exists, and NULL in other cases. -+ NOTE-EDWARD: Do not call this for RIGHT_SIDE */ -+ -+int utmost_child_ctail(const coord_t * coord, sideof side, jnode ** child) -+{ -+ reiser4_key key; -+ -+ item_key_by_coord(coord, &key); -+ -+ assert("edward-257", coord != NULL); -+ assert("edward-258", child != NULL); -+ assert("edward-259", side == LEFT_SIDE); -+ assert("edward-260", -+ item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID)); -+ -+ if (!is_disk_cluster_key(&key, coord)) -+ *child = NULL; -+ else -+ *child = jlookup(current_tree, -+ get_key_objectid(item_key_by_coord -+ (coord, &key)), -+ off_to_pg(get_key_offset(&key))); -+ return 0; -+} -+ -+/* Returns true if @p2 is the next item to @p1 -+ in the _same_ disk cluster. -+ Disk cluster is a set of items. If ->clustered() != NULL, -+ with each item the whole disk cluster should be read/modified -+*/ -+static int clustered_ctail(const coord_t * p1, const coord_t * p2) -+{ -+ return mergeable_ctail(p1, p2); -+} -+ -+/* Go rightward and check for next disk cluster item, set -+ d_next to DC_CHAINED_ITEM, if the last one exists. -+ If the current position is last item, go to right neighbor. -+ Skip empty nodes. Note, that right neighbors may be not in -+ the slum because of races. If so, make it dirty and -+ convertible. -+*/ -+static int next_item_dc_stat(flush_pos_t * pos) -+{ -+ int ret = 0; -+ int stop = 0; -+ znode *cur; -+ coord_t coord; -+ lock_handle lh; -+ lock_handle right_lock; -+ -+ assert("edward-1232", !node_is_empty(pos->coord.node)); -+ assert("edward-1014", -+ pos->coord.item_pos < coord_num_items(&pos->coord)); -+ assert("edward-1015", chaining_data_present(pos)); -+ assert("edward-1017", -+ item_convert_data(pos)->d_next == DC_INVALID_STATE); -+ -+ item_convert_data(pos)->d_next = DC_AFTER_CLUSTER; -+ -+ if (item_convert_data(pos)->d_cur == DC_AFTER_CLUSTER) -+ return ret; -+ if (pos->coord.item_pos < coord_num_items(&pos->coord) - 1) -+ return ret; -+ -+ /* check next slum item */ -+ init_lh(&right_lock); -+ cur = pos->coord.node; -+ -+ while (!stop) { -+ init_lh(&lh); -+ ret = reiser4_get_right_neighbor(&lh, -+ cur, -+ ZNODE_WRITE_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ if (ret) -+ break; -+ ret = zload(lh.node); -+ if (ret) { -+ done_lh(&lh); -+ break; -+ } -+ coord_init_before_first_item(&coord, lh.node); -+ -+ if (node_is_empty(lh.node)) { -+ znode_make_dirty(lh.node); -+ znode_set_convertible(lh.node); -+ stop = 0; -+ } else if (clustered_ctail(&pos->coord, &coord)) { -+ -+ item_convert_data(pos)->d_next = DC_CHAINED_ITEM; -+ -+ if (!ZF_ISSET(lh.node, JNODE_DIRTY)) { -+ /* -+ warning("edward-1024", -+ "next slum item mergeable, " -+ "but znode %p isn't dirty\n", -+ lh.node); -+ */ -+ znode_make_dirty(lh.node); -+ } -+ if (!znode_convertible(lh.node)) { -+ /* -+ warning("edward-1272", -+ "next slum item mergeable, " -+ "but znode %p isn't convertible\n", -+ lh.node); -+ */ -+ znode_set_convertible(lh.node); -+ } -+ stop = 1; -+ } else -+ stop = 1; -+ zrelse(lh.node); -+ done_lh(&right_lock); -+ copy_lh(&right_lock, &lh); -+ done_lh(&lh); -+ cur = right_lock.node; -+ } -+ done_lh(&right_lock); -+ -+ if (ret == -E_NO_NEIGHBOR) -+ ret = 0; -+ return ret; -+} -+ -+static int -+assign_convert_mode(convert_item_info_t * idata, -+ cryptcompress_write_mode_t * mode) -+{ -+ int result = 0; -+ -+ assert("edward-1025", idata != NULL); -+ -+ if (idata->flow.length) { -+ /* append or overwrite */ -+ switch (idata->d_cur) { -+ case DC_FIRST_ITEM: -+ case DC_CHAINED_ITEM: -+ *mode = CRC_OVERWRITE_ITEM; -+ break; -+ case DC_AFTER_CLUSTER: -+ *mode = CRC_APPEND_ITEM; -+ break; -+ default: -+ impossible("edward-1018", "wrong current item state"); -+ } -+ } else { -+ /* cut or invalidate */ -+ switch (idata->d_cur) { -+ case DC_FIRST_ITEM: -+ case DC_CHAINED_ITEM: -+ *mode = CRC_CUT_ITEM; -+ break; -+ case DC_AFTER_CLUSTER: -+ result = 1; -+ break; -+ default: -+ impossible("edward-1019", "wrong current item state"); -+ } -+ } -+ return result; -+} -+ -+/* plugin->u.item.f.convert */ -+/* write ctail in guessed mode */ -+int convert_ctail(flush_pos_t * pos) -+{ -+ int result; -+ int nr_items; -+ cryptcompress_write_mode_t mode = CRC_OVERWRITE_ITEM; -+ -+ assert("edward-1020", pos != NULL); -+ assert("edward-1213", coord_num_items(&pos->coord) != 0); -+ assert("edward-1257", item_id_by_coord(&pos->coord) == CTAIL_ID); -+ assert("edward-1258", ctail_ok(&pos->coord)); -+ assert("edward-261", pos->coord.node != NULL); -+ -+ nr_items = coord_num_items(&pos->coord); -+ if (!chaining_data_present(pos)) { -+ if (should_attach_convert_idata(pos)) { -+ /* attach convert item info */ -+ struct inode *inode; -+ -+ assert("edward-264", pos->child != NULL); -+ assert("edward-265", jnode_page(pos->child) != NULL); -+ assert("edward-266", -+ jnode_page(pos->child)->mapping != NULL); -+ -+ inode = jnode_page(pos->child)->mapping->host; -+ -+ assert("edward-267", inode != NULL); -+ -+ /* attach item convert info by child and put the last one */ -+ result = attach_convert_idata(pos, inode); -+ pos->child = NULL; -+ if (result == -E_REPEAT) { -+ /* jnode became clean, or there is no dirty -+ pages (nothing to update in disk cluster) */ -+ warning("edward-1021", -+ "convert_ctail: nothing to attach"); -+ return 0; -+ } -+ if (result != 0) -+ return result; -+ } else -+ /* unconvertible */ -+ return 0; -+ } else { -+ /* use old convert info */ -+ -+ convert_item_info_t *idata; -+ -+ idata = item_convert_data(pos); -+ -+ result = assign_convert_mode(idata, &mode); -+ if (result) { -+ /* disk cluster is over, -+ nothing to update anymore */ -+ detach_convert_idata(pos->sq); -+ return 0; -+ } -+ } -+ -+ assert("edward-433", chaining_data_present(pos)); -+ assert("edward-1022", -+ pos->coord.item_pos < coord_num_items(&pos->coord)); -+ -+ result = next_item_dc_stat(pos); -+ if (result) { -+ detach_convert_idata(pos->sq); -+ return result; -+ } -+ result = do_convert_ctail(pos, mode); -+ if (result) { -+ detach_convert_idata(pos->sq); -+ return result; -+ } -+ switch (mode) { -+ case CRC_CUT_ITEM: -+ assert("edward-1214", item_convert_data(pos)->flow.length == 0); -+ assert("edward-1215", -+ coord_num_items(&pos->coord) == nr_items || -+ coord_num_items(&pos->coord) == nr_items - 1); -+ if (item_convert_data(pos)->d_next == DC_CHAINED_ITEM) -+ break; -+ if (coord_num_items(&pos->coord) != nr_items) { -+ /* the item was killed, no more chained items */ -+ detach_convert_idata(pos->sq); -+ if (!node_is_empty(pos->coord.node)) -+ /* make sure the next item will be scanned */ -+ coord_init_before_item(&pos->coord); -+ break; -+ } -+ case CRC_APPEND_ITEM: -+ assert("edward-434", item_convert_data(pos)->flow.length == 0); -+ detach_convert_idata(pos->sq); -+ break; -+ case CRC_OVERWRITE_ITEM: -+ if (coord_is_unprepped_ctail(&pos->coord)) { -+ /* convert unpprepped ctail to prepped one */ -+ int shift; -+ shift = -+ inode_cluster_shift(item_convert_data(pos)->inode); -+ assert("edward-1259", cluster_shift_ok(shift)); -+ put_unaligned((d8)shift, -+ &ctail_formatted_at(&pos->coord)-> -+ cluster_shift); -+ } -+ break; -+ } -+ return result; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/ctail.h linux-2.6.20/fs/reiser4/plugin/item/ctail.h ---- linux-2.6.20.orig/fs/reiser4/plugin/item/ctail.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/item/ctail.h 2007-05-06 14:50:43.803008220 +0400 -@@ -0,0 +1,97 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#if !defined( __FS_REISER4_CTAIL_H__ ) -+#define __FS_REISER4_CTAIL_H__ -+ -+/* Disk format of ctail item */ -+typedef struct ctail_item_format { -+ /* packed shift; size of (prepped) disk cluster -+ is calculated as (1 << cluster_shift) */ -+ d8 cluster_shift; -+ /* ctail body */ -+ d8 body[0]; -+} __attribute__ ((packed)) ctail_item_format; -+ -+/* Unprepped disk cluster is represented by a single ctail item -+ with the following "magic" attributes: */ -+/* "magic" cluster_shift */ -+#define UCTAIL_SHIFT 0xff -+/* How many units unprepped ctail item has */ -+#define UCTAIL_NR_UNITS 1 -+ -+/* The following is a set of various item states in a disk cluster. -+ Disk cluster is a set of items whose keys belong to the interval -+ [dc_key , dc_key + disk_cluster_size - 1] */ -+typedef enum { -+ DC_INVALID_STATE = 0, -+ DC_FIRST_ITEM = 1, -+ DC_CHAINED_ITEM = 2, -+ DC_AFTER_CLUSTER = 3 -+} dc_item_stat; -+ -+/* ctail-specific extension. -+ In particular this describes parameters of disk cluster an item belongs to */ -+typedef struct { -+ int shift; /* this contains cluster_shift extracted from -+ ctail_item_format (above), or UCTAIL_SHIFT -+ (the last one is the "magic" of unprepped disk clusters)*/ -+ int dsize; /* size of a prepped disk cluster */ -+ int ncount; /* count of nodes occupied by a disk cluster */ -+} ctail_coord_extension_t; -+ -+struct cut_list; -+ -+/* plugin->item.b.* */ -+int can_contain_key_ctail(const coord_t *, const reiser4_key *, -+ const reiser4_item_data *); -+int mergeable_ctail(const coord_t * p1, const coord_t * p2); -+pos_in_node_t nr_units_ctail(const coord_t * coord); -+int estimate_ctail(const coord_t * coord, const reiser4_item_data * data); -+void print_ctail(const char *prefix, coord_t * coord); -+lookup_result lookup_ctail(const reiser4_key *, lookup_bias, coord_t *); -+ -+int paste_ctail(coord_t * coord, reiser4_item_data * data, -+ carry_plugin_info * info UNUSED_ARG); -+int init_ctail(coord_t *, coord_t *, reiser4_item_data *); -+int can_shift_ctail(unsigned free_space, coord_t * coord, -+ znode * target, shift_direction pend, unsigned *size, -+ unsigned want); -+void copy_units_ctail(coord_t * target, coord_t * source, unsigned from, -+ unsigned count, shift_direction where_is_free_space, -+ unsigned free_space); -+int cut_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ carry_cut_data *, reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+int kill_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ carry_kill_data *, reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+int ctail_ok(const coord_t * coord); -+int check_ctail(const coord_t * coord, const char **error); -+ -+/* plugin->u.item.s.* */ -+int read_ctail(struct file *, flow_t *, hint_t *); -+int readpage_ctail(void *, struct page *); -+int readpages_ctail(struct file *, struct address_space *, struct list_head *); -+reiser4_key *append_key_ctail(const coord_t *, reiser4_key *); -+int create_hook_ctail(const coord_t * coord, void *arg); -+int kill_hook_ctail(const coord_t *, pos_in_node_t, pos_in_node_t, -+ carry_kill_data *); -+int shift_hook_ctail(const coord_t *, unsigned, unsigned, znode *); -+ -+/* plugin->u.item.f */ -+int utmost_child_ctail(const coord_t *, sideof, jnode **); -+int scan_ctail(flush_scan *); -+int convert_ctail(flush_pos_t *); -+size_t inode_scaled_cluster_size(struct inode *); -+ -+#endif /* __FS_REISER4_CTAIL_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/extent.c linux-2.6.20/fs/reiser4/plugin/item/extent.c ---- linux-2.6.20.orig/fs/reiser4/plugin/item/extent.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/item/extent.c 2007-05-06 14:50:43.807009470 +0400 -@@ -0,0 +1,197 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "item.h" -+#include "../../key.h" -+#include "../../super.h" -+#include "../../carry.h" -+#include "../../inode.h" -+#include "../../page_cache.h" -+#include "../../flush.h" -+#include "../object.h" -+ -+/* prepare structure reiser4_item_data. It is used to put one extent unit into tree */ -+/* Audited by: green(2002.06.13) */ -+reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit, -+ int nr_extents) -+{ -+ data->data = ext_unit; -+ /* data->data is kernel space */ -+ data->user = 0; -+ data->length = sizeof(reiser4_extent) * nr_extents; -+ data->arg = NULL; -+ data->iplug = item_plugin_by_id(EXTENT_POINTER_ID); -+ return data; -+} -+ -+/* how many bytes are addressed by @nr first extents of the extent item */ -+reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr) -+{ -+ pos_in_node_t i; -+ reiser4_block_nr blocks; -+ reiser4_extent *ext; -+ -+ ext = item_body_by_coord(coord); -+ assert("vs-263", nr <= nr_units_extent(coord)); -+ -+ blocks = 0; -+ for (i = 0; i < nr; i++, ext++) { -+ blocks += extent_get_width(ext); -+ } -+ -+ return blocks * current_blocksize; -+} -+ -+extent_state state_of_extent(reiser4_extent * ext) -+{ -+ switch ((int)extent_get_start(ext)) { -+ case 0: -+ return HOLE_EXTENT; -+ case 1: -+ return UNALLOCATED_EXTENT; -+ default: -+ break; -+ } -+ return ALLOCATED_EXTENT; -+} -+ -+int extent_is_unallocated(const coord_t * item) -+{ -+ assert("jmacd-5133", item_is_extent(item)); -+ -+ return state_of_extent(extent_by_coord(item)) == UNALLOCATED_EXTENT; -+} -+ -+/* set extent's start and width */ -+void reiser4_set_extent(reiser4_extent * ext, reiser4_block_nr start, -+ reiser4_block_nr width) -+{ -+ extent_set_start(ext, start); -+ extent_set_width(ext, width); -+} -+ -+/** -+ * reiser4_replace_extent - replace extent and paste 1 or 2 after it -+ * @un_extent: coordinate of extent to be overwritten -+ * @lh: need better comment -+ * @key: need better comment -+ * @exts_to_add: data prepared for insertion into tree -+ * @replace: need better comment -+ * @flags: need better comment -+ * @return_insert_position: need better comment -+ * -+ * Overwrites one extent, pastes 1 or 2 more ones after overwritten one. If -+ * @return_inserted_position is 1 - @un_extent and @lh are returned set to -+ * first of newly inserted units, if it is 0 - @un_extent and @lh are returned -+ * set to extent which was overwritten. -+ */ -+int reiser4_replace_extent(struct replace_handle *h, -+ int return_inserted_position) -+{ -+ int result; -+ znode *orig_znode; -+ /*ON_DEBUG(reiser4_extent orig_ext);*/ /* this is for debugging */ -+ -+ assert("vs-990", coord_is_existing_unit(h->coord)); -+ assert("vs-1375", znode_is_write_locked(h->coord->node)); -+ assert("vs-1426", extent_get_width(&h->overwrite) != 0); -+ assert("vs-1427", extent_get_width(&h->new_extents[0]) != 0); -+ assert("vs-1427", ergo(h->nr_new_extents == 2, -+ extent_get_width(&h->new_extents[1]) != 0)); -+ -+ /* compose structure for paste */ -+ init_new_extent(&h->item, &h->new_extents[0], h->nr_new_extents); -+ -+ coord_dup(&h->coord_after, h->coord); -+ init_lh(&h->lh_after); -+ copy_lh(&h->lh_after, h->lh); -+ reiser4_tap_init(&h->watch, &h->coord_after, &h->lh_after, ZNODE_WRITE_LOCK); -+ reiser4_tap_monitor(&h->watch); -+ -+ ON_DEBUG(h->orig_ext = *extent_by_coord(h->coord)); -+ orig_znode = h->coord->node; -+ -+#if REISER4_DEBUG -+ /* make sure that key is set properly */ -+ unit_key_by_coord(h->coord, &h->tmp); -+ set_key_offset(&h->tmp, -+ get_key_offset(&h->tmp) + -+ extent_get_width(&h->overwrite) * current_blocksize); -+ assert("vs-1080", keyeq(&h->tmp, &h->paste_key)); -+#endif -+ -+ /* set insert point after unit to be replaced */ -+ h->coord->between = AFTER_UNIT; -+ -+ result = insert_into_item(h->coord, return_inserted_position ? h->lh : NULL, -+ &h->paste_key, &h->item, h->flags); -+ if (!result) { -+ /* now we have to replace the unit after which new units were -+ inserted. Its position is tracked by @watch */ -+ reiser4_extent *ext; -+ znode *node; -+ -+ node = h->coord_after.node; -+ if (node != orig_znode) { -+ coord_clear_iplug(&h->coord_after); -+ result = zload(node); -+ } -+ -+ if (likely(!result)) { -+ ext = extent_by_coord(&h->coord_after); -+ -+ assert("vs-987", znode_is_loaded(node)); -+ assert("vs-988", !memcmp(ext, &h->orig_ext, sizeof(*ext))); -+ -+ /* overwrite extent unit */ -+ memcpy(ext, &h->overwrite, sizeof(reiser4_extent)); -+ znode_make_dirty(node); -+ -+ if (node != orig_znode) -+ zrelse(node); -+ -+ if (return_inserted_position == 0) { -+ /* coord and lh are to be set to overwritten -+ extent */ -+ assert("vs-1662", -+ WITH_DATA(node, !memcmp(&h->overwrite, -+ extent_by_coord( -+ &h->coord_after), -+ sizeof(reiser4_extent)))); -+ -+ *h->coord = h->coord_after; -+ done_lh(h->lh); -+ copy_lh(h->lh, &h->lh_after); -+ } else { -+ /* h->coord and h->lh are to be set to first of -+ inserted units */ -+ assert("vs-1663", -+ WITH_DATA(h->coord->node, -+ !memcmp(&h->new_extents[0], -+ extent_by_coord(h->coord), -+ sizeof(reiser4_extent)))); -+ assert("vs-1664", h->lh->node == h->coord->node); -+ } -+ } -+ } -+ reiser4_tap_done(&h->watch); -+ -+ return result; -+} -+ -+lock_handle *znode_lh(znode *node) -+{ -+ assert("vs-1371", znode_is_write_locked(node)); -+ assert("vs-1372", znode_is_wlocked_once(node)); -+ return list_entry(node->lock.owners.next, lock_handle, owners_link); -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/extent_file_ops.c linux-2.6.20/fs/reiser4/plugin/item/extent_file_ops.c ---- linux-2.6.20.orig/fs/reiser4/plugin/item/extent_file_ops.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/item/extent_file_ops.c 2007-05-06 14:50:43.807009470 +0400 -@@ -0,0 +1,1443 @@ -+/* COPYRIGHT 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "item.h" -+#include "../../inode.h" -+#include "../../page_cache.h" -+#include "../object.h" -+ -+#include -+#include -+#include "../../../../mm/filemap.h" -+ -+static inline reiser4_extent *ext_by_offset(const znode *node, int offset) -+{ -+ reiser4_extent *ext; -+ -+ ext = (reiser4_extent *) (zdata(node) + offset); -+ return ext; -+} -+ -+/** -+ * check_uf_coord - verify coord extension -+ * @uf_coord: -+ * @key: -+ * -+ * Makes sure that all fields of @uf_coord are set properly. If @key is -+ * specified - check whether @uf_coord is set correspondingly. -+ */ -+static void check_uf_coord(const uf_coord_t *uf_coord, const reiser4_key *key) -+{ -+#if REISER4_DEBUG -+ const coord_t *coord; -+ const extent_coord_extension_t *ext_coord; -+ reiser4_extent *ext; -+ -+ coord = &uf_coord->coord; -+ ext_coord = &uf_coord->extension.extent; -+ ext = ext_by_offset(coord->node, uf_coord->extension.extent.ext_offset); -+ -+ assert("", -+ WITH_DATA(coord->node, -+ (uf_coord->valid == 1 && -+ coord_is_iplug_set(coord) && -+ item_is_extent(coord) && -+ ext_coord->nr_units == nr_units_extent(coord) && -+ ext == extent_by_coord(coord) && -+ ext_coord->width == extent_get_width(ext) && -+ coord->unit_pos < ext_coord->nr_units && -+ ext_coord->pos_in_unit < ext_coord->width && -+ memcmp(ext, &ext_coord->extent, -+ sizeof(reiser4_extent)) == 0))); -+ if (key) { -+ reiser4_key coord_key; -+ -+ unit_key_by_coord(&uf_coord->coord, &coord_key); -+ set_key_offset(&coord_key, -+ get_key_offset(&coord_key) + -+ (uf_coord->extension.extent. -+ pos_in_unit << PAGE_CACHE_SHIFT)); -+ assert("", keyeq(key, &coord_key)); -+ } -+#endif -+} -+ -+static inline reiser4_extent *ext_by_ext_coord(const uf_coord_t *uf_coord) -+{ -+ check_uf_coord(uf_coord, NULL); -+ -+ return ext_by_offset(uf_coord->coord.node, -+ uf_coord->extension.extent.ext_offset); -+} -+ -+#if REISER4_DEBUG -+ -+/** -+ * offset_is_in_unit -+ * -+ * -+ * -+ */ -+/* return 1 if offset @off is inside of extent unit pointed to by @coord. Set -+ pos_in_unit inside of unit correspondingly */ -+static int offset_is_in_unit(const coord_t *coord, loff_t off) -+{ -+ reiser4_key unit_key; -+ __u64 unit_off; -+ reiser4_extent *ext; -+ -+ ext = extent_by_coord(coord); -+ -+ unit_key_extent(coord, &unit_key); -+ unit_off = get_key_offset(&unit_key); -+ if (off < unit_off) -+ return 0; -+ if (off >= (unit_off + (current_blocksize * extent_get_width(ext)))) -+ return 0; -+ return 1; -+} -+ -+static int -+coord_matches_key_extent(const coord_t * coord, const reiser4_key * key) -+{ -+ reiser4_key item_key; -+ -+ assert("vs-771", coord_is_existing_unit(coord)); -+ assert("vs-1258", keylt(key, append_key_extent(coord, &item_key))); -+ assert("vs-1259", keyge(key, item_key_by_coord(coord, &item_key))); -+ -+ return offset_is_in_unit(coord, get_key_offset(key)); -+} -+ -+#endif -+ -+/** -+ * can_append - -+ * @key: -+ * @coord: -+ * -+ * Returns 1 if @key is equal to an append key of item @coord is set to -+ */ -+static int can_append(const reiser4_key *key, const coord_t *coord) -+{ -+ reiser4_key append_key; -+ -+ return keyeq(key, append_key_extent(coord, &append_key)); -+} -+ -+/** -+ * append_hole -+ * @coord: -+ * @lh: -+ * @key: -+ * -+ */ -+static int append_hole(coord_t *coord, lock_handle *lh, -+ const reiser4_key *key) -+{ -+ reiser4_key append_key; -+ reiser4_block_nr hole_width; -+ reiser4_extent *ext, new_ext; -+ reiser4_item_data idata; -+ -+ /* last item of file may have to be appended with hole */ -+ assert("vs-708", znode_get_level(coord->node) == TWIG_LEVEL); -+ assert("vs-714", item_id_by_coord(coord) == EXTENT_POINTER_ID); -+ -+ /* key of first byte which is not addressed by this extent */ -+ append_key_extent(coord, &append_key); -+ -+ assert("", keyle(&append_key, key)); -+ -+ /* -+ * extent item has to be appended with hole. Calculate length of that -+ * hole -+ */ -+ hole_width = ((get_key_offset(key) - get_key_offset(&append_key) + -+ current_blocksize - 1) >> current_blocksize_bits); -+ assert("vs-954", hole_width > 0); -+ -+ /* set coord after last unit */ -+ coord_init_after_item_end(coord); -+ -+ /* get last extent in the item */ -+ ext = extent_by_coord(coord); -+ if (state_of_extent(ext) == HOLE_EXTENT) { -+ /* -+ * last extent of a file is hole extent. Widen that extent by -+ * @hole_width blocks. Note that we do not worry about -+ * overflowing - extent width is 64 bits -+ */ -+ reiser4_set_extent(ext, HOLE_EXTENT_START, -+ extent_get_width(ext) + hole_width); -+ znode_make_dirty(coord->node); -+ return 0; -+ } -+ -+ /* append last item of the file with hole extent unit */ -+ assert("vs-713", (state_of_extent(ext) == ALLOCATED_EXTENT || -+ state_of_extent(ext) == UNALLOCATED_EXTENT)); -+ -+ reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width); -+ init_new_extent(&idata, &new_ext, 1); -+ return insert_into_item(coord, lh, &append_key, &idata, 0); -+} -+ -+/** -+ * check_jnodes -+ * @twig: longterm locked twig node -+ * @key: -+ * -+ */ -+static void check_jnodes(znode *twig, const reiser4_key *key, int count) -+{ -+#if REISER4_DEBUG -+ coord_t c; -+ reiser4_key node_key, jnode_key; -+ -+ jnode_key = *key; -+ -+ assert("", twig != NULL); -+ assert("", znode_get_level(twig) == TWIG_LEVEL); -+ assert("", znode_is_write_locked(twig)); -+ -+ zload(twig); -+ /* get the smallest key in twig node */ -+ coord_init_first_unit(&c, twig); -+ unit_key_by_coord(&c, &node_key); -+ assert("", keyle(&node_key, &jnode_key)); -+ -+ coord_init_last_unit(&c, twig); -+ unit_key_by_coord(&c, &node_key); -+ if (item_plugin_by_coord(&c)->s.file.append_key) -+ item_plugin_by_coord(&c)->s.file.append_key(&c, &node_key); -+ set_key_offset(&jnode_key, -+ get_key_offset(&jnode_key) + (loff_t)count * PAGE_CACHE_SIZE - 1); -+ assert("", keylt(&jnode_key, &node_key)); -+ zrelse(twig); -+#endif -+} -+ -+/** -+ * append_last_extent - append last file item -+ * @uf_coord: coord to start insertion from -+ * @jnodes: array of jnodes -+ * @count: number of jnodes in the array -+ * -+ * There is already at least one extent item of file @inode in the tree. Append -+ * the last of them with unallocated extent unit of width @count. Assign -+ * fake block numbers to jnodes corresponding to the inserted extent. -+ */ -+static int append_last_extent(uf_coord_t *uf_coord, const reiser4_key *key, -+ jnode **jnodes, int count) -+{ -+ int result; -+ reiser4_extent new_ext; -+ reiser4_item_data idata; -+ coord_t *coord; -+ extent_coord_extension_t *ext_coord; -+ reiser4_extent *ext; -+ reiser4_block_nr block; -+ jnode *node; -+ int i; -+ -+ coord = &uf_coord->coord; -+ ext_coord = &uf_coord->extension.extent; -+ ext = ext_by_ext_coord(uf_coord); -+ -+ /* check correctness of position in the item */ -+ assert("vs-228", coord->unit_pos == coord_last_unit_pos(coord)); -+ assert("vs-1311", coord->between == AFTER_UNIT); -+ assert("vs-1302", ext_coord->pos_in_unit == ext_coord->width - 1); -+ -+ if (!can_append(key, coord)) { -+ /* hole extent has to be inserted */ -+ result = append_hole(coord, uf_coord->lh, key); -+ uf_coord->valid = 0; -+ return result; -+ } -+ -+ if (count == 0) -+ return 0; -+ -+ assert("", get_key_offset(key) == (loff_t)index_jnode(jnodes[0]) * PAGE_CACHE_SIZE); -+ -+ result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host, -+ count); -+ BUG_ON(result != 0); -+ -+ switch (state_of_extent(ext)) { -+ case UNALLOCATED_EXTENT: -+ /* -+ * last extent unit of the file is unallocated one. Increase -+ * its width by @count -+ */ -+ reiser4_set_extent(ext, UNALLOCATED_EXTENT_START, -+ extent_get_width(ext) + count); -+ znode_make_dirty(coord->node); -+ -+ /* update coord extension */ -+ ext_coord->width += count; -+ ON_DEBUG(extent_set_width -+ (&uf_coord->extension.extent.extent, -+ ext_coord->width)); -+ break; -+ -+ case HOLE_EXTENT: -+ case ALLOCATED_EXTENT: -+ /* -+ * last extent unit of the file is either hole or allocated -+ * one. Append one unallocated extent of width @count -+ */ -+ reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count); -+ init_new_extent(&idata, &new_ext, 1); -+ result = insert_into_item(coord, uf_coord->lh, key, &idata, 0); -+ uf_coord->valid = 0; -+ if (result) -+ return result; -+ break; -+ -+ default: -+ return RETERR(-EIO); -+ } -+ -+ /* -+ * make sure that we hold long term locked twig node containing all -+ * jnodes we are about to capture -+ */ -+ check_jnodes(uf_coord->lh->node, key, count); -+ -+ /* -+ * assign fake block numbers to all jnodes. FIXME: make sure whether -+ * twig node containing inserted extent item is locked -+ */ -+ block = fake_blocknr_unformatted(count); -+ for (i = 0; i < count; i ++, block ++) { -+ node = jnodes[i]; -+ spin_lock_jnode(node); -+ JF_SET(node, JNODE_CREATED); -+ jnode_set_block(node, &block); -+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0); -+ BUG_ON(result != 0); -+ jnode_make_dirty_locked(node); -+ spin_unlock_jnode(node); -+ } -+ return count; -+} -+ -+/** -+ * insert_first_hole - inser hole extent into tree -+ * @coord: -+ * @lh: -+ * @key: -+ * -+ * -+ */ -+static int insert_first_hole(coord_t *coord, lock_handle *lh, -+ const reiser4_key *key) -+{ -+ reiser4_extent new_ext; -+ reiser4_item_data idata; -+ reiser4_key item_key; -+ reiser4_block_nr hole_width; -+ -+ /* @coord must be set for inserting of new item */ -+ assert("vs-711", coord_is_between_items(coord)); -+ -+ item_key = *key; -+ set_key_offset(&item_key, 0ull); -+ -+ hole_width = ((get_key_offset(key) + current_blocksize - 1) >> -+ current_blocksize_bits); -+ assert("vs-710", hole_width > 0); -+ -+ /* compose body of hole extent and insert item into tree */ -+ reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width); -+ init_new_extent(&idata, &new_ext, 1); -+ return insert_extent_by_coord(coord, &idata, &item_key, lh); -+} -+ -+ -+/** -+ * insert_first_extent - insert first file item -+ * @inode: inode of file -+ * @uf_coord: coord to start insertion from -+ * @jnodes: array of jnodes -+ * @count: number of jnodes in the array -+ * @inode: -+ * -+ * There are no items of file @inode in the tree yet. Insert unallocated extent -+ * of width @count into tree or hole extent if writing not to the -+ * beginning. Assign fake block numbers to jnodes corresponding to the inserted -+ * unallocated extent. Returns number of jnodes or error code. -+ */ -+static int insert_first_extent(uf_coord_t *uf_coord, const reiser4_key *key, -+ jnode **jnodes, int count, -+ struct inode *inode) -+{ -+ int result; -+ int i; -+ reiser4_extent new_ext; -+ reiser4_item_data idata; -+ reiser4_block_nr block; -+ unix_file_info_t *uf_info; -+ jnode *node; -+ -+ /* first extent insertion starts at leaf level */ -+ assert("vs-719", znode_get_level(uf_coord->coord.node) == LEAF_LEVEL); -+ assert("vs-711", coord_is_between_items(&uf_coord->coord)); -+ -+ if (get_key_offset(key) != 0) { -+ result = insert_first_hole(&uf_coord->coord, uf_coord->lh, key); -+ uf_coord->valid = 0; -+ uf_info = unix_file_inode_data(inode); -+ -+ /* -+ * first item insertion is only possible when writing to empty -+ * file or performing tail conversion -+ */ -+ assert("", (uf_info->container == UF_CONTAINER_EMPTY || -+ (reiser4_inode_get_flag(inode, -+ REISER4_PART_MIXED) && -+ reiser4_inode_get_flag(inode, -+ REISER4_PART_IN_CONV)))); -+ /* if file was empty - update its state */ -+ if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY) -+ uf_info->container = UF_CONTAINER_EXTENTS; -+ return result; -+ } -+ -+ if (count == 0) -+ return 0; -+ -+ result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host, count); -+ BUG_ON(result != 0); -+ -+ /* -+ * prepare for tree modification: compose body of item and item data -+ * structure needed for insertion -+ */ -+ reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count); -+ init_new_extent(&idata, &new_ext, 1); -+ -+ /* insert extent item into the tree */ -+ result = insert_extent_by_coord(&uf_coord->coord, &idata, key, -+ uf_coord->lh); -+ if (result) -+ return result; -+ -+ /* -+ * make sure that we hold long term locked twig node containing all -+ * jnodes we are about to capture -+ */ -+ check_jnodes(uf_coord->lh->node, key, count); -+ /* -+ * assign fake block numbers to all jnodes, capture and mark them dirty -+ */ -+ block = fake_blocknr_unformatted(count); -+ for (i = 0; i < count; i ++, block ++) { -+ node = jnodes[i]; -+ spin_lock_jnode(node); -+ JF_SET(node, JNODE_CREATED); -+ jnode_set_block(node, &block); -+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0); -+ BUG_ON(result != 0); -+ jnode_make_dirty_locked(node); -+ spin_unlock_jnode(node); -+ } -+ -+ /* -+ * invalidate coordinate, research must be performed to continue -+ * because write will continue on twig level -+ */ -+ uf_coord->valid = 0; -+ return count; -+} -+ -+/** -+ * plug_hole - replace hole extent with unallocated and holes -+ * @uf_coord: -+ * @key: -+ * @node: -+ * @h: structure containing coordinate, lock handle, key, etc -+ * -+ * Creates an unallocated extent of width 1 within a hole. In worst case two -+ * additional extents can be created. -+ */ -+static int plug_hole(uf_coord_t *uf_coord, const reiser4_key *key, int *how) -+{ -+ struct replace_handle rh; -+ reiser4_extent *ext; -+ reiser4_block_nr width, pos_in_unit; -+ coord_t *coord; -+ extent_coord_extension_t *ext_coord; -+ int return_inserted_position; -+ -+ check_uf_coord(uf_coord, key); -+ -+ rh.coord = coord_by_uf_coord(uf_coord); -+ rh.lh = uf_coord->lh; -+ rh.flags = 0; -+ -+ coord = coord_by_uf_coord(uf_coord); -+ ext_coord = ext_coord_by_uf_coord(uf_coord); -+ ext = ext_by_ext_coord(uf_coord); -+ -+ width = ext_coord->width; -+ pos_in_unit = ext_coord->pos_in_unit; -+ -+ *how = 0; -+ if (width == 1) { -+ reiser4_set_extent(ext, UNALLOCATED_EXTENT_START, 1); -+ znode_make_dirty(coord->node); -+ /* update uf_coord */ -+ ON_DEBUG(ext_coord->extent = *ext); -+ *how = 1; -+ return 0; -+ } else if (pos_in_unit == 0) { -+ /* we deal with first element of extent */ -+ if (coord->unit_pos) { -+ /* there is an extent to the left */ -+ if (state_of_extent(ext - 1) == UNALLOCATED_EXTENT) { -+ /* -+ * left neighboring unit is an unallocated -+ * extent. Increase its width and decrease -+ * width of hole -+ */ -+ extent_set_width(ext - 1, -+ extent_get_width(ext - 1) + 1); -+ extent_set_width(ext, width - 1); -+ znode_make_dirty(coord->node); -+ -+ /* update coord extension */ -+ coord->unit_pos--; -+ ext_coord->width = extent_get_width(ext - 1); -+ ext_coord->pos_in_unit = ext_coord->width - 1; -+ ext_coord->ext_offset -= sizeof(reiser4_extent); -+ ON_DEBUG(ext_coord->extent = -+ *extent_by_coord(coord)); -+ *how = 2; -+ return 0; -+ } -+ } -+ /* extent for replace */ -+ reiser4_set_extent(&rh.overwrite, UNALLOCATED_EXTENT_START, 1); -+ /* extent to be inserted */ -+ reiser4_set_extent(&rh.new_extents[0], HOLE_EXTENT_START, -+ width - 1); -+ rh.nr_new_extents = 1; -+ -+ /* have reiser4_replace_extent to return with @coord and -+ @uf_coord->lh set to unit which was replaced */ -+ return_inserted_position = 0; -+ *how = 3; -+ } else if (pos_in_unit == width - 1) { -+ /* we deal with last element of extent */ -+ if (coord->unit_pos < nr_units_extent(coord) - 1) { -+ /* there is an extent unit to the right */ -+ if (state_of_extent(ext + 1) == UNALLOCATED_EXTENT) { -+ /* -+ * right neighboring unit is an unallocated -+ * extent. Increase its width and decrease -+ * width of hole -+ */ -+ extent_set_width(ext + 1, -+ extent_get_width(ext + 1) + 1); -+ extent_set_width(ext, width - 1); -+ znode_make_dirty(coord->node); -+ -+ /* update coord extension */ -+ coord->unit_pos++; -+ ext_coord->width = extent_get_width(ext + 1); -+ ext_coord->pos_in_unit = 0; -+ ext_coord->ext_offset += sizeof(reiser4_extent); -+ ON_DEBUG(ext_coord->extent = -+ *extent_by_coord(coord)); -+ *how = 4; -+ return 0; -+ } -+ } -+ /* extent for replace */ -+ reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START, width - 1); -+ /* extent to be inserted */ -+ reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START, -+ 1); -+ rh.nr_new_extents = 1; -+ -+ /* have reiser4_replace_extent to return with @coord and -+ @uf_coord->lh set to unit which was inserted */ -+ return_inserted_position = 1; -+ *how = 5; -+ } else { -+ /* extent for replace */ -+ reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START, -+ pos_in_unit); -+ /* extents to be inserted */ -+ reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START, -+ 1); -+ reiser4_set_extent(&rh.new_extents[1], HOLE_EXTENT_START, -+ width - pos_in_unit - 1); -+ rh.nr_new_extents = 2; -+ -+ /* have reiser4_replace_extent to return with @coord and -+ @uf_coord->lh set to first of units which were inserted */ -+ return_inserted_position = 1; -+ *how = 6; -+ } -+ unit_key_by_coord(coord, &rh.paste_key); -+ set_key_offset(&rh.paste_key, get_key_offset(&rh.paste_key) + -+ extent_get_width(&rh.overwrite) * current_blocksize); -+ -+ uf_coord->valid = 0; -+ return reiser4_replace_extent(&rh, return_inserted_position); -+} -+ -+/** -+ * overwrite_one_block - -+ * @uf_coord: -+ * @key: -+ * @node: -+ * -+ * If @node corresponds to hole extent - create unallocated extent for it and -+ * assign fake block number. If @node corresponds to allocated extent - assign -+ * block number of jnode -+ */ -+static int overwrite_one_block(uf_coord_t *uf_coord, const reiser4_key *key, -+ jnode *node, int *hole_plugged) -+{ -+ int result; -+ extent_coord_extension_t *ext_coord; -+ reiser4_extent *ext; -+ reiser4_block_nr block; -+ int how; -+ -+ assert("vs-1312", uf_coord->coord.between == AT_UNIT); -+ -+ result = 0; -+ ext_coord = ext_coord_by_uf_coord(uf_coord); -+ ext = ext_by_ext_coord(uf_coord); -+ assert("", state_of_extent(ext) != UNALLOCATED_EXTENT); -+ -+ switch (state_of_extent(ext)) { -+ case ALLOCATED_EXTENT: -+ block = extent_get_start(ext) + ext_coord->pos_in_unit; -+ break; -+ -+ case HOLE_EXTENT: -+ result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(node)->host, 1); -+ BUG_ON(result != 0); -+ result = plug_hole(uf_coord, key, &how); -+ if (result) -+ return result; -+ block = fake_blocknr_unformatted(1); -+ if (hole_plugged) -+ *hole_plugged = 1; -+ JF_SET(node, JNODE_CREATED); -+ break; -+ -+ default: -+ return RETERR(-EIO); -+ } -+ -+ jnode_set_block(node, &block); -+ return 0; -+} -+ -+/** -+ * move_coord - move coordinate forward -+ * @uf_coord: -+ * -+ * Move coordinate one data block pointer forward. Return 1 if coord is set to -+ * the last one already or is invalid. -+ */ -+static int move_coord(uf_coord_t *uf_coord) -+{ -+ extent_coord_extension_t *ext_coord; -+ -+ if (uf_coord->valid == 0) -+ return 1; -+ ext_coord = &uf_coord->extension.extent; -+ ext_coord->pos_in_unit ++; -+ if (ext_coord->pos_in_unit < ext_coord->width) -+ /* coordinate moved within the unit */ -+ return 0; -+ -+ /* end of unit is reached. Try to move to next unit */ -+ ext_coord->pos_in_unit = 0; -+ uf_coord->coord.unit_pos ++; -+ if (uf_coord->coord.unit_pos < ext_coord->nr_units) { -+ /* coordinate moved to next unit */ -+ ext_coord->ext_offset += sizeof(reiser4_extent); -+ ext_coord->width = -+ extent_get_width(ext_by_offset -+ (uf_coord->coord.node, -+ ext_coord->ext_offset)); -+ ON_DEBUG(ext_coord->extent = -+ *ext_by_offset(uf_coord->coord.node, -+ ext_coord->ext_offset)); -+ return 0; -+ } -+ /* end of item is reached */ -+ uf_coord->valid = 0; -+ return 1; -+} -+ -+/** -+ * overwrite_extent - -+ * @inode: -+ * -+ * Returns number of handled jnodes. -+ */ -+static int overwrite_extent(uf_coord_t *uf_coord, const reiser4_key *key, -+ jnode **jnodes, int count, int *plugged_hole) -+{ -+ int result; -+ reiser4_key k; -+ int i; -+ jnode *node; -+ -+ k = *key; -+ for (i = 0; i < count; i ++) { -+ node = jnodes[i]; -+ if (*jnode_get_block(node) == 0) { -+ result = overwrite_one_block(uf_coord, &k, node, plugged_hole); -+ if (result) -+ return result; -+ } -+ /* -+ * make sure that we hold long term locked twig node containing -+ * all jnodes we are about to capture -+ */ -+ check_jnodes(uf_coord->lh->node, &k, 1); -+ /* -+ * assign fake block numbers to all jnodes, capture and mark -+ * them dirty -+ */ -+ spin_lock_jnode(node); -+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0); -+ BUG_ON(result != 0); -+ jnode_make_dirty_locked(node); -+ spin_unlock_jnode(node); -+ -+ if (uf_coord->valid == 0) -+ return i + 1; -+ -+ check_uf_coord(uf_coord, &k); -+ -+ if (move_coord(uf_coord)) { -+ /* -+ * failed to move to the next node pointer. Either end -+ * of file or end of twig node is reached. In the later -+ * case we might go to the right neighbor. -+ */ -+ uf_coord->valid = 0; -+ return i + 1; -+ } -+ set_key_offset(&k, get_key_offset(&k) + PAGE_CACHE_SIZE); -+ } -+ -+ return count; -+} -+ -+/** -+ * reiser4_update_extent -+ * @file: -+ * @jnodes: -+ * @count: -+ * @off: -+ * -+ */ -+int reiser4_update_extent(struct inode *inode, jnode *node, loff_t pos, -+ int *plugged_hole) -+{ -+ int result; -+ znode *loaded; -+ uf_coord_t uf_coord; -+ coord_t *coord; -+ lock_handle lh; -+ reiser4_key key; -+ -+ assert("", reiser4_lock_counters()->d_refs == 0); -+ -+ key_by_inode_and_offset_common(inode, pos, &key); -+ -+ init_uf_coord(&uf_coord, &lh); -+ coord = &uf_coord.coord; -+ result = find_file_item_nohint(coord, &lh, &key, -+ ZNODE_WRITE_LOCK, inode); -+ if (IS_CBKERR(result)) { -+ assert("", reiser4_lock_counters()->d_refs == 0); -+ return result; -+ } -+ -+ result = zload(coord->node); -+ BUG_ON(result != 0); -+ loaded = coord->node; -+ -+ if (coord->between == AFTER_UNIT) { -+ /* -+ * append existing extent item with unallocated extent of width -+ * nr_jnodes -+ */ -+ init_coord_extension_extent(&uf_coord, -+ get_key_offset(&key)); -+ result = append_last_extent(&uf_coord, &key, -+ &node, 1); -+ } else if (coord->between == AT_UNIT) { -+ /* -+ * overwrite -+ * not optimal yet. Will be optimized if new write will show -+ * performance win. -+ */ -+ init_coord_extension_extent(&uf_coord, -+ get_key_offset(&key)); -+ result = overwrite_extent(&uf_coord, &key, -+ &node, 1, plugged_hole); -+ } else { -+ /* -+ * there are no items of this file in the tree yet. Create -+ * first item of the file inserting one unallocated extent of -+ * width nr_jnodes -+ */ -+ result = insert_first_extent(&uf_coord, &key, &node, 1, inode); -+ } -+ assert("", result == 1 || result < 0); -+ zrelse(loaded); -+ done_lh(&lh); -+ assert("", reiser4_lock_counters()->d_refs == 0); -+ return (result == 1) ? 0 : result; -+} -+ -+/** -+ * update_extents -+ * @file: -+ * @jnodes: -+ * @count: -+ * @off: -+ * -+ */ -+static int update_extents(struct file *file, jnode **jnodes, int count, loff_t pos) -+{ -+ struct inode *inode; -+ struct hint hint; -+ reiser4_key key; -+ int result; -+ znode *loaded; -+ -+ result = load_file_hint(file, &hint); -+ BUG_ON(result != 0); -+ -+ inode = file->f_dentry->d_inode; -+ if (count != 0) -+ /* -+ * count == 0 is special case: expanding truncate -+ */ -+ pos = (loff_t)index_jnode(jnodes[0]) << PAGE_CACHE_SHIFT; -+ key_by_inode_and_offset_common(inode, pos, &key); -+ -+ assert("", reiser4_lock_counters()->d_refs == 0); -+ -+ do { -+ result = find_file_item(&hint, &key, ZNODE_WRITE_LOCK, inode); -+ if (IS_CBKERR(result)) { -+ assert("", reiser4_lock_counters()->d_refs == 0); -+ return result; -+ } -+ -+ result = zload(hint.ext_coord.coord.node); -+ BUG_ON(result != 0); -+ loaded = hint.ext_coord.coord.node; -+ -+ if (hint.ext_coord.coord.between == AFTER_UNIT) { -+ /* -+ * append existing extent item with unallocated extent -+ * of width nr_jnodes -+ */ -+ if (hint.ext_coord.valid == 0) -+ /* NOTE: get statistics on this */ -+ init_coord_extension_extent(&hint.ext_coord, -+ get_key_offset(&key)); -+ result = append_last_extent(&hint.ext_coord, &key, -+ jnodes, count); -+ } else if (hint.ext_coord.coord.between == AT_UNIT) { -+ /* -+ * overwrite -+ * not optimal yet. Will be optimized if new write will -+ * show performance win. -+ */ -+ if (hint.ext_coord.valid == 0) -+ /* NOTE: get statistics on this */ -+ init_coord_extension_extent(&hint.ext_coord, -+ get_key_offset(&key)); -+ result = overwrite_extent(&hint.ext_coord, &key, -+ jnodes, count, NULL); -+ } else { -+ /* -+ * there are no items of this file in the tree -+ * yet. Create first item of the file inserting one -+ * unallocated extent of * width nr_jnodes -+ */ -+ result = insert_first_extent(&hint.ext_coord, &key, -+ jnodes, count, inode); -+ } -+ zrelse(loaded); -+ if (result < 0) { -+ done_lh(hint.ext_coord.lh); -+ break; -+ } -+ -+ jnodes += result; -+ count -= result; -+ set_key_offset(&key, get_key_offset(&key) + result * PAGE_CACHE_SIZE); -+ -+ /* seal and unlock znode */ -+ if (hint.ext_coord.valid) -+ reiser4_set_hint(&hint, &key, ZNODE_WRITE_LOCK); -+ else -+ reiser4_unset_hint(&hint); -+ -+ } while (count > 0); -+ -+ save_file_hint(file, &hint); -+ assert("", reiser4_lock_counters()->d_refs == 0); -+ return result; -+} -+ -+/** -+ * write_extent_reserve_space - reserve space for extent write operation -+ * @inode: -+ * -+ * Estimates and reserves space which may be required for writing -+ * WRITE_GRANULARITY pages of file. -+ */ -+static int write_extent_reserve_space(struct inode *inode) -+{ -+ __u64 count; -+ reiser4_tree *tree; -+ -+ /* -+ * to write WRITE_GRANULARITY pages to a file by extents we have to -+ * reserve disk space for: -+ -+ * 1. find_file_item may have to insert empty node to the tree (empty -+ * leaf node between two extent items). This requires 1 block and -+ * number of blocks which are necessary to perform insertion of an -+ * internal item into twig level. -+ -+ * 2. for each of written pages there might be needed 1 block and -+ * number of blocks which might be necessary to perform insertion of or -+ * paste to an extent item. -+ -+ * 3. stat data update -+ */ -+ tree = reiser4_tree_by_inode(inode); -+ count = estimate_one_insert_item(tree) + -+ WRITE_GRANULARITY * (1 + estimate_one_insert_into_item(tree)) + -+ estimate_one_insert_item(tree); -+ grab_space_enable(); -+ return reiser4_grab_space(count, 0 /* flags */); -+} -+ -+/** -+ * reiser4_write_extent - write method of extent item plugin -+ * @file: file to write to -+ * @buf: address of user-space buffer -+ * @count: number of bytes to write -+ * @pos: position in file to write to -+ * -+ */ -+ssize_t reiser4_write_extent(struct file *file, const char __user *buf, -+ size_t count, loff_t *pos) -+{ -+ int have_to_update_extent; -+ int nr_pages, nr_dirty; -+ struct page *page; -+ jnode *jnodes[WRITE_GRANULARITY + 1]; -+ struct inode *inode; -+ unsigned long index; -+ unsigned long end; -+ int i; -+ int to_page, page_off; -+ size_t left, written; -+ int result = 0; -+ -+ inode = file->f_dentry->d_inode; -+ if (write_extent_reserve_space(inode)) -+ return RETERR(-ENOSPC); -+ -+ if (count == 0) { -+ /* truncate case */ -+ update_extents(file, jnodes, 0, *pos); -+ return 0; -+ } -+ -+ BUG_ON(get_current_context()->trans->atom != NULL); -+ -+ left = count; -+ index = *pos >> PAGE_CACHE_SHIFT; -+ /* calculate number of pages which are to be written */ -+ end = ((*pos + count - 1) >> PAGE_CACHE_SHIFT); -+ nr_pages = end - index + 1; -+ nr_dirty = 0; -+ assert("", nr_pages <= WRITE_GRANULARITY + 1); -+ -+ /* get pages and jnodes */ -+ for (i = 0; i < nr_pages; i ++) { -+ page = find_or_create_page(inode->i_mapping, index + i, -+ reiser4_ctx_gfp_mask_get()); -+ if (page == NULL) { -+ nr_pages = i; -+ result = RETERR(-ENOMEM); -+ goto out; -+ } -+ -+ jnodes[i] = jnode_of_page(page); -+ if (IS_ERR(jnodes[i])) { -+ unlock_page(page); -+ page_cache_release(page); -+ nr_pages = i; -+ result = RETERR(-ENOMEM); -+ goto out; -+ } -+ /* prevent jnode and page from disconnecting */ -+ JF_SET(jnodes[i], JNODE_WRITE_PREPARED); -+ unlock_page(page); -+ } -+ -+ BUG_ON(get_current_context()->trans->atom != NULL); -+ -+ have_to_update_extent = 0; -+ -+ page_off = (*pos & (PAGE_CACHE_SIZE - 1)); -+ for (i = 0; i < nr_pages; i ++) { -+ to_page = PAGE_CACHE_SIZE - page_off; -+ if (to_page > left) -+ to_page = left; -+ page = jnode_page(jnodes[i]); -+ if (page_offset(page) < inode->i_size && -+ !PageUptodate(page) && to_page != PAGE_CACHE_SIZE) { -+ /* -+ * the above is not optimal for partial write to last -+ * page of file when file size is not at boundary of -+ * page -+ */ -+ lock_page(page); -+ if (!PageUptodate(page)) { -+ result = readpage_unix_file(NULL, page); -+ BUG_ON(result != 0); -+ /* wait for read completion */ -+ lock_page(page); -+ BUG_ON(!PageUptodate(page)); -+ } else -+ result = 0; -+ unlock_page(page); -+ } -+ -+ BUG_ON(get_current_context()->trans->atom != NULL); -+ fault_in_pages_readable(buf, to_page); -+ BUG_ON(get_current_context()->trans->atom != NULL); -+ -+ lock_page(page); -+ if (!PageUptodate(page) && to_page != PAGE_CACHE_SIZE) { -+ void *kaddr; -+ -+ kaddr = kmap_atomic(page, KM_USER0); -+ memset(kaddr, 0, page_off); -+ memset(kaddr + page_off + to_page, 0, -+ PAGE_CACHE_SIZE - (page_off + to_page)); -+ flush_dcache_page(page); -+ kunmap_atomic(kaddr, KM_USER0); -+ } -+ -+ written = filemap_copy_from_user(page, page_off, buf, to_page); -+ if (unlikely(written != to_page)) { -+ unlock_page(page); -+ result = RETERR(-EFAULT); -+ break; -+ } -+ -+ flush_dcache_page(page); -+ reiser4_set_page_dirty_internal(page); -+ unlock_page(page); -+ nr_dirty++; -+ -+ mark_page_accessed(page); -+ SetPageUptodate(page); -+ -+ if (jnodes[i]->blocknr == 0) -+ have_to_update_extent ++; -+ -+ page_off = 0; -+ buf += to_page; -+ left -= to_page; -+ BUG_ON(get_current_context()->trans->atom != NULL); -+ } -+ -+ if (have_to_update_extent) { -+ update_extents(file, jnodes, nr_dirty, *pos); -+ } else { -+ for (i = 0; i < nr_dirty; i ++) { -+ int ret; -+ spin_lock_jnode(jnodes[i]); -+ ret = reiser4_try_capture(jnodes[i], -+ ZNODE_WRITE_LOCK, 0); -+ BUG_ON(ret != 0); -+ jnode_make_dirty_locked(jnodes[i]); -+ spin_unlock_jnode(jnodes[i]); -+ } -+ } -+out: -+ for (i = 0; i < nr_pages; i ++) { -+ page_cache_release(jnode_page(jnodes[i])); -+ JF_CLR(jnodes[i], JNODE_WRITE_PREPARED); -+ jput(jnodes[i]); -+ } -+ -+ /* the only errors handled so far is ENOMEM and -+ EFAULT on copy_from_user */ -+ -+ return (count - left) ? (count - left) : result; -+} -+ -+static inline void zero_page(struct page *page) -+{ -+ char *kaddr = kmap_atomic(page, KM_USER0); -+ -+ memset(kaddr, 0, PAGE_CACHE_SIZE); -+ flush_dcache_page(page); -+ kunmap_atomic(kaddr, KM_USER0); -+ SetPageUptodate(page); -+ unlock_page(page); -+} -+ -+int reiser4_do_readpage_extent(reiser4_extent * ext, reiser4_block_nr pos, -+ struct page *page) -+{ -+ jnode *j; -+ struct address_space *mapping; -+ unsigned long index; -+ oid_t oid; -+ reiser4_block_nr block; -+ -+ mapping = page->mapping; -+ oid = get_inode_oid(mapping->host); -+ index = page->index; -+ -+ switch (state_of_extent(ext)) { -+ case HOLE_EXTENT: -+ /* -+ * it is possible to have hole page with jnode, if page was -+ * eflushed previously. -+ */ -+ j = jfind(mapping, index); -+ if (j == NULL) { -+ zero_page(page); -+ return 0; -+ } -+ spin_lock_jnode(j); -+ if (!jnode_page(j)) { -+ jnode_attach_page(j, page); -+ } else { -+ BUG_ON(jnode_page(j) != page); -+ assert("vs-1504", jnode_page(j) == page); -+ } -+ block = *jnode_get_io_block(j); -+ spin_unlock_jnode(j); -+ if (block == 0) { -+ zero_page(page); -+ jput(j); -+ return 0; -+ } -+ break; -+ -+ case ALLOCATED_EXTENT: -+ j = jnode_of_page(page); -+ if (IS_ERR(j)) -+ return PTR_ERR(j); -+ if (*jnode_get_block(j) == 0) { -+ reiser4_block_nr blocknr; -+ -+ blocknr = extent_get_start(ext) + pos; -+ jnode_set_block(j, &blocknr); -+ } else -+ assert("vs-1403", -+ j->blocknr == extent_get_start(ext) + pos); -+ break; -+ -+ case UNALLOCATED_EXTENT: -+ j = jfind(mapping, index); -+ assert("nikita-2688", j); -+ assert("vs-1426", jnode_page(j) == NULL); -+ -+ spin_lock_jnode(j); -+ jnode_attach_page(j, page); -+ spin_unlock_jnode(j); -+ break; -+ -+ default: -+ warning("vs-957", "wrong extent\n"); -+ return RETERR(-EIO); -+ } -+ -+ BUG_ON(j == 0); -+ reiser4_page_io(page, j, READ, reiser4_ctx_gfp_mask_get()); -+ jput(j); -+ return 0; -+} -+ -+/* Implements plugin->u.item.s.file.read operation for extent items. */ -+int reiser4_read_extent(struct file *file, flow_t *flow, hint_t *hint) -+{ -+ int result; -+ struct page *page; -+ unsigned long cur_page, next_page; -+ unsigned long page_off, count; -+ struct address_space *mapping; -+ loff_t file_off; -+ uf_coord_t *uf_coord; -+ coord_t *coord; -+ extent_coord_extension_t *ext_coord; -+ unsigned long nr_pages; -+ char *kaddr; -+ -+ assert("vs-1353", current_blocksize == PAGE_CACHE_SIZE); -+ assert("vs-572", flow->user == 1); -+ assert("vs-1351", flow->length > 0); -+ -+ uf_coord = &hint->ext_coord; -+ -+ check_uf_coord(uf_coord, NULL); -+ assert("vs-33", uf_coord->lh == &hint->lh); -+ -+ coord = &uf_coord->coord; -+ assert("vs-1119", znode_is_rlocked(coord->node)); -+ assert("vs-1120", znode_is_loaded(coord->node)); -+ assert("vs-1256", coord_matches_key_extent(coord, &flow->key)); -+ -+ mapping = file->f_dentry->d_inode->i_mapping; -+ ext_coord = &uf_coord->extension.extent; -+ -+ /* offset in a file to start read from */ -+ file_off = get_key_offset(&flow->key); -+ /* offset within the page to start read from */ -+ page_off = (unsigned long)(file_off & (PAGE_CACHE_SIZE - 1)); -+ /* bytes which can be read from the page which contains file_off */ -+ count = PAGE_CACHE_SIZE - page_off; -+ -+ /* index of page containing offset read is to start from */ -+ cur_page = (unsigned long)(file_off >> PAGE_CACHE_SHIFT); -+ next_page = cur_page; -+ /* number of pages flow spans over */ -+ nr_pages = -+ ((file_off + flow->length + PAGE_CACHE_SIZE - -+ 1) >> PAGE_CACHE_SHIFT) - cur_page; -+ -+ /* we start having twig node read locked. However, we do not want to -+ keep that lock all the time readahead works. So, set a sel and -+ release twig node. */ -+ reiser4_set_hint(hint, &flow->key, ZNODE_READ_LOCK); -+ /* &hint->lh is done-ed */ -+ -+ do { -+ reiser4_txn_restart_current(); -+ page = read_mapping_page(mapping, cur_page, file); -+ if (IS_ERR(page)) -+ return PTR_ERR(page); -+ lock_page(page); -+ if (!PageUptodate(page)) { -+ unlock_page(page); -+ page_cache_release(page); -+ warning("jmacd-97178", "extent_read: page is not up to date"); -+ return RETERR(-EIO); -+ } -+ mark_page_accessed(page); -+ unlock_page(page); -+ -+ /* If users can be writing to this page using arbitrary virtual -+ addresses, take care about potential aliasing before reading -+ the page on the kernel side. -+ */ -+ if (mapping_writably_mapped(mapping)) -+ flush_dcache_page(page); -+ -+ assert("nikita-3034", reiser4_schedulable()); -+ -+ /* number of bytes which are to be read from the page */ -+ if (count > flow->length) -+ count = flow->length; -+ -+ result = fault_in_pages_writeable(flow->data, count); -+ if (result) { -+ page_cache_release(page); -+ return RETERR(-EFAULT); -+ } -+ -+ kaddr = kmap_atomic(page, KM_USER0); -+ result = __copy_to_user_inatomic(flow->data, -+ kaddr + page_off, count); -+ kunmap_atomic(kaddr, KM_USER0); -+ if (result != 0) { -+ kaddr = kmap(page); -+ result = __copy_to_user(flow->data, kaddr + page_off, count); -+ kunmap(page); -+ if (unlikely(result)) -+ return RETERR(-EFAULT); -+ } -+ -+ page_cache_release(page); -+ -+ /* increase key (flow->key), update user area pointer (flow->data) */ -+ move_flow_forward(flow, count); -+ -+ page_off = 0; -+ cur_page ++; -+ count = PAGE_CACHE_SIZE; -+ nr_pages--; -+ } while (flow->length); -+ -+ return 0; -+} -+ -+/* -+ plugin->s.file.readpage -+ reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->extent_readpage -+ or -+ filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_extent -+ -+ At the beginning: coord->node is read locked, zloaded, page is -+ locked, coord is set to existing unit inside of extent item (it is not necessary that coord matches to page->index) -+*/ -+int reiser4_readpage_extent(void *vp, struct page *page) -+{ -+ uf_coord_t *uf_coord = vp; -+ ON_DEBUG(coord_t * coord = &uf_coord->coord); -+ ON_DEBUG(reiser4_key key); -+ -+ assert("vs-1040", PageLocked(page)); -+ assert("vs-1050", !PageUptodate(page)); -+ assert("vs-1039", page->mapping && page->mapping->host); -+ -+ assert("vs-1044", znode_is_loaded(coord->node)); -+ assert("vs-758", item_is_extent(coord)); -+ assert("vs-1046", coord_is_existing_unit(coord)); -+ assert("vs-1045", znode_is_rlocked(coord->node)); -+ assert("vs-1047", -+ page->mapping->host->i_ino == -+ get_key_objectid(item_key_by_coord(coord, &key))); -+ check_uf_coord(uf_coord, NULL); -+ -+ return reiser4_do_readpage_extent( -+ ext_by_ext_coord(uf_coord), -+ uf_coord->extension.extent.pos_in_unit, page); -+} -+ -+/** -+ * get_block_address_extent -+ * @coord: -+ * @block: -+ * @result: -+ * -+ * -+ */ -+int get_block_address_extent(const coord_t *coord, sector_t block, -+ sector_t *result) -+{ -+ reiser4_extent *ext; -+ -+ if (!coord_is_existing_unit(coord)) -+ return RETERR(-EINVAL); -+ -+ ext = extent_by_coord(coord); -+ -+ if (state_of_extent(ext) != ALLOCATED_EXTENT) -+ /* FIXME: bad things may happen if it is unallocated extent */ -+ *result = 0; -+ else { -+ reiser4_key key; -+ -+ unit_key_by_coord(coord, &key); -+ assert("vs-1645", -+ block >= get_key_offset(&key) >> current_blocksize_bits); -+ assert("vs-1646", -+ block < -+ (get_key_offset(&key) >> current_blocksize_bits) + -+ extent_get_width(ext)); -+ *result = -+ extent_get_start(ext) + (block - -+ (get_key_offset(&key) >> -+ current_blocksize_bits)); -+ } -+ return 0; -+} -+ -+/* -+ plugin->u.item.s.file.append_key -+ key of first byte which is the next to last byte by addressed by this extent -+*/ -+reiser4_key *append_key_extent(const coord_t * coord, reiser4_key * key) -+{ -+ item_key_by_coord(coord, key); -+ set_key_offset(key, -+ get_key_offset(key) + reiser4_extent_size(coord, -+ nr_units_extent -+ (coord))); -+ -+ assert("vs-610", get_key_offset(key) -+ && (get_key_offset(key) & (current_blocksize - 1)) == 0); -+ return key; -+} -+ -+/* plugin->u.item.s.file.init_coord_extension */ -+void init_coord_extension_extent(uf_coord_t * uf_coord, loff_t lookuped) -+{ -+ coord_t *coord; -+ extent_coord_extension_t *ext_coord; -+ reiser4_key key; -+ loff_t offset; -+ -+ assert("vs-1295", uf_coord->valid == 0); -+ -+ coord = &uf_coord->coord; -+ assert("vs-1288", coord_is_iplug_set(coord)); -+ assert("vs-1327", znode_is_loaded(coord->node)); -+ -+ if (coord->between != AFTER_UNIT && coord->between != AT_UNIT) -+ return; -+ -+ ext_coord = &uf_coord->extension.extent; -+ ext_coord->nr_units = nr_units_extent(coord); -+ ext_coord->ext_offset = -+ (char *)extent_by_coord(coord) - zdata(coord->node); -+ ext_coord->width = extent_get_width(extent_by_coord(coord)); -+ ON_DEBUG(ext_coord->extent = *extent_by_coord(coord)); -+ uf_coord->valid = 1; -+ -+ /* pos_in_unit is the only uninitialized field in extended coord */ -+ if (coord->between == AFTER_UNIT) { -+ assert("vs-1330", -+ coord->unit_pos == nr_units_extent(coord) - 1); -+ -+ ext_coord->pos_in_unit = ext_coord->width - 1; -+ } else { -+ /* AT_UNIT */ -+ unit_key_by_coord(coord, &key); -+ offset = get_key_offset(&key); -+ -+ assert("vs-1328", offset <= lookuped); -+ assert("vs-1329", -+ lookuped < -+ offset + ext_coord->width * current_blocksize); -+ ext_coord->pos_in_unit = -+ ((lookuped - offset) >> current_blocksize_bits); -+ } -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/extent_flush_ops.c linux-2.6.20/fs/reiser4/plugin/item/extent_flush_ops.c ---- linux-2.6.20.orig/fs/reiser4/plugin/item/extent_flush_ops.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/item/extent_flush_ops.c 2007-05-06 14:50:43.811010720 +0400 -@@ -0,0 +1,1028 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "item.h" -+#include "../../tree.h" -+#include "../../jnode.h" -+#include "../../super.h" -+#include "../../flush.h" -+#include "../../carry.h" -+#include "../object.h" -+ -+#include -+ -+static reiser4_block_nr extent_unit_start(const coord_t * item); -+ -+/* Return either first or last extent (depending on @side) of the item -+ @coord is set to. Set @pos_in_unit either to first or to last block -+ of extent. */ -+static reiser4_extent *extent_utmost_ext(const coord_t * coord, sideof side, -+ reiser4_block_nr * pos_in_unit) -+{ -+ reiser4_extent *ext; -+ -+ if (side == LEFT_SIDE) { -+ /* get first extent of item */ -+ ext = extent_item(coord); -+ *pos_in_unit = 0; -+ } else { -+ /* get last extent of item and last position within it */ -+ assert("vs-363", side == RIGHT_SIDE); -+ ext = extent_item(coord) + coord_last_unit_pos(coord); -+ *pos_in_unit = extent_get_width(ext) - 1; -+ } -+ -+ return ext; -+} -+ -+/* item_plugin->f.utmost_child */ -+/* Return the child. Coord is set to extent item. Find jnode corresponding -+ either to first or to last unformatted node pointed by the item */ -+int utmost_child_extent(const coord_t * coord, sideof side, jnode ** childp) -+{ -+ reiser4_extent *ext; -+ reiser4_block_nr pos_in_unit; -+ -+ ext = extent_utmost_ext(coord, side, &pos_in_unit); -+ -+ switch (state_of_extent(ext)) { -+ case HOLE_EXTENT: -+ *childp = NULL; -+ return 0; -+ case ALLOCATED_EXTENT: -+ case UNALLOCATED_EXTENT: -+ break; -+ default: -+ /* this should never happen */ -+ assert("vs-1417", 0); -+ } -+ -+ { -+ reiser4_key key; -+ reiser4_tree *tree; -+ unsigned long index; -+ -+ if (side == LEFT_SIDE) { -+ /* get key of first byte addressed by the extent */ -+ item_key_by_coord(coord, &key); -+ } else { -+ /* get key of byte which next after last byte addressed by the extent */ -+ append_key_extent(coord, &key); -+ } -+ -+ assert("vs-544", -+ (get_key_offset(&key) >> PAGE_CACHE_SHIFT) < ~0ul); -+ /* index of first or last (depending on @side) page addressed -+ by the extent */ -+ index = -+ (unsigned long)(get_key_offset(&key) >> PAGE_CACHE_SHIFT); -+ if (side == RIGHT_SIDE) -+ index--; -+ -+ tree = coord->node->zjnode.tree; -+ *childp = jlookup(tree, get_key_objectid(&key), index); -+ } -+ -+ return 0; -+} -+ -+/* item_plugin->f.utmost_child_real_block */ -+/* Return the child's block, if allocated. */ -+int -+utmost_child_real_block_extent(const coord_t * coord, sideof side, -+ reiser4_block_nr * block) -+{ -+ reiser4_extent *ext; -+ -+ ext = extent_by_coord(coord); -+ -+ switch (state_of_extent(ext)) { -+ case ALLOCATED_EXTENT: -+ *block = extent_get_start(ext); -+ if (side == RIGHT_SIDE) -+ *block += extent_get_width(ext) - 1; -+ break; -+ case HOLE_EXTENT: -+ case UNALLOCATED_EXTENT: -+ *block = 0; -+ break; -+ default: -+ /* this should never happen */ -+ assert("vs-1418", 0); -+ } -+ -+ return 0; -+} -+ -+/* item_plugin->f.scan */ -+/* Performs leftward scanning starting from an unformatted node and its parent coordinate. -+ This scan continues, advancing the parent coordinate, until either it encounters a -+ formatted child or it finishes scanning this node. -+ -+ If unallocated, the entire extent must be dirty and in the same atom. (Actually, I'm -+ not sure this is last property (same atom) is enforced, but it should be the case since -+ one atom must write the parent and the others must read the parent, thus fusing?). In -+ any case, the code below asserts this case for unallocated extents. Unallocated -+ extents are thus optimized because we can skip to the endpoint when scanning. -+ -+ It returns control to reiser4_scan_extent, handles these terminating conditions, -+ e.g., by loading the next twig. -+*/ -+int reiser4_scan_extent(flush_scan * scan) -+{ -+ coord_t coord; -+ jnode *neighbor; -+ unsigned long scan_index, unit_index, unit_width, scan_max, scan_dist; -+ reiser4_block_nr unit_start; -+ __u64 oid; -+ reiser4_key key; -+ int ret = 0, allocated, incr; -+ reiser4_tree *tree; -+ -+ if (!JF_ISSET(scan->node, JNODE_DIRTY)) { -+ scan->stop = 1; -+ return 0; /* Race with truncate, this node is already -+ * truncated. */ -+ } -+ -+ coord_dup(&coord, &scan->parent_coord); -+ -+ assert("jmacd-1404", !reiser4_scan_finished(scan)); -+ assert("jmacd-1405", jnode_get_level(scan->node) == LEAF_LEVEL); -+ assert("jmacd-1406", jnode_is_unformatted(scan->node)); -+ -+ /* The scan_index variable corresponds to the current page index of the -+ unformatted block scan position. */ -+ scan_index = index_jnode(scan->node); -+ -+ assert("jmacd-7889", item_is_extent(&coord)); -+ -+ repeat: -+ /* objectid of file */ -+ oid = get_key_objectid(item_key_by_coord(&coord, &key)); -+ -+ allocated = !extent_is_unallocated(&coord); -+ /* Get the values of this extent unit: */ -+ unit_index = extent_unit_index(&coord); -+ unit_width = extent_unit_width(&coord); -+ unit_start = extent_unit_start(&coord); -+ -+ assert("jmacd-7187", unit_width > 0); -+ assert("jmacd-7188", scan_index >= unit_index); -+ assert("jmacd-7189", scan_index <= unit_index + unit_width - 1); -+ -+ /* Depending on the scan direction, we set different maximum values for scan_index -+ (scan_max) and the number of nodes that would be passed if the scan goes the -+ entire way (scan_dist). Incr is an integer reflecting the incremental -+ direction of scan_index. */ -+ if (reiser4_scanning_left(scan)) { -+ scan_max = unit_index; -+ scan_dist = scan_index - unit_index; -+ incr = -1; -+ } else { -+ scan_max = unit_index + unit_width - 1; -+ scan_dist = scan_max - unit_index; -+ incr = +1; -+ } -+ -+ tree = coord.node->zjnode.tree; -+ -+ /* If the extent is allocated we have to check each of its blocks. If the extent -+ is unallocated we can skip to the scan_max. */ -+ if (allocated) { -+ do { -+ neighbor = jlookup(tree, oid, scan_index); -+ if (neighbor == NULL) -+ goto stop_same_parent; -+ -+ if (scan->node != neighbor -+ && !reiser4_scan_goto(scan, neighbor)) { -+ /* @neighbor was jput() by reiser4_scan_goto */ -+ goto stop_same_parent; -+ } -+ -+ ret = scan_set_current(scan, neighbor, 1, &coord); -+ if (ret != 0) { -+ goto exit; -+ } -+ -+ /* reference to @neighbor is stored in @scan, no need -+ to jput(). */ -+ scan_index += incr; -+ -+ } while (incr + scan_max != scan_index); -+ -+ } else { -+ /* Optimized case for unallocated extents, skip to the end. */ -+ neighbor = jlookup(tree, oid, scan_max /*index */ ); -+ if (neighbor == NULL) { -+ /* Race with truncate */ -+ scan->stop = 1; -+ ret = 0; -+ goto exit; -+ } -+ -+ assert("zam-1043", -+ reiser4_blocknr_is_fake(jnode_get_block(neighbor))); -+ -+ ret = scan_set_current(scan, neighbor, scan_dist, &coord); -+ if (ret != 0) { -+ goto exit; -+ } -+ } -+ -+ if (coord_sideof_unit(&coord, scan->direction) == 0 -+ && item_is_extent(&coord)) { -+ /* Continue as long as there are more extent units. */ -+ -+ scan_index = -+ extent_unit_index(&coord) + -+ (reiser4_scanning_left(scan) ? -+ extent_unit_width(&coord) - 1 : 0); -+ goto repeat; -+ } -+ -+ if (0) { -+ stop_same_parent: -+ -+ /* If we are scanning left and we stop in the middle of an allocated -+ extent, we know the preceder immediately.. */ -+ /* middle of extent is (scan_index - unit_index) != 0. */ -+ if (reiser4_scanning_left(scan) && -+ (scan_index - unit_index) != 0) { -+ /* FIXME(B): Someone should step-through and verify that this preceder -+ calculation is indeed correct. */ -+ /* @unit_start is starting block (number) of extent -+ unit. Flush stopped at the @scan_index block from -+ the beginning of the file, which is (scan_index - -+ unit_index) block within extent. -+ */ -+ if (unit_start) { -+ /* skip preceder update when we are at hole */ -+ scan->preceder_blk = -+ unit_start + scan_index - unit_index; -+ check_preceder(scan->preceder_blk); -+ } -+ } -+ -+ /* In this case, we leave coord set to the parent of scan->node. */ -+ scan->stop = 1; -+ -+ } else { -+ /* In this case, we are still scanning, coord is set to the next item which is -+ either off-the-end of the node or not an extent. */ -+ assert("jmacd-8912", scan->stop == 0); -+ assert("jmacd-7812", -+ (coord_is_after_sideof_unit(&coord, scan->direction) -+ || !item_is_extent(&coord))); -+ } -+ -+ ret = 0; -+ exit: -+ return ret; -+} -+ -+/* ask block allocator for some blocks */ -+static void extent_allocate_blocks(reiser4_blocknr_hint *preceder, -+ reiser4_block_nr wanted_count, -+ reiser4_block_nr *first_allocated, -+ reiser4_block_nr *allocated, -+ block_stage_t block_stage) -+{ -+ *allocated = wanted_count; -+ preceder->max_dist = 0; /* scan whole disk, if needed */ -+ -+ /* that number of blocks (wanted_count) is either in UNALLOCATED or in GRABBED */ -+ preceder->block_stage = block_stage; -+ -+ /* FIXME: we do not handle errors here now */ -+ check_me("vs-420", -+ reiser4_alloc_blocks(preceder, first_allocated, allocated, -+ BA_PERMANENT) == 0); -+ /* update flush_pos's preceder to last allocated block number */ -+ preceder->blk = *first_allocated + *allocated - 1; -+} -+ -+/* when on flush time unallocated extent is to be replaced with allocated one it may happen that one unallocated extent -+ will have to be replaced with set of allocated extents. In this case insert_into_item will be called which may have -+ to add new nodes into tree. Space for that is taken from inviolable reserve (5%). */ -+static reiser4_block_nr reserve_replace(void) -+{ -+ reiser4_block_nr grabbed, needed; -+ -+ grabbed = get_current_context()->grabbed_blocks; -+ needed = estimate_one_insert_into_item(current_tree); -+ check_me("vpf-340", !reiser4_grab_space_force(needed, BA_RESERVED)); -+ return grabbed; -+} -+ -+static void free_replace_reserved(reiser4_block_nr grabbed) -+{ -+ reiser4_context *ctx; -+ -+ ctx = get_current_context(); -+ grabbed2free(ctx, get_super_private(ctx->super), -+ ctx->grabbed_blocks - grabbed); -+} -+ -+/* Block offset of first block addressed by unit */ -+__u64 extent_unit_index(const coord_t * item) -+{ -+ reiser4_key key; -+ -+ assert("vs-648", coord_is_existing_unit(item)); -+ unit_key_by_coord(item, &key); -+ return get_key_offset(&key) >> current_blocksize_bits; -+} -+ -+/* AUDIT shouldn't return value be of reiser4_block_nr type? -+ Josh's answer: who knows? Is a "number of blocks" the same type as "block offset"? */ -+__u64 extent_unit_width(const coord_t * item) -+{ -+ assert("vs-649", coord_is_existing_unit(item)); -+ return width_by_coord(item); -+} -+ -+/* Starting block location of this unit */ -+static reiser4_block_nr extent_unit_start(const coord_t * item) -+{ -+ return extent_get_start(extent_by_coord(item)); -+} -+ -+/** -+ * split_allocated_extent - -+ * @coord: -+ * @pos_in_unit: -+ * -+ * replace allocated extent with two allocated extents -+ */ -+static int split_allocated_extent(coord_t *coord, reiser4_block_nr pos_in_unit) -+{ -+ int result; -+ struct replace_handle *h; -+ reiser4_extent *ext; -+ reiser4_block_nr grabbed; -+ -+ ext = extent_by_coord(coord); -+ assert("vs-1410", state_of_extent(ext) == ALLOCATED_EXTENT); -+ assert("vs-1411", extent_get_width(ext) > pos_in_unit); -+ -+ h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get()); -+ if (h == NULL) -+ return RETERR(-ENOMEM); -+ h->coord = coord; -+ h->lh = znode_lh(coord->node); -+ h->pkey = &h->key; -+ unit_key_by_coord(coord, h->pkey); -+ set_key_offset(h->pkey, -+ (get_key_offset(h->pkey) + -+ pos_in_unit * current_blocksize)); -+ reiser4_set_extent(&h->overwrite, extent_get_start(ext), -+ pos_in_unit); -+ reiser4_set_extent(&h->new_extents[0], -+ extent_get_start(ext) + pos_in_unit, -+ extent_get_width(ext) - pos_in_unit); -+ h->nr_new_extents = 1; -+ h->flags = COPI_DONT_SHIFT_LEFT; -+ h->paste_key = h->key; -+ -+ /* reserve space for extent unit paste, @grabbed is reserved before */ -+ grabbed = reserve_replace(); -+ result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten -+ extent */); -+ /* restore reserved */ -+ free_replace_reserved(grabbed); -+ kfree(h); -+ return result; -+} -+ -+/* replace extent @ext by extent @replace. Try to merge @replace with previous extent of the item (if there is -+ one). Return 1 if it succeeded, 0 - otherwise */ -+static int try_to_merge_with_left(coord_t *coord, reiser4_extent *ext, -+ reiser4_extent *replace) -+{ -+ assert("vs-1415", extent_by_coord(coord) == ext); -+ -+ if (coord->unit_pos == 0 -+ || state_of_extent(ext - 1) != ALLOCATED_EXTENT) -+ /* @ext either does not exist or is not allocated extent */ -+ return 0; -+ if (extent_get_start(ext - 1) + extent_get_width(ext - 1) != -+ extent_get_start(replace)) -+ return 0; -+ -+ /* we can glue, widen previous unit */ -+ extent_set_width(ext - 1, -+ extent_get_width(ext - 1) + extent_get_width(replace)); -+ -+ if (extent_get_width(ext) != extent_get_width(replace)) { -+ /* make current extent narrower */ -+ if (state_of_extent(ext) == ALLOCATED_EXTENT) -+ extent_set_start(ext, -+ extent_get_start(ext) + -+ extent_get_width(replace)); -+ extent_set_width(ext, -+ extent_get_width(ext) - -+ extent_get_width(replace)); -+ } else { -+ /* current extent completely glued with its left neighbor, remove it */ -+ coord_t from, to; -+ -+ coord_dup(&from, coord); -+ from.unit_pos = nr_units_extent(coord) - 1; -+ coord_dup(&to, &from); -+ -+ /* currently cut from extent can cut either from the beginning or from the end. Move place which got -+ freed after unit removal to end of item */ -+ memmove(ext, ext + 1, -+ (from.unit_pos - -+ coord->unit_pos) * sizeof(reiser4_extent)); -+ /* wipe part of item which is going to be cut, so that node_check will not be confused */ -+ cut_node_content(&from, &to, NULL, NULL, NULL); -+ } -+ znode_make_dirty(coord->node); -+ /* move coord back */ -+ coord->unit_pos--; -+ return 1; -+} -+ -+/** -+ * conv_extent - replace extent with 2 ones -+ * @coord: coordinate of extent to be replaced -+ * @replace: extent to overwrite the one @coord is set to -+ * -+ * Overwrites extent @coord is set to and paste one extent unit after -+ * overwritten one if @replace is shorter than initial extent -+ */ -+static int conv_extent(coord_t *coord, reiser4_extent *replace) -+{ -+ int result; -+ struct replace_handle *h; -+ reiser4_extent *ext; -+ reiser4_block_nr start, width, new_width; -+ reiser4_block_nr grabbed; -+ extent_state state; -+ -+ ext = extent_by_coord(coord); -+ state = state_of_extent(ext); -+ start = extent_get_start(ext); -+ width = extent_get_width(ext); -+ new_width = extent_get_width(replace); -+ -+ assert("vs-1458", (state == UNALLOCATED_EXTENT || -+ state == ALLOCATED_EXTENT)); -+ assert("vs-1459", width >= new_width); -+ -+ if (try_to_merge_with_left(coord, ext, replace)) { -+ /* merged @replace with left neighbor. Current unit is either -+ removed or narrowed */ -+ return 0; -+ } -+ -+ if (width == new_width) { -+ /* replace current extent with @replace */ -+ *ext = *replace; -+ znode_make_dirty(coord->node); -+ return 0; -+ } -+ -+ h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get()); -+ if (h == NULL) -+ return RETERR(-ENOMEM); -+ h->coord = coord; -+ h->lh = znode_lh(coord->node); -+ h->pkey = &h->key; -+ unit_key_by_coord(coord, h->pkey); -+ set_key_offset(h->pkey, -+ (get_key_offset(h->pkey) + new_width * current_blocksize)); -+ h->overwrite = *replace; -+ -+ /* replace @ext with @replace and padding extent */ -+ reiser4_set_extent(&h->new_extents[0], -+ (state == ALLOCATED_EXTENT) ? -+ (start + new_width) : -+ UNALLOCATED_EXTENT_START, -+ width - new_width); -+ h->nr_new_extents = 1; -+ h->flags = COPI_DONT_SHIFT_LEFT; -+ h->paste_key = h->key; -+ -+ /* reserve space for extent unit paste, @grabbed is reserved before */ -+ grabbed = reserve_replace(); -+ result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten -+ extent */); -+ -+ /* restore reserved */ -+ free_replace_reserved(grabbed); -+ kfree(h); -+ return result; -+} -+ -+/** -+ * assign_real_blocknrs -+ * @flush_pos: -+ * @oid: objectid of file jnodes to assign block number to belongs to -+ * @index: first jnode on the range -+ * @count: number of jnodes to assign block numbers to -+ * @first: start of allocated block range -+ * -+ * Assigns block numbers to each of @count jnodes. Index of first jnode is -+ * @index. Jnodes get lookuped with jlookup. -+ */ -+static void assign_real_blocknrs(flush_pos_t *flush_pos, oid_t oid, -+ unsigned long index, reiser4_block_nr count, -+ reiser4_block_nr first) -+{ -+ unsigned long i; -+ reiser4_tree *tree; -+ txn_atom *atom; -+ int nr; -+ -+ atom = atom_locked_by_fq(flush_pos->fq); -+ assert("vs-1468", atom); -+ BUG_ON(atom == NULL); -+ -+ nr = 0; -+ tree = current_tree; -+ for (i = 0; i < count; ++i, ++index) { -+ jnode *node; -+ -+ node = jlookup(tree, oid, index); -+ assert("", node != NULL); -+ BUG_ON(node == NULL); -+ -+ spin_lock_jnode(node); -+ assert("", !jnode_is_flushprepped(node)); -+ assert("vs-1475", node->atom == atom); -+ assert("vs-1476", atomic_read(&node->x_count) > 0); -+ -+ JF_CLR(node, JNODE_FLUSH_RESERVED); -+ jnode_set_block(node, &first); -+ unformatted_make_reloc(node, flush_pos->fq); -+ ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node), -+ FQ_LIST, 0)); -+ spin_unlock_jnode(node); -+ first++; -+ -+ atomic_dec(&node->x_count); -+ nr ++; -+ } -+ -+ spin_unlock_atom(atom); -+ return; -+} -+ -+/** -+ * make_node_ovrwr - assign node to overwrite set -+ * @jnodes: overwrite set list head -+ * @node: jnode to belong to overwrite set -+ * -+ * Sets OVRWR jnode state bit and puts @node to the end of list head @jnodes -+ * which is an accumulator for nodes before they get to overwrite set list of -+ * atom. -+ */ -+static void make_node_ovrwr(struct list_head *jnodes, jnode *node) -+{ -+ spin_lock_jnode(node); -+ -+ assert("zam-917", !JF_ISSET(node, JNODE_RELOC)); -+ assert("zam-918", !JF_ISSET(node, JNODE_OVRWR)); -+ -+ JF_SET(node, JNODE_OVRWR); -+ list_move_tail(&node->capture_link, jnodes); -+ ON_DEBUG(count_jnode(node->atom, node, DIRTY_LIST, OVRWR_LIST, 0)); -+ -+ spin_unlock_jnode(node); -+} -+ -+/** -+ * mark_jnodes_overwrite - put bunch of jnodes to overwrite set -+ * @flush_pos: flush position -+ * @oid: objectid of file jnodes belong to -+ * @index: starting index -+ * @width: extent width -+ * -+ * Puts nodes of one extent (file objectid @oid, extent width @width) to atom's -+ * overwrite set. Starting from the one with index @index. If end of slum is -+ * detected (node is not found or flushprepped) - stop iterating and set flush -+ * position's state to POS_INVALID. -+ */ -+static void mark_jnodes_overwrite(flush_pos_t *flush_pos, oid_t oid, -+ unsigned long index, reiser4_block_nr width) -+{ -+ unsigned long i; -+ reiser4_tree *tree; -+ jnode *node; -+ txn_atom *atom; -+ LIST_HEAD(jnodes); -+ -+ tree = current_tree; -+ -+ atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos)); -+ assert("vs-1478", atom); -+ -+ for (i = flush_pos->pos_in_unit; i < width; i++, index++) { -+ node = jlookup(tree, oid, index); -+ if (!node) { -+ flush_pos->state = POS_INVALID; -+ break; -+ } -+ if (jnode_check_flushprepped(node)) { -+ flush_pos->state = POS_INVALID; -+ atomic_dec(&node->x_count); -+ break; -+ } -+ if (node->atom != atom) { -+ flush_pos->state = POS_INVALID; -+ atomic_dec(&node->x_count); -+ break; -+ } -+ make_node_ovrwr(&jnodes, node); -+ atomic_dec(&node->x_count); -+ } -+ -+ list_splice_init(&jnodes, ATOM_OVRWR_LIST(atom)->prev); -+ spin_unlock_atom(atom); -+} -+ -+/** -+ * allocated_extent_slum_size -+ * @flush_pos: -+ * @oid: -+ * @index: -+ * @count: -+ * -+ * -+ */ -+static int allocated_extent_slum_size(flush_pos_t *flush_pos, oid_t oid, -+ unsigned long index, unsigned long count) -+{ -+ unsigned long i; -+ reiser4_tree *tree; -+ txn_atom *atom; -+ int nr; -+ -+ atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos)); -+ assert("vs-1468", atom); -+ -+ nr = 0; -+ tree = current_tree; -+ for (i = 0; i < count; ++i, ++index) { -+ jnode *node; -+ -+ node = jlookup(tree, oid, index); -+ if (!node) -+ break; -+ -+ if (jnode_check_flushprepped(node)) { -+ atomic_dec(&node->x_count); -+ break; -+ } -+ -+ if (node->atom != atom) { -+ /* -+ * this is possible on overwrite: extent_write may -+ * capture several unformatted nodes without capturing -+ * any formatted nodes. -+ */ -+ atomic_dec(&node->x_count); -+ break; -+ } -+ -+ assert("vs-1476", atomic_read(&node->x_count) > 1); -+ atomic_dec(&node->x_count); -+ nr ++; -+ } -+ -+ spin_unlock_atom(atom); -+ return nr; -+} -+ -+/** -+ * alloc_extent -+ * @flush_pos: -+ * -+ * -+ * this is called by handle_pos_on_twig to proceed extent unit flush_pos->coord -+ * is set to. It is to prepare for flushing sequence of not flushprepped nodes -+ * (slum). It supposes that slum starts at flush_pos->pos_in_unit position -+ * within the extent. Slum gets to relocate set if flush_pos->leaf_relocate is -+ * set to 1 and to overwrite set otherwise -+ */ -+int reiser4_alloc_extent(flush_pos_t *flush_pos) -+{ -+ coord_t *coord; -+ reiser4_extent *ext; -+ reiser4_extent replace_ext; -+ oid_t oid; -+ reiser4_block_nr protected; -+ reiser4_block_nr start; -+ __u64 index; -+ __u64 width; -+ extent_state state; -+ int result; -+ reiser4_block_nr first_allocated; -+ __u64 allocated; -+ reiser4_key key; -+ block_stage_t block_stage; -+ -+ assert("vs-1468", flush_pos->state == POS_ON_EPOINT); -+ assert("vs-1469", coord_is_existing_unit(&flush_pos->coord) -+ && item_is_extent(&flush_pos->coord)); -+ -+ coord = &flush_pos->coord; -+ -+ ext = extent_by_coord(coord); -+ state = state_of_extent(ext); -+ if (state == HOLE_EXTENT) { -+ flush_pos->state = POS_INVALID; -+ return 0; -+ } -+ -+ item_key_by_coord(coord, &key); -+ oid = get_key_objectid(&key); -+ index = extent_unit_index(coord) + flush_pos->pos_in_unit; -+ start = extent_get_start(ext); -+ width = extent_get_width(ext); -+ -+ assert("vs-1457", width > flush_pos->pos_in_unit); -+ -+ if (flush_pos->leaf_relocate || state == UNALLOCATED_EXTENT) { -+ /* relocate */ -+ if (flush_pos->pos_in_unit) { -+ /* split extent unit into two */ -+ result = -+ split_allocated_extent(coord, -+ flush_pos->pos_in_unit); -+ flush_pos->pos_in_unit = 0; -+ return result; -+ } -+ -+ /* limit number of nodes to allocate */ -+ if (flush_pos->nr_to_write < width) -+ width = flush_pos->nr_to_write; -+ -+ if (state == ALLOCATED_EXTENT) { -+ /* -+ * all protected nodes are not flushprepped, therefore -+ * they are counted as flush_reserved -+ */ -+ block_stage = BLOCK_FLUSH_RESERVED; -+ protected = allocated_extent_slum_size(flush_pos, oid, -+ index, width); -+ if (protected == 0) { -+ flush_pos->state = POS_INVALID; -+ flush_pos->pos_in_unit = 0; -+ return 0; -+ } -+ } else { -+ block_stage = BLOCK_UNALLOCATED; -+ protected = width; -+ } -+ -+ /* -+ * look at previous unit if possible. If it is allocated, make -+ * preceder more precise -+ */ -+ if (coord->unit_pos && -+ (state_of_extent(ext - 1) == ALLOCATED_EXTENT)) -+ reiser4_pos_hint(flush_pos)->blk = -+ extent_get_start(ext - 1) + -+ extent_get_width(ext - 1); -+ -+ /* allocate new block numbers for protected nodes */ -+ extent_allocate_blocks(reiser4_pos_hint(flush_pos), -+ protected, -+ &first_allocated, &allocated, -+ block_stage); -+ -+ if (state == ALLOCATED_EXTENT) -+ /* -+ * on relocating - free nodes which are going to be -+ * relocated -+ */ -+ reiser4_dealloc_blocks(&start, &allocated, -+ BLOCK_ALLOCATED, BA_DEFER); -+ -+ /* assign new block numbers to protected nodes */ -+ assign_real_blocknrs(flush_pos, oid, index, allocated, first_allocated); -+ -+ /* prepare extent which will replace current one */ -+ reiser4_set_extent(&replace_ext, first_allocated, allocated); -+ -+ /* adjust extent item */ -+ result = conv_extent(coord, &replace_ext); -+ if (result != 0 && result != -ENOMEM) { -+ warning("vs-1461", -+ "Failed to allocate extent. Should not happen\n"); -+ return result; -+ } -+ -+ /* -+ * break flush: we prepared for flushing as many blocks as we -+ * were asked for -+ */ -+ if (flush_pos->nr_to_write == allocated) -+ flush_pos->state = POS_INVALID; -+ } else { -+ /* overwrite */ -+ mark_jnodes_overwrite(flush_pos, oid, index, width); -+ } -+ flush_pos->pos_in_unit = 0; -+ return 0; -+} -+ -+/* if @key is glueable to the item @coord is set to */ -+static int must_insert(const coord_t *coord, const reiser4_key *key) -+{ -+ reiser4_key last; -+ -+ if (item_id_by_coord(coord) == EXTENT_POINTER_ID -+ && keyeq(append_key_extent(coord, &last), key)) -+ return 0; -+ return 1; -+} -+ -+/* copy extent @copy to the end of @node. It may have to either insert new item after the last one, or append last item, -+ or modify last unit of last item to have greater width */ -+static int put_unit_to_end(znode *node, const reiser4_key *key, -+ reiser4_extent *copy_ext) -+{ -+ int result; -+ coord_t coord; -+ cop_insert_flag flags; -+ reiser4_extent *last_ext; -+ reiser4_item_data data; -+ -+ /* set coord after last unit in an item */ -+ coord_init_last_unit(&coord, node); -+ coord.between = AFTER_UNIT; -+ -+ flags = -+ COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT | COPI_DONT_ALLOCATE; -+ if (must_insert(&coord, key)) { -+ result = -+ insert_by_coord(&coord, init_new_extent(&data, copy_ext, 1), -+ key, NULL /*lh */ , flags); -+ -+ } else { -+ /* try to glue with last unit */ -+ last_ext = extent_by_coord(&coord); -+ if (state_of_extent(last_ext) && -+ extent_get_start(last_ext) + extent_get_width(last_ext) == -+ extent_get_start(copy_ext)) { -+ /* widen last unit of node */ -+ extent_set_width(last_ext, -+ extent_get_width(last_ext) + -+ extent_get_width(copy_ext)); -+ znode_make_dirty(node); -+ return 0; -+ } -+ -+ /* FIXME: put an assertion here that we can not merge last unit in @node and new unit */ -+ result = -+ insert_into_item(&coord, NULL /*lh */ , key, -+ init_new_extent(&data, copy_ext, 1), -+ flags); -+ } -+ -+ assert("vs-438", result == 0 || result == -E_NODE_FULL); -+ return result; -+} -+ -+/* @coord is set to extent unit */ -+squeeze_result squalloc_extent(znode *left, const coord_t *coord, -+ flush_pos_t *flush_pos, -+ reiser4_key *stop_key) -+{ -+ reiser4_extent *ext; -+ __u64 index; -+ __u64 width; -+ reiser4_block_nr start; -+ extent_state state; -+ oid_t oid; -+ reiser4_block_nr first_allocated; -+ __u64 allocated; -+ __u64 protected; -+ reiser4_extent copy_extent; -+ reiser4_key key; -+ int result; -+ block_stage_t block_stage; -+ -+ assert("vs-1457", flush_pos->pos_in_unit == 0); -+ assert("vs-1467", coord_is_leftmost_unit(coord)); -+ assert("vs-1467", item_is_extent(coord)); -+ -+ ext = extent_by_coord(coord); -+ index = extent_unit_index(coord); -+ start = extent_get_start(ext); -+ width = extent_get_width(ext); -+ state = state_of_extent(ext); -+ unit_key_by_coord(coord, &key); -+ oid = get_key_objectid(&key); -+ -+ if ((flush_pos->leaf_relocate && state == ALLOCATED_EXTENT) || -+ (state == UNALLOCATED_EXTENT)) { -+ /* relocate */ -+ if (state == ALLOCATED_EXTENT) { -+ /* all protected nodes are not flushprepped, therefore -+ * they are counted as flush_reserved */ -+ block_stage = BLOCK_FLUSH_RESERVED; -+ protected = allocated_extent_slum_size(flush_pos, oid, -+ index, width); -+ if (protected == 0) { -+ flush_pos->state = POS_INVALID; -+ flush_pos->pos_in_unit = 0; -+ return 0; -+ } -+ } else { -+ block_stage = BLOCK_UNALLOCATED; -+ protected = width; -+ } -+ -+ /* -+ * look at previous unit if possible. If it is allocated, make -+ * preceder more precise -+ */ -+ if (coord->unit_pos && -+ (state_of_extent(ext - 1) == ALLOCATED_EXTENT)) -+ reiser4_pos_hint(flush_pos)->blk = -+ extent_get_start(ext - 1) + -+ extent_get_width(ext - 1); -+ -+ /* allocate new block numbers for protected nodes */ -+ extent_allocate_blocks(reiser4_pos_hint(flush_pos), -+ protected, -+ &first_allocated, &allocated, -+ block_stage); -+ -+ /* prepare extent which will be copied to left */ -+ reiser4_set_extent(©_extent, first_allocated, allocated); -+ -+ result = put_unit_to_end(left, &key, ©_extent); -+ if (result == -E_NODE_FULL) { -+ int target_block_stage; -+ -+ /* free blocks which were just allocated */ -+ target_block_stage = -+ (state == -+ ALLOCATED_EXTENT) ? BLOCK_FLUSH_RESERVED : -+ BLOCK_UNALLOCATED; -+ reiser4_dealloc_blocks(&first_allocated, &allocated, -+ target_block_stage, -+ BA_PERMANENT); -+ -+ /* rewind the preceder. */ -+ flush_pos->preceder.blk = first_allocated; -+ check_preceder(flush_pos->preceder.blk); -+ -+ return SQUEEZE_TARGET_FULL; -+ } -+ -+ if (state == ALLOCATED_EXTENT) { -+ /* free nodes which were relocated */ -+ reiser4_dealloc_blocks(&start, &allocated, -+ BLOCK_ALLOCATED, BA_DEFER); -+ } -+ -+ /* assign new block numbers to protected nodes */ -+ assign_real_blocknrs(flush_pos, oid, index, allocated, -+ first_allocated); -+ -+ set_key_offset(&key, -+ get_key_offset(&key) + -+ (allocated << current_blocksize_bits)); -+ } else { -+ /* -+ * overwrite: try to copy unit as it is to left neighbor and -+ * make all first not flushprepped nodes overwrite nodes -+ */ -+ reiser4_set_extent(©_extent, start, width); -+ result = put_unit_to_end(left, &key, ©_extent); -+ if (result == -E_NODE_FULL) -+ return SQUEEZE_TARGET_FULL; -+ -+ if (state != HOLE_EXTENT) -+ mark_jnodes_overwrite(flush_pos, oid, index, width); -+ set_key_offset(&key, -+ get_key_offset(&key) + -+ (width << current_blocksize_bits)); -+ } -+ *stop_key = key; -+ return SQUEEZE_CONTINUE; -+} -+ -+int key_by_offset_extent(struct inode *inode, loff_t off, reiser4_key * key) -+{ -+ return key_by_inode_and_offset_common(inode, off, key); -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/extent.h linux-2.6.20/fs/reiser4/plugin/item/extent.h ---- linux-2.6.20.orig/fs/reiser4/plugin/item/extent.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/item/extent.h 2007-05-06 14:50:43.811010720 +0400 -@@ -0,0 +1,231 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#ifndef __REISER4_EXTENT_H__ -+#define __REISER4_EXTENT_H__ -+ -+/* on disk extent */ -+typedef struct { -+ reiser4_dblock_nr start; -+ reiser4_dblock_nr width; -+} reiser4_extent; -+ -+typedef struct extent_stat { -+ int unallocated_units; -+ int unallocated_blocks; -+ int allocated_units; -+ int allocated_blocks; -+ int hole_units; -+ int hole_blocks; -+} extent_stat; -+ -+/* extents in an extent item can be either holes, or unallocated or allocated -+ extents */ -+typedef enum { -+ HOLE_EXTENT, -+ UNALLOCATED_EXTENT, -+ ALLOCATED_EXTENT -+} extent_state; -+ -+#define HOLE_EXTENT_START 0 -+#define UNALLOCATED_EXTENT_START 1 -+#define UNALLOCATED_EXTENT_START2 2 -+ -+typedef struct { -+ reiser4_block_nr pos_in_unit; -+ reiser4_block_nr width; /* width of current unit */ -+ pos_in_node_t nr_units; /* number of units */ -+ int ext_offset; /* offset from the beginning of zdata() */ -+ unsigned long expected_page; -+#if REISER4_DEBUG -+ reiser4_extent extent; -+#endif -+} extent_coord_extension_t; -+ -+/* macros to set/get fields of on-disk extent */ -+static inline reiser4_block_nr extent_get_start(const reiser4_extent * ext) -+{ -+ return le64_to_cpu(ext->start); -+} -+ -+static inline reiser4_block_nr extent_get_width(const reiser4_extent * ext) -+{ -+ return le64_to_cpu(ext->width); -+} -+ -+extern __u64 reiser4_current_block_count(void); -+ -+static inline void -+extent_set_start(reiser4_extent * ext, reiser4_block_nr start) -+{ -+ cassert(sizeof(ext->start) == 8); -+ assert("nikita-2510", -+ ergo(start > 1, start < reiser4_current_block_count())); -+ put_unaligned(cpu_to_le64(start), &ext->start); -+} -+ -+static inline void -+extent_set_width(reiser4_extent * ext, reiser4_block_nr width) -+{ -+ cassert(sizeof(ext->width) == 8); -+ assert("", width > 0); -+ put_unaligned(cpu_to_le64(width), &ext->width); -+ assert("nikita-2511", -+ ergo(extent_get_start(ext) > 1, -+ extent_get_start(ext) + width <= -+ reiser4_current_block_count())); -+} -+ -+#define extent_item(coord) \ -+({ \ -+ assert("nikita-3143", item_is_extent(coord)); \ -+ ((reiser4_extent *)item_body_by_coord (coord)); \ -+}) -+ -+#define extent_by_coord(coord) \ -+({ \ -+ assert("nikita-3144", item_is_extent(coord)); \ -+ (extent_item (coord) + (coord)->unit_pos); \ -+}) -+ -+#define width_by_coord(coord) \ -+({ \ -+ assert("nikita-3145", item_is_extent(coord)); \ -+ extent_get_width (extent_by_coord(coord)); \ -+}) -+ -+struct carry_cut_data; -+struct carry_kill_data; -+ -+/* plugin->u.item.b.* */ -+reiser4_key *max_key_inside_extent(const coord_t *, reiser4_key *); -+int can_contain_key_extent(const coord_t * coord, const reiser4_key * key, -+ const reiser4_item_data *); -+int mergeable_extent(const coord_t * p1, const coord_t * p2); -+pos_in_node_t nr_units_extent(const coord_t *); -+lookup_result lookup_extent(const reiser4_key *, lookup_bias, coord_t *); -+void init_coord_extent(coord_t *); -+int init_extent(coord_t *, reiser4_item_data *); -+int paste_extent(coord_t *, reiser4_item_data *, carry_plugin_info *); -+int can_shift_extent(unsigned free_space, -+ coord_t * source, znode * target, shift_direction, -+ unsigned *size, unsigned want); -+void copy_units_extent(coord_t * target, coord_t * source, unsigned from, -+ unsigned count, shift_direction where_is_free_space, -+ unsigned free_space); -+int kill_hook_extent(const coord_t *, pos_in_node_t from, pos_in_node_t count, -+ struct carry_kill_data *); -+int create_hook_extent(const coord_t * coord, void *arg); -+int cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ struct carry_cut_data *, reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+int kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ struct carry_kill_data *, reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+reiser4_key *unit_key_extent(const coord_t *, reiser4_key *); -+reiser4_key *max_unit_key_extent(const coord_t *, reiser4_key *); -+void print_extent(const char *, coord_t *); -+int utmost_child_extent(const coord_t * coord, sideof side, jnode ** child); -+int utmost_child_real_block_extent(const coord_t * coord, sideof side, -+ reiser4_block_nr * block); -+void item_stat_extent(const coord_t * coord, void *vp); -+int reiser4_check_extent(const coord_t * coord, const char **error); -+ -+/* plugin->u.item.s.file.* */ -+ssize_t reiser4_write_extent(struct file *, const char __user *, -+ size_t, loff_t *); -+int reiser4_read_extent(struct file *, flow_t *, hint_t *); -+int reiser4_readpage_extent(void *, struct page *); -+int reiser4_do_readpage_extent(reiser4_extent*, reiser4_block_nr, struct page*); -+reiser4_key *append_key_extent(const coord_t *, reiser4_key *); -+void init_coord_extension_extent(uf_coord_t *, loff_t offset); -+int get_block_address_extent(const coord_t *, sector_t block, -+ sector_t * result); -+ -+/* these are used in flush.c -+ FIXME-VS: should they be somewhere in item_plugin? */ -+int allocate_extent_item_in_place(coord_t *, lock_handle *, flush_pos_t * pos); -+int allocate_and_copy_extent(znode * left, coord_t * right, flush_pos_t * pos, -+ reiser4_key * stop_key); -+ -+int extent_is_unallocated(const coord_t * item); /* True if this extent is unallocated (i.e., not a hole, not allocated). */ -+__u64 extent_unit_index(const coord_t * item); /* Block offset of this unit. */ -+__u64 extent_unit_width(const coord_t * item); /* Number of blocks in this unit. */ -+ -+/* plugin->u.item.f. */ -+int reiser4_scan_extent(flush_scan * scan); -+extern int key_by_offset_extent(struct inode *, loff_t, reiser4_key *); -+ -+reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit, -+ int nr_extents); -+reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr); -+extent_state state_of_extent(reiser4_extent * ext); -+void reiser4_set_extent(reiser4_extent *, reiser4_block_nr start, -+ reiser4_block_nr width); -+int reiser4_update_extent(struct inode *, jnode *, loff_t pos, -+ int *plugged_hole); -+ -+#include "../../coord.h" -+#include "../../lock.h" -+#include "../../tap.h" -+ -+struct replace_handle { -+ /* these are to be set before calling reiser4_replace_extent */ -+ coord_t *coord; -+ lock_handle *lh; -+ reiser4_key key; -+ reiser4_key *pkey; -+ reiser4_extent overwrite; -+ reiser4_extent new_extents[2]; -+ int nr_new_extents; -+ unsigned flags; -+ -+ /* these are used by reiser4_replace_extent */ -+ reiser4_item_data item; -+ coord_t coord_after; -+ lock_handle lh_after; -+ tap_t watch; -+ reiser4_key paste_key; -+#if REISER4_DEBUG -+ reiser4_extent orig_ext; -+ reiser4_key tmp; -+#endif -+}; -+ -+/* this structure is kmalloced before calling make_extent to avoid excessive -+ stack consumption on plug_hole->reiser4_replace_extent */ -+struct make_extent_handle { -+ uf_coord_t *uf_coord; -+ reiser4_block_nr blocknr; -+ int created; -+ struct inode *inode; -+ union { -+ struct { -+ } append; -+ struct replace_handle replace; -+ } u; -+}; -+ -+int reiser4_replace_extent(struct replace_handle *, -+ int return_inserted_position); -+lock_handle *znode_lh(znode *); -+ -+/* the reiser4 repacker support */ -+struct repacker_cursor; -+extern int process_extent_backward_for_repacking(tap_t *, -+ struct repacker_cursor *); -+extern int mark_extent_for_repacking(tap_t *, int); -+ -+#define coord_by_uf_coord(uf_coord) (&((uf_coord)->coord)) -+#define ext_coord_by_uf_coord(uf_coord) (&((uf_coord)->extension.extent)) -+ -+/* __REISER4_EXTENT_H__ */ -+#endif -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/extent_item_ops.c linux-2.6.20/fs/reiser4/plugin/item/extent_item_ops.c ---- linux-2.6.20.orig/fs/reiser4/plugin/item/extent_item_ops.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/item/extent_item_ops.c 2007-05-06 14:50:43.815011970 +0400 -@@ -0,0 +1,889 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "item.h" -+#include "../../inode.h" -+#include "../../tree_walk.h" /* check_sibling_list() */ -+#include "../../page_cache.h" -+#include "../../carry.h" -+ -+#include -+ -+/* item_plugin->b.max_key_inside */ -+reiser4_key *max_key_inside_extent(const coord_t * coord, reiser4_key * key) -+{ -+ item_key_by_coord(coord, key); -+ set_key_offset(key, get_key_offset(reiser4_max_key())); -+ return key; -+} -+ -+/* item_plugin->b.can_contain_key -+ this checks whether @key of @data is matching to position set by @coord */ -+int -+can_contain_key_extent(const coord_t * coord, const reiser4_key * key, -+ const reiser4_item_data * data) -+{ -+ reiser4_key item_key; -+ -+ if (item_plugin_by_coord(coord) != data->iplug) -+ return 0; -+ -+ item_key_by_coord(coord, &item_key); -+ if (get_key_locality(key) != get_key_locality(&item_key) || -+ get_key_objectid(key) != get_key_objectid(&item_key) || -+ get_key_ordering(key) != get_key_ordering(&item_key)) -+ return 0; -+ -+ return 1; -+} -+ -+/* item_plugin->b.mergeable -+ first item is of extent type */ -+/* Audited by: green(2002.06.13) */ -+int mergeable_extent(const coord_t * p1, const coord_t * p2) -+{ -+ reiser4_key key1, key2; -+ -+ assert("vs-299", item_id_by_coord(p1) == EXTENT_POINTER_ID); -+ /* FIXME-VS: Which is it? Assert or return 0 */ -+ if (item_id_by_coord(p2) != EXTENT_POINTER_ID) { -+ return 0; -+ } -+ -+ item_key_by_coord(p1, &key1); -+ item_key_by_coord(p2, &key2); -+ if (get_key_locality(&key1) != get_key_locality(&key2) || -+ get_key_objectid(&key1) != get_key_objectid(&key2) || -+ get_key_ordering(&key1) != get_key_ordering(&key2) || -+ get_key_type(&key1) != get_key_type(&key2)) -+ return 0; -+ if (get_key_offset(&key1) + -+ reiser4_extent_size(p1, nr_units_extent(p1)) != -+ get_key_offset(&key2)) -+ return 0; -+ return 1; -+} -+ -+/* item_plugin->b.nr_units */ -+pos_in_node_t nr_units_extent(const coord_t * coord) -+{ -+ /* length of extent item has to be multiple of extent size */ -+ assert("vs-1424", -+ (item_length_by_coord(coord) % sizeof(reiser4_extent)) == 0); -+ return item_length_by_coord(coord) / sizeof(reiser4_extent); -+} -+ -+/* item_plugin->b.lookup */ -+lookup_result -+lookup_extent(const reiser4_key * key, lookup_bias bias UNUSED_ARG, -+ coord_t * coord) -+{ /* znode and item_pos are -+ set to an extent item to -+ look through */ -+ reiser4_key item_key; -+ reiser4_block_nr lookuped, offset; -+ unsigned i, nr_units; -+ reiser4_extent *ext; -+ unsigned blocksize; -+ unsigned char blocksize_bits; -+ -+ item_key_by_coord(coord, &item_key); -+ offset = get_key_offset(&item_key); -+ -+ /* key we are looking for must be greater than key of item @coord */ -+ assert("vs-414", keygt(key, &item_key)); -+ -+ assert("umka-99945", -+ !keygt(key, max_key_inside_extent(coord, &item_key))); -+ -+ ext = extent_item(coord); -+ assert("vs-1350", (char *)ext == (zdata(coord->node) + coord->offset)); -+ -+ blocksize = current_blocksize; -+ blocksize_bits = current_blocksize_bits; -+ -+ /* offset we are looking for */ -+ lookuped = get_key_offset(key); -+ -+ nr_units = nr_units_extent(coord); -+ /* go through all extents until the one which address given offset */ -+ for (i = 0; i < nr_units; i++, ext++) { -+ offset += (extent_get_width(ext) << blocksize_bits); -+ if (offset > lookuped) { -+ /* desired byte is somewhere in this extent */ -+ coord->unit_pos = i; -+ coord->between = AT_UNIT; -+ return CBK_COORD_FOUND; -+ } -+ } -+ -+ /* set coord after last unit */ -+ coord->unit_pos = nr_units - 1; -+ coord->between = AFTER_UNIT; -+ return CBK_COORD_FOUND; -+} -+ -+/* item_plugin->b.paste -+ item @coord is set to has been appended with @data->length of free -+ space. data->data contains data to be pasted into the item in position -+ @coord->in_item.unit_pos. It must fit into that free space. -+ @coord must be set between units. -+*/ -+int -+paste_extent(coord_t * coord, reiser4_item_data * data, -+ carry_plugin_info * info UNUSED_ARG) -+{ -+ unsigned old_nr_units; -+ reiser4_extent *ext; -+ int item_length; -+ -+ ext = extent_item(coord); -+ item_length = item_length_by_coord(coord); -+ old_nr_units = (item_length - data->length) / sizeof(reiser4_extent); -+ -+ /* this is also used to copy extent into newly created item, so -+ old_nr_units could be 0 */ -+ assert("vs-260", item_length >= data->length); -+ -+ /* make sure that coord is set properly */ -+ assert("vs-35", -+ ((!coord_is_existing_unit(coord)) -+ || (!old_nr_units && !coord->unit_pos))); -+ -+ /* first unit to be moved */ -+ switch (coord->between) { -+ case AFTER_UNIT: -+ coord->unit_pos++; -+ case BEFORE_UNIT: -+ coord->between = AT_UNIT; -+ break; -+ case AT_UNIT: -+ assert("vs-331", !old_nr_units && !coord->unit_pos); -+ break; -+ default: -+ impossible("vs-330", "coord is set improperly"); -+ } -+ -+ /* prepare space for new units */ -+ memmove(ext + coord->unit_pos + data->length / sizeof(reiser4_extent), -+ ext + coord->unit_pos, -+ (old_nr_units - coord->unit_pos) * sizeof(reiser4_extent)); -+ -+ /* copy new data from kernel space */ -+ assert("vs-556", data->user == 0); -+ memcpy(ext + coord->unit_pos, data->data, (unsigned)data->length); -+ -+ /* after paste @coord is set to first of pasted units */ -+ assert("vs-332", coord_is_existing_unit(coord)); -+ assert("vs-333", -+ !memcmp(data->data, extent_by_coord(coord), -+ (unsigned)data->length)); -+ return 0; -+} -+ -+/* item_plugin->b.can_shift */ -+int -+can_shift_extent(unsigned free_space, coord_t * source, -+ znode * target UNUSED_ARG, shift_direction pend UNUSED_ARG, -+ unsigned *size, unsigned want) -+{ -+ *size = item_length_by_coord(source); -+ if (*size > free_space) -+ /* never split a unit of extent item */ -+ *size = free_space - free_space % sizeof(reiser4_extent); -+ -+ /* we can shift *size bytes, calculate how many do we want to shift */ -+ if (*size > want * sizeof(reiser4_extent)) -+ *size = want * sizeof(reiser4_extent); -+ -+ if (*size % sizeof(reiser4_extent) != 0) -+ impossible("vs-119", "Wrong extent size: %i %zd", *size, -+ sizeof(reiser4_extent)); -+ return *size / sizeof(reiser4_extent); -+ -+} -+ -+/* item_plugin->b.copy_units */ -+void -+copy_units_extent(coord_t * target, coord_t * source, -+ unsigned from, unsigned count, -+ shift_direction where_is_free_space, unsigned free_space) -+{ -+ char *from_ext, *to_ext; -+ -+ assert("vs-217", free_space == count * sizeof(reiser4_extent)); -+ -+ from_ext = item_body_by_coord(source); -+ to_ext = item_body_by_coord(target); -+ -+ if (where_is_free_space == SHIFT_LEFT) { -+ assert("vs-215", from == 0); -+ -+ /* At this moment, item length was already updated in the item -+ header by shifting code, hence nr_units_extent() will -+ return "new" number of units---one we obtain after copying -+ units. -+ */ -+ to_ext += -+ (nr_units_extent(target) - count) * sizeof(reiser4_extent); -+ } else { -+ reiser4_key key; -+ coord_t coord; -+ -+ assert("vs-216", -+ from + count == coord_last_unit_pos(source) + 1); -+ -+ from_ext += item_length_by_coord(source) - free_space; -+ -+ /* new units are inserted before first unit in an item, -+ therefore, we have to update item key */ -+ coord = *source; -+ coord.unit_pos = from; -+ unit_key_extent(&coord, &key); -+ -+ node_plugin_by_node(target->node)->update_item_key(target, &key, -+ NULL /*info */); -+ } -+ -+ memcpy(to_ext, from_ext, free_space); -+} -+ -+/* item_plugin->b.create_hook -+ @arg is znode of leaf node for which we need to update right delimiting key */ -+int create_hook_extent(const coord_t * coord, void *arg) -+{ -+ coord_t *child_coord; -+ znode *node; -+ reiser4_key key; -+ reiser4_tree *tree; -+ -+ if (!arg) -+ return 0; -+ -+ child_coord = arg; -+ tree = znode_get_tree(coord->node); -+ -+ assert("nikita-3246", znode_get_level(child_coord->node) == LEAF_LEVEL); -+ -+ write_lock_tree(tree); -+ write_lock_dk(tree); -+ /* find a node on the left level for which right delimiting key has to -+ be updated */ -+ if (coord_wrt(child_coord) == COORD_ON_THE_LEFT) { -+ assert("vs-411", znode_is_left_connected(child_coord->node)); -+ node = child_coord->node->left; -+ } else { -+ assert("vs-412", coord_wrt(child_coord) == COORD_ON_THE_RIGHT); -+ node = child_coord->node; -+ assert("nikita-3314", node != NULL); -+ } -+ -+ if (node != NULL) { -+ znode_set_rd_key(node, item_key_by_coord(coord, &key)); -+ -+ assert("nikita-3282", check_sibling_list(node)); -+ /* break sibling links */ -+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && node->right) { -+ ON_DEBUG(node->right->left_version = -+ atomic_inc_return(&delim_key_version); -+ node->right_version = -+ atomic_inc_return(&delim_key_version);); -+ -+ node->right->left = NULL; -+ node->right = NULL; -+ } -+ } -+ write_unlock_dk(tree); -+ write_unlock_tree(tree); -+ return 0; -+} -+ -+#define ITEM_TAIL_KILLED 0 -+#define ITEM_HEAD_KILLED 1 -+#define ITEM_KILLED 2 -+ -+/* item_plugin->b.kill_hook -+ this is called when @count units starting from @from-th one are going to be removed -+ */ -+int -+kill_hook_extent(const coord_t * coord, pos_in_node_t from, pos_in_node_t count, -+ struct carry_kill_data *kdata) -+{ -+ reiser4_extent *ext; -+ reiser4_block_nr start, length; -+ const reiser4_key *pfrom_key, *pto_key; -+ struct inode *inode; -+ reiser4_tree *tree; -+ pgoff_t from_off, to_off, offset, skip; -+ int retval; -+ -+ /* these are located in memory kmalloc-ed by kill_node_content */ -+ reiser4_key *min_item_key, *max_item_key, *from_key, *to_key, *key; -+ coord_t *dup, *next; -+ -+ assert("zam-811", znode_is_write_locked(coord->node)); -+ assert("nikita-3315", kdata != NULL); -+ assert("vs-34", kdata->buf != NULL); -+ -+ /* map structures to kdata->buf */ -+ min_item_key = (reiser4_key *) (kdata->buf); -+ max_item_key = min_item_key + 1; -+ from_key = max_item_key + 1; -+ to_key = from_key + 1; -+ key = to_key + 1; -+ dup = (coord_t *) (key + 1); -+ next = dup + 1; -+ -+ item_key_by_coord(coord, min_item_key); -+ max_item_key_by_coord(coord, max_item_key); -+ -+ if (kdata->params.from_key) { -+ pfrom_key = kdata->params.from_key; -+ pto_key = kdata->params.to_key; -+ } else { -+ assert("vs-1549", from == coord->unit_pos); -+ unit_key_by_coord(coord, from_key); -+ pfrom_key = from_key; -+ -+ coord_dup(dup, coord); -+ dup->unit_pos = from + count - 1; -+ max_unit_key_by_coord(dup, to_key); -+ pto_key = to_key; -+ } -+ -+ if (!keylt(pto_key, max_item_key)) { -+ if (!keygt(pfrom_key, min_item_key)) { -+ znode *left, *right; -+ -+ /* item is to be removed completely */ -+ assert("nikita-3316", kdata->left != NULL -+ && kdata->right != NULL); -+ -+ left = kdata->left->node; -+ right = kdata->right->node; -+ -+ tree = current_tree; -+ /* we have to do two things: -+ * -+ * 1. link left and right formatted neighbors of -+ * extent being removed, and -+ * -+ * 2. update their delimiting keys. -+ * -+ * atomicity of these operations is protected by -+ * taking dk-lock and tree-lock. -+ */ -+ /* if neighbors of item being removed are znodes - -+ * link them */ -+ write_lock_tree(tree); -+ write_lock_dk(tree); -+ link_left_and_right(left, right); -+ if (left) { -+ /* update right delimiting key of left -+ * neighbor of extent item */ -+ /*coord_t next; -+ reiser4_key key; */ -+ -+ coord_dup(next, coord); -+ -+ if (coord_next_item(next)) -+ *key = *znode_get_rd_key(coord->node); -+ else -+ item_key_by_coord(next, key); -+ znode_set_rd_key(left, key); -+ } -+ write_unlock_dk(tree); -+ write_unlock_tree(tree); -+ -+ from_off = -+ get_key_offset(min_item_key) >> PAGE_CACHE_SHIFT; -+ to_off = -+ (get_key_offset(max_item_key) + -+ 1) >> PAGE_CACHE_SHIFT; -+ retval = ITEM_KILLED; -+ } else { -+ /* tail of item is to be removed */ -+ from_off = -+ (get_key_offset(pfrom_key) + PAGE_CACHE_SIZE - -+ 1) >> PAGE_CACHE_SHIFT; -+ to_off = -+ (get_key_offset(max_item_key) + -+ 1) >> PAGE_CACHE_SHIFT; -+ retval = ITEM_TAIL_KILLED; -+ } -+ } else { -+ /* head of item is to be removed */ -+ assert("vs-1571", keyeq(pfrom_key, min_item_key)); -+ assert("vs-1572", -+ (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == -+ 0); -+ assert("vs-1573", -+ ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE - -+ 1)) == 0); -+ -+ if (kdata->left->node) { -+ /* update right delimiting key of left neighbor of extent item */ -+ /*reiser4_key key; */ -+ -+ *key = *pto_key; -+ set_key_offset(key, get_key_offset(pto_key) + 1); -+ -+ write_lock_dk(current_tree); -+ znode_set_rd_key(kdata->left->node, key); -+ write_unlock_dk(current_tree); -+ } -+ -+ from_off = get_key_offset(pfrom_key) >> PAGE_CACHE_SHIFT; -+ to_off = (get_key_offset(pto_key) + 1) >> PAGE_CACHE_SHIFT; -+ retval = ITEM_HEAD_KILLED; -+ } -+ -+ inode = kdata->inode; -+ assert("vs-1545", inode != NULL); -+ if (inode != NULL) -+ /* take care of pages and jnodes corresponding to part of item being killed */ -+ reiser4_invalidate_pages(inode->i_mapping, from_off, -+ to_off - from_off, -+ kdata->params.truncate); -+ -+ ext = extent_item(coord) + from; -+ offset = -+ (get_key_offset(min_item_key) + -+ reiser4_extent_size(coord, from)) >> PAGE_CACHE_SHIFT; -+ -+ assert("vs-1551", from_off >= offset); -+ assert("vs-1552", from_off - offset <= extent_get_width(ext)); -+ skip = from_off - offset; -+ offset = from_off; -+ -+ while (offset < to_off) { -+ length = extent_get_width(ext) - skip; -+ if (state_of_extent(ext) == HOLE_EXTENT) { -+ skip = 0; -+ offset += length; -+ ext++; -+ continue; -+ } -+ -+ if (offset + length > to_off) { -+ length = to_off - offset; -+ } -+ -+ DQUOT_FREE_BLOCK_NODIRTY(inode, length); -+ -+ if (state_of_extent(ext) == UNALLOCATED_EXTENT) { -+ /* some jnodes corresponding to this unallocated extent */ -+ fake_allocated2free(length, 0 /* unformatted */ ); -+ -+ skip = 0; -+ offset += length; -+ ext++; -+ continue; -+ } -+ -+ assert("vs-1218", state_of_extent(ext) == ALLOCATED_EXTENT); -+ -+ if (length != 0) { -+ start = extent_get_start(ext) + skip; -+ -+ /* BA_DEFER bit parameter is turned on because blocks which get freed are not safe to be freed -+ immediately */ -+ reiser4_dealloc_blocks(&start, &length, -+ 0 /* not used */ , -+ BA_DEFER -+ /* unformatted with defer */ ); -+ } -+ skip = 0; -+ offset += length; -+ ext++; -+ } -+ return retval; -+} -+ -+/* item_plugin->b.kill_units */ -+int -+kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ struct carry_kill_data *kdata, reiser4_key * smallest_removed, -+ reiser4_key * new_first) -+{ -+ reiser4_extent *ext; -+ reiser4_key item_key; -+ pos_in_node_t count; -+ reiser4_key from_key, to_key; -+ const reiser4_key *pfrom_key, *pto_key; -+ loff_t off; -+ int result; -+ -+ assert("vs-1541", -+ ((kdata->params.from_key == NULL && kdata->params.to_key == NULL) -+ || (kdata->params.from_key != NULL -+ && kdata->params.to_key != NULL))); -+ -+ if (kdata->params.from_key) { -+ pfrom_key = kdata->params.from_key; -+ pto_key = kdata->params.to_key; -+ } else { -+ coord_t dup; -+ -+ /* calculate key range of kill */ -+ assert("vs-1549", from == coord->unit_pos); -+ unit_key_by_coord(coord, &from_key); -+ pfrom_key = &from_key; -+ -+ coord_dup(&dup, coord); -+ dup.unit_pos = to; -+ max_unit_key_by_coord(&dup, &to_key); -+ pto_key = &to_key; -+ } -+ -+ item_key_by_coord(coord, &item_key); -+ -+#if REISER4_DEBUG -+ { -+ reiser4_key max_item_key; -+ -+ max_item_key_by_coord(coord, &max_item_key); -+ -+ if (new_first) { -+ /* head of item is to be cut */ -+ assert("vs-1542", keyeq(pfrom_key, &item_key)); -+ assert("vs-1538", keylt(pto_key, &max_item_key)); -+ } else { -+ /* tail of item is to be cut */ -+ assert("vs-1540", keygt(pfrom_key, &item_key)); -+ assert("vs-1543", !keylt(pto_key, &max_item_key)); -+ } -+ } -+#endif -+ -+ if (smallest_removed) -+ *smallest_removed = *pfrom_key; -+ -+ if (new_first) { -+ /* item head is cut. Item key will change. This new key is calculated here */ -+ assert("vs-1556", -+ (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) == -+ (PAGE_CACHE_SIZE - 1)); -+ *new_first = *pto_key; -+ set_key_offset(new_first, get_key_offset(new_first) + 1); -+ } -+ -+ count = to - from + 1; -+ result = kill_hook_extent(coord, from, count, kdata); -+ if (result == ITEM_TAIL_KILLED) { -+ assert("vs-1553", -+ get_key_offset(pfrom_key) >= -+ get_key_offset(&item_key) + -+ reiser4_extent_size(coord, from)); -+ off = -+ get_key_offset(pfrom_key) - -+ (get_key_offset(&item_key) + -+ reiser4_extent_size(coord, from)); -+ if (off) { -+ /* unit @from is to be cut partially. Its width decreases */ -+ ext = extent_item(coord) + from; -+ extent_set_width(ext, -+ (off + PAGE_CACHE_SIZE - -+ 1) >> PAGE_CACHE_SHIFT); -+ count--; -+ } -+ } else { -+ __u64 max_to_offset; -+ __u64 rest; -+ -+ assert("vs-1575", result == ITEM_HEAD_KILLED); -+ assert("", from == 0); -+ assert("", -+ ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE - -+ 1)) == 0); -+ assert("", -+ get_key_offset(pto_key) + 1 > -+ get_key_offset(&item_key) + -+ reiser4_extent_size(coord, to)); -+ max_to_offset = -+ get_key_offset(&item_key) + -+ reiser4_extent_size(coord, to + 1) - 1; -+ assert("", get_key_offset(pto_key) <= max_to_offset); -+ -+ rest = -+ (max_to_offset - -+ get_key_offset(pto_key)) >> PAGE_CACHE_SHIFT; -+ if (rest) { -+ /* unit @to is to be cut partially */ -+ ext = extent_item(coord) + to; -+ -+ assert("", extent_get_width(ext) > rest); -+ -+ if (state_of_extent(ext) == ALLOCATED_EXTENT) -+ extent_set_start(ext, -+ extent_get_start(ext) + -+ (extent_get_width(ext) - -+ rest)); -+ -+ extent_set_width(ext, rest); -+ count--; -+ } -+ } -+ return count * sizeof(reiser4_extent); -+} -+ -+/* item_plugin->b.cut_units -+ this is too similar to kill_units_extent */ -+int -+cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ struct carry_cut_data *cdata, reiser4_key * smallest_removed, -+ reiser4_key * new_first) -+{ -+ reiser4_extent *ext; -+ reiser4_key item_key; -+ pos_in_node_t count; -+ reiser4_key from_key, to_key; -+ const reiser4_key *pfrom_key, *pto_key; -+ loff_t off; -+ -+ assert("vs-1541", -+ ((cdata->params.from_key == NULL && cdata->params.to_key == NULL) -+ || (cdata->params.from_key != NULL -+ && cdata->params.to_key != NULL))); -+ -+ if (cdata->params.from_key) { -+ pfrom_key = cdata->params.from_key; -+ pto_key = cdata->params.to_key; -+ } else { -+ coord_t dup; -+ -+ /* calculate key range of kill */ -+ coord_dup(&dup, coord); -+ dup.unit_pos = from; -+ unit_key_by_coord(&dup, &from_key); -+ -+ dup.unit_pos = to; -+ max_unit_key_by_coord(&dup, &to_key); -+ -+ pfrom_key = &from_key; -+ pto_key = &to_key; -+ } -+ -+ assert("vs-1555", -+ (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == 0); -+ assert("vs-1556", -+ (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) == -+ (PAGE_CACHE_SIZE - 1)); -+ -+ item_key_by_coord(coord, &item_key); -+ -+#if REISER4_DEBUG -+ { -+ reiser4_key max_item_key; -+ -+ assert("vs-1584", -+ get_key_locality(pfrom_key) == -+ get_key_locality(&item_key)); -+ assert("vs-1585", -+ get_key_type(pfrom_key) == get_key_type(&item_key)); -+ assert("vs-1586", -+ get_key_objectid(pfrom_key) == -+ get_key_objectid(&item_key)); -+ assert("vs-1587", -+ get_key_ordering(pfrom_key) == -+ get_key_ordering(&item_key)); -+ -+ max_item_key_by_coord(coord, &max_item_key); -+ -+ if (new_first != NULL) { -+ /* head of item is to be cut */ -+ assert("vs-1542", keyeq(pfrom_key, &item_key)); -+ assert("vs-1538", keylt(pto_key, &max_item_key)); -+ } else { -+ /* tail of item is to be cut */ -+ assert("vs-1540", keygt(pfrom_key, &item_key)); -+ assert("vs-1543", keyeq(pto_key, &max_item_key)); -+ } -+ } -+#endif -+ -+ if (smallest_removed) -+ *smallest_removed = *pfrom_key; -+ -+ if (new_first) { -+ /* item head is cut. Item key will change. This new key is calculated here */ -+ *new_first = *pto_key; -+ set_key_offset(new_first, get_key_offset(new_first) + 1); -+ } -+ -+ count = to - from + 1; -+ -+ assert("vs-1553", -+ get_key_offset(pfrom_key) >= -+ get_key_offset(&item_key) + reiser4_extent_size(coord, from)); -+ off = -+ get_key_offset(pfrom_key) - (get_key_offset(&item_key) + -+ reiser4_extent_size(coord, from)); -+ if (off) { -+ /* tail of unit @from is to be cut partially. Its width decreases */ -+ assert("vs-1582", new_first == NULL); -+ ext = extent_item(coord) + from; -+ extent_set_width(ext, off >> PAGE_CACHE_SHIFT); -+ count--; -+ } -+ -+ assert("vs-1554", -+ get_key_offset(pto_key) <= -+ get_key_offset(&item_key) + -+ reiser4_extent_size(coord, to + 1) - 1); -+ off = -+ (get_key_offset(&item_key) + -+ reiser4_extent_size(coord, to + 1) - 1) - -+ get_key_offset(pto_key); -+ if (off) { -+ /* @to_key is smaller than max key of unit @to. Unit @to will not be removed. It gets start increased -+ and width decreased. */ -+ assert("vs-1583", (off & (PAGE_CACHE_SIZE - 1)) == 0); -+ ext = extent_item(coord) + to; -+ if (state_of_extent(ext) == ALLOCATED_EXTENT) -+ extent_set_start(ext, -+ extent_get_start(ext) + -+ (extent_get_width(ext) - -+ (off >> PAGE_CACHE_SHIFT))); -+ -+ extent_set_width(ext, (off >> PAGE_CACHE_SHIFT)); -+ count--; -+ } -+ return count * sizeof(reiser4_extent); -+} -+ -+/* item_plugin->b.unit_key */ -+reiser4_key *unit_key_extent(const coord_t * coord, reiser4_key * key) -+{ -+ assert("vs-300", coord_is_existing_unit(coord)); -+ -+ item_key_by_coord(coord, key); -+ set_key_offset(key, -+ (get_key_offset(key) + -+ reiser4_extent_size(coord, coord->unit_pos))); -+ -+ return key; -+} -+ -+/* item_plugin->b.max_unit_key */ -+reiser4_key *max_unit_key_extent(const coord_t * coord, reiser4_key * key) -+{ -+ assert("vs-300", coord_is_existing_unit(coord)); -+ -+ item_key_by_coord(coord, key); -+ set_key_offset(key, -+ (get_key_offset(key) + -+ reiser4_extent_size(coord, coord->unit_pos + 1) - 1)); -+ return key; -+} -+ -+/* item_plugin->b.estimate -+ item_plugin->b.item_data_by_flow */ -+ -+#if REISER4_DEBUG -+ -+/* item_plugin->b.check -+ used for debugging, every item should have here the most complete -+ possible check of the consistency of the item that the inventor can -+ construct -+*/ -+int reiser4_check_extent(const coord_t * coord /* coord of item to check */, -+ const char **error /* where to store error message */) -+{ -+ reiser4_extent *ext, *first; -+ unsigned i, j; -+ reiser4_block_nr start, width, blk_cnt; -+ unsigned num_units; -+ reiser4_tree *tree; -+ oid_t oid; -+ reiser4_key key; -+ coord_t scan; -+ -+ assert("vs-933", REISER4_DEBUG); -+ -+ if (znode_get_level(coord->node) != TWIG_LEVEL) { -+ *error = "Extent on the wrong level"; -+ return -1; -+ } -+ if (item_length_by_coord(coord) % sizeof(reiser4_extent) != 0) { -+ *error = "Wrong item size"; -+ return -1; -+ } -+ ext = first = extent_item(coord); -+ blk_cnt = reiser4_block_count(reiser4_get_current_sb()); -+ num_units = coord_num_units(coord); -+ tree = znode_get_tree(coord->node); -+ item_key_by_coord(coord, &key); -+ oid = get_key_objectid(&key); -+ coord_dup(&scan, coord); -+ -+ for (i = 0; i < num_units; ++i, ++ext) { -+ __u64 index; -+ -+ scan.unit_pos = i; -+ index = extent_unit_index(&scan); -+ -+#if 0 -+ /* check that all jnodes are present for the unallocated -+ * extent */ -+ if (state_of_extent(ext) == UNALLOCATED_EXTENT) { -+ for (j = 0; j < extent_get_width(ext); j++) { -+ jnode *node; -+ -+ node = jlookup(tree, oid, index + j); -+ if (node == NULL) { -+ print_coord("scan", &scan, 0); -+ *error = "Jnode missing"; -+ return -1; -+ } -+ jput(node); -+ } -+ } -+#endif -+ -+ start = extent_get_start(ext); -+ if (start < 2) -+ continue; -+ /* extent is allocated one */ -+ width = extent_get_width(ext); -+ if (start >= blk_cnt) { -+ *error = "Start too large"; -+ return -1; -+ } -+ if (start + width > blk_cnt) { -+ *error = "End too large"; -+ return -1; -+ } -+ /* make sure that this extent does not overlap with other -+ allocated extents extents */ -+ for (j = 0; j < i; j++) { -+ if (state_of_extent(first + j) != ALLOCATED_EXTENT) -+ continue; -+ if (! -+ ((extent_get_start(ext) >= -+ extent_get_start(first + j) + -+ extent_get_width(first + j)) -+ || (extent_get_start(ext) + -+ extent_get_width(ext) <= -+ extent_get_start(first + j)))) { -+ *error = "Extent overlaps with others"; -+ return -1; -+ } -+ } -+ -+ } -+ -+ return 0; -+} -+ -+#endif /* REISER4_DEBUG */ -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/internal.c linux-2.6.20/fs/reiser4/plugin/item/internal.c ---- linux-2.6.20.orig/fs/reiser4/plugin/item/internal.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/item/internal.c 2007-05-06 14:50:43.815011970 +0400 -@@ -0,0 +1,396 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Implementation of internal-item plugin methods. */ -+ -+#include "../../forward.h" -+#include "../../debug.h" -+#include "../../dformat.h" -+#include "../../key.h" -+#include "../../coord.h" -+#include "internal.h" -+#include "item.h" -+#include "../node/node.h" -+#include "../plugin.h" -+#include "../../jnode.h" -+#include "../../znode.h" -+#include "../../tree_walk.h" -+#include "../../tree_mod.h" -+#include "../../tree.h" -+#include "../../super.h" -+#include "../../block_alloc.h" -+ -+/* see internal.h for explanation */ -+ -+/* plugin->u.item.b.mergeable */ -+int mergeable_internal(const coord_t * p1 UNUSED_ARG /* first item */ , -+ const coord_t * p2 UNUSED_ARG /* second item */ ) -+{ -+ /* internal items are not mergeable */ -+ return 0; -+} -+ -+/* ->lookup() method for internal items */ -+lookup_result lookup_internal(const reiser4_key * key /* key to look up */ , -+ lookup_bias bias UNUSED_ARG /* lookup bias */ , -+ coord_t * coord /* coord of item */ ) -+{ -+ reiser4_key ukey; -+ -+ switch (keycmp(unit_key_by_coord(coord, &ukey), key)) { -+ default: -+ impossible("", "keycmp()?!"); -+ case LESS_THAN: -+ /* FIXME-VS: AFTER_ITEM used to be here. But with new coord -+ item plugin can not be taken using coord set this way */ -+ assert("vs-681", coord->unit_pos == 0); -+ coord->between = AFTER_UNIT; -+ case EQUAL_TO: -+ return CBK_COORD_FOUND; -+ case GREATER_THAN: -+ return CBK_COORD_NOTFOUND; -+ } -+} -+ -+/* return body of internal item at @coord */ -+static internal_item_layout *internal_at(const coord_t * coord /* coord of -+ * item */ ) -+{ -+ assert("nikita-607", coord != NULL); -+ assert("nikita-1650", -+ item_plugin_by_coord(coord) == -+ item_plugin_by_id(NODE_POINTER_ID)); -+ return (internal_item_layout *) item_body_by_coord(coord); -+} -+ -+void reiser4_update_internal(const coord_t * coord, -+ const reiser4_block_nr * blocknr) -+{ -+ internal_item_layout *item = internal_at(coord); -+ assert("nikita-2959", reiser4_blocknr_is_sane(blocknr)); -+ -+ put_unaligned(cpu_to_le64(*blocknr), &item->pointer); -+} -+ -+/* return child block number stored in the internal item at @coord */ -+static reiser4_block_nr pointer_at(const coord_t * coord /* coord of item */ ) -+{ -+ assert("nikita-608", coord != NULL); -+ return le64_to_cpu(get_unaligned(&internal_at(coord)->pointer)); -+} -+ -+/* get znode pointed to by internal @item */ -+static znode *znode_at(const coord_t * item /* coord of item */ , -+ znode * parent /* parent node */ ) -+{ -+ return child_znode(item, parent, 1, 0); -+} -+ -+/* store pointer from internal item into "block". Implementation of -+ ->down_link() method */ -+void down_link_internal(const coord_t * coord /* coord of item */ , -+ const reiser4_key * key UNUSED_ARG /* key to get -+ * pointer for */ , -+ reiser4_block_nr * block /* resulting block number */ ) -+{ -+ ON_DEBUG(reiser4_key item_key); -+ -+ assert("nikita-609", coord != NULL); -+ assert("nikita-611", block != NULL); -+ assert("nikita-612", (key == NULL) || -+ /* twig horrors */ -+ (znode_get_level(coord->node) == TWIG_LEVEL) -+ || keyle(item_key_by_coord(coord, &item_key), key)); -+ -+ *block = pointer_at(coord); -+ assert("nikita-2960", reiser4_blocknr_is_sane(block)); -+} -+ -+/* Get the child's block number, or 0 if the block is unallocated. */ -+int -+utmost_child_real_block_internal(const coord_t * coord, sideof side UNUSED_ARG, -+ reiser4_block_nr * block) -+{ -+ assert("jmacd-2059", coord != NULL); -+ -+ *block = pointer_at(coord); -+ assert("nikita-2961", reiser4_blocknr_is_sane(block)); -+ -+ if (reiser4_blocknr_is_fake(block)) { -+ *block = 0; -+ } -+ -+ return 0; -+} -+ -+/* Return the child. */ -+int -+utmost_child_internal(const coord_t * coord, sideof side UNUSED_ARG, -+ jnode ** childp) -+{ -+ reiser4_block_nr block = pointer_at(coord); -+ znode *child; -+ -+ assert("jmacd-2059", childp != NULL); -+ assert("nikita-2962", reiser4_blocknr_is_sane(&block)); -+ -+ child = zlook(znode_get_tree(coord->node), &block); -+ -+ if (IS_ERR(child)) { -+ return PTR_ERR(child); -+ } -+ -+ *childp = ZJNODE(child); -+ -+ return 0; -+} -+ -+#if REISER4_DEBUG -+ -+static void check_link(znode * left, znode * right) -+{ -+ znode *scan; -+ -+ for (scan = left; scan != right; scan = scan->right) { -+ if (ZF_ISSET(scan, JNODE_RIP)) -+ break; -+ if (znode_is_right_connected(scan) && scan->right != NULL) { -+ if (ZF_ISSET(scan->right, JNODE_RIP)) -+ break; -+ assert("nikita-3285", -+ znode_is_left_connected(scan->right)); -+ assert("nikita-3265", -+ ergo(scan != left, -+ ZF_ISSET(scan, JNODE_HEARD_BANSHEE))); -+ assert("nikita-3284", scan->right->left == scan); -+ } else -+ break; -+ } -+} -+ -+int check__internal(const coord_t * coord, const char **error) -+{ -+ reiser4_block_nr blk; -+ znode *child; -+ coord_t cpy; -+ -+ blk = pointer_at(coord); -+ if (!reiser4_blocknr_is_sane(&blk)) { -+ *error = "Invalid pointer"; -+ return -1; -+ } -+ coord_dup(&cpy, coord); -+ child = znode_at(&cpy, cpy.node); -+ if (child != NULL) { -+ znode *left_child; -+ znode *right_child; -+ -+ left_child = right_child = NULL; -+ -+ assert("nikita-3256", znode_invariant(child)); -+ if (coord_prev_item(&cpy) == 0 && item_is_internal(&cpy)) { -+ left_child = znode_at(&cpy, cpy.node); -+ if (left_child != NULL) { -+ read_lock_tree(znode_get_tree(child)); -+ check_link(left_child, child); -+ read_unlock_tree(znode_get_tree(child)); -+ zput(left_child); -+ } -+ } -+ coord_dup(&cpy, coord); -+ if (coord_next_item(&cpy) == 0 && item_is_internal(&cpy)) { -+ right_child = znode_at(&cpy, cpy.node); -+ if (right_child != NULL) { -+ read_lock_tree(znode_get_tree(child)); -+ check_link(child, right_child); -+ read_unlock_tree(znode_get_tree(child)); -+ zput(right_child); -+ } -+ } -+ zput(child); -+ } -+ return 0; -+} -+ -+#endif /* REISER4_DEBUG */ -+ -+/* return true only if this item really points to "block" */ -+/* Audited by: green(2002.06.14) */ -+int has_pointer_to_internal(const coord_t * coord /* coord of item */ , -+ const reiser4_block_nr * block /* block number to -+ * check */ ) -+{ -+ assert("nikita-613", coord != NULL); -+ assert("nikita-614", block != NULL); -+ -+ return pointer_at(coord) == *block; -+} -+ -+/* hook called by ->create_item() method of node plugin after new internal -+ item was just created. -+ -+ This is point where pointer to new node is inserted into tree. Initialize -+ parent pointer in child znode, insert child into sibling list and slum. -+ -+*/ -+int create_hook_internal(const coord_t * item /* coord of item */ , -+ void *arg /* child's left neighbor, if any */ ) -+{ -+ znode *child; -+ __u64 child_ptr; -+ -+ assert("nikita-1252", item != NULL); -+ assert("nikita-1253", item->node != NULL); -+ assert("nikita-1181", znode_get_level(item->node) > LEAF_LEVEL); -+ assert("nikita-1450", item->unit_pos == 0); -+ -+ /* -+ * preparing to item insertion build_child_ptr_data sets pointer to -+ * data to be inserted to jnode's blocknr which is in cpu byte -+ * order. Node's create_item simply copied those data. As result we -+ * have child pointer in cpu's byte order. Convert content of internal -+ * item to little endian byte order. -+ */ -+ child_ptr = get_unaligned((__u64 *)item_body_by_coord(item)); -+ reiser4_update_internal(item, &child_ptr); -+ -+ child = znode_at(item, item->node); -+ if (child != NULL && !IS_ERR(child)) { -+ znode *left; -+ int result = 0; -+ reiser4_tree *tree; -+ -+ left = arg; -+ tree = znode_get_tree(item->node); -+ write_lock_tree(tree); -+ write_lock_dk(tree); -+ assert("nikita-1400", (child->in_parent.node == NULL) -+ || (znode_above_root(child->in_parent.node))); -+ ++item->node->c_count; -+ coord_to_parent_coord(item, &child->in_parent); -+ sibling_list_insert_nolock(child, left); -+ -+ assert("nikita-3297", ZF_ISSET(child, JNODE_ORPHAN)); -+ ZF_CLR(child, JNODE_ORPHAN); -+ -+ if ((left != NULL) && !keyeq(znode_get_rd_key(left), -+ znode_get_rd_key(child))) { -+ znode_set_rd_key(child, znode_get_rd_key(left)); -+ } -+ write_unlock_dk(tree); -+ write_unlock_tree(tree); -+ zput(child); -+ return result; -+ } else { -+ if (child == NULL) -+ child = ERR_PTR(-EIO); -+ return PTR_ERR(child); -+ } -+} -+ -+/* hook called by ->cut_and_kill() method of node plugin just before internal -+ item is removed. -+ -+ This is point where empty node is removed from the tree. Clear parent -+ pointer in child, and mark node for pending deletion. -+ -+ Node will be actually deleted later and in several installations: -+ -+ . when last lock on this node will be released, node will be removed from -+ the sibling list and its lock will be invalidated -+ -+ . when last reference to this node will be dropped, bitmap will be updated -+ and node will be actually removed from the memory. -+ -+*/ -+int kill_hook_internal(const coord_t * item /* coord of item */ , -+ pos_in_node_t from UNUSED_ARG /* start unit */ , -+ pos_in_node_t count UNUSED_ARG /* stop unit */ , -+ struct carry_kill_data *p UNUSED_ARG) -+{ -+ znode *child; -+ -+ assert("nikita-1222", item != NULL); -+ assert("nikita-1224", from == 0); -+ assert("nikita-1225", count == 1); -+ -+ child = znode_at(item, item->node); -+ if (IS_ERR(child)) -+ return PTR_ERR(child); -+ else if (node_is_empty(child)) { -+ reiser4_tree *tree; -+ -+ assert("nikita-1397", znode_is_write_locked(child)); -+ assert("nikita-1398", child->c_count == 0); -+ assert("nikita-2546", ZF_ISSET(child, JNODE_HEARD_BANSHEE)); -+ -+ tree = znode_get_tree(item->node); -+ write_lock_tree(tree); -+ init_parent_coord(&child->in_parent, NULL); -+ --item->node->c_count; -+ write_unlock_tree(tree); -+ zput(child); -+ return 0; -+ } else { -+ warning("nikita-1223", -+ "Cowardly refuse to remove link to non-empty node"); -+ zput(child); -+ return RETERR(-EIO); -+ } -+} -+ -+/* hook called by ->shift() node plugin method when iternal item was just -+ moved from one node to another. -+ -+ Update parent pointer in child and c_counts in old and new parent -+ -+*/ -+int shift_hook_internal(const coord_t * item /* coord of item */ , -+ unsigned from UNUSED_ARG /* start unit */ , -+ unsigned count UNUSED_ARG /* stop unit */ , -+ znode * old_node /* old parent */ ) -+{ -+ znode *child; -+ znode *new_node; -+ reiser4_tree *tree; -+ -+ assert("nikita-1276", item != NULL); -+ assert("nikita-1277", from == 0); -+ assert("nikita-1278", count == 1); -+ assert("nikita-1451", item->unit_pos == 0); -+ -+ new_node = item->node; -+ assert("nikita-2132", new_node != old_node); -+ tree = znode_get_tree(item->node); -+ child = child_znode(item, old_node, 1, 0); -+ if (child == NULL) -+ return 0; -+ if (!IS_ERR(child)) { -+ write_lock_tree(tree); -+ ++new_node->c_count; -+ assert("nikita-1395", znode_parent(child) == old_node); -+ assert("nikita-1396", old_node->c_count > 0); -+ coord_to_parent_coord(item, &child->in_parent); -+ assert("nikita-1781", znode_parent(child) == new_node); -+ assert("nikita-1782", -+ check_tree_pointer(item, child) == NS_FOUND); -+ --old_node->c_count; -+ write_unlock_tree(tree); -+ zput(child); -+ return 0; -+ } else -+ return PTR_ERR(child); -+} -+ -+/* plugin->u.item.b.max_key_inside - not defined */ -+ -+/* plugin->u.item.b.nr_units - item.c:single_unit */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/internal.h linux-2.6.20/fs/reiser4/plugin/item/internal.h ---- linux-2.6.20.orig/fs/reiser4/plugin/item/internal.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/item/internal.h 2007-05-06 14:50:43.815011970 +0400 -@@ -0,0 +1,57 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+/* Internal item contains down-link to the child of the internal/twig -+ node in a tree. It is internal items that are actually used during -+ tree traversal. */ -+ -+#if !defined( __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ ) -+#define __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ -+ -+#include "../../forward.h" -+#include "../../dformat.h" -+ -+/* on-disk layout of internal item */ -+typedef struct internal_item_layout { -+ /* 0 */ reiser4_dblock_nr pointer; -+ /* 4 */ -+} internal_item_layout; -+ -+struct cut_list; -+ -+int mergeable_internal(const coord_t * p1, const coord_t * p2); -+lookup_result lookup_internal(const reiser4_key * key, lookup_bias bias, -+ coord_t * coord); -+/* store pointer from internal item into "block". Implementation of -+ ->down_link() method */ -+extern void down_link_internal(const coord_t * coord, const reiser4_key * key, -+ reiser4_block_nr * block); -+extern int has_pointer_to_internal(const coord_t * coord, -+ const reiser4_block_nr * block); -+extern int create_hook_internal(const coord_t * item, void *arg); -+extern int kill_hook_internal(const coord_t * item, pos_in_node_t from, -+ pos_in_node_t count, struct carry_kill_data *); -+extern int shift_hook_internal(const coord_t * item, unsigned from, -+ unsigned count, znode * old_node); -+extern void reiser4_print_internal(const char *prefix, coord_t * coord); -+ -+extern int utmost_child_internal(const coord_t * coord, sideof side, -+ jnode ** child); -+int utmost_child_real_block_internal(const coord_t * coord, sideof side, -+ reiser4_block_nr * block); -+ -+extern void reiser4_update_internal(const coord_t * coord, -+ const reiser4_block_nr * blocknr); -+/* FIXME: reiserfs has check_internal */ -+extern int check__internal(const coord_t * coord, const char **error); -+ -+/* __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/item.c linux-2.6.20/fs/reiser4/plugin/item/item.c ---- linux-2.6.20.orig/fs/reiser4/plugin/item/item.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/item/item.c 2007-05-06 14:50:43.815011970 +0400 -@@ -0,0 +1,719 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* definition of item plugins. */ -+ -+#include "../../forward.h" -+#include "../../debug.h" -+#include "../../key.h" -+#include "../../coord.h" -+#include "../plugin_header.h" -+#include "sde.h" -+#include "internal.h" -+#include "item.h" -+#include "static_stat.h" -+#include "../plugin.h" -+#include "../../znode.h" -+#include "../../tree.h" -+#include "../../context.h" -+#include "ctail.h" -+ -+/* return pointer to item body */ -+void item_body_by_coord_hard(coord_t * coord /* coord to query */ ) -+{ -+ assert("nikita-324", coord != NULL); -+ assert("nikita-325", coord->node != NULL); -+ assert("nikita-326", znode_is_loaded(coord->node)); -+ assert("nikita-3200", coord->offset == INVALID_OFFSET); -+ -+ coord->offset = -+ node_plugin_by_node(coord->node)->item_by_coord(coord) - -+ zdata(coord->node); -+ ON_DEBUG(coord->body_v = coord->node->times_locked); -+} -+ -+void *item_body_by_coord_easy(const coord_t * coord /* coord to query */ ) -+{ -+ return zdata(coord->node) + coord->offset; -+} -+ -+#if REISER4_DEBUG -+ -+int item_body_is_valid(const coord_t * coord) -+{ -+ return -+ coord->offset == -+ node_plugin_by_node(coord->node)->item_by_coord(coord) - -+ zdata(coord->node); -+} -+ -+#endif -+ -+/* return length of item at @coord */ -+pos_in_node_t item_length_by_coord(const coord_t * coord /* coord to query */ ) -+{ -+ int len; -+ -+ assert("nikita-327", coord != NULL); -+ assert("nikita-328", coord->node != NULL); -+ assert("nikita-329", znode_is_loaded(coord->node)); -+ -+ len = node_plugin_by_node(coord->node)->length_by_coord(coord); -+ return len; -+} -+ -+void obtain_item_plugin(const coord_t * coord) -+{ -+ assert("nikita-330", coord != NULL); -+ assert("nikita-331", coord->node != NULL); -+ assert("nikita-332", znode_is_loaded(coord->node)); -+ -+ coord_set_iplug((coord_t *) coord, -+ node_plugin_by_node(coord->node)-> -+ plugin_by_coord(coord)); -+ assert("nikita-2479", -+ coord_iplug(coord) == -+ node_plugin_by_node(coord->node)->plugin_by_coord(coord)); -+} -+ -+/* return id of item */ -+/* Audited by: green(2002.06.15) */ -+item_id item_id_by_coord(const coord_t * coord /* coord to query */ ) -+{ -+ assert("vs-539", coord != NULL); -+ assert("vs-538", coord->node != NULL); -+ assert("vs-537", znode_is_loaded(coord->node)); -+ assert("vs-536", item_plugin_by_coord(coord) != NULL); -+ assert("vs-540", -+ item_id_by_plugin(item_plugin_by_coord(coord)) < LAST_ITEM_ID); -+ -+ return item_id_by_plugin(item_plugin_by_coord(coord)); -+} -+ -+/* return key of item at @coord */ -+/* Audited by: green(2002.06.15) */ -+reiser4_key *item_key_by_coord(const coord_t * coord /* coord to query */ , -+ reiser4_key * key /* result */ ) -+{ -+ assert("nikita-338", coord != NULL); -+ assert("nikita-339", coord->node != NULL); -+ assert("nikita-340", znode_is_loaded(coord->node)); -+ -+ return node_plugin_by_node(coord->node)->key_at(coord, key); -+} -+ -+/* this returns max key in the item */ -+reiser4_key *max_item_key_by_coord(const coord_t * coord /* coord to query */ , -+ reiser4_key * key /* result */ ) -+{ -+ coord_t last; -+ -+ assert("nikita-338", coord != NULL); -+ assert("nikita-339", coord->node != NULL); -+ assert("nikita-340", znode_is_loaded(coord->node)); -+ -+ /* make coord pointing to last item's unit */ -+ coord_dup(&last, coord); -+ last.unit_pos = coord_num_units(&last) - 1; -+ assert("vs-1560", coord_is_existing_unit(&last)); -+ -+ max_unit_key_by_coord(&last, key); -+ return key; -+} -+ -+/* return key of unit at @coord */ -+reiser4_key *unit_key_by_coord(const coord_t * coord /* coord to query */ , -+ reiser4_key * key /* result */ ) -+{ -+ assert("nikita-772", coord != NULL); -+ assert("nikita-774", coord->node != NULL); -+ assert("nikita-775", znode_is_loaded(coord->node)); -+ -+ if (item_plugin_by_coord(coord)->b.unit_key != NULL) -+ return item_plugin_by_coord(coord)->b.unit_key(coord, key); -+ else -+ return item_key_by_coord(coord, key); -+} -+ -+/* return the biggest key contained the unit @coord */ -+reiser4_key *max_unit_key_by_coord(const coord_t * coord /* coord to query */ , -+ reiser4_key * key /* result */ ) -+{ -+ assert("nikita-772", coord != NULL); -+ assert("nikita-774", coord->node != NULL); -+ assert("nikita-775", znode_is_loaded(coord->node)); -+ -+ if (item_plugin_by_coord(coord)->b.max_unit_key != NULL) -+ return item_plugin_by_coord(coord)->b.max_unit_key(coord, key); -+ else -+ return unit_key_by_coord(coord, key); -+} -+ -+/* ->max_key_inside() method for items consisting of exactly one key (like -+ stat-data) */ -+static reiser4_key *max_key_inside_single_key(const coord_t * -+ coord /* coord of item */ , -+ reiser4_key * -+ result /* resulting key */ ) -+{ -+ assert("nikita-604", coord != NULL); -+ -+ /* coord -> key is starting key of this item and it has to be already -+ filled in */ -+ return unit_key_by_coord(coord, result); -+} -+ -+/* ->nr_units() method for items consisting of exactly one unit always */ -+pos_in_node_t -+nr_units_single_unit(const coord_t * coord UNUSED_ARG /* coord of item */ ) -+{ -+ return 1; -+} -+ -+static int -+paste_no_paste(coord_t * coord UNUSED_ARG, -+ reiser4_item_data * data UNUSED_ARG, -+ carry_plugin_info * info UNUSED_ARG) -+{ -+ return 0; -+} -+ -+/* default ->fast_paste() method */ -+static int -+agree_to_fast_op(const coord_t * coord UNUSED_ARG /* coord of item */ ) -+{ -+ return 1; -+} -+ -+int item_can_contain_key(const coord_t * item /* coord of item */ , -+ const reiser4_key * key /* key to check */ , -+ const reiser4_item_data * data /* parameters of item -+ * being created */ ) -+{ -+ item_plugin *iplug; -+ reiser4_key min_key_in_item; -+ reiser4_key max_key_in_item; -+ -+ assert("nikita-1658", item != NULL); -+ assert("nikita-1659", key != NULL); -+ -+ iplug = item_plugin_by_coord(item); -+ if (iplug->b.can_contain_key != NULL) -+ return iplug->b.can_contain_key(item, key, data); -+ else { -+ assert("nikita-1681", iplug->b.max_key_inside != NULL); -+ item_key_by_coord(item, &min_key_in_item); -+ iplug->b.max_key_inside(item, &max_key_in_item); -+ -+ /* can contain key if -+ min_key_in_item <= key && -+ key <= max_key_in_item -+ */ -+ return keyle(&min_key_in_item, key) -+ && keyle(key, &max_key_in_item); -+ } -+} -+ -+/* mergeable method for non mergeable items */ -+static int -+not_mergeable(const coord_t * i1 UNUSED_ARG, const coord_t * i2 UNUSED_ARG) -+{ -+ return 0; -+} -+ -+/* return 0 if @item1 and @item2 are not mergeable, !0 - otherwise */ -+int are_items_mergeable(const coord_t * i1 /* coord of first item */ , -+ const coord_t * i2 /* coord of second item */ ) -+{ -+ item_plugin *iplug; -+ reiser4_key k1; -+ reiser4_key k2; -+ -+ assert("nikita-1336", i1 != NULL); -+ assert("nikita-1337", i2 != NULL); -+ -+ iplug = item_plugin_by_coord(i1); -+ assert("nikita-1338", iplug != NULL); -+ -+ /* NOTE-NIKITA are_items_mergeable() is also called by assertions in -+ shifting code when nodes are in "suspended" state. */ -+ assert("nikita-1663", -+ keyle(item_key_by_coord(i1, &k1), item_key_by_coord(i2, &k2))); -+ -+ if (iplug->b.mergeable != NULL) { -+ return iplug->b.mergeable(i1, i2); -+ } else if (iplug->b.max_key_inside != NULL) { -+ iplug->b.max_key_inside(i1, &k1); -+ item_key_by_coord(i2, &k2); -+ -+ /* mergeable if ->max_key_inside() >= key of i2; */ -+ return keyge(iplug->b.max_key_inside(i1, &k1), -+ item_key_by_coord(i2, &k2)); -+ } else { -+ item_key_by_coord(i1, &k1); -+ item_key_by_coord(i2, &k2); -+ -+ return -+ (get_key_locality(&k1) == get_key_locality(&k2)) && -+ (get_key_objectid(&k1) == get_key_objectid(&k2)) -+ && (iplug == item_plugin_by_coord(i2)); -+ } -+} -+ -+int item_is_extent(const coord_t * item) -+{ -+ assert("vs-482", coord_is_existing_item(item)); -+ return item_id_by_coord(item) == EXTENT_POINTER_ID; -+} -+ -+int item_is_tail(const coord_t * item) -+{ -+ assert("vs-482", coord_is_existing_item(item)); -+ return item_id_by_coord(item) == FORMATTING_ID; -+} -+ -+#if REISER4_DEBUG -+ -+int item_is_statdata(const coord_t * item) -+{ -+ assert("vs-516", coord_is_existing_item(item)); -+ return plugin_of_group(item_plugin_by_coord(item), STAT_DATA_ITEM_TYPE); -+} -+ -+int item_is_ctail(const coord_t * item) -+{ -+ assert("edward-xx", coord_is_existing_item(item)); -+ return item_id_by_coord(item) == CTAIL_ID; -+} -+ -+#endif /* REISER4_DEBUG */ -+ -+static int change_item(struct inode *inode, -+ reiser4_plugin * plugin, -+ pset_member memb) -+{ -+ /* cannot change constituent item (sd, or dir_item) */ -+ return RETERR(-EINVAL); -+} -+ -+static reiser4_plugin_ops item_plugin_ops = { -+ .init = NULL, -+ .load = NULL, -+ .save_len = NULL, -+ .save = NULL, -+ .change = change_item -+}; -+ -+item_plugin item_plugins[LAST_ITEM_ID] = { -+ [STATIC_STAT_DATA_ID] = { -+ .h = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .id = STATIC_STAT_DATA_ID, -+ .groups = (1 << STAT_DATA_ITEM_TYPE), -+ .pops = &item_plugin_ops, -+ .label = "sd", -+ .desc = "stat-data", -+ .linkage = {NULL, NULL} -+ }, -+ .b = { -+ .max_key_inside = max_key_inside_single_key, -+ .can_contain_key = NULL, -+ .mergeable = not_mergeable, -+ .nr_units = nr_units_single_unit, -+ .lookup = NULL, -+ .init = NULL, -+ .paste = paste_no_paste, -+ .fast_paste = NULL, -+ .can_shift = NULL, -+ .copy_units = NULL, -+ .create_hook = NULL, -+ .kill_hook = NULL, -+ .shift_hook = NULL, -+ .cut_units = NULL, -+ .kill_units = NULL, -+ .unit_key = NULL, -+ .max_unit_key = NULL, -+ .estimate = NULL, -+ .item_data_by_flow = NULL, -+#if REISER4_DEBUG -+ .check = NULL -+#endif -+ }, -+ .f = { -+ .utmost_child = NULL, -+ .utmost_child_real_block = NULL, -+ .update = NULL, -+ .scan = NULL, -+ .convert = NULL -+ }, -+ .s = { -+ .sd = { -+ .init_inode = init_inode_static_sd, -+ .save_len = save_len_static_sd, -+ .save = save_static_sd -+ } -+ } -+ }, -+ [SIMPLE_DIR_ENTRY_ID] = { -+ .h = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .id = SIMPLE_DIR_ENTRY_ID, -+ .groups = (1 << DIR_ENTRY_ITEM_TYPE), -+ .pops = &item_plugin_ops, -+ .label = "de", -+ .desc = "directory entry", -+ .linkage = {NULL, NULL} -+ }, -+ .b = { -+ .max_key_inside = max_key_inside_single_key, -+ .can_contain_key = NULL, -+ .mergeable = NULL, -+ .nr_units = nr_units_single_unit, -+ .lookup = NULL, -+ .init = NULL, -+ .paste = NULL, -+ .fast_paste = NULL, -+ .can_shift = NULL, -+ .copy_units = NULL, -+ .create_hook = NULL, -+ .kill_hook = NULL, -+ .shift_hook = NULL, -+ .cut_units = NULL, -+ .kill_units = NULL, -+ .unit_key = NULL, -+ .max_unit_key = NULL, -+ .estimate = NULL, -+ .item_data_by_flow = NULL, -+#if REISER4_DEBUG -+ .check = NULL -+#endif -+ }, -+ .f = { -+ .utmost_child = NULL, -+ .utmost_child_real_block = NULL, -+ .update = NULL, -+ .scan = NULL, -+ .convert = NULL -+ }, -+ .s = { -+ .dir = { -+ .extract_key = extract_key_de, -+ .update_key = update_key_de, -+ .extract_name = extract_name_de, -+ .extract_file_type = extract_file_type_de, -+ .add_entry = add_entry_de, -+ .rem_entry = rem_entry_de, -+ .max_name_len = max_name_len_de -+ } -+ } -+ }, -+ [COMPOUND_DIR_ID] = { -+ .h = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .id = COMPOUND_DIR_ID, -+ .groups = (1 << DIR_ENTRY_ITEM_TYPE), -+ .pops = &item_plugin_ops, -+ .label = "cde", -+ .desc = "compressed directory entry", -+ .linkage = {NULL, NULL} -+ }, -+ .b = { -+ .max_key_inside = max_key_inside_cde, -+ .can_contain_key = can_contain_key_cde, -+ .mergeable = mergeable_cde, -+ .nr_units = nr_units_cde, -+ .lookup = lookup_cde, -+ .init = init_cde, -+ .paste = paste_cde, -+ .fast_paste = agree_to_fast_op, -+ .can_shift = can_shift_cde, -+ .copy_units = copy_units_cde, -+ .create_hook = NULL, -+ .kill_hook = NULL, -+ .shift_hook = NULL, -+ .cut_units = cut_units_cde, -+ .kill_units = kill_units_cde, -+ .unit_key = unit_key_cde, -+ .max_unit_key = unit_key_cde, -+ .estimate = estimate_cde, -+ .item_data_by_flow = NULL, -+#if REISER4_DEBUG -+ .check = reiser4_check_cde -+#endif -+ }, -+ .f = { -+ .utmost_child = NULL, -+ .utmost_child_real_block = NULL, -+ .update = NULL, -+ .scan = NULL, -+ .convert = NULL -+ }, -+ .s = { -+ .dir = { -+ .extract_key = extract_key_cde, -+ .update_key = update_key_cde, -+ .extract_name = extract_name_cde, -+ .extract_file_type = extract_file_type_de, -+ .add_entry = add_entry_cde, -+ .rem_entry = rem_entry_cde, -+ .max_name_len = max_name_len_cde -+ } -+ } -+ }, -+ [NODE_POINTER_ID] = { -+ .h = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .id = NODE_POINTER_ID, -+ .groups = (1 << INTERNAL_ITEM_TYPE), -+ .pops = NULL, -+ .label = "internal", -+ .desc = "internal item", -+ .linkage = {NULL, NULL} -+ }, -+ .b = { -+ .max_key_inside = NULL, -+ .can_contain_key = NULL, -+ .mergeable = mergeable_internal, -+ .nr_units = nr_units_single_unit, -+ .lookup = lookup_internal, -+ .init = NULL, -+ .paste = NULL, -+ .fast_paste = NULL, -+ .can_shift = NULL, -+ .copy_units = NULL, -+ .create_hook = create_hook_internal, -+ .kill_hook = kill_hook_internal, -+ .shift_hook = shift_hook_internal, -+ .cut_units = NULL, -+ .kill_units = NULL, -+ .unit_key = NULL, -+ .max_unit_key = NULL, -+ .estimate = NULL, -+ .item_data_by_flow = NULL, -+#if REISER4_DEBUG -+ .check = check__internal -+#endif -+ }, -+ .f = { -+ .utmost_child = utmost_child_internal, -+ .utmost_child_real_block = -+ utmost_child_real_block_internal, -+ .update = reiser4_update_internal, -+ .scan = NULL, -+ .convert = NULL -+ }, -+ .s = { -+ .internal = { -+ .down_link = down_link_internal, -+ .has_pointer_to = has_pointer_to_internal -+ } -+ } -+ }, -+ [EXTENT_POINTER_ID] = { -+ .h = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .id = EXTENT_POINTER_ID, -+ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE), -+ .pops = NULL, -+ .label = "extent", -+ .desc = "extent item", -+ .linkage = {NULL, NULL} -+ }, -+ .b = { -+ .max_key_inside = max_key_inside_extent, -+ .can_contain_key = can_contain_key_extent, -+ .mergeable = mergeable_extent, -+ .nr_units = nr_units_extent, -+ .lookup = lookup_extent, -+ .init = NULL, -+ .paste = paste_extent, -+ .fast_paste = agree_to_fast_op, -+ .can_shift = can_shift_extent, -+ .create_hook = create_hook_extent, -+ .copy_units = copy_units_extent, -+ .kill_hook = kill_hook_extent, -+ .shift_hook = NULL, -+ .cut_units = cut_units_extent, -+ .kill_units = kill_units_extent, -+ .unit_key = unit_key_extent, -+ .max_unit_key = max_unit_key_extent, -+ .estimate = NULL, -+ .item_data_by_flow = NULL, -+#if REISER4_DEBUG -+ .check = reiser4_check_extent -+#endif -+ }, -+ .f = { -+ .utmost_child = utmost_child_extent, -+ .utmost_child_real_block = -+ utmost_child_real_block_extent, -+ .update = NULL, -+ .scan = reiser4_scan_extent, -+ .convert = NULL, -+ .key_by_offset = key_by_offset_extent -+ }, -+ .s = { -+ .file = { -+ .write = reiser4_write_extent, -+ .read = reiser4_read_extent, -+ .readpage = reiser4_readpage_extent, -+ .get_block = get_block_address_extent, -+ .append_key = append_key_extent, -+ .init_coord_extension = -+ init_coord_extension_extent -+ } -+ } -+ }, -+ [FORMATTING_ID] = { -+ .h = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .id = FORMATTING_ID, -+ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE), -+ .pops = NULL, -+ .label = "body", -+ .desc = "body (or tail?) item", -+ .linkage = {NULL, NULL} -+ }, -+ .b = { -+ .max_key_inside = max_key_inside_tail, -+ .can_contain_key = can_contain_key_tail, -+ .mergeable = mergeable_tail, -+ .nr_units = nr_units_tail, -+ .lookup = lookup_tail, -+ .init = NULL, -+ .paste = paste_tail, -+ .fast_paste = agree_to_fast_op, -+ .can_shift = can_shift_tail, -+ .create_hook = NULL, -+ .copy_units = copy_units_tail, -+ .kill_hook = kill_hook_tail, -+ .shift_hook = NULL, -+ .cut_units = cut_units_tail, -+ .kill_units = kill_units_tail, -+ .unit_key = unit_key_tail, -+ .max_unit_key = unit_key_tail, -+ .estimate = NULL, -+ .item_data_by_flow = NULL, -+#if REISER4_DEBUG -+ .check = NULL -+#endif -+ }, -+ .f = { -+ .utmost_child = NULL, -+ .utmost_child_real_block = NULL, -+ .update = NULL, -+ .scan = NULL, -+ .convert = NULL -+ }, -+ .s = { -+ .file = { -+ .write = reiser4_write_tail, -+ .read = reiser4_read_tail, -+ .readpage = readpage_tail, -+ .get_block = get_block_address_tail, -+ .append_key = append_key_tail, -+ .init_coord_extension = -+ init_coord_extension_tail -+ } -+ } -+ }, -+ [CTAIL_ID] = { -+ .h = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .id = CTAIL_ID, -+ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE), -+ .pops = NULL, -+ .label = "ctail", -+ .desc = "cryptcompress tail item", -+ .linkage = {NULL, NULL} -+ }, -+ .b = { -+ .max_key_inside = max_key_inside_tail, -+ .can_contain_key = can_contain_key_ctail, -+ .mergeable = mergeable_ctail, -+ .nr_units = nr_units_ctail, -+ .lookup = NULL, -+ .init = init_ctail, -+ .paste = paste_ctail, -+ .fast_paste = agree_to_fast_op, -+ .can_shift = can_shift_ctail, -+ .create_hook = create_hook_ctail, -+ .copy_units = copy_units_ctail, -+ .kill_hook = kill_hook_ctail, -+ .shift_hook = shift_hook_ctail, -+ .cut_units = cut_units_ctail, -+ .kill_units = kill_units_ctail, -+ .unit_key = unit_key_tail, -+ .max_unit_key = unit_key_tail, -+ .estimate = estimate_ctail, -+ .item_data_by_flow = NULL, -+#if REISER4_DEBUG -+ .check = check_ctail -+#endif -+ }, -+ .f = { -+ .utmost_child = utmost_child_ctail, -+ /* FIXME-EDWARD: write this */ -+ .utmost_child_real_block = NULL, -+ .update = NULL, -+ .scan = scan_ctail, -+ .convert = convert_ctail -+ }, -+ .s = { -+ .file = { -+ .write = NULL, -+ .read = read_ctail, -+ .readpage = readpage_ctail, -+ .get_block = get_block_address_tail, -+ .append_key = append_key_ctail, -+ .init_coord_extension = -+ init_coord_extension_tail -+ } -+ } -+ }, -+ [BLACK_BOX_ID] = { -+ .h = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .id = BLACK_BOX_ID, -+ .groups = (1 << OTHER_ITEM_TYPE), -+ .pops = NULL, -+ .label = "blackbox", -+ .desc = "black box item", -+ .linkage = {NULL, NULL} -+ }, -+ .b = { -+ .max_key_inside = NULL, -+ .can_contain_key = NULL, -+ .mergeable = not_mergeable, -+ .nr_units = nr_units_single_unit, -+ /* to need for ->lookup method */ -+ .lookup = NULL, -+ .init = NULL, -+ .paste = NULL, -+ .fast_paste = NULL, -+ .can_shift = NULL, -+ .copy_units = NULL, -+ .create_hook = NULL, -+ .kill_hook = NULL, -+ .shift_hook = NULL, -+ .cut_units = NULL, -+ .kill_units = NULL, -+ .unit_key = NULL, -+ .max_unit_key = NULL, -+ .estimate = NULL, -+ .item_data_by_flow = NULL, -+#if REISER4_DEBUG -+ .check = NULL -+#endif -+ } -+ } -+}; -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/item.h linux-2.6.20/fs/reiser4/plugin/item/item.h ---- linux-2.6.20.orig/fs/reiser4/plugin/item/item.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/item/item.h 2007-05-06 14:50:43.819013220 +0400 -@@ -0,0 +1,400 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* first read balance.c comments before reading this */ -+ -+/* An item_plugin implements all of the operations required for -+ balancing that are item specific. */ -+ -+/* an item plugin also implements other operations that are specific to that -+ item. These go into the item specific operations portion of the item -+ handler, and all of the item specific portions of the item handler are put -+ into a union. */ -+ -+#if !defined( __REISER4_ITEM_H__ ) -+#define __REISER4_ITEM_H__ -+ -+#include "../../forward.h" -+#include "../plugin_header.h" -+#include "../../dformat.h" -+#include "../../seal.h" -+#include "../../plugin/file/file.h" -+ -+#include /* for struct file, struct inode */ -+#include /* for struct page */ -+#include /* for struct dentry */ -+ -+typedef enum { -+ STAT_DATA_ITEM_TYPE, -+ DIR_ENTRY_ITEM_TYPE, -+ INTERNAL_ITEM_TYPE, -+ UNIX_FILE_METADATA_ITEM_TYPE, -+ OTHER_ITEM_TYPE -+} item_type_id; -+ -+/* this is the part of each item plugin that all items are expected to -+ support or at least explicitly fail to support by setting the -+ pointer to null. */ -+typedef struct { -+ /* operations called by balancing -+ -+ It is interesting to consider that some of these item -+ operations could be given sources or targets that are not -+ really items in nodes. This could be ok/useful. -+ -+ */ -+ /* maximal key that can _possibly_ be occupied by this item -+ -+ When inserting, and node ->lookup() method (called by -+ coord_by_key()) reaches an item after binary search, -+ the ->max_key_inside() item plugin method is used to determine -+ whether new item should pasted into existing item -+ (new_key<=max_key_inside()) or new item has to be created -+ (new_key>max_key_inside()). -+ -+ For items that occupy exactly one key (like stat-data) -+ this method should return this key. For items that can -+ grow indefinitely (extent, directory item) this should -+ return reiser4_max_key(). -+ -+ For example extent with the key -+ -+ (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks, -+ -+ ->max_key_inside is (LOCALITY,4,OBJID,0xffffffffffffffff), and -+ */ -+ reiser4_key *(*max_key_inside) (const coord_t *, reiser4_key *); -+ -+ /* true if item @coord can merge data at @key. */ -+ int (*can_contain_key) (const coord_t *, const reiser4_key *, -+ const reiser4_item_data *); -+ /* mergeable() - check items for mergeability -+ -+ Optional method. Returns true if two items can be merged. -+ -+ */ -+ int (*mergeable) (const coord_t *, const coord_t *); -+ -+ /* number of atomic things in an item. -+ NOTE FOR CONTRIBUTORS: use a generic method -+ nr_units_single_unit() for solid (atomic) items, as -+ tree operations use it as a criterion of solidness -+ (see is_solid_item macro) */ -+ pos_in_node_t(*nr_units) (const coord_t *); -+ -+ /* search within item for a unit within the item, and return a -+ pointer to it. This can be used to calculate how many -+ bytes to shrink an item if you use pointer arithmetic and -+ compare to the start of the item body if the item's data -+ are continuous in the node, if the item's data are not -+ continuous in the node, all sorts of other things are maybe -+ going to break as well. */ -+ lookup_result(*lookup) (const reiser4_key *, lookup_bias, coord_t *); -+ /* method called by ode_plugin->create_item() to initialise new -+ item */ -+ int (*init) (coord_t * target, coord_t * from, -+ reiser4_item_data * data); -+ /* method called (e.g., by reiser4_resize_item()) to place new data -+ into item when it grows */ -+ int (*paste) (coord_t *, reiser4_item_data *, carry_plugin_info *); -+ /* return true if paste into @coord is allowed to skip -+ carry. That is, if such paste would require any changes -+ at the parent level -+ */ -+ int (*fast_paste) (const coord_t *); -+ /* how many but not more than @want units of @source can be -+ shifted into @target node. If pend == append - we try to -+ append last item of @target by first units of @source. If -+ pend == prepend - we try to "prepend" first item in @target -+ by last units of @source. @target node has @free_space -+ bytes of free space. Total size of those units are returned -+ via @size. -+ -+ @target is not NULL if shifting to the mergeable item and -+ NULL is new item will be created during shifting. -+ */ -+ int (*can_shift) (unsigned free_space, coord_t *, -+ znode *, shift_direction, unsigned *size, -+ unsigned want); -+ -+ /* starting off @from-th unit of item @source append or -+ prepend @count units to @target. @target has been already -+ expanded by @free_space bytes. That must be exactly what is -+ needed for those items in @target. If @where_is_free_space -+ == SHIFT_LEFT - free space is at the end of @target item, -+ othersize - it is in the beginning of it. */ -+ void (*copy_units) (coord_t *, coord_t *, -+ unsigned from, unsigned count, -+ shift_direction where_is_free_space, -+ unsigned free_space); -+ -+ int (*create_hook) (const coord_t *, void *); -+ /* do whatever is necessary to do when @count units starting -+ from @from-th one are removed from the tree */ -+ /* FIXME-VS: this is used to be here for, in particular, -+ extents and items of internal type to free blocks they point -+ to at the same time with removing items from a -+ tree. Problems start, however, when dealloc_block fails due -+ to some reason. Item gets removed, but blocks it pointed to -+ are not freed. It is not clear how to fix this for items of -+ internal type because a need to remove internal item may -+ appear in the middle of balancing, and there is no way to -+ undo changes made. OTOH, if space allocator involves -+ balancing to perform dealloc_block - this will probably -+ break balancing due to deadlock issues -+ */ -+ int (*kill_hook) (const coord_t *, pos_in_node_t from, -+ pos_in_node_t count, struct carry_kill_data *); -+ int (*shift_hook) (const coord_t *, unsigned from, unsigned count, -+ znode * _node); -+ -+ /* unit @*from contains @from_key. unit @*to contains @to_key. Cut all keys between @from_key and @to_key -+ including boundaries. When units are cut from item beginning - move space which gets freed to head of -+ item. When units are cut from item end - move freed space to item end. When units are cut from the middle of -+ item - move freed space to item head. Return amount of space which got freed. Save smallest removed key in -+ @smallest_removed if it is not 0. Save new first item key in @new_first_key if it is not 0 -+ */ -+ int (*cut_units) (coord_t *, pos_in_node_t from, pos_in_node_t to, -+ struct carry_cut_data *, -+ reiser4_key * smallest_removed, -+ reiser4_key * new_first_key); -+ -+ /* like cut_units, except that these units are removed from the -+ tree, not only from a node */ -+ int (*kill_units) (coord_t *, pos_in_node_t from, pos_in_node_t to, -+ struct carry_kill_data *, -+ reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+ -+ /* if @key_of_coord == 1 - returned key of coord, otherwise - -+ key of unit is returned. If @coord is not set to certain -+ unit - ERR_PTR(-ENOENT) is returned */ -+ reiser4_key *(*unit_key) (const coord_t *, reiser4_key *); -+ reiser4_key *(*max_unit_key) (const coord_t *, reiser4_key *); -+ /* estimate how much space is needed for paste @data into item at -+ @coord. if @coord==0 - estimate insertion, otherwise - estimate -+ pasting -+ */ -+ int (*estimate) (const coord_t *, const reiser4_item_data *); -+ -+ /* converts flow @f to item data. @coord == 0 on insert */ -+ int (*item_data_by_flow) (const coord_t *, const flow_t *, -+ reiser4_item_data *); -+ -+ /*void (*show) (struct seq_file *, coord_t *); */ -+ -+#if REISER4_DEBUG -+ /* used for debugging, every item should have here the most -+ complete possible check of the consistency of the item that -+ the inventor can construct */ -+ int (*check) (const coord_t *, const char **error); -+#endif -+ -+} balance_ops; -+ -+typedef struct { -+ /* return the right or left child of @coord, only if it is in memory */ -+ int (*utmost_child) (const coord_t *, sideof side, jnode ** child); -+ -+ /* return whether the right or left child of @coord has a non-fake -+ block number. */ -+ int (*utmost_child_real_block) (const coord_t *, sideof side, -+ reiser4_block_nr *); -+ /* relocate child at @coord to the @block */ -+ void (*update) (const coord_t *, const reiser4_block_nr *); -+ /* count unformatted nodes per item for leave relocation policy, etc.. */ -+ int (*scan) (flush_scan * scan); -+ /* convert item by flush */ -+ int (*convert) (flush_pos_t * pos); -+ /* backward mapping from jnode offset to a key. */ -+ int (*key_by_offset) (struct inode *, loff_t, reiser4_key *); -+} flush_ops; -+ -+/* operations specific to the directory item */ -+typedef struct { -+ /* extract stat-data key from directory entry at @coord and place it -+ into @key. */ -+ int (*extract_key) (const coord_t *, reiser4_key * key); -+ /* update object key in item. */ -+ int (*update_key) (const coord_t *, const reiser4_key *, lock_handle *); -+ /* extract name from directory entry at @coord and return it */ -+ char *(*extract_name) (const coord_t *, char *buf); -+ /* extract file type (DT_* stuff) from directory entry at @coord and -+ return it */ -+ unsigned (*extract_file_type) (const coord_t *); -+ int (*add_entry) (struct inode * dir, -+ coord_t *, lock_handle *, -+ const struct dentry * name, -+ reiser4_dir_entry_desc * entry); -+ int (*rem_entry) (struct inode * dir, const struct qstr * name, -+ coord_t *, lock_handle *, -+ reiser4_dir_entry_desc * entry); -+ int (*max_name_len) (const struct inode * dir); -+} dir_entry_ops; -+ -+/* operations specific to items regular (unix) file metadata are built of */ -+typedef struct { -+ int (*write) (struct file *, const char __user *, size_t, loff_t *pos); -+ int (*read) (struct file *, flow_t *, hint_t *); -+ int (*readpage) (void *, struct page *); -+ int (*get_block) (const coord_t *, sector_t, sector_t *); -+ /* -+ * key of first byte which is not addressed by the item @coord is set -+ * to. -+ * For example, for extent item with the key -+ * -+ * (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks, -+ * -+ * ->append_key is -+ * -+ * (LOCALITY,4,OBJID,STARTING-OFFSET + BLK * block_size) -+ */ -+ reiser4_key *(*append_key) (const coord_t *, reiser4_key *); -+ -+ void (*init_coord_extension) (uf_coord_t *, loff_t); -+} file_ops; -+ -+/* operations specific to items of stat data type */ -+typedef struct { -+ int (*init_inode) (struct inode * inode, char *sd, int len); -+ int (*save_len) (struct inode * inode); -+ int (*save) (struct inode * inode, char **area); -+} sd_ops; -+ -+/* operations specific to internal item */ -+typedef struct { -+ /* all tree traversal want to know from internal item is where -+ to go next. */ -+ void (*down_link) (const coord_t * coord, -+ const reiser4_key * key, reiser4_block_nr * block); -+ /* check that given internal item contains given pointer. */ -+ int (*has_pointer_to) (const coord_t * coord, -+ const reiser4_block_nr * block); -+} internal_item_ops; -+ -+struct item_plugin { -+ /* generic fields */ -+ plugin_header h; -+ -+ /* methods common for all item types */ -+ balance_ops b; -+ /* methods used during flush */ -+ flush_ops f; -+ -+ /* methods specific to particular type of item */ -+ union { -+ dir_entry_ops dir; -+ file_ops file; -+ sd_ops sd; -+ internal_item_ops internal; -+ } s; -+ -+}; -+ -+#define is_solid_item(iplug) ((iplug)->b.nr_units == nr_units_single_unit) -+ -+static inline item_id item_id_by_plugin(item_plugin * plugin) -+{ -+ return plugin->h.id; -+} -+ -+static inline char get_iplugid(item_plugin * iplug) -+{ -+ assert("nikita-2838", iplug != NULL); -+ assert("nikita-2839", iplug->h.id < 0xff); -+ return (char)item_id_by_plugin(iplug); -+} -+ -+extern unsigned long znode_times_locked(const znode * z); -+ -+static inline void coord_set_iplug(coord_t * coord, item_plugin * iplug) -+{ -+ assert("nikita-2837", coord != NULL); -+ assert("nikita-2838", iplug != NULL); -+ coord->iplugid = get_iplugid(iplug); -+ ON_DEBUG(coord->plug_v = znode_times_locked(coord->node)); -+} -+ -+static inline item_plugin *coord_iplug(const coord_t * coord) -+{ -+ assert("nikita-2833", coord != NULL); -+ assert("nikita-2834", coord->iplugid != INVALID_PLUGID); -+ assert("nikita-3549", coord->plug_v == znode_times_locked(coord->node)); -+ return (item_plugin *) plugin_by_id(REISER4_ITEM_PLUGIN_TYPE, -+ coord->iplugid); -+} -+ -+extern int item_can_contain_key(const coord_t * item, const reiser4_key * key, -+ const reiser4_item_data *); -+extern int are_items_mergeable(const coord_t * i1, const coord_t * i2); -+extern int item_is_extent(const coord_t *); -+extern int item_is_tail(const coord_t *); -+extern int item_is_statdata(const coord_t * item); -+extern int item_is_ctail(const coord_t *); -+ -+extern pos_in_node_t item_length_by_coord(const coord_t * coord); -+extern pos_in_node_t nr_units_single_unit(const coord_t * coord); -+extern item_id item_id_by_coord(const coord_t * coord /* coord to query */ ); -+extern reiser4_key *item_key_by_coord(const coord_t * coord, reiser4_key * key); -+extern reiser4_key *max_item_key_by_coord(const coord_t *, reiser4_key *); -+extern reiser4_key *unit_key_by_coord(const coord_t * coord, reiser4_key * key); -+extern reiser4_key *max_unit_key_by_coord(const coord_t * coord, -+ reiser4_key * key); -+extern void obtain_item_plugin(const coord_t * coord); -+ -+#if defined(REISER4_DEBUG) -+extern int znode_is_loaded(const znode * node); -+#endif -+ -+/* return plugin of item at @coord */ -+static inline item_plugin *item_plugin_by_coord(const coord_t * -+ coord /* coord to query */ ) -+{ -+ assert("nikita-330", coord != NULL); -+ assert("nikita-331", coord->node != NULL); -+ assert("nikita-332", znode_is_loaded(coord->node)); -+ -+ if (unlikely(!coord_is_iplug_set(coord))) -+ obtain_item_plugin(coord); -+ return coord_iplug(coord); -+} -+ -+/* this returns true if item is of internal type */ -+static inline int item_is_internal(const coord_t * item) -+{ -+ assert("vs-483", coord_is_existing_item(item)); -+ return plugin_of_group(item_plugin_by_coord(item), INTERNAL_ITEM_TYPE); -+} -+ -+extern void item_body_by_coord_hard(coord_t * coord); -+extern void *item_body_by_coord_easy(const coord_t * coord); -+#if REISER4_DEBUG -+extern int item_body_is_valid(const coord_t * coord); -+#endif -+ -+/* return pointer to item body */ -+static inline void *item_body_by_coord(const coord_t * -+ coord /* coord to query */ ) -+{ -+ assert("nikita-324", coord != NULL); -+ assert("nikita-325", coord->node != NULL); -+ assert("nikita-326", znode_is_loaded(coord->node)); -+ -+ if (coord->offset == INVALID_OFFSET) -+ item_body_by_coord_hard((coord_t *) coord); -+ assert("nikita-3201", item_body_is_valid(coord)); -+ assert("nikita-3550", coord->body_v == znode_times_locked(coord->node)); -+ return item_body_by_coord_easy(coord); -+} -+ -+/* __REISER4_ITEM_H__ */ -+#endif -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/Makefile linux-2.6.20/fs/reiser4/plugin/item/Makefile ---- linux-2.6.20.orig/fs/reiser4/plugin/item/Makefile 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/item/Makefile 2007-05-06 14:50:43.819013220 +0400 -@@ -0,0 +1,18 @@ -+obj-$(CONFIG_REISER4_FS) += item_plugins.o -+ -+item_plugins-objs := \ -+ item.o \ -+ static_stat.o \ -+ sde.o \ -+ cde.o \ -+ blackbox.o \ -+ internal.o \ -+ tail.o \ -+ ctail.o \ -+ extent.o \ -+ extent_item_ops.o \ -+ extent_file_ops.o \ -+ extent_flush_ops.o -+ -+ -+ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/sde.c linux-2.6.20/fs/reiser4/plugin/item/sde.c ---- linux-2.6.20.orig/fs/reiser4/plugin/item/sde.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/item/sde.c 2007-05-06 14:50:43.819013220 +0400 -@@ -0,0 +1,190 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Directory entry implementation */ -+#include "../../forward.h" -+#include "../../debug.h" -+#include "../../dformat.h" -+#include "../../kassign.h" -+#include "../../coord.h" -+#include "sde.h" -+#include "item.h" -+#include "../plugin.h" -+#include "../../znode.h" -+#include "../../carry.h" -+#include "../../tree.h" -+#include "../../inode.h" -+ -+#include /* for struct inode */ -+#include /* for struct dentry */ -+#include -+ -+/* ->extract_key() method of simple directory item plugin. */ -+int extract_key_de(const coord_t * coord /* coord of item */ , -+ reiser4_key * key /* resulting key */ ) -+{ -+ directory_entry_format *dent; -+ -+ assert("nikita-1458", coord != NULL); -+ assert("nikita-1459", key != NULL); -+ -+ dent = (directory_entry_format *) item_body_by_coord(coord); -+ assert("nikita-1158", item_length_by_coord(coord) >= (int)sizeof *dent); -+ return extract_key_from_id(&dent->id, key); -+} -+ -+int -+update_key_de(const coord_t * coord, const reiser4_key * key, -+ lock_handle * lh UNUSED_ARG) -+{ -+ directory_entry_format *dent; -+ obj_key_id obj_id; -+ int result; -+ -+ assert("nikita-2342", coord != NULL); -+ assert("nikita-2343", key != NULL); -+ -+ dent = (directory_entry_format *) item_body_by_coord(coord); -+ result = build_obj_key_id(key, &obj_id); -+ if (result == 0) { -+ dent->id = obj_id; -+ znode_make_dirty(coord->node); -+ } -+ return 0; -+} -+ -+char *extract_dent_name(const coord_t * coord, directory_entry_format * dent, -+ char *buf) -+{ -+ reiser4_key key; -+ -+ unit_key_by_coord(coord, &key); -+ if (get_key_type(&key) != KEY_FILE_NAME_MINOR) -+ reiser4_print_address("oops", znode_get_block(coord->node)); -+ if (!is_longname_key(&key)) { -+ if (is_dot_key(&key)) -+ return (char *)"."; -+ else -+ return extract_name_from_key(&key, buf); -+ } else -+ return (char *)dent->name; -+} -+ -+/* ->extract_name() method of simple directory item plugin. */ -+char *extract_name_de(const coord_t * coord /* coord of item */ , char *buf) -+{ -+ directory_entry_format *dent; -+ -+ assert("nikita-1460", coord != NULL); -+ -+ dent = (directory_entry_format *) item_body_by_coord(coord); -+ return extract_dent_name(coord, dent, buf); -+} -+ -+/* ->extract_file_type() method of simple directory item plugin. */ -+unsigned extract_file_type_de(const coord_t * coord UNUSED_ARG /* coord of -+ * item */ ) -+{ -+ assert("nikita-1764", coord != NULL); -+ /* we don't store file type in the directory entry yet. -+ -+ But see comments at kassign.h:obj_key_id -+ */ -+ return DT_UNKNOWN; -+} -+ -+int add_entry_de(struct inode *dir /* directory of item */ , -+ coord_t * coord /* coord of item */ , -+ lock_handle * lh /* insertion lock handle */ , -+ const struct dentry *de /* name to add */ , -+ reiser4_dir_entry_desc * entry /* parameters of new directory -+ * entry */ ) -+{ -+ reiser4_item_data data; -+ directory_entry_format *dent; -+ int result; -+ const char *name; -+ int len; -+ int longname; -+ -+ name = de->d_name.name; -+ len = de->d_name.len; -+ assert("nikita-1163", strlen(name) == len); -+ -+ longname = is_longname(name, len); -+ -+ data.length = sizeof *dent; -+ if (longname) -+ data.length += len + 1; -+ data.data = NULL; -+ data.user = 0; -+ data.iplug = item_plugin_by_id(SIMPLE_DIR_ENTRY_ID); -+ -+ /* NOTE-NIKITA quota plugin */ -+ if (DQUOT_ALLOC_SPACE_NODIRTY(dir, data.length)) -+ return -EDQUOT; -+ -+ result = insert_by_coord(coord, &data, &entry->key, lh, 0 /*flags */ ); -+ if (result != 0) -+ return result; -+ -+ dent = (directory_entry_format *) item_body_by_coord(coord); -+ build_inode_key_id(entry->obj, &dent->id); -+ if (longname) { -+ memcpy(dent->name, name, len); -+ put_unaligned(0, &dent->name[len]); -+ } -+ return 0; -+} -+ -+int rem_entry_de(struct inode *dir /* directory of item */ , -+ const struct qstr *name UNUSED_ARG, -+ coord_t * coord /* coord of item */ , -+ lock_handle * lh UNUSED_ARG /* lock handle for -+ * removal */ , -+ reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of -+ * directory entry -+ * being removed */ ) -+{ -+ coord_t shadow; -+ int result; -+ int length; -+ -+ length = item_length_by_coord(coord); -+ if (inode_get_bytes(dir) < length) { -+ warning("nikita-2627", "Dir is broke: %llu: %llu", -+ (unsigned long long)get_inode_oid(dir), -+ inode_get_bytes(dir)); -+ -+ return RETERR(-EIO); -+ } -+ -+ /* cut_node() is supposed to take pointers to _different_ -+ coords, because it will modify them without respect to -+ possible aliasing. To work around this, create temporary copy -+ of @coord. -+ */ -+ coord_dup(&shadow, coord); -+ result = -+ kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0); -+ if (result == 0) { -+ /* NOTE-NIKITA quota plugin */ -+ DQUOT_FREE_SPACE_NODIRTY(dir, length); -+ } -+ return result; -+} -+ -+int max_name_len_de(const struct inode *dir) -+{ -+ return reiser4_tree_by_inode(dir)->nplug->max_item_size() - -+ sizeof(directory_entry_format) - 2; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/sde.h linux-2.6.20/fs/reiser4/plugin/item/sde.h ---- linux-2.6.20.orig/fs/reiser4/plugin/item/sde.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/item/sde.h 2007-05-06 14:50:43.819013220 +0400 -@@ -0,0 +1,66 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Directory entry. */ -+ -+#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ ) -+#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ -+ -+#include "../../forward.h" -+#include "../../dformat.h" -+#include "../../kassign.h" -+#include "../../key.h" -+ -+#include -+#include /* for struct dentry */ -+ -+typedef struct directory_entry_format { -+ /* key of object stat-data. It's not necessary to store whole -+ key here, because it's always key of stat-data, so minor -+ packing locality and offset can be omitted here. But this -+ relies on particular key allocation scheme for stat-data, so, -+ for extensibility sake, whole key can be stored here. -+ -+ We store key as array of bytes, because we don't want 8-byte -+ alignment of dir entries. -+ */ -+ obj_key_id id; -+ /* file name. Null terminated string. */ -+ d8 name[0]; -+} directory_entry_format; -+ -+void print_de(const char *prefix, coord_t * coord); -+int extract_key_de(const coord_t * coord, reiser4_key * key); -+int update_key_de(const coord_t * coord, const reiser4_key * key, -+ lock_handle * lh); -+char *extract_name_de(const coord_t * coord, char *buf); -+unsigned extract_file_type_de(const coord_t * coord); -+int add_entry_de(struct inode *dir, coord_t * coord, -+ lock_handle * lh, const struct dentry *name, -+ reiser4_dir_entry_desc * entry); -+int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord, -+ lock_handle * lh, reiser4_dir_entry_desc * entry); -+int max_name_len_de(const struct inode *dir); -+ -+int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length); -+ -+char *extract_dent_name(const coord_t * coord, -+ directory_entry_format * dent, char *buf); -+ -+#if REISER4_LARGE_KEY -+#define DE_NAME_BUF_LEN (24) -+#else -+#define DE_NAME_BUF_LEN (16) -+#endif -+ -+/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/static_stat.c linux-2.6.20/fs/reiser4/plugin/item/static_stat.c ---- linux-2.6.20.orig/fs/reiser4/plugin/item/static_stat.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/item/static_stat.c 2007-05-06 14:50:43.823014469 +0400 -@@ -0,0 +1,1107 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* stat data manipulation. */ -+ -+#include "../../forward.h" -+#include "../../super.h" -+#include "../../vfs_ops.h" -+#include "../../inode.h" -+#include "../../debug.h" -+#include "../../dformat.h" -+#include "../object.h" -+#include "../plugin.h" -+#include "../plugin_header.h" -+#include "static_stat.h" -+#include "item.h" -+ -+#include -+#include -+ -+/* see static_stat.h for explanation */ -+ -+/* helper function used while we are dumping/loading inode/plugin state -+ to/from the stat-data. */ -+ -+static void move_on(int *length /* space remaining in stat-data */ , -+ char **area /* current coord in stat data */ , -+ int size_of /* how many bytes to move forward */ ) -+{ -+ assert("nikita-615", length != NULL); -+ assert("nikita-616", area != NULL); -+ -+ *length -= size_of; -+ *area += size_of; -+ -+ assert("nikita-617", *length >= 0); -+} -+ -+/* helper function used while loading inode/plugin state from stat-data. -+ Complain if there is less space in stat-data than was expected. -+ Can only happen on disk corruption. */ -+static int not_enough_space(struct inode *inode /* object being processed */ , -+ const char *where /* error message */ ) -+{ -+ assert("nikita-618", inode != NULL); -+ -+ warning("nikita-619", "Not enough space in %llu while loading %s", -+ (unsigned long long)get_inode_oid(inode), where); -+ -+ return RETERR(-EINVAL); -+} -+ -+/* helper function used while loading inode/plugin state from -+ stat-data. Call it if invalid plugin id was found. */ -+static int unknown_plugin(reiser4_plugin_id id /* invalid id */ , -+ struct inode *inode /* object being processed */ ) -+{ -+ warning("nikita-620", "Unknown plugin %i in %llu", -+ id, (unsigned long long)get_inode_oid(inode)); -+ -+ return RETERR(-EINVAL); -+} -+ -+/* this is installed as ->init_inode() method of -+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). -+ Copies data from on-disk stat-data format into inode. -+ Handles stat-data extensions. */ -+/* was sd_load */ -+int init_inode_static_sd(struct inode *inode /* object being processed */ , -+ char *sd /* stat-data body */ , -+ int len /* length of stat-data */ ) -+{ -+ int result; -+ int bit; -+ int chunk; -+ __u16 mask; -+ __u64 bigmask; -+ reiser4_stat_data_base *sd_base; -+ reiser4_inode *state; -+ -+ assert("nikita-625", inode != NULL); -+ assert("nikita-626", sd != NULL); -+ -+ result = 0; -+ sd_base = (reiser4_stat_data_base *) sd; -+ state = reiser4_inode_data(inode); -+ mask = le16_to_cpu(get_unaligned(&sd_base->extmask)); -+ bigmask = mask; -+ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN); -+ -+ move_on(&len, &sd, sizeof *sd_base); -+ for (bit = 0, chunk = 0; -+ mask != 0 || bit <= LAST_IMPORTANT_SD_EXTENSION; -+ ++bit, mask >>= 1) { -+ if (((bit + 1) % 16) != 0) { -+ /* handle extension */ -+ sd_ext_plugin *sdplug; -+ -+ if (bit >= LAST_SD_EXTENSION) { -+ warning("vpf-1904", -+ "No such extension %i in inode %llu", -+ bit, -+ (unsigned long long) -+ get_inode_oid(inode)); -+ -+ result = RETERR(-EINVAL); -+ break; -+ } -+ -+ sdplug = sd_ext_plugin_by_id(bit); -+ if (sdplug == NULL) { -+ warning("nikita-627", -+ "No such extension %i in inode %llu", -+ bit, -+ (unsigned long long) -+ get_inode_oid(inode)); -+ -+ result = RETERR(-EINVAL); -+ break; -+ } -+ if (mask & 1) { -+ assert("nikita-628", sdplug->present); -+ /* alignment is not supported in node layout -+ plugin yet. -+ result = align( inode, &len, &sd, -+ sdplug -> alignment ); -+ if( result != 0 ) -+ return result; */ -+ result = sdplug->present(inode, &sd, &len); -+ } else if (sdplug->absent != NULL) -+ result = sdplug->absent(inode); -+ if (result) -+ break; -+ /* else, we are looking at the last bit in 16-bit -+ portion of bitmask */ -+ } else if (mask & 1) { -+ /* next portion of bitmask */ -+ if (len < (int)sizeof(d16)) { -+ warning("nikita-629", -+ "No space for bitmap in inode %llu", -+ (unsigned long long) -+ get_inode_oid(inode)); -+ -+ result = RETERR(-EINVAL); -+ break; -+ } -+ mask = le16_to_cpu(get_unaligned((d16 *)sd)); -+ bigmask <<= 16; -+ bigmask |= mask; -+ move_on(&len, &sd, sizeof(d16)); -+ ++chunk; -+ if (chunk == 3) { -+ if (!(mask & 0x8000)) { -+ /* clear last bit */ -+ mask &= ~0x8000; -+ continue; -+ } -+ /* too much */ -+ warning("nikita-630", -+ "Too many extensions in %llu", -+ (unsigned long long) -+ get_inode_oid(inode)); -+ -+ result = RETERR(-EINVAL); -+ break; -+ } -+ } else -+ /* bitmask exhausted */ -+ break; -+ } -+ state->extmask = bigmask; -+ /* common initialisations */ -+ if (len - (bit / 16 * sizeof(d16)) > 0) { -+ /* alignment in save_len_static_sd() is taken into account -+ -edward */ -+ warning("nikita-631", "unused space in inode %llu", -+ (unsigned long long)get_inode_oid(inode)); -+ } -+ -+ return result; -+} -+ -+/* estimates size of stat-data required to store inode. -+ Installed as ->save_len() method of -+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */ -+/* was sd_len */ -+int save_len_static_sd(struct inode *inode /* object being processed */ ) -+{ -+ unsigned int result; -+ __u64 mask; -+ int bit; -+ -+ assert("nikita-632", inode != NULL); -+ -+ result = sizeof(reiser4_stat_data_base); -+ mask = reiser4_inode_data(inode)->extmask; -+ for (bit = 0; mask != 0; ++bit, mask >>= 1) { -+ if (mask & 1) { -+ sd_ext_plugin *sdplug; -+ -+ sdplug = sd_ext_plugin_by_id(bit); -+ assert("nikita-633", sdplug != NULL); -+ /* no aligment support -+ result += -+ round_up( result, sdplug -> alignment ) - result; */ -+ result += sdplug->save_len(inode); -+ } -+ } -+ result += bit / 16 * sizeof(d16); -+ return result; -+} -+ -+/* saves inode into stat-data. -+ Installed as ->save() method of -+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */ -+/* was sd_save */ -+int save_static_sd(struct inode *inode /* object being processed */ , -+ char **area /* where to save stat-data */ ) -+{ -+ int result; -+ __u64 emask; -+ int bit; -+ unsigned int len; -+ reiser4_stat_data_base *sd_base; -+ -+ assert("nikita-634", inode != NULL); -+ assert("nikita-635", area != NULL); -+ -+ result = 0; -+ emask = reiser4_inode_data(inode)->extmask; -+ sd_base = (reiser4_stat_data_base *) * area; -+ put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)), &sd_base->extmask); -+ /*cputod16((unsigned)(emask & 0xffff), &sd_base->extmask);*/ -+ -+ *area += sizeof *sd_base; -+ len = 0xffffffffu; -+ for (bit = 0; emask != 0; ++bit, emask >>= 1) { -+ if (emask & 1) { -+ if ((bit + 1) % 16 != 0) { -+ sd_ext_plugin *sdplug; -+ sdplug = sd_ext_plugin_by_id(bit); -+ assert("nikita-636", sdplug != NULL); -+ /* no alignment support yet -+ align( inode, &len, area, -+ sdplug -> alignment ); */ -+ result = sdplug->save(inode, area); -+ if (result) -+ break; -+ } else { -+ put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)), -+ (d16 *)(*area)); -+ /*cputod16((unsigned)(emask & 0xffff), -+ (d16 *) * area);*/ -+ *area += sizeof(d16); -+ } -+ } -+ } -+ return result; -+} -+ -+/* stat-data extension handling functions. */ -+ -+static int present_lw_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ , -+ int *len /* remaining length */ ) -+{ -+ if (*len >= (int)sizeof(reiser4_light_weight_stat)) { -+ reiser4_light_weight_stat *sd_lw; -+ -+ sd_lw = (reiser4_light_weight_stat *) * area; -+ -+ inode->i_mode = le16_to_cpu(get_unaligned(&sd_lw->mode)); -+ inode->i_nlink = le32_to_cpu(get_unaligned(&sd_lw->nlink)); -+ inode->i_size = le64_to_cpu(get_unaligned(&sd_lw->size)); -+ if ((inode->i_mode & S_IFMT) == (S_IFREG | S_IFIFO)) { -+ inode->i_mode &= ~S_IFIFO; -+ warning("", "partially converted file is encountered"); -+ reiser4_inode_set_flag(inode, REISER4_PART_MIXED); -+ } -+ move_on(len, area, sizeof *sd_lw); -+ return 0; -+ } else -+ return not_enough_space(inode, "lw sd"); -+} -+ -+static int save_len_lw_sd(struct inode *inode UNUSED_ARG /* object being -+ * processed */ ) -+{ -+ return sizeof(reiser4_light_weight_stat); -+} -+ -+static int save_lw_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ ) -+{ -+ reiser4_light_weight_stat *sd; -+ mode_t delta; -+ -+ assert("nikita-2705", inode != NULL); -+ assert("nikita-2706", area != NULL); -+ assert("nikita-2707", *area != NULL); -+ -+ sd = (reiser4_light_weight_stat *) * area; -+ -+ delta = (reiser4_inode_get_flag(inode, -+ REISER4_PART_MIXED) ? S_IFIFO : 0); -+ put_unaligned(cpu_to_le16(inode->i_mode | delta), &sd->mode); -+ put_unaligned(cpu_to_le32(inode->i_nlink), &sd->nlink); -+ put_unaligned(cpu_to_le64((__u64) inode->i_size), &sd->size); -+ *area += sizeof *sd; -+ return 0; -+} -+ -+static int present_unix_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ , -+ int *len /* remaining length */ ) -+{ -+ assert("nikita-637", inode != NULL); -+ assert("nikita-638", area != NULL); -+ assert("nikita-639", *area != NULL); -+ assert("nikita-640", len != NULL); -+ assert("nikita-641", *len > 0); -+ -+ if (*len >= (int)sizeof(reiser4_unix_stat)) { -+ reiser4_unix_stat *sd; -+ -+ sd = (reiser4_unix_stat *) * area; -+ -+ inode->i_uid = le32_to_cpu(get_unaligned(&sd->uid)); -+ inode->i_gid = le32_to_cpu(get_unaligned(&sd->gid)); -+ inode->i_atime.tv_sec = le32_to_cpu(get_unaligned(&sd->atime)); -+ inode->i_mtime.tv_sec = le32_to_cpu(get_unaligned(&sd->mtime)); -+ inode->i_ctime.tv_sec = le32_to_cpu(get_unaligned(&sd->ctime)); -+ if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) -+ inode->i_rdev = le64_to_cpu(get_unaligned(&sd->u.rdev)); -+ else -+ inode_set_bytes(inode, (loff_t) le64_to_cpu(get_unaligned(&sd->u.bytes))); -+ move_on(len, area, sizeof *sd); -+ return 0; -+ } else -+ return not_enough_space(inode, "unix sd"); -+} -+ -+static int absent_unix_sd(struct inode *inode /* object being processed */ ) -+{ -+ inode->i_uid = get_super_private(inode->i_sb)->default_uid; -+ inode->i_gid = get_super_private(inode->i_sb)->default_gid; -+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; -+ inode_set_bytes(inode, inode->i_size); -+ /* mark inode as lightweight, so that caller (lookup_common) will -+ complete initialisation by copying [ug]id from a parent. */ -+ reiser4_inode_set_flag(inode, REISER4_LIGHT_WEIGHT); -+ return 0; -+} -+ -+/* Audited by: green(2002.06.14) */ -+static int save_len_unix_sd(struct inode *inode UNUSED_ARG /* object being -+ * processed */ ) -+{ -+ return sizeof(reiser4_unix_stat); -+} -+ -+static int save_unix_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ ) -+{ -+ reiser4_unix_stat *sd; -+ -+ assert("nikita-642", inode != NULL); -+ assert("nikita-643", area != NULL); -+ assert("nikita-644", *area != NULL); -+ -+ sd = (reiser4_unix_stat *) * area; -+ put_unaligned(cpu_to_le32(inode->i_uid), &sd->uid); -+ put_unaligned(cpu_to_le32(inode->i_gid), &sd->gid); -+ put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_sec), &sd->atime); -+ put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_sec), &sd->ctime); -+ put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_sec), &sd->mtime); -+ if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) -+ put_unaligned(cpu_to_le64(inode->i_rdev), &sd->u.rdev); -+ else -+ put_unaligned(cpu_to_le64((__u64) inode_get_bytes(inode)), &sd->u.bytes); -+ *area += sizeof *sd; -+ return 0; -+} -+ -+static int -+present_large_times_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ , -+ int *len /* remaining length */ ) -+{ -+ if (*len >= (int)sizeof(reiser4_large_times_stat)) { -+ reiser4_large_times_stat *sd_lt; -+ -+ sd_lt = (reiser4_large_times_stat *) * area; -+ -+ inode->i_atime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->atime)); -+ inode->i_mtime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->mtime)); -+ inode->i_ctime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->ctime)); -+ -+ move_on(len, area, sizeof *sd_lt); -+ return 0; -+ } else -+ return not_enough_space(inode, "large times sd"); -+} -+ -+static int -+save_len_large_times_sd(struct inode *inode UNUSED_ARG -+ /* object being processed */ ) -+{ -+ return sizeof(reiser4_large_times_stat); -+} -+ -+static int -+save_large_times_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ ) -+{ -+ reiser4_large_times_stat *sd; -+ -+ assert("nikita-2817", inode != NULL); -+ assert("nikita-2818", area != NULL); -+ assert("nikita-2819", *area != NULL); -+ -+ sd = (reiser4_large_times_stat *) * area; -+ -+ put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_nsec), &sd->atime); -+ put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_nsec), &sd->ctime); -+ put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_nsec), &sd->mtime); -+ -+ *area += sizeof *sd; -+ return 0; -+} -+ -+/* symlink stat data extension */ -+ -+/* allocate memory for symlink target and attach it to inode->i_private */ -+static int -+symlink_target_to_inode(struct inode *inode, const char *target, int len) -+{ -+ assert("vs-845", inode->i_private == NULL); -+ assert("vs-846", !reiser4_inode_get_flag(inode, -+ REISER4_GENERIC_PTR_USED)); -+ /* FIXME-VS: this is prone to deadlock. Not more than other similar -+ places, though */ -+ inode->i_private = kmalloc((size_t) len + 1, -+ reiser4_ctx_gfp_mask_get()); -+ if (!inode->i_private) -+ return RETERR(-ENOMEM); -+ -+ memcpy((char *)(inode->i_private), target, (size_t) len); -+ ((char *)(inode->i_private))[len] = 0; -+ reiser4_inode_set_flag(inode, REISER4_GENERIC_PTR_USED); -+ return 0; -+} -+ -+/* this is called on read_inode. There is nothing to do actually, but some -+ sanity checks */ -+static int present_symlink_sd(struct inode *inode, char **area, int *len) -+{ -+ int result; -+ int length; -+ reiser4_symlink_stat *sd; -+ -+ length = (int)inode->i_size; -+ /* -+ * *len is number of bytes in stat data item from *area to the end of -+ * item. It must be not less than size of symlink + 1 for ending 0 -+ */ -+ if (length > *len) -+ return not_enough_space(inode, "symlink"); -+ -+ if (*(*area + length) != 0) { -+ warning("vs-840", "Symlink is not zero terminated"); -+ return RETERR(-EIO); -+ } -+ -+ sd = (reiser4_symlink_stat *) * area; -+ result = symlink_target_to_inode(inode, sd->body, length); -+ -+ move_on(len, area, length + 1); -+ return result; -+} -+ -+static int save_len_symlink_sd(struct inode *inode) -+{ -+ return inode->i_size + 1; -+} -+ -+/* this is called on create and update stat data. Do nothing on update but -+ update @area */ -+static int save_symlink_sd(struct inode *inode, char **area) -+{ -+ int result; -+ int length; -+ reiser4_symlink_stat *sd; -+ -+ length = (int)inode->i_size; -+ /* inode->i_size must be set already */ -+ assert("vs-841", length); -+ -+ result = 0; -+ sd = (reiser4_symlink_stat *) * area; -+ if (!reiser4_inode_get_flag(inode, REISER4_GENERIC_PTR_USED)) { -+ const char *target; -+ -+ target = (const char *)(inode->i_private); -+ inode->i_private = NULL; -+ -+ result = symlink_target_to_inode(inode, target, length); -+ -+ /* copy symlink to stat data */ -+ memcpy(sd->body, target, (size_t) length); -+ (*area)[length] = 0; -+ } else { -+ /* there is nothing to do in update but move area */ -+ assert("vs-844", -+ !memcmp(inode->i_private, sd->body, -+ (size_t) length + 1)); -+ } -+ -+ *area += (length + 1); -+ return result; -+} -+ -+static int present_flags_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ , -+ int *len /* remaining length */ ) -+{ -+ assert("nikita-645", inode != NULL); -+ assert("nikita-646", area != NULL); -+ assert("nikita-647", *area != NULL); -+ assert("nikita-648", len != NULL); -+ assert("nikita-649", *len > 0); -+ -+ if (*len >= (int)sizeof(reiser4_flags_stat)) { -+ reiser4_flags_stat *sd; -+ -+ sd = (reiser4_flags_stat *) * area; -+ inode->i_flags = le32_to_cpu(get_unaligned(&sd->flags)); -+ move_on(len, area, sizeof *sd); -+ return 0; -+ } else -+ return not_enough_space(inode, "generation and attrs"); -+} -+ -+/* Audited by: green(2002.06.14) */ -+static int save_len_flags_sd(struct inode *inode UNUSED_ARG /* object being -+ * processed */ ) -+{ -+ return sizeof(reiser4_flags_stat); -+} -+ -+static int save_flags_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ ) -+{ -+ reiser4_flags_stat *sd; -+ -+ assert("nikita-650", inode != NULL); -+ assert("nikita-651", area != NULL); -+ assert("nikita-652", *area != NULL); -+ -+ sd = (reiser4_flags_stat *) * area; -+ put_unaligned(cpu_to_le32(inode->i_flags), &sd->flags); -+ *area += sizeof *sd; -+ return 0; -+} -+ -+static int absent_plugin_sd(struct inode *inode); -+static int present_plugin_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ , -+ int *len /* remaining length */, -+ int is_pset /* 1 if plugin set, 0 if heir set. */) -+{ -+ reiser4_plugin_stat *sd; -+ reiser4_plugin *plugin; -+ reiser4_inode *info; -+ int i; -+ __u16 mask; -+ int result; -+ int num_of_plugins; -+ -+ assert("nikita-653", inode != NULL); -+ assert("nikita-654", area != NULL); -+ assert("nikita-655", *area != NULL); -+ assert("nikita-656", len != NULL); -+ assert("nikita-657", *len > 0); -+ -+ if (*len < (int)sizeof(reiser4_plugin_stat)) -+ return not_enough_space(inode, "plugin"); -+ -+ sd = (reiser4_plugin_stat *) * area; -+ info = reiser4_inode_data(inode); -+ -+ mask = 0; -+ num_of_plugins = le16_to_cpu(get_unaligned(&sd->plugins_no)); -+ move_on(len, area, sizeof *sd); -+ result = 0; -+ for (i = 0; i < num_of_plugins; ++i) { -+ reiser4_plugin_slot *slot; -+ reiser4_plugin_type type; -+ pset_member memb; -+ -+ slot = (reiser4_plugin_slot *) * area; -+ if (*len < (int)sizeof *slot) -+ return not_enough_space(inode, "additional plugin"); -+ -+ memb = le16_to_cpu(get_unaligned(&slot->pset_memb)); -+ type = aset_member_to_type_unsafe(memb); -+ -+ if (type == REISER4_PLUGIN_TYPES) { -+ warning("nikita-3502", -+ "wrong %s member (%i) for %llu", is_pset ? -+ "pset" : "hset", memb, -+ (unsigned long long)get_inode_oid(inode)); -+ return RETERR(-EINVAL); -+ } -+ plugin = plugin_by_disk_id(reiser4_tree_by_inode(inode), -+ type, &slot->id); -+ if (plugin == NULL) -+ return unknown_plugin(le16_to_cpu(get_unaligned(&slot->id)), inode); -+ -+ /* plugin is loaded into inode, mark this into inode's -+ bitmask of loaded non-standard plugins */ -+ if (!(mask & (1 << memb))) { -+ mask |= (1 << memb); -+ } else { -+ warning("nikita-658", "duplicate plugin for %llu", -+ (unsigned long long)get_inode_oid(inode)); -+ return RETERR(-EINVAL); -+ } -+ move_on(len, area, sizeof *slot); -+ /* load plugin data, if any */ -+ if (plugin->h.pops != NULL && plugin->h.pops->load) -+ result = plugin->h.pops->load(inode, plugin, area, len); -+ else -+ result = aset_set_unsafe(is_pset ? &info->pset : -+ &info->hset, memb, plugin); -+ if (result) -+ return result; -+ } -+ if (is_pset) { -+ /* if object plugin wasn't loaded from stat-data, guess it by -+ mode bits */ -+ plugin = file_plugin_to_plugin(inode_file_plugin(inode)); -+ if (plugin == NULL) -+ result = absent_plugin_sd(inode); -+ info->plugin_mask = mask; -+ } else -+ info->heir_mask = mask; -+ -+ return result; -+} -+ -+static int present_pset_sd(struct inode *inode, char **area, int *len) { -+ return present_plugin_sd(inode, area, len, 1 /* pset */); -+} -+ -+/* Determine object plugin for @inode based on i_mode. -+ -+ Many objects in reiser4 file system are controlled by standard object -+ plugins that emulate traditional unix objects: unix file, directory, symlink, fifo, and so on. -+ -+ For such files we don't explicitly store plugin id in object stat -+ data. Rather required plugin is guessed from mode bits, where file "type" -+ is encoded (see stat(2)). -+*/ -+static int -+guess_plugin_by_mode(struct inode *inode /* object to guess plugins for */ ) -+{ -+ int fplug_id; -+ int dplug_id; -+ reiser4_inode *info; -+ -+ assert("nikita-736", inode != NULL); -+ -+ dplug_id = fplug_id = -1; -+ -+ switch (inode->i_mode & S_IFMT) { -+ case S_IFSOCK: -+ case S_IFBLK: -+ case S_IFCHR: -+ case S_IFIFO: -+ fplug_id = SPECIAL_FILE_PLUGIN_ID; -+ break; -+ case S_IFLNK: -+ fplug_id = SYMLINK_FILE_PLUGIN_ID; -+ break; -+ case S_IFDIR: -+ fplug_id = DIRECTORY_FILE_PLUGIN_ID; -+ dplug_id = HASHED_DIR_PLUGIN_ID; -+ break; -+ default: -+ warning("nikita-737", "wrong file mode: %o", inode->i_mode); -+ return RETERR(-EIO); -+ case S_IFREG: -+ fplug_id = UNIX_FILE_PLUGIN_ID; -+ break; -+ } -+ info = reiser4_inode_data(inode); -+ set_plugin(&info->pset, PSET_FILE, (fplug_id >= 0) ? -+ plugin_by_id(REISER4_FILE_PLUGIN_TYPE, fplug_id) : NULL); -+ set_plugin(&info->pset, PSET_DIR, (dplug_id >= 0) ? -+ plugin_by_id(REISER4_DIR_PLUGIN_TYPE, dplug_id) : NULL); -+ return 0; -+} -+ -+/* Audited by: green(2002.06.14) */ -+static int absent_plugin_sd(struct inode *inode /* object being processed */ ) -+{ -+ int result; -+ -+ assert("nikita-659", inode != NULL); -+ -+ result = guess_plugin_by_mode(inode); -+ /* if mode was wrong, guess_plugin_by_mode() returns "regular file", -+ but setup_inode_ops() will call make_bad_inode(). -+ Another, more logical but bit more complex solution is to add -+ "bad-file plugin". */ -+ /* FIXME-VS: activate was called here */ -+ return result; -+} -+ -+/* helper function for plugin_sd_save_len(): calculate how much space -+ required to save state of given plugin */ -+/* Audited by: green(2002.06.14) */ -+static int len_for(reiser4_plugin * plugin /* plugin to save */ , -+ struct inode *inode /* object being processed */ , -+ pset_member memb, -+ int len, int is_pset) -+{ -+ reiser4_inode *info; -+ assert("nikita-661", inode != NULL); -+ -+ if (plugin == NULL) -+ return len; -+ -+ info = reiser4_inode_data(inode); -+ if (is_pset ? -+ info->plugin_mask & (1 << memb) : -+ info->heir_mask & (1 << memb)) { -+ len += sizeof(reiser4_plugin_slot); -+ if (plugin->h.pops && plugin->h.pops->save_len != NULL) { -+ /* non-standard plugin, call method */ -+ /* commented as it is incompatible with alignment -+ * policy in save_plug() -edward */ -+ /* len = round_up(len, plugin->h.pops->alignment); */ -+ len += plugin->h.pops->save_len(inode, plugin); -+ } -+ } -+ return len; -+} -+ -+/* calculate how much space is required to save state of all plugins, -+ associated with inode */ -+static int save_len_plugin_sd(struct inode *inode /* object being processed */, -+ int is_pset) -+{ -+ int len; -+ int last; -+ reiser4_inode *state; -+ pset_member memb; -+ -+ assert("nikita-663", inode != NULL); -+ -+ state = reiser4_inode_data(inode); -+ -+ /* common case: no non-standard plugins */ -+ if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0) -+ return 0; -+ len = sizeof(reiser4_plugin_stat); -+ last = PSET_LAST; -+ -+ for (memb = 0; memb < last; ++memb) { -+ len = len_for(aset_get(is_pset ? state->pset : state->hset, memb), -+ inode, memb, len, is_pset); -+ } -+ assert("nikita-664", len > (int)sizeof(reiser4_plugin_stat)); -+ return len; -+} -+ -+static int save_len_pset_sd(struct inode *inode) { -+ return save_len_plugin_sd(inode, 1 /* pset */); -+} -+ -+/* helper function for plugin_sd_save(): save plugin, associated with -+ inode. */ -+static int save_plug(reiser4_plugin * plugin /* plugin to save */ , -+ struct inode *inode /* object being processed */ , -+ int memb /* what element of pset is saved */ , -+ char **area /* position in stat-data */ , -+ int *count /* incremented if plugin were actually saved. */, -+ int is_pset /* 1 for plugin set, 0 for heir set */) -+{ -+ reiser4_plugin_slot *slot; -+ int fake_len; -+ int result; -+ -+ assert("nikita-665", inode != NULL); -+ assert("nikita-666", area != NULL); -+ assert("nikita-667", *area != NULL); -+ -+ if (plugin == NULL) -+ return 0; -+ -+ if (is_pset ? -+ !(reiser4_inode_data(inode)->plugin_mask & (1 << memb)) : -+ !(reiser4_inode_data(inode)->heir_mask & (1 << memb))) -+ return 0; -+ slot = (reiser4_plugin_slot *) * area; -+ put_unaligned(cpu_to_le16(memb), &slot->pset_memb); -+ put_unaligned(cpu_to_le16(plugin->h.id), &slot->id); -+ fake_len = (int)0xffff; -+ move_on(&fake_len, area, sizeof *slot); -+ ++*count; -+ result = 0; -+ if (plugin->h.pops != NULL) { -+ if (plugin->h.pops->save != NULL) -+ result = plugin->h.pops->save(inode, plugin, area); -+ } -+ return result; -+} -+ -+/* save state of all non-standard plugins associated with inode */ -+static int save_plugin_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */, -+ int is_pset /* 1 for pset, 0 for hset */) -+{ -+ int fake_len; -+ int result = 0; -+ int num_of_plugins; -+ reiser4_plugin_stat *sd; -+ reiser4_inode *state; -+ pset_member memb; -+ -+ assert("nikita-669", inode != NULL); -+ assert("nikita-670", area != NULL); -+ assert("nikita-671", *area != NULL); -+ -+ state = reiser4_inode_data(inode); -+ if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0) -+ return 0; -+ sd = (reiser4_plugin_stat *) * area; -+ fake_len = (int)0xffff; -+ move_on(&fake_len, area, sizeof *sd); -+ -+ num_of_plugins = 0; -+ for (memb = 0; memb < PSET_LAST; ++memb) { -+ result = save_plug(aset_get(is_pset ? state->pset : state->hset, -+ memb), -+ inode, memb, area, &num_of_plugins, is_pset); -+ if (result != 0) -+ break; -+ } -+ -+ put_unaligned(cpu_to_le16((__u16)num_of_plugins), &sd->plugins_no); -+ return result; -+} -+ -+static int save_pset_sd(struct inode *inode, char **area) { -+ return save_plugin_sd(inode, area, 1 /* pset */); -+} -+ -+static int present_hset_sd(struct inode *inode, char **area, int *len) { -+ return present_plugin_sd(inode, area, len, 0 /* hset */); -+} -+ -+static int save_len_hset_sd(struct inode *inode) { -+ return save_len_plugin_sd(inode, 0 /* pset */); -+} -+ -+static int save_hset_sd(struct inode *inode, char **area) { -+ return save_plugin_sd(inode, area, 0 /* hset */); -+} -+ -+/* helper function for crypto_sd_present(), crypto_sd_save. -+ Allocates memory for crypto stat, keyid and attaches it to the inode */ -+static int extract_crypto_stat (struct inode * inode, -+ reiser4_crypto_stat * sd) -+{ -+ crypto_stat_t * info; -+ assert("edward-11", !inode_crypto_stat(inode)); -+ assert("edward-1413", -+ !reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)); -+ /* create and attach a crypto-stat without secret key loaded */ -+ info = reiser4_alloc_crypto_stat(inode); -+ if (IS_ERR(info)) -+ return PTR_ERR(info); -+ info->keysize = le16_to_cpu(get_unaligned(&sd->keysize)); -+ memcpy(info->keyid, sd->keyid, inode_digest_plugin(inode)->fipsize); -+ reiser4_attach_crypto_stat(inode, info); -+ reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED); -+ return 0; -+} -+ -+/* crypto stat-data extension */ -+ -+static int present_crypto_sd(struct inode *inode, char **area, int *len) -+{ -+ int result; -+ reiser4_crypto_stat *sd; -+ digest_plugin *dplug = inode_digest_plugin(inode); -+ -+ assert("edward-06", dplug != NULL); -+ assert("edward-684", dplug->fipsize); -+ assert("edward-07", area != NULL); -+ assert("edward-08", *area != NULL); -+ assert("edward-09", len != NULL); -+ assert("edward-10", *len > 0); -+ -+ if (*len < (int)sizeof(reiser4_crypto_stat)) { -+ return not_enough_space(inode, "crypto-sd"); -+ } -+ /* *len is number of bytes in stat data item from *area to the end of -+ item. It must be not less than size of this extension */ -+ assert("edward-75", sizeof(*sd) + dplug->fipsize <= *len); -+ -+ sd = (reiser4_crypto_stat *) * area; -+ result = extract_crypto_stat(inode, sd); -+ move_on(len, area, sizeof(*sd) + dplug->fipsize); -+ -+ return result; -+} -+ -+static int save_len_crypto_sd(struct inode *inode) -+{ -+ return sizeof(reiser4_crypto_stat) + -+ inode_digest_plugin(inode)->fipsize; -+} -+ -+static int save_crypto_sd(struct inode *inode, char **area) -+{ -+ int result = 0; -+ reiser4_crypto_stat *sd; -+ crypto_stat_t * info = inode_crypto_stat(inode); -+ digest_plugin *dplug = inode_digest_plugin(inode); -+ -+ assert("edward-12", dplug != NULL); -+ assert("edward-13", area != NULL); -+ assert("edward-14", *area != NULL); -+ assert("edward-15", info != NULL); -+ assert("edward-1414", info->keyid != NULL); -+ assert("edward-1415", info->keysize != 0); -+ assert("edward-76", reiser4_inode_data(inode) != NULL); -+ -+ if (!reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)) { -+ /* file is just created */ -+ sd = (reiser4_crypto_stat *) *area; -+ /* copy everything but private key to the disk stat-data */ -+ put_unaligned(cpu_to_le16(info->keysize), &sd->keysize); -+ memcpy(sd->keyid, info->keyid, (size_t) dplug->fipsize); -+ reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED); -+ } -+ *area += (sizeof(*sd) + dplug->fipsize); -+ return result; -+} -+ -+static int eio(struct inode *inode, char **area, int *len) -+{ -+ return RETERR(-EIO); -+} -+ -+sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION] = { -+ [LIGHT_WEIGHT_STAT] = { -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = LIGHT_WEIGHT_STAT, -+ .pops = NULL, -+ .label = "light-weight sd", -+ .desc = "sd for light-weight files", -+ .linkage = {NULL,NULL} -+ }, -+ .present = present_lw_sd, -+ .absent = NULL, -+ .save_len = save_len_lw_sd, -+ .save = save_lw_sd, -+ .alignment = 8 -+ }, -+ [UNIX_STAT] = { -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = UNIX_STAT, -+ .pops = NULL, -+ .label = "unix-sd", -+ .desc = "unix stat-data fields", -+ .linkage = {NULL,NULL} -+ }, -+ .present = present_unix_sd, -+ .absent = absent_unix_sd, -+ .save_len = save_len_unix_sd, -+ .save = save_unix_sd, -+ .alignment = 8 -+ }, -+ [LARGE_TIMES_STAT] = { -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = LARGE_TIMES_STAT, -+ .pops = NULL, -+ .label = "64time-sd", -+ .desc = "nanosecond resolution for times", -+ .linkage = {NULL,NULL} -+ }, -+ .present = present_large_times_sd, -+ .absent = NULL, -+ .save_len = save_len_large_times_sd, -+ .save = save_large_times_sd, -+ .alignment = 8 -+ }, -+ [SYMLINK_STAT] = { -+ /* stat data of symlink has this extension */ -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = SYMLINK_STAT, -+ .pops = NULL, -+ .label = "symlink-sd", -+ .desc = -+ "stat data is appended with symlink name", -+ .linkage = {NULL,NULL} -+ }, -+ .present = present_symlink_sd, -+ .absent = NULL, -+ .save_len = save_len_symlink_sd, -+ .save = save_symlink_sd, -+ .alignment = 8 -+ }, -+ [PLUGIN_STAT] = { -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = PLUGIN_STAT, -+ .pops = NULL, -+ .label = "plugin-sd", -+ .desc = "plugin stat-data fields", -+ .linkage = {NULL,NULL} -+ }, -+ .present = present_pset_sd, -+ .absent = absent_plugin_sd, -+ .save_len = save_len_pset_sd, -+ .save = save_pset_sd, -+ .alignment = 8 -+ }, -+ [HEIR_STAT] = { -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = HEIR_STAT, -+ .pops = NULL, -+ .label = "heir-plugin-sd", -+ .desc = "heir plugin stat-data fields", -+ .linkage = {NULL,NULL} -+ }, -+ .present = present_hset_sd, -+ .absent = NULL, -+ .save_len = save_len_hset_sd, -+ .save = save_hset_sd, -+ .alignment = 8 -+ }, -+ [FLAGS_STAT] = { -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = FLAGS_STAT, -+ .pops = NULL, -+ .label = "flags-sd", -+ .desc = "inode bit flags", -+ .linkage = {NULL, NULL} -+ }, -+ .present = present_flags_sd, -+ .absent = NULL, -+ .save_len = save_len_flags_sd, -+ .save = save_flags_sd, -+ .alignment = 8 -+ }, -+ [CAPABILITIES_STAT] = { -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = CAPABILITIES_STAT, -+ .pops = NULL, -+ .label = "capabilities-sd", -+ .desc = "capabilities", -+ .linkage = {NULL, NULL} -+ }, -+ .present = eio, -+ .absent = NULL, -+ .save_len = save_len_flags_sd, -+ .save = save_flags_sd, -+ .alignment = 8 -+ }, -+ [CRYPTO_STAT] = { -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = CRYPTO_STAT, -+ .pops = NULL, -+ .label = "crypto-sd", -+ .desc = "secret key size and id", -+ .linkage = {NULL, NULL} -+ }, -+ .present = present_crypto_sd, -+ .absent = NULL, -+ .save_len = save_len_crypto_sd, -+ .save = save_crypto_sd, -+ .alignment = 8 -+ } -+}; -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/static_stat.h linux-2.6.20/fs/reiser4/plugin/item/static_stat.h ---- linux-2.6.20.orig/fs/reiser4/plugin/item/static_stat.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/item/static_stat.h 2007-05-06 14:50:43.823014469 +0400 -@@ -0,0 +1,224 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* This describes the static_stat item, used to hold all information needed by the stat() syscall. -+ -+In the case where each file has not less than the fields needed by the -+stat() syscall, it is more compact to store those fields in this -+struct. -+ -+If this item does not exist, then all stats are dynamically resolved. -+At the moment, we either resolve all stats dynamically or all of them -+statically. If you think this is not fully optimal, and the rest of -+reiser4 is working, then fix it...:-) -+ -+*/ -+ -+#if !defined( __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ ) -+#define __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ -+ -+#include "../../forward.h" -+#include "../../dformat.h" -+ -+#include /* for struct inode */ -+ -+/* Stat data layout: goals and implementation. -+ -+ We want to be able to have lightweight files which have complete flexibility in what semantic metadata is attached to -+ them, including not having semantic metadata attached to them. -+ -+ There is one problem with doing that, which is that if in fact you have exactly the same metadata for most files you -+ want to store, then it takes more space to store that metadata in a dynamically sized structure than in a statically -+ sized structure because the statically sized structure knows without recording it what the names and lengths of the -+ attributes are. -+ -+ This leads to a natural compromise, which is to special case those files which have simply the standard unix file -+ attributes, and only employ the full dynamic stat data mechanism for those files that differ from the standard unix -+ file in their use of file attributes. -+ -+ Yet this compromise deserves to be compromised a little. -+ -+ We accommodate the case where you have no more than the standard unix file attributes by using an "extension -+ bitmask": each bit in it indicates presence or absence of or particular stat data extension (see sd_ext_bits enum). -+ -+ If the first bit of the extension bitmask bit is 0, we have light-weight file whose attributes are either inherited -+ from parent directory (as uid, gid) or initialised to some sane values. -+ -+ To capitalize on existing code infrastructure, extensions are -+ implemented as plugins of type REISER4_SD_EXT_PLUGIN_TYPE. -+ Each stat-data extension plugin implements four methods: -+ -+ ->present() called by sd_load() when this extension is found in stat-data -+ ->absent() called by sd_load() when this extension is not found in stat-data -+ ->save_len() called by sd_len() to calculate total length of stat-data -+ ->save() called by sd_save() to store extension data into stat-data -+ -+ Implementation is in fs/reiser4/plugin/item/static_stat.c -+*/ -+ -+/* stat-data extension. Please order this by presumed frequency of use */ -+typedef enum { -+ /* support for light-weight files */ -+ LIGHT_WEIGHT_STAT, -+ /* data required to implement unix stat(2) call. Layout is in -+ reiser4_unix_stat. If this is not present, file is light-weight */ -+ UNIX_STAT, -+ /* this contains additional set of 32bit [anc]time fields to implement -+ nanosecond resolution. Layout is in reiser4_large_times_stat. Usage -+ if this extension is governed by 32bittimes mount option. */ -+ LARGE_TIMES_STAT, -+ /* stat data has link name included */ -+ SYMLINK_STAT, -+ /* on-disk slots of non-standard plugins for main plugin table -+ (@reiser4_inode->pset), that is, plugins that cannot be deduced -+ from file mode bits), for example, aggregation, interpolation etc. */ -+ PLUGIN_STAT, -+ /* this extension contains persistent inode flags. These flags are -+ single bits: immutable, append, only, etc. Layout is in -+ reiser4_flags_stat. */ -+ FLAGS_STAT, -+ /* this extension contains capabilities sets, associated with this -+ file. Layout is in reiser4_capabilities_stat */ -+ CAPABILITIES_STAT, -+ /* this extension contains size and public id of the secret key. -+ Layout is in reiser4_crypto_stat */ -+ CRYPTO_STAT, -+ /* on-disk slots of non-default plugins for inheritance, which -+ are extracted to special plugin table (@reiser4_inode->hset). -+ By default, children of the object will inherit plugins from -+ its main plugin table (pset). */ -+ HEIR_STAT, -+ LAST_SD_EXTENSION, -+ /* -+ * init_inode_static_sd() iterates over extension mask until all -+ * non-zero bits are processed. This means, that neither ->present(), -+ * nor ->absent() methods will be called for stat-data extensions that -+ * go after last present extension. But some basic extensions, we want -+ * either ->absent() or ->present() method to be called, because these -+ * extensions set up something in inode even when they are not -+ * present. This is what LAST_IMPORTANT_SD_EXTENSION is for: for all -+ * extensions before and including LAST_IMPORTANT_SD_EXTENSION either -+ * ->present(), or ->absent() method will be called, independently of -+ * what other extensions are present. -+ */ -+ LAST_IMPORTANT_SD_EXTENSION = PLUGIN_STAT -+} sd_ext_bits; -+ -+/* minimal stat-data. This allows to support light-weight files. */ -+typedef struct reiser4_stat_data_base { -+ /* 0 */ __le16 extmask; -+ /* 2 */ -+} PACKED reiser4_stat_data_base; -+ -+typedef struct reiser4_light_weight_stat { -+ /* 0 */ __le16 mode; -+ /* 2 */ __le32 nlink; -+ /* 6 */ __le64 size; -+ /* size in bytes */ -+ /* 14 */ -+} PACKED reiser4_light_weight_stat; -+ -+typedef struct reiser4_unix_stat { -+ /* owner id */ -+ /* 0 */ __le32 uid; -+ /* group id */ -+ /* 4 */ __le32 gid; -+ /* access time */ -+ /* 8 */ __le32 atime; -+ /* modification time */ -+ /* 12 */ __le32 mtime; -+ /* change time */ -+ /* 16 */ __le32 ctime; -+ union { -+ /* minor:major for device files */ -+ /* 20 */ __le64 rdev; -+ /* bytes used by file */ -+ /* 20 */ __le64 bytes; -+ } u; -+ /* 28 */ -+} PACKED reiser4_unix_stat; -+ -+/* symlink stored as part of inode */ -+typedef struct reiser4_symlink_stat { -+ char body[0]; -+} PACKED reiser4_symlink_stat; -+ -+typedef struct reiser4_plugin_slot { -+ /* 0 */ __le16 pset_memb; -+ /* 2 */ __le16 id; -+ /* 4 *//* here plugin stores its persistent state */ -+} PACKED reiser4_plugin_slot; -+ -+/* stat-data extension for files with non-standard plugin. */ -+typedef struct reiser4_plugin_stat { -+ /* number of additional plugins, associated with this object */ -+ /* 0 */ __le16 plugins_no; -+ /* 2 */ reiser4_plugin_slot slot[0]; -+ /* 2 */ -+} PACKED reiser4_plugin_stat; -+ -+/* stat-data extension for inode flags. Currently it is just fixed-width 32 -+ * bit mask. If need arise, this can be replaced with variable width -+ * bitmask. */ -+typedef struct reiser4_flags_stat { -+ /* 0 */ __le32 flags; -+ /* 4 */ -+} PACKED reiser4_flags_stat; -+ -+typedef struct reiser4_capabilities_stat { -+ /* 0 */ __le32 effective; -+ /* 8 */ __le32 permitted; -+ /* 16 */ -+} PACKED reiser4_capabilities_stat; -+ -+typedef struct reiser4_cluster_stat { -+/* this defines cluster size (an attribute of cryptcompress objects) as PAGE_SIZE << cluster shift */ -+ /* 0 */ d8 cluster_shift; -+ /* 1 */ -+} PACKED reiser4_cluster_stat; -+ -+typedef struct reiser4_crypto_stat { -+ /* secret key size, bits */ -+ /* 0 */ d16 keysize; -+ /* secret key id */ -+ /* 2 */ d8 keyid[0]; -+ /* 2 */ -+} PACKED reiser4_crypto_stat; -+ -+typedef struct reiser4_large_times_stat { -+ /* access time */ -+ /* 0 */ d32 atime; -+ /* modification time */ -+ /* 4 */ d32 mtime; -+ /* change time */ -+ /* 8 */ d32 ctime; -+ /* 12 */ -+} PACKED reiser4_large_times_stat; -+ -+/* this structure is filled by sd_item_stat */ -+typedef struct sd_stat { -+ int dirs; -+ int files; -+ int others; -+} sd_stat; -+ -+/* plugin->item.common.* */ -+extern void print_sd(const char *prefix, coord_t * coord); -+extern void item_stat_static_sd(const coord_t * coord, void *vp); -+ -+/* plugin->item.s.sd.* */ -+extern int init_inode_static_sd(struct inode *inode, char *sd, int len); -+extern int save_len_static_sd(struct inode *inode); -+extern int save_static_sd(struct inode *inode, char **area); -+ -+/* __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/tail.c linux-2.6.20/fs/reiser4/plugin/item/tail.c ---- linux-2.6.20.orig/fs/reiser4/plugin/item/tail.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/item/tail.c 2007-05-06 14:50:43.823014469 +0400 -@@ -0,0 +1,812 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "item.h" -+#include "../../inode.h" -+#include "../../page_cache.h" -+#include "../../carry.h" -+#include "../../vfs_ops.h" -+ -+#include -+#include -+#include -+#include -+ -+/* plugin->u.item.b.max_key_inside */ -+reiser4_key *max_key_inside_tail(const coord_t *coord, reiser4_key *key) -+{ -+ item_key_by_coord(coord, key); -+ set_key_offset(key, get_key_offset(reiser4_max_key())); -+ return key; -+} -+ -+/* plugin->u.item.b.can_contain_key */ -+int can_contain_key_tail(const coord_t *coord, const reiser4_key *key, -+ const reiser4_item_data *data) -+{ -+ reiser4_key item_key; -+ -+ if (item_plugin_by_coord(coord) != data->iplug) -+ return 0; -+ -+ item_key_by_coord(coord, &item_key); -+ if (get_key_locality(key) != get_key_locality(&item_key) || -+ get_key_objectid(key) != get_key_objectid(&item_key)) -+ return 0; -+ -+ return 1; -+} -+ -+/* plugin->u.item.b.mergeable -+ first item is of tail type */ -+/* Audited by: green(2002.06.14) */ -+int mergeable_tail(const coord_t *p1, const coord_t *p2) -+{ -+ reiser4_key key1, key2; -+ -+ assert("vs-535", plugin_of_group(item_plugin_by_coord(p1), -+ UNIX_FILE_METADATA_ITEM_TYPE)); -+ assert("vs-365", item_id_by_coord(p1) == FORMATTING_ID); -+ -+ if (item_id_by_coord(p2) != FORMATTING_ID) { -+ /* second item is of another type */ -+ return 0; -+ } -+ -+ item_key_by_coord(p1, &key1); -+ item_key_by_coord(p2, &key2); -+ if (get_key_locality(&key1) != get_key_locality(&key2) || -+ get_key_objectid(&key1) != get_key_objectid(&key2) -+ || get_key_type(&key1) != get_key_type(&key2)) { -+ /* items of different objects */ -+ return 0; -+ } -+ if (get_key_offset(&key1) + nr_units_tail(p1) != get_key_offset(&key2)) { -+ /* not adjacent items */ -+ return 0; -+ } -+ return 1; -+} -+ -+/* plugin->u.item.b.print -+ plugin->u.item.b.check */ -+ -+/* plugin->u.item.b.nr_units */ -+pos_in_node_t nr_units_tail(const coord_t * coord) -+{ -+ return item_length_by_coord(coord); -+} -+ -+/* plugin->u.item.b.lookup */ -+lookup_result -+lookup_tail(const reiser4_key * key, lookup_bias bias, coord_t * coord) -+{ -+ reiser4_key item_key; -+ __u64 lookuped, offset; -+ unsigned nr_units; -+ -+ item_key_by_coord(coord, &item_key); -+ offset = get_key_offset(item_key_by_coord(coord, &item_key)); -+ nr_units = nr_units_tail(coord); -+ -+ /* key we are looking for must be greater than key of item @coord */ -+ assert("vs-416", keygt(key, &item_key)); -+ -+ /* offset we are looking for */ -+ lookuped = get_key_offset(key); -+ -+ if (lookuped >= offset && lookuped < offset + nr_units) { -+ /* byte we are looking for is in this item */ -+ coord->unit_pos = lookuped - offset; -+ coord->between = AT_UNIT; -+ return CBK_COORD_FOUND; -+ } -+ -+ /* set coord after last unit */ -+ coord->unit_pos = nr_units - 1; -+ coord->between = AFTER_UNIT; -+ return bias == -+ FIND_MAX_NOT_MORE_THAN ? CBK_COORD_FOUND : CBK_COORD_NOTFOUND; -+} -+ -+/* plugin->u.item.b.paste */ -+int -+paste_tail(coord_t *coord, reiser4_item_data *data, -+ carry_plugin_info *info UNUSED_ARG) -+{ -+ unsigned old_item_length; -+ char *item; -+ -+ /* length the item had before resizing has been performed */ -+ old_item_length = item_length_by_coord(coord) - data->length; -+ -+ /* tail items never get pasted in the middle */ -+ assert("vs-363", -+ (coord->unit_pos == 0 && coord->between == BEFORE_UNIT) || -+ (coord->unit_pos == old_item_length - 1 && -+ coord->between == AFTER_UNIT) || -+ (coord->unit_pos == 0 && old_item_length == 0 -+ && coord->between == AT_UNIT)); -+ -+ item = item_body_by_coord(coord); -+ if (coord->unit_pos == 0) -+ /* make space for pasted data when pasting at the beginning of -+ the item */ -+ memmove(item + data->length, item, old_item_length); -+ -+ if (coord->between == AFTER_UNIT) -+ coord->unit_pos++; -+ -+ if (data->data) { -+ assert("vs-554", data->user == 0 || data->user == 1); -+ if (data->user) { -+ assert("nikita-3035", reiser4_schedulable()); -+ /* copy from user space */ -+ if (__copy_from_user(item + coord->unit_pos, -+ (const char __user *)data->data, -+ (unsigned)data->length)) -+ return RETERR(-EFAULT); -+ } else -+ /* copy from kernel space */ -+ memcpy(item + coord->unit_pos, data->data, -+ (unsigned)data->length); -+ } else { -+ memset(item + coord->unit_pos, 0, (unsigned)data->length); -+ } -+ return 0; -+} -+ -+/* plugin->u.item.b.fast_paste */ -+ -+/* plugin->u.item.b.can_shift -+ number of units is returned via return value, number of bytes via @size. For -+ tail items they coincide */ -+int -+can_shift_tail(unsigned free_space, coord_t * source UNUSED_ARG, -+ znode * target UNUSED_ARG, shift_direction direction UNUSED_ARG, -+ unsigned *size, unsigned want) -+{ -+ /* make sure that that we do not want to shift more than we have */ -+ assert("vs-364", want > 0 -+ && want <= (unsigned)item_length_by_coord(source)); -+ -+ *size = min(want, free_space); -+ return *size; -+} -+ -+/* plugin->u.item.b.copy_units */ -+void -+copy_units_tail(coord_t * target, coord_t * source, -+ unsigned from, unsigned count, -+ shift_direction where_is_free_space, -+ unsigned free_space UNUSED_ARG) -+{ -+ /* make sure that item @target is expanded already */ -+ assert("vs-366", (unsigned)item_length_by_coord(target) >= count); -+ assert("vs-370", free_space >= count); -+ -+ if (where_is_free_space == SHIFT_LEFT) { -+ /* append item @target with @count first bytes of @source */ -+ assert("vs-365", from == 0); -+ -+ memcpy((char *)item_body_by_coord(target) + -+ item_length_by_coord(target) - count, -+ (char *)item_body_by_coord(source), count); -+ } else { -+ /* target item is moved to right already */ -+ reiser4_key key; -+ -+ assert("vs-367", -+ (unsigned)item_length_by_coord(source) == from + count); -+ -+ memcpy((char *)item_body_by_coord(target), -+ (char *)item_body_by_coord(source) + from, count); -+ -+ /* new units are inserted before first unit in an item, -+ therefore, we have to update item key */ -+ item_key_by_coord(source, &key); -+ set_key_offset(&key, get_key_offset(&key) + from); -+ -+ node_plugin_by_node(target->node)->update_item_key(target, &key, -+ NULL /*info */); -+ } -+} -+ -+/* plugin->u.item.b.create_hook */ -+ -+/* item_plugin->b.kill_hook -+ this is called when @count units starting from @from-th one are going to be removed -+ */ -+int -+kill_hook_tail(const coord_t * coord, pos_in_node_t from, -+ pos_in_node_t count, struct carry_kill_data *kdata) -+{ -+ reiser4_key key; -+ loff_t start, end; -+ -+ assert("vs-1577", kdata); -+ assert("vs-1579", kdata->inode); -+ -+ item_key_by_coord(coord, &key); -+ start = get_key_offset(&key) + from; -+ end = start + count; -+ fake_kill_hook_tail(kdata->inode, start, end, kdata->params.truncate); -+ return 0; -+} -+ -+/* plugin->u.item.b.shift_hook */ -+ -+/* helper for kill_units_tail and cut_units_tail */ -+static int -+do_cut_or_kill(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ reiser4_key * smallest_removed, reiser4_key * new_first) -+{ -+ pos_in_node_t count; -+ -+ /* this method is only called to remove part of item */ -+ assert("vs-374", (to - from + 1) < item_length_by_coord(coord)); -+ /* tails items are never cut from the middle of an item */ -+ assert("vs-396", ergo(from != 0, to == coord_last_unit_pos(coord))); -+ assert("vs-1558", ergo(from == 0, to < coord_last_unit_pos(coord))); -+ -+ count = to - from + 1; -+ -+ if (smallest_removed) { -+ /* store smallest key removed */ -+ item_key_by_coord(coord, smallest_removed); -+ set_key_offset(smallest_removed, -+ get_key_offset(smallest_removed) + from); -+ } -+ if (new_first) { -+ /* head of item is cut */ -+ assert("vs-1529", from == 0); -+ -+ item_key_by_coord(coord, new_first); -+ set_key_offset(new_first, -+ get_key_offset(new_first) + from + count); -+ } -+ -+ if (REISER4_DEBUG) -+ memset((char *)item_body_by_coord(coord) + from, 0, count); -+ return count; -+} -+ -+/* plugin->u.item.b.cut_units */ -+int -+cut_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ struct carry_cut_data *cdata UNUSED_ARG, -+ reiser4_key * smallest_removed, reiser4_key * new_first) -+{ -+ return do_cut_or_kill(coord, from, to, smallest_removed, new_first); -+} -+ -+/* plugin->u.item.b.kill_units */ -+int -+kill_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ struct carry_kill_data *kdata, reiser4_key * smallest_removed, -+ reiser4_key * new_first) -+{ -+ kill_hook_tail(coord, from, to - from + 1, kdata); -+ return do_cut_or_kill(coord, from, to, smallest_removed, new_first); -+} -+ -+/* plugin->u.item.b.unit_key */ -+reiser4_key *unit_key_tail(const coord_t * coord, reiser4_key * key) -+{ -+ assert("vs-375", coord_is_existing_unit(coord)); -+ -+ item_key_by_coord(coord, key); -+ set_key_offset(key, (get_key_offset(key) + coord->unit_pos)); -+ -+ return key; -+} -+ -+/* plugin->u.item.b.estimate -+ plugin->u.item.b.item_data_by_flow */ -+ -+/* tail redpage function. It is called from readpage_tail(). */ -+static int do_readpage_tail(uf_coord_t *uf_coord, struct page *page) -+{ -+ tap_t tap; -+ int result; -+ coord_t coord; -+ lock_handle lh; -+ int count, mapped; -+ struct inode *inode; -+ char *pagedata; -+ -+ /* saving passed coord in order to do not move it by tap. */ -+ init_lh(&lh); -+ copy_lh(&lh, uf_coord->lh); -+ inode = page->mapping->host; -+ coord_dup(&coord, &uf_coord->coord); -+ -+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK); -+ -+ if ((result = reiser4_tap_load(&tap))) -+ goto out_tap_done; -+ -+ /* lookup until page is filled up. */ -+ for (mapped = 0; mapped < PAGE_CACHE_SIZE; ) { -+ /* number of bytes to be copied to page */ -+ count = item_length_by_coord(&coord) - coord.unit_pos; -+ if (count > PAGE_CACHE_SIZE - mapped) -+ count = PAGE_CACHE_SIZE - mapped; -+ -+ /* attach @page to address space and get data address */ -+ pagedata = kmap_atomic(page, KM_USER0); -+ -+ /* copy tail item to page */ -+ memcpy(pagedata + mapped, -+ ((char *)item_body_by_coord(&coord) + coord.unit_pos), -+ count); -+ mapped += count; -+ -+ flush_dcache_page(page); -+ -+ /* dettach page from address space */ -+ kunmap_atomic(pagedata, KM_USER0); -+ -+ /* Getting next tail item. */ -+ if (mapped < PAGE_CACHE_SIZE) { -+ /* -+ * unlock page in order to avoid keep it locked -+ * during tree lookup, which takes long term locks -+ */ -+ unlock_page(page); -+ -+ /* getting right neighbour. */ -+ result = go_dir_el(&tap, RIGHT_SIDE, 0); -+ -+ /* lock page back */ -+ lock_page(page); -+ if (PageUptodate(page)) { -+ /* -+ * another thread read the page, we have -+ * nothing to do -+ */ -+ result = 0; -+ goto out_unlock_page; -+ } -+ -+ if (result) { -+ if (result == -E_NO_NEIGHBOR) { -+ /* -+ * rigth neighbor is not a formatted -+ * node -+ */ -+ result = 0; -+ goto done; -+ } else { -+ goto out_tap_relse; -+ } -+ } else { -+ if (!inode_file_plugin(inode)-> -+ owns_item(inode, &coord)) { -+ /* item of another file is found */ -+ result = 0; -+ goto done; -+ } -+ } -+ } -+ } -+ -+ done: -+ if (mapped != PAGE_CACHE_SIZE) { -+ pagedata = kmap_atomic(page, KM_USER0); -+ memset(pagedata + mapped, 0, PAGE_CACHE_SIZE - mapped); -+ flush_dcache_page(page); -+ kunmap_atomic(pagedata, KM_USER0); -+ } -+ SetPageUptodate(page); -+ out_unlock_page: -+ unlock_page(page); -+ out_tap_relse: -+ reiser4_tap_relse(&tap); -+ out_tap_done: -+ reiser4_tap_done(&tap); -+ return result; -+} -+ -+/* -+ plugin->s.file.readpage -+ reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->readpage_tail -+ or -+ filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_tail -+ -+ At the beginning: coord->node is read locked, zloaded, page is locked, coord is set to existing unit inside of tail -+ item. */ -+int readpage_tail(void *vp, struct page *page) -+{ -+ uf_coord_t *uf_coord = vp; -+ ON_DEBUG(coord_t * coord = &uf_coord->coord); -+ ON_DEBUG(reiser4_key key); -+ -+ assert("umka-2515", PageLocked(page)); -+ assert("umka-2516", !PageUptodate(page)); -+ assert("umka-2517", !jprivate(page) && !PagePrivate(page)); -+ assert("umka-2518", page->mapping && page->mapping->host); -+ -+ assert("umka-2519", znode_is_loaded(coord->node)); -+ assert("umka-2520", item_is_tail(coord)); -+ assert("umka-2521", coord_is_existing_unit(coord)); -+ assert("umka-2522", znode_is_rlocked(coord->node)); -+ assert("umka-2523", -+ page->mapping->host->i_ino == -+ get_key_objectid(item_key_by_coord(coord, &key))); -+ -+ return do_readpage_tail(uf_coord, page); -+} -+ -+/** -+ * overwrite_tail -+ * @flow: -+ * @coord: -+ * -+ * Overwrites tail item or its part by user data. Returns number of bytes -+ * written or error code. -+ */ -+static int overwrite_tail(flow_t *flow, coord_t *coord) -+{ -+ unsigned count; -+ -+ assert("vs-570", flow->user == 1); -+ assert("vs-946", flow->data); -+ assert("vs-947", coord_is_existing_unit(coord)); -+ assert("vs-948", znode_is_write_locked(coord->node)); -+ assert("nikita-3036", reiser4_schedulable()); -+ -+ count = item_length_by_coord(coord) - coord->unit_pos; -+ if (count > flow->length) -+ count = flow->length; -+ -+ if (__copy_from_user((char *)item_body_by_coord(coord) + coord->unit_pos, -+ (const char __user *)flow->data, count)) -+ return RETERR(-EFAULT); -+ -+ znode_make_dirty(coord->node); -+ return count; -+} -+ -+/** -+ * insert_first_tail -+ * @inode: -+ * @flow: -+ * @coord: -+ * @lh: -+ * -+ * Returns number of bytes written or error code. -+ */ -+static ssize_t insert_first_tail(struct inode *inode, flow_t *flow, -+ coord_t *coord, lock_handle *lh) -+{ -+ int result; -+ loff_t to_write; -+ unix_file_info_t *uf_info; -+ -+ if (get_key_offset(&flow->key) != 0) { -+ /* -+ * file is empty and we have to write not to the beginning of -+ * file. Create a hole at the beginning of file. On success -+ * insert_flow returns 0 as number of written bytes which is -+ * what we have to return on padding a file with holes -+ */ -+ flow->data = NULL; -+ flow->length = get_key_offset(&flow->key); -+ set_key_offset(&flow->key, 0); -+ /* -+ * holes in files built of tails are stored just like if there -+ * were real data which are all zeros. Therefore we have to -+ * allocate quota here as well -+ */ -+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length)) -+ return RETERR(-EDQUOT); -+ result = reiser4_insert_flow(coord, lh, flow); -+ if (flow->length) -+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length); -+ -+ uf_info = unix_file_inode_data(inode); -+ -+ /* -+ * first item insertion is only possible when writing to empty -+ * file or performing tail conversion -+ */ -+ assert("", (uf_info->container == UF_CONTAINER_EMPTY || -+ (reiser4_inode_get_flag(inode, -+ REISER4_PART_MIXED) && -+ reiser4_inode_get_flag(inode, -+ REISER4_PART_IN_CONV)))); -+ /* if file was empty - update its state */ -+ if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY) -+ uf_info->container = UF_CONTAINER_TAILS; -+ return result; -+ } -+ -+ /* check quota before appending data */ -+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length)) -+ return RETERR(-EDQUOT); -+ -+ to_write = flow->length; -+ result = reiser4_insert_flow(coord, lh, flow); -+ if (flow->length) -+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length); -+ return (to_write - flow->length) ? (to_write - flow->length) : result; -+} -+ -+/** -+ * append_tail -+ * @inode: -+ * @flow: -+ * @coord: -+ * @lh: -+ * -+ * Returns number of bytes written or error code. -+ */ -+static ssize_t append_tail(struct inode *inode, -+ flow_t *flow, coord_t *coord, lock_handle *lh) -+{ -+ int result; -+ reiser4_key append_key; -+ loff_t to_write; -+ -+ if (!keyeq(&flow->key, append_key_tail(coord, &append_key))) { -+ flow->data = NULL; -+ flow->length = get_key_offset(&flow->key) - get_key_offset(&append_key); -+ set_key_offset(&flow->key, get_key_offset(&append_key)); -+ /* -+ * holes in files built of tails are stored just like if there -+ * were real data which are all zeros. Therefore we have to -+ * allocate quota here as well -+ */ -+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length)) -+ return RETERR(-EDQUOT); -+ result = reiser4_insert_flow(coord, lh, flow); -+ if (flow->length) -+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length); -+ return result; -+ } -+ -+ /* check quota before appending data */ -+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length)) -+ return RETERR(-EDQUOT); -+ -+ to_write = flow->length; -+ result = reiser4_insert_flow(coord, lh, flow); -+ if (flow->length) -+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length); -+ return (to_write - flow->length) ? (to_write - flow->length) : result; -+} -+ -+/** -+ * write_tail_reserve_space - reserve space for tail write operation -+ * @inode: -+ * -+ * Estimates and reserves space which may be required for writing one flow to a -+ * file -+ */ -+static int write_extent_reserve_space(struct inode *inode) -+{ -+ __u64 count; -+ reiser4_tree *tree; -+ -+ /* -+ * to write one flow to a file by tails we have to reserve disk space for: -+ -+ * 1. find_file_item may have to insert empty node to the tree (empty -+ * leaf node between two extent items). This requires 1 block and -+ * number of blocks which are necessary to perform insertion of an -+ * internal item into twig level. -+ * -+ * 2. flow insertion -+ * -+ * 3. stat data update -+ */ -+ tree = reiser4_tree_by_inode(inode); -+ count = estimate_one_insert_item(tree) + -+ estimate_insert_flow(tree->height) + -+ estimate_one_insert_item(tree); -+ grab_space_enable(); -+ return reiser4_grab_space(count, 0 /* flags */); -+} -+ -+#define PAGE_PER_FLOW 4 -+ -+static loff_t faultin_user_pages(const char __user *buf, size_t count) -+{ -+ loff_t faulted; -+ int to_fault; -+ -+ if (count > PAGE_PER_FLOW * PAGE_CACHE_SIZE) -+ count = PAGE_PER_FLOW * PAGE_CACHE_SIZE; -+ faulted = 0; -+ while (count > 0) { -+ to_fault = PAGE_CACHE_SIZE; -+ if (count < to_fault) -+ to_fault = count; -+ fault_in_pages_readable(buf + faulted, to_fault); -+ count -= to_fault; -+ faulted += to_fault; -+ } -+ return faulted; -+} -+ -+/** -+ * reiser4_write_extent - write method of tail item plugin -+ * @file: file to write to -+ * @buf: address of user-space buffer -+ * @count: number of bytes to write -+ * @pos: position in file to write to -+ * -+ * Returns number of written bytes or error code. -+ */ -+ssize_t reiser4_write_tail(struct file *file, const char __user *buf, -+ size_t count, loff_t *pos) -+{ -+ struct inode *inode; -+ struct hint hint; -+ int result; -+ flow_t flow; -+ coord_t *coord; -+ lock_handle *lh; -+ znode *loaded; -+ -+ inode = file->f_dentry->d_inode; -+ -+ if (write_extent_reserve_space(inode)) -+ return RETERR(-ENOSPC); -+ -+ result = load_file_hint(file, &hint); -+ BUG_ON(result != 0); -+ -+ flow.length = faultin_user_pages(buf, count); -+ flow.user = 1; -+ memcpy(&flow.data, &buf, sizeof(buf)); -+ flow.op = WRITE_OP; -+ key_by_inode_and_offset_common(inode, *pos, &flow.key); -+ -+ result = find_file_item(&hint, &flow.key, ZNODE_WRITE_LOCK, inode); -+ if (IS_CBKERR(result)) -+ return result; -+ -+ coord = &hint.ext_coord.coord; -+ lh = hint.ext_coord.lh; -+ -+ result = zload(coord->node); -+ BUG_ON(result != 0); -+ loaded = coord->node; -+ -+ if (coord->between == AFTER_UNIT) { -+ /* append with data or hole */ -+ result = append_tail(inode, &flow, coord, lh); -+ } else if (coord->between == AT_UNIT) { -+ /* overwrite */ -+ result = overwrite_tail(&flow, coord); -+ } else { -+ /* no items of this file yet. insert data or hole */ -+ result = insert_first_tail(inode, &flow, coord, lh); -+ } -+ zrelse(loaded); -+ if (result < 0) { -+ done_lh(lh); -+ return result; -+ } -+ -+ /* seal and unlock znode */ -+ hint.ext_coord.valid = 0; -+ if (hint.ext_coord.valid) -+ reiser4_set_hint(&hint, &flow.key, ZNODE_WRITE_LOCK); -+ else -+ reiser4_unset_hint(&hint); -+ -+ save_file_hint(file, &hint); -+ return result; -+} -+ -+#if REISER4_DEBUG -+ -+static int -+coord_matches_key_tail(const coord_t * coord, const reiser4_key * key) -+{ -+ reiser4_key item_key; -+ -+ assert("vs-1356", coord_is_existing_unit(coord)); -+ assert("vs-1354", keylt(key, append_key_tail(coord, &item_key))); -+ assert("vs-1355", keyge(key, item_key_by_coord(coord, &item_key))); -+ return get_key_offset(key) == -+ get_key_offset(&item_key) + coord->unit_pos; -+ -+} -+ -+#endif -+ -+/* plugin->u.item.s.file.read */ -+int reiser4_read_tail(struct file *file UNUSED_ARG, flow_t *f, hint_t *hint) -+{ -+ unsigned count; -+ int item_length; -+ coord_t *coord; -+ uf_coord_t *uf_coord; -+ -+ uf_coord = &hint->ext_coord; -+ coord = &uf_coord->coord; -+ -+ assert("vs-571", f->user == 1); -+ assert("vs-571", f->data); -+ assert("vs-967", coord && coord->node); -+ assert("vs-1117", znode_is_rlocked(coord->node)); -+ assert("vs-1118", znode_is_loaded(coord->node)); -+ -+ assert("nikita-3037", reiser4_schedulable()); -+ assert("vs-1357", coord_matches_key_tail(coord, &f->key)); -+ -+ /* calculate number of bytes to read off the item */ -+ item_length = item_length_by_coord(coord); -+ count = item_length_by_coord(coord) - coord->unit_pos; -+ if (count > f->length) -+ count = f->length; -+ -+ /* user page has to be brought in so that major page fault does not -+ * occur here when longtem lock is held */ -+ if (__copy_to_user((char __user *)f->data, -+ ((char *)item_body_by_coord(coord) + coord->unit_pos), -+ count)) -+ return RETERR(-EFAULT); -+ -+ /* probably mark_page_accessed() should only be called if -+ * coord->unit_pos is zero. */ -+ mark_page_accessed(znode_page(coord->node)); -+ move_flow_forward(f, count); -+ -+ coord->unit_pos += count; -+ if (item_length == coord->unit_pos) { -+ coord->unit_pos--; -+ coord->between = AFTER_UNIT; -+ } -+ -+ return 0; -+} -+ -+/* -+ plugin->u.item.s.file.append_key -+ key of first byte which is the next to last byte by addressed by this item -+*/ -+reiser4_key *append_key_tail(const coord_t * coord, reiser4_key * key) -+{ -+ item_key_by_coord(coord, key); -+ set_key_offset(key, get_key_offset(key) + item_length_by_coord(coord)); -+ return key; -+} -+ -+/* plugin->u.item.s.file.init_coord_extension */ -+void init_coord_extension_tail(uf_coord_t * uf_coord, loff_t lookuped) -+{ -+ uf_coord->valid = 1; -+} -+ -+/* -+ plugin->u.item.s.file.get_block -+*/ -+int -+get_block_address_tail(const coord_t * coord, sector_t lblock, sector_t * block) -+{ -+ assert("nikita-3252", znode_get_level(coord->node) == LEAF_LEVEL); -+ -+ if (reiser4_blocknr_is_fake(znode_get_block(coord->node))) -+ /* if node has'nt obtainet its block number yet, return 0. -+ * Lets avoid upsetting users with some cosmic numbers beyond -+ * the device capacity.*/ -+ *block = 0; -+ else -+ *block = *znode_get_block(coord->node); -+ return 0; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/item/tail.h linux-2.6.20/fs/reiser4/plugin/item/tail.h ---- linux-2.6.20.orig/fs/reiser4/plugin/item/tail.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/item/tail.h 2007-05-06 14:50:43.827015719 +0400 -@@ -0,0 +1,58 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#if !defined( __REISER4_TAIL_H__ ) -+#define __REISER4_TAIL_H__ -+ -+typedef struct { -+ int not_used; -+} tail_coord_extension_t; -+ -+struct cut_list; -+ -+/* plugin->u.item.b.* */ -+reiser4_key *max_key_inside_tail(const coord_t *, reiser4_key *); -+int can_contain_key_tail(const coord_t * coord, const reiser4_key * key, -+ const reiser4_item_data *); -+int mergeable_tail(const coord_t * p1, const coord_t * p2); -+pos_in_node_t nr_units_tail(const coord_t *); -+lookup_result lookup_tail(const reiser4_key *, lookup_bias, coord_t *); -+int paste_tail(coord_t *, reiser4_item_data *, carry_plugin_info *); -+int can_shift_tail(unsigned free_space, coord_t * source, -+ znode * target, shift_direction, unsigned *size, -+ unsigned want); -+void copy_units_tail(coord_t * target, coord_t * source, unsigned from, -+ unsigned count, shift_direction, unsigned free_space); -+int kill_hook_tail(const coord_t *, pos_in_node_t from, pos_in_node_t count, -+ struct carry_kill_data *); -+int cut_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to, -+ struct carry_cut_data *, reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+int kill_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to, -+ struct carry_kill_data *, reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+reiser4_key *unit_key_tail(const coord_t *, reiser4_key *); -+ -+/* plugin->u.item.s.* */ -+ssize_t reiser4_write_tail(struct file *file, const char __user *buf, -+ size_t count, loff_t *pos); -+int reiser4_read_tail(struct file *, flow_t *, hint_t *); -+int readpage_tail(void *vp, struct page *page); -+reiser4_key *append_key_tail(const coord_t *, reiser4_key *); -+void init_coord_extension_tail(uf_coord_t *, loff_t offset); -+int get_block_address_tail(const coord_t *, sector_t, sector_t *); -+int item_balance_dirty_pages(struct address_space *, const flow_t *, -+ hint_t *, int back_to_dirty, int set_hint); -+ -+/* __REISER4_TAIL_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/Makefile linux-2.6.20/fs/reiser4/plugin/Makefile ---- linux-2.6.20.orig/fs/reiser4/plugin/Makefile 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/Makefile 2007-05-06 14:50:43.827015719 +0400 -@@ -0,0 +1,26 @@ -+obj-$(CONFIG_REISER4_FS) += plugins.o -+ -+plugins-objs := \ -+ plugin.o \ -+ plugin_set.o \ -+ object.o \ -+ inode_ops.o \ -+ inode_ops_rename.o \ -+ file_ops.o \ -+ file_ops_readdir.o \ -+ file_plugin_common.o \ -+ dir_plugin_common.o \ -+ digest.o \ -+ hash.o \ -+ fibration.o \ -+ tail_policy.o \ -+ regular.o -+ -+obj-$(CONFIG_REISER4_FS) += item/ -+obj-$(CONFIG_REISER4_FS) += file/ -+obj-$(CONFIG_REISER4_FS) += dir/ -+obj-$(CONFIG_REISER4_FS) += node/ -+obj-$(CONFIG_REISER4_FS) += compress/ -+obj-$(CONFIG_REISER4_FS) += space/ -+obj-$(CONFIG_REISER4_FS) += disk_format/ -+obj-$(CONFIG_REISER4_FS) += security/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/node/Makefile linux-2.6.20/fs/reiser4/plugin/node/Makefile ---- linux-2.6.20.orig/fs/reiser4/plugin/node/Makefile 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/node/Makefile 2007-05-06 14:50:43.827015719 +0400 -@@ -0,0 +1,5 @@ -+obj-$(CONFIG_REISER4_FS) += node_plugins.o -+ -+node_plugins-objs := \ -+ node.o \ -+ node40.o -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/node/node40.c linux-2.6.20/fs/reiser4/plugin/node/node40.c ---- linux-2.6.20.orig/fs/reiser4/plugin/node/node40.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/node/node40.c 2007-05-06 14:50:43.831016969 +0400 -@@ -0,0 +1,2924 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "../../debug.h" -+#include "../../key.h" -+#include "../../coord.h" -+#include "../plugin_header.h" -+#include "../item/item.h" -+#include "node.h" -+#include "node40.h" -+#include "../plugin.h" -+#include "../../jnode.h" -+#include "../../znode.h" -+#include "../../pool.h" -+#include "../../carry.h" -+#include "../../tap.h" -+#include "../../tree.h" -+#include "../../super.h" -+#include "../../reiser4.h" -+ -+#include -+#include -+#include -+ -+/* leaf 40 format: -+ -+ [node header | item 0, item 1, .., item N-1 | free space | item_head N-1, .. item_head 1, item head 0 ] -+ plugin_id (16) key -+ free_space (16) pluginid (16) -+ free_space_start (16) offset (16) -+ level (8) -+ num_items (16) -+ magic (32) -+ flush_time (32) -+*/ -+/* NIKITA-FIXME-HANS: I told you guys not less than 10 times to not call it r4fs. Change to "ReIs". */ -+/* magic number that is stored in ->magic field of node header */ -+static const __u32 REISER4_NODE_MAGIC = 0x52344653; /* (*(__u32 *)"R4FS"); */ -+ -+static int prepare_for_update(znode * left, znode * right, -+ carry_plugin_info * info); -+ -+/* header of node of reiser40 format is at the beginning of node */ -+static inline node40_header *node40_node_header(const znode * node /* node to -+ * query */ ) -+{ -+ assert("nikita-567", node != NULL); -+ assert("nikita-568", znode_page(node) != NULL); -+ assert("nikita-569", zdata(node) != NULL); -+ return (node40_header *) zdata(node); -+} -+ -+/* functions to get/set fields of node40_header */ -+#define nh40_get_magic(nh) le32_to_cpu(get_unaligned(&(nh)->magic)) -+#define nh40_get_free_space(nh) le16_to_cpu(get_unaligned(&(nh)->free_space)) -+#define nh40_get_free_space_start(nh) le16_to_cpu(get_unaligned(&(nh)->free_space_start)) -+#define nh40_get_level(nh) get_unaligned(&(nh)->level) -+#define nh40_get_num_items(nh) le16_to_cpu(get_unaligned(&(nh)->nr_items)) -+#define nh40_get_flush_id(nh) le64_to_cpu(get_unaligned(&(nh)->flush_id)) -+ -+#define nh40_set_magic(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->magic) -+#define nh40_set_free_space(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space) -+#define nh40_set_free_space_start(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space_start) -+#define nh40_set_level(nh, value) put_unaligned(value, &(nh)->level) -+#define nh40_set_num_items(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->nr_items) -+#define nh40_set_mkfs_id(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->mkfs_id) -+ -+/* plugin field of node header should be read/set by -+ plugin_by_disk_id/save_disk_plugin */ -+ -+/* array of item headers is at the end of node */ -+static inline item_header40 *node40_ih_at(const znode * node, unsigned pos) -+{ -+ return (item_header40 *) (zdata(node) + znode_size(node)) - pos - 1; -+} -+ -+/* ( page_address( node -> pg ) + PAGE_CACHE_SIZE ) - pos - 1 -+ */ -+static inline item_header40 *node40_ih_at_coord(const coord_t * coord) -+{ -+ return (item_header40 *) (zdata(coord->node) + -+ znode_size(coord->node)) - (coord->item_pos) - -+ 1; -+} -+ -+/* functions to get/set fields of item_header40 */ -+#define ih40_get_offset(ih) le16_to_cpu(get_unaligned(&(ih)->offset)) -+ -+#define ih40_set_offset(ih, value) put_unaligned(cpu_to_le16(value), &(ih)->offset) -+ -+/* plugin field of item header should be read/set by -+ plugin_by_disk_id/save_disk_plugin */ -+ -+/* plugin methods */ -+ -+/* plugin->u.node.item_overhead -+ look for description of this method in plugin/node/node.h */ -+size_t -+item_overhead_node40(const znode * node UNUSED_ARG, flow_t * f UNUSED_ARG) -+{ -+ return sizeof(item_header40); -+} -+ -+/* plugin->u.node.free_space -+ look for description of this method in plugin/node/node.h */ -+size_t free_space_node40(znode * node) -+{ -+ assert("nikita-577", node != NULL); -+ assert("nikita-578", znode_is_loaded(node)); -+ assert("nikita-579", zdata(node) != NULL); -+ -+ return nh40_get_free_space(node40_node_header(node)); -+} -+ -+/* private inline version of node40_num_of_items() for use in this file. This -+ is necessary, because address of node40_num_of_items() is taken and it is -+ never inlined as a result. */ -+static inline short node40_num_of_items_internal(const znode * node) -+{ -+ return nh40_get_num_items(node40_node_header(node)); -+} -+ -+#if REISER4_DEBUG -+static inline void check_num_items(const znode * node) -+{ -+ assert("nikita-2749", -+ node40_num_of_items_internal(node) == node->nr_items); -+ assert("nikita-2746", znode_is_write_locked(node)); -+} -+#else -+#define check_num_items(node) noop -+#endif -+ -+/* plugin->u.node.num_of_items -+ look for description of this method in plugin/node/node.h */ -+int num_of_items_node40(const znode * node) -+{ -+ return node40_num_of_items_internal(node); -+} -+ -+static void -+node40_set_num_items(znode * node, node40_header * nh, unsigned value) -+{ -+ assert("nikita-2751", node != NULL); -+ assert("nikita-2750", nh == node40_node_header(node)); -+ -+ check_num_items(node); -+ nh40_set_num_items(nh, value); -+ node->nr_items = value; -+ check_num_items(node); -+} -+ -+/* plugin->u.node.item_by_coord -+ look for description of this method in plugin/node/node.h */ -+char *item_by_coord_node40(const coord_t * coord) -+{ -+ item_header40 *ih; -+ char *p; -+ -+ /* @coord is set to existing item */ -+ assert("nikita-596", coord != NULL); -+ assert("vs-255", coord_is_existing_item(coord)); -+ -+ ih = node40_ih_at_coord(coord); -+ p = zdata(coord->node) + ih40_get_offset(ih); -+ return p; -+} -+ -+/* plugin->u.node.length_by_coord -+ look for description of this method in plugin/node/node.h */ -+int length_by_coord_node40(const coord_t * coord) -+{ -+ item_header40 *ih; -+ int result; -+ -+ /* @coord is set to existing item */ -+ assert("vs-256", coord != NULL); -+ assert("vs-257", coord_is_existing_item(coord)); -+ -+ ih = node40_ih_at_coord(coord); -+ if ((int)coord->item_pos == -+ node40_num_of_items_internal(coord->node) - 1) -+ result = -+ nh40_get_free_space_start(node40_node_header(coord->node)) - -+ ih40_get_offset(ih); -+ else -+ result = ih40_get_offset(ih - 1) - ih40_get_offset(ih); -+ -+ return result; -+} -+ -+static pos_in_node_t -+node40_item_length(const znode * node, pos_in_node_t item_pos) -+{ -+ item_header40 *ih; -+ pos_in_node_t result; -+ -+ /* @coord is set to existing item */ -+ assert("vs-256", node != NULL); -+ assert("vs-257", node40_num_of_items_internal(node) > item_pos); -+ -+ ih = node40_ih_at(node, item_pos); -+ if (item_pos == node40_num_of_items_internal(node) - 1) -+ result = -+ nh40_get_free_space_start(node40_node_header(node)) - -+ ih40_get_offset(ih); -+ else -+ result = ih40_get_offset(ih - 1) - ih40_get_offset(ih); -+ -+ return result; -+} -+ -+/* plugin->u.node.plugin_by_coord -+ look for description of this method in plugin/node/node.h */ -+item_plugin *plugin_by_coord_node40(const coord_t * coord) -+{ -+ item_header40 *ih; -+ item_plugin *result; -+ -+ /* @coord is set to existing item */ -+ assert("vs-258", coord != NULL); -+ assert("vs-259", coord_is_existing_item(coord)); -+ -+ ih = node40_ih_at_coord(coord); -+ /* pass NULL in stead of current tree. This is time critical call. */ -+ result = item_plugin_by_disk_id(NULL, &ih->plugin_id); -+ return result; -+} -+ -+/* plugin->u.node.key_at -+ look for description of this method in plugin/node/node.h */ -+reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key) -+{ -+ item_header40 *ih; -+ -+ assert("nikita-1765", coord_is_existing_item(coord)); -+ -+ /* @coord is set to existing item */ -+ ih = node40_ih_at_coord(coord); -+ memcpy(key, &ih->key, sizeof(reiser4_key)); -+ return key; -+} -+ -+/* VS-FIXME-HANS: please review whether the below are properly disabled when debugging is disabled */ -+ -+#define NODE_INCSTAT(n, counter) \ -+ reiser4_stat_inc_at_level(znode_get_level(n), node.lookup.counter) -+ -+#define NODE_ADDSTAT(n, counter, val) \ -+ reiser4_stat_add_at_level(znode_get_level(n), node.lookup.counter, val) -+ -+/* plugin->u.node.lookup -+ look for description of this method in plugin/node/node.h */ -+node_search_result lookup_node40(znode * node /* node to query */ , -+ const reiser4_key * key /* key to look for */ , -+ lookup_bias bias /* search bias */ , -+ coord_t * coord /* resulting coord */ ) -+{ -+ int left; -+ int right; -+ int found; -+ int items; -+ -+ item_header40 *lefth; -+ item_header40 *righth; -+ -+ item_plugin *iplug; -+ item_header40 *bstop; -+ item_header40 *ih; -+ cmp_t order; -+ -+ assert("nikita-583", node != NULL); -+ assert("nikita-584", key != NULL); -+ assert("nikita-585", coord != NULL); -+ assert("nikita-2693", znode_is_any_locked(node)); -+ cassert(REISER4_SEQ_SEARCH_BREAK > 2); -+ -+ items = node_num_items(node); -+ -+ if (unlikely(items == 0)) { -+ coord_init_first_unit(coord, node); -+ return NS_NOT_FOUND; -+ } -+ -+ /* binary search for item that can contain given key */ -+ left = 0; -+ right = items - 1; -+ coord->node = node; -+ coord_clear_iplug(coord); -+ found = 0; -+ -+ lefth = node40_ih_at(node, left); -+ righth = node40_ih_at(node, right); -+ -+ /* It is known that for small arrays sequential search is on average -+ more efficient than binary. This is because sequential search is -+ coded as tight loop that can be better optimized by compilers and -+ for small array size gain from this optimization makes sequential -+ search the winner. Another, maybe more important, reason for this, -+ is that sequential array is more CPU cache friendly, whereas binary -+ search effectively destroys CPU caching. -+ -+ Critical here is the notion of "smallness". Reasonable value of -+ REISER4_SEQ_SEARCH_BREAK can be found by playing with code in -+ fs/reiser4/ulevel/ulevel.c:test_search(). -+ -+ Don't try to further optimize sequential search by scanning from -+ right to left in attempt to use more efficient loop termination -+ condition (comparison with 0). This doesn't work. -+ -+ */ -+ -+ while (right - left >= REISER4_SEQ_SEARCH_BREAK) { -+ int median; -+ item_header40 *medianh; -+ -+ median = (left + right) / 2; -+ medianh = node40_ih_at(node, median); -+ -+ assert("nikita-1084", median >= 0); -+ assert("nikita-1085", median < items); -+ switch (keycmp(key, &medianh->key)) { -+ case LESS_THAN: -+ right = median; -+ righth = medianh; -+ break; -+ default: -+ wrong_return_value("nikita-586", "keycmp"); -+ case GREATER_THAN: -+ left = median; -+ lefth = medianh; -+ break; -+ case EQUAL_TO: -+ do { -+ --median; -+ /* headers are ordered from right to left */ -+ ++medianh; -+ } while (median >= 0 && keyeq(key, &medianh->key)); -+ right = left = median + 1; -+ ih = lefth = righth = medianh - 1; -+ found = 1; -+ break; -+ } -+ } -+ /* sequential scan. Item headers, and, therefore, keys are stored at -+ the rightmost part of a node from right to left. We are trying to -+ access memory from left to right, and hence, scan in _descending_ -+ order of item numbers. -+ */ -+ if (!found) { -+ for (left = right, ih = righth; left >= 0; ++ih, --left) { -+ cmp_t comparison; -+ -+ prefetchkey(&(ih + 1)->key); -+ comparison = keycmp(&ih->key, key); -+ if (comparison == GREATER_THAN) -+ continue; -+ if (comparison == EQUAL_TO) { -+ found = 1; -+ do { -+ --left; -+ ++ih; -+ } while (left >= 0 && keyeq(&ih->key, key)); -+ ++left; -+ --ih; -+ } else { -+ assert("nikita-1256", comparison == LESS_THAN); -+ } -+ break; -+ } -+ if (unlikely(left < 0)) -+ left = 0; -+ } -+ -+ assert("nikita-3212", right >= left); -+ assert("nikita-3214", -+ equi(found, keyeq(&node40_ih_at(node, left)->key, key))); -+ -+ coord_set_item_pos(coord, left); -+ coord->unit_pos = 0; -+ coord->between = AT_UNIT; -+ -+ /* key < leftmost key in a mode or node is corrupted and keys -+ are not sorted */ -+ bstop = node40_ih_at(node, (unsigned)left); -+ order = keycmp(&bstop->key, key); -+ if (unlikely(order == GREATER_THAN)) { -+ if (unlikely(left != 0)) { -+ /* screw up */ -+ warning("nikita-587", "Key less than %i key in a node", -+ left); -+ reiser4_print_key("key", key); -+ reiser4_print_key("min", &bstop->key); -+ print_coord_content("coord", coord); -+ return RETERR(-EIO); -+ } else { -+ coord->between = BEFORE_UNIT; -+ return NS_NOT_FOUND; -+ } -+ } -+ /* left <= key, ok */ -+ iplug = item_plugin_by_disk_id(znode_get_tree(node), &bstop->plugin_id); -+ -+ if (unlikely(iplug == NULL)) { -+ warning("nikita-588", "Unknown plugin %i", -+ le16_to_cpu(get_unaligned(&bstop->plugin_id))); -+ reiser4_print_key("key", key); -+ print_coord_content("coord", coord); -+ return RETERR(-EIO); -+ } -+ -+ coord_set_iplug(coord, iplug); -+ -+ /* if exact key from item header was found by binary search, no -+ further checks are necessary. */ -+ if (found) { -+ assert("nikita-1259", order == EQUAL_TO); -+ return NS_FOUND; -+ } -+ if (iplug->b.max_key_inside != NULL) { -+ reiser4_key max_item_key; -+ -+ /* key > max_item_key --- outside of an item */ -+ if (keygt(key, iplug->b.max_key_inside(coord, &max_item_key))) { -+ coord->unit_pos = 0; -+ coord->between = AFTER_ITEM; -+ /* FIXME-VS: key we are looking for does not fit into -+ found item. Return NS_NOT_FOUND then. Without that -+ the following case does not work: there is extent of -+ file 10000, 10001. File 10000, 10002 has been just -+ created. When writing to position 0 in that file - -+ traverse_tree will stop here on twig level. When we -+ want it to go down to leaf level -+ */ -+ return NS_NOT_FOUND; -+ } -+ } -+ -+ if (iplug->b.lookup != NULL) { -+ return iplug->b.lookup(key, bias, coord); -+ } else { -+ assert("nikita-1260", order == LESS_THAN); -+ coord->between = AFTER_UNIT; -+ return (bias == FIND_EXACT) ? NS_NOT_FOUND : NS_FOUND; -+ } -+} -+ -+#undef NODE_ADDSTAT -+#undef NODE_INCSTAT -+ -+/* plugin->u.node.estimate -+ look for description of this method in plugin/node/node.h */ -+size_t estimate_node40(znode * node) -+{ -+ size_t result; -+ -+ assert("nikita-597", node != NULL); -+ -+ result = free_space_node40(node) - sizeof(item_header40); -+ -+ return (result > 0) ? result : 0; -+} -+ -+/* plugin->u.node.check -+ look for description of this method in plugin/node/node.h */ -+int check_node40(const znode * node /* node to check */ , -+ __u32 flags /* check flags */ , -+ const char **error /* where to store error message */ ) -+{ -+ int nr_items; -+ int i; -+ reiser4_key prev; -+ unsigned old_offset; -+ tree_level level; -+ coord_t coord; -+ int result; -+ -+ assert("nikita-580", node != NULL); -+ assert("nikita-581", error != NULL); -+ assert("nikita-2948", znode_is_loaded(node)); -+ -+ if (ZF_ISSET(node, JNODE_HEARD_BANSHEE)) -+ return 0; -+ -+ assert("nikita-582", zdata(node) != NULL); -+ -+ nr_items = node40_num_of_items_internal(node); -+ if (nr_items < 0) { -+ *error = "Negative number of items"; -+ return -1; -+ } -+ -+ if (flags & REISER4_NODE_DKEYS) -+ prev = *znode_get_ld_key((znode *) node); -+ else -+ prev = *reiser4_min_key(); -+ -+ old_offset = 0; -+ coord_init_zero(&coord); -+ coord.node = (znode *) node; -+ coord.unit_pos = 0; -+ coord.between = AT_UNIT; -+ level = znode_get_level(node); -+ for (i = 0; i < nr_items; i++) { -+ item_header40 *ih; -+ reiser4_key unit_key; -+ unsigned j; -+ -+ ih = node40_ih_at(node, (unsigned)i); -+ coord_set_item_pos(&coord, i); -+ if ((ih40_get_offset(ih) >= -+ znode_size(node) - nr_items * sizeof(item_header40)) || -+ (ih40_get_offset(ih) < sizeof(node40_header))) { -+ *error = "Offset is out of bounds"; -+ return -1; -+ } -+ if (ih40_get_offset(ih) <= old_offset) { -+ *error = "Offsets are in wrong order"; -+ return -1; -+ } -+ if ((i == 0) && (ih40_get_offset(ih) != sizeof(node40_header))) { -+ *error = "Wrong offset of first item"; -+ return -1; -+ } -+ old_offset = ih40_get_offset(ih); -+ -+ if (keygt(&prev, &ih->key)) { -+ *error = "Keys are in wrong order"; -+ return -1; -+ } -+ if (!keyeq(&ih->key, unit_key_by_coord(&coord, &unit_key))) { -+ *error = "Wrong key of first unit"; -+ return -1; -+ } -+ prev = ih->key; -+ for (j = 0; j < coord_num_units(&coord); ++j) { -+ coord.unit_pos = j; -+ unit_key_by_coord(&coord, &unit_key); -+ if (keygt(&prev, &unit_key)) { -+ *error = "Unit keys are in wrong order"; -+ return -1; -+ } -+ prev = unit_key; -+ } -+ coord.unit_pos = 0; -+ if (level != TWIG_LEVEL && item_is_extent(&coord)) { -+ *error = "extent on the wrong level"; -+ return -1; -+ } -+ if (level == LEAF_LEVEL && item_is_internal(&coord)) { -+ *error = "internal item on the wrong level"; -+ return -1; -+ } -+ if (level != LEAF_LEVEL && -+ !item_is_internal(&coord) && !item_is_extent(&coord)) { -+ *error = "wrong item on the internal level"; -+ return -1; -+ } -+ if (level > TWIG_LEVEL && !item_is_internal(&coord)) { -+ *error = "non-internal item on the internal level"; -+ return -1; -+ } -+#if REISER4_DEBUG -+ if (item_plugin_by_coord(&coord)->b.check -+ && item_plugin_by_coord(&coord)->b.check(&coord, error)) -+ return -1; -+#endif -+ if (i) { -+ coord_t prev_coord; -+ /* two neighboring items can not be mergeable */ -+ coord_dup(&prev_coord, &coord); -+ coord_prev_item(&prev_coord); -+ if (are_items_mergeable(&prev_coord, &coord)) { -+ *error = "mergeable items in one node"; -+ return -1; -+ } -+ -+ } -+ } -+ -+ if ((flags & REISER4_NODE_DKEYS) && !node_is_empty(node)) { -+ coord_t coord; -+ item_plugin *iplug; -+ -+ coord_init_last_unit(&coord, node); -+ iplug = item_plugin_by_coord(&coord); -+ if ((item_is_extent(&coord) || item_is_tail(&coord)) && -+ iplug->s.file.append_key != NULL) { -+ reiser4_key mkey; -+ -+ iplug->s.file.append_key(&coord, &mkey); -+ set_key_offset(&mkey, get_key_offset(&mkey) - 1); -+ read_lock_dk(current_tree); -+ result = keygt(&mkey, znode_get_rd_key((znode *) node)); -+ read_unlock_dk(current_tree); -+ if (result) { -+ *error = "key of rightmost item is too large"; -+ return -1; -+ } -+ } -+ } -+ if (flags & REISER4_NODE_DKEYS) { -+ read_lock_tree(current_tree); -+ read_lock_dk(current_tree); -+ -+ flags |= REISER4_NODE_TREE_STABLE; -+ -+ if (keygt(&prev, znode_get_rd_key((znode *) node))) { -+ if (flags & REISER4_NODE_TREE_STABLE) { -+ *error = "Last key is greater than rdkey"; -+ read_unlock_dk(current_tree); -+ read_unlock_tree(current_tree); -+ return -1; -+ } -+ } -+ if (keygt -+ (znode_get_ld_key((znode *) node), -+ znode_get_rd_key((znode *) node))) { -+ *error = "ldkey is greater than rdkey"; -+ read_unlock_dk(current_tree); -+ read_unlock_tree(current_tree); -+ return -1; -+ } -+ if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && -+ (node->left != NULL) && -+ !ZF_ISSET(node->left, JNODE_HEARD_BANSHEE) && -+ ergo(flags & REISER4_NODE_TREE_STABLE, -+ !keyeq(znode_get_rd_key(node->left), -+ znode_get_ld_key((znode *) node))) -+ && ergo(!(flags & REISER4_NODE_TREE_STABLE), -+ keygt(znode_get_rd_key(node->left), -+ znode_get_ld_key((znode *) node)))) { -+ *error = "left rdkey or ldkey is wrong"; -+ read_unlock_dk(current_tree); -+ read_unlock_tree(current_tree); -+ return -1; -+ } -+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && -+ (node->right != NULL) && -+ !ZF_ISSET(node->right, JNODE_HEARD_BANSHEE) && -+ ergo(flags & REISER4_NODE_TREE_STABLE, -+ !keyeq(znode_get_rd_key((znode *) node), -+ znode_get_ld_key(node->right))) -+ && ergo(!(flags & REISER4_NODE_TREE_STABLE), -+ keygt(znode_get_rd_key((znode *) node), -+ znode_get_ld_key(node->right)))) { -+ *error = "rdkey or right ldkey is wrong"; -+ read_unlock_dk(current_tree); -+ read_unlock_tree(current_tree); -+ return -1; -+ } -+ -+ read_unlock_dk(current_tree); -+ read_unlock_tree(current_tree); -+ } -+ -+ return 0; -+} -+ -+/* plugin->u.node.parse -+ look for description of this method in plugin/node/node.h */ -+int parse_node40(znode * node /* node to parse */ ) -+{ -+ node40_header *header; -+ int result; -+ d8 level; -+ -+ header = node40_node_header((znode *) node); -+ result = -EIO; -+ level = nh40_get_level(header); -+ if (unlikely(((__u8) znode_get_level(node)) != level)) -+ warning("nikita-494", "Wrong level found in node: %i != %i", -+ znode_get_level(node), level); -+ else if (unlikely(nh40_get_magic(header) != REISER4_NODE_MAGIC)) -+ warning("nikita-495", -+ "Wrong magic in tree node: want %x, got %x", -+ REISER4_NODE_MAGIC, nh40_get_magic(header)); -+ else { -+ node->nr_items = node40_num_of_items_internal(node); -+ result = 0; -+ } -+ return RETERR(result); -+} -+ -+/* plugin->u.node.init -+ look for description of this method in plugin/node/node.h */ -+int init_node40(znode * node /* node to initialise */ ) -+{ -+ node40_header *header; -+ -+ assert("nikita-570", node != NULL); -+ assert("nikita-572", zdata(node) != NULL); -+ -+ header = node40_node_header(node); -+ memset(header, 0, sizeof(node40_header)); -+ nh40_set_free_space(header, znode_size(node) - sizeof(node40_header)); -+ nh40_set_free_space_start(header, sizeof(node40_header)); -+ /* sane hypothesis: 0 in CPU format is 0 in disk format */ -+ /* items: 0 */ -+ save_plugin_id(node_plugin_to_plugin(node->nplug), -+ &header->common_header.plugin_id); -+ nh40_set_level(header, znode_get_level(node)); -+ nh40_set_magic(header, REISER4_NODE_MAGIC); -+ node->nr_items = 0; -+ nh40_set_mkfs_id(header, reiser4_mkfs_id(reiser4_get_current_sb())); -+ -+ /* flags: 0 */ -+ return 0; -+} -+ -+#ifdef GUESS_EXISTS -+int guess_node40(const znode * node /* node to guess plugin of */ ) -+{ -+ node40_header *nethack; -+ -+ assert("nikita-1058", node != NULL); -+ nethack = node40_node_header(node); -+ return -+ (nh40_get_magic(nethack) == REISER4_NODE_MAGIC) && -+ (plugin_by_disk_id(znode_get_tree(node), -+ REISER4_NODE_PLUGIN_TYPE, -+ &nethack->common_header.plugin_id)->h.id == -+ NODE40_ID); -+} -+#endif -+ -+/* plugin->u.node.chage_item_size -+ look for description of this method in plugin/node/node.h */ -+void change_item_size_node40(coord_t * coord, int by) -+{ -+ node40_header *nh; -+ item_header40 *ih; -+ char *item_data; -+ int item_length; -+ unsigned i; -+ -+ /* make sure that @item is coord of existing item */ -+ assert("vs-210", coord_is_existing_item(coord)); -+ -+ nh = node40_node_header(coord->node); -+ -+ item_data = item_by_coord_node40(coord); -+ item_length = length_by_coord_node40(coord); -+ -+ /* move item bodies */ -+ ih = node40_ih_at_coord(coord); -+ memmove(item_data + item_length + by, item_data + item_length, -+ nh40_get_free_space_start(node40_node_header(coord->node)) - -+ (ih40_get_offset(ih) + item_length)); -+ -+ /* update offsets of moved items */ -+ for (i = coord->item_pos + 1; i < nh40_get_num_items(nh); i++) { -+ ih = node40_ih_at(coord->node, i); -+ ih40_set_offset(ih, ih40_get_offset(ih) + by); -+ } -+ -+ /* update node header */ -+ nh40_set_free_space(nh, nh40_get_free_space(nh) - by); -+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) + by); -+} -+ -+static int should_notify_parent(const znode * node) -+{ -+ /* FIXME_JMACD This looks equivalent to znode_is_root(), right? -josh */ -+ return !disk_addr_eq(znode_get_block(node), -+ &znode_get_tree(node)->root_block); -+} -+ -+/* plugin->u.node.create_item -+ look for description of this method in plugin/node/node.h */ -+int -+create_item_node40(coord_t *target, const reiser4_key *key, -+ reiser4_item_data *data, carry_plugin_info *info) -+{ -+ node40_header *nh; -+ item_header40 *ih; -+ unsigned offset; -+ unsigned i; -+ -+ nh = node40_node_header(target->node); -+ -+ assert("vs-212", coord_is_between_items(target)); -+ /* node must have enough free space */ -+ assert("vs-254", -+ free_space_node40(target->node) >= -+ data->length + sizeof(item_header40)); -+ assert("vs-1410", data->length >= 0); -+ -+ if (coord_set_to_right(target)) -+ /* there are not items to the right of @target, so, new item -+ will be inserted after last one */ -+ coord_set_item_pos(target, nh40_get_num_items(nh)); -+ -+ if (target->item_pos < nh40_get_num_items(nh)) { -+ /* there are items to be moved to prepare space for new -+ item */ -+ ih = node40_ih_at_coord(target); -+ /* new item will start at this offset */ -+ offset = ih40_get_offset(ih); -+ -+ memmove(zdata(target->node) + offset + data->length, -+ zdata(target->node) + offset, -+ nh40_get_free_space_start(nh) - offset); -+ /* update headers of moved items */ -+ for (i = target->item_pos; i < nh40_get_num_items(nh); i++) { -+ ih = node40_ih_at(target->node, i); -+ ih40_set_offset(ih, ih40_get_offset(ih) + data->length); -+ } -+ -+ /* @ih is set to item header of the last item, move item headers */ -+ memmove(ih - 1, ih, -+ sizeof(item_header40) * (nh40_get_num_items(nh) - -+ target->item_pos)); -+ } else { -+ /* new item will start at this offset */ -+ offset = nh40_get_free_space_start(nh); -+ } -+ -+ /* make item header for the new item */ -+ ih = node40_ih_at_coord(target); -+ memcpy(&ih->key, key, sizeof(reiser4_key)); -+ ih40_set_offset(ih, offset); -+ save_plugin_id(item_plugin_to_plugin(data->iplug), &ih->plugin_id); -+ -+ /* update node header */ -+ nh40_set_free_space(nh, -+ nh40_get_free_space(nh) - data->length - -+ sizeof(item_header40)); -+ nh40_set_free_space_start(nh, -+ nh40_get_free_space_start(nh) + data->length); -+ node40_set_num_items(target->node, nh, nh40_get_num_items(nh) + 1); -+ -+ /* FIXME: check how does create_item work when between is set to BEFORE_UNIT */ -+ target->unit_pos = 0; -+ target->between = AT_UNIT; -+ coord_clear_iplug(target); -+ -+ /* initialize item */ -+ if (data->iplug->b.init != NULL) { -+ data->iplug->b.init(target, NULL, data); -+ } -+ /* copy item body */ -+ if (data->iplug->b.paste != NULL) { -+ data->iplug->b.paste(target, data, info); -+ } else if (data->data != NULL) { -+ if (data->user) { -+ /* AUDIT: Are we really should not check that pointer -+ from userspace was valid and data bytes were -+ available? How will we return -EFAULT of some kind -+ without this check? */ -+ assert("nikita-3038", reiser4_schedulable()); -+ /* copy data from user space */ -+ __copy_from_user(zdata(target->node) + offset, -+ (const char __user *)data->data, -+ (unsigned)data->length); -+ } else -+ /* copy from kernel space */ -+ memcpy(zdata(target->node) + offset, data->data, -+ (unsigned)data->length); -+ } -+ -+ if (target->item_pos == 0) { -+ /* left delimiting key has to be updated */ -+ prepare_for_update(NULL, target->node, info); -+ } -+ -+ if (item_plugin_by_coord(target)->b.create_hook != NULL) { -+ item_plugin_by_coord(target)->b.create_hook(target, data->arg); -+ } -+ -+ return 0; -+} -+ -+/* plugin->u.node.update_item_key -+ look for description of this method in plugin/node/node.h */ -+void -+update_item_key_node40(coord_t * target, const reiser4_key * key, -+ carry_plugin_info * info) -+{ -+ item_header40 *ih; -+ -+ ih = node40_ih_at_coord(target); -+ memcpy(&ih->key, key, sizeof(reiser4_key)); -+ -+ if (target->item_pos == 0) { -+ prepare_for_update(NULL, target->node, info); -+ } -+} -+ -+/* this bits encode cut mode */ -+#define CMODE_TAIL 1 -+#define CMODE_WHOLE 2 -+#define CMODE_HEAD 4 -+ -+struct cut40_info { -+ int mode; -+ pos_in_node_t tail_removed; /* position of item which gets tail removed */ -+ pos_in_node_t first_removed; /* position of first the leftmost item among items removed completely */ -+ pos_in_node_t removed_count; /* number of items removed completely */ -+ pos_in_node_t head_removed; /* position of item which gets head removed */ -+ -+ pos_in_node_t freed_space_start; -+ pos_in_node_t freed_space_end; -+ pos_in_node_t first_moved; -+ pos_in_node_t head_removed_location; -+}; -+ -+static void init_cinfo(struct cut40_info *cinfo) -+{ -+ cinfo->mode = 0; -+ cinfo->tail_removed = MAX_POS_IN_NODE; -+ cinfo->first_removed = MAX_POS_IN_NODE; -+ cinfo->removed_count = MAX_POS_IN_NODE; -+ cinfo->head_removed = MAX_POS_IN_NODE; -+ cinfo->freed_space_start = MAX_POS_IN_NODE; -+ cinfo->freed_space_end = MAX_POS_IN_NODE; -+ cinfo->first_moved = MAX_POS_IN_NODE; -+ cinfo->head_removed_location = MAX_POS_IN_NODE; -+} -+ -+/* complete cut_node40/kill_node40 content by removing the gap created by */ -+static void compact(znode * node, struct cut40_info *cinfo) -+{ -+ node40_header *nh; -+ item_header40 *ih; -+ pos_in_node_t freed; -+ pos_in_node_t pos, nr_items; -+ -+ assert("vs-1526", (cinfo->freed_space_start != MAX_POS_IN_NODE && -+ cinfo->freed_space_end != MAX_POS_IN_NODE && -+ cinfo->first_moved != MAX_POS_IN_NODE)); -+ assert("vs-1523", cinfo->freed_space_end >= cinfo->freed_space_start); -+ -+ nh = node40_node_header(node); -+ nr_items = nh40_get_num_items(nh); -+ -+ /* remove gap made up by removal */ -+ memmove(zdata(node) + cinfo->freed_space_start, -+ zdata(node) + cinfo->freed_space_end, -+ nh40_get_free_space_start(nh) - cinfo->freed_space_end); -+ -+ /* update item headers of moved items - change their locations */ -+ pos = cinfo->first_moved; -+ ih = node40_ih_at(node, pos); -+ if (cinfo->head_removed_location != MAX_POS_IN_NODE) { -+ assert("vs-1580", pos == cinfo->head_removed); -+ ih40_set_offset(ih, cinfo->head_removed_location); -+ pos++; -+ ih--; -+ } -+ -+ freed = cinfo->freed_space_end - cinfo->freed_space_start; -+ for (; pos < nr_items; pos++, ih--) { -+ assert("vs-1581", ih == node40_ih_at(node, pos)); -+ ih40_set_offset(ih, ih40_get_offset(ih) - freed); -+ } -+ -+ /* free space start moved to right */ -+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - freed); -+ -+ if (cinfo->removed_count != MAX_POS_IN_NODE) { -+ /* number of items changed. Remove item headers of those items */ -+ ih = node40_ih_at(node, nr_items - 1); -+ memmove(ih + cinfo->removed_count, ih, -+ sizeof(item_header40) * (nr_items - -+ cinfo->removed_count - -+ cinfo->first_removed)); -+ freed += sizeof(item_header40) * cinfo->removed_count; -+ node40_set_num_items(node, nh, nr_items - cinfo->removed_count); -+ } -+ -+ /* total amount of free space increased */ -+ nh40_set_free_space(nh, nh40_get_free_space(nh) + freed); -+} -+ -+int shrink_item_node40(coord_t * coord, int delta) -+{ -+ node40_header *nh; -+ item_header40 *ih; -+ pos_in_node_t pos; -+ pos_in_node_t nr_items; -+ char *end; -+ znode *node; -+ int off; -+ -+ assert("nikita-3487", coord != NULL); -+ assert("nikita-3488", delta >= 0); -+ -+ node = coord->node; -+ nh = node40_node_header(node); -+ nr_items = nh40_get_num_items(nh); -+ -+ ih = node40_ih_at_coord(coord); -+ assert("nikita-3489", delta <= length_by_coord_node40(coord)); -+ off = ih40_get_offset(ih) + length_by_coord_node40(coord); -+ end = zdata(node) + off; -+ -+ /* remove gap made up by removal */ -+ memmove(end - delta, end, nh40_get_free_space_start(nh) - off); -+ -+ /* update item headers of moved items - change their locations */ -+ pos = coord->item_pos + 1; -+ ih = node40_ih_at(node, pos); -+ for (; pos < nr_items; pos++, ih--) { -+ assert("nikita-3490", ih == node40_ih_at(node, pos)); -+ ih40_set_offset(ih, ih40_get_offset(ih) - delta); -+ } -+ -+ /* free space start moved to left */ -+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - delta); -+ /* total amount of free space increased */ -+ nh40_set_free_space(nh, nh40_get_free_space(nh) + delta); -+ /* -+ * This method does _not_ changes number of items. Hence, it cannot -+ * make node empty. Also it doesn't remove items at all, which means -+ * that no keys have to be updated either. -+ */ -+ return 0; -+} -+ -+/* this is used by cut_node40 and kill_node40. It analyses input parameters and calculates cut mode. There are 2 types -+ of cut. First is when a unit is removed from the middle of an item. In this case this function returns 1. All the -+ rest fits into second case: 0 or 1 of items getting tail cut, 0 or more items removed completely and 0 or 1 item -+ getting head cut. Function returns 0 in this case */ -+static int -+parse_cut(struct cut40_info *cinfo, const struct cut_kill_params *params) -+{ -+ reiser4_key left_key, right_key; -+ reiser4_key min_from_key, max_to_key; -+ const reiser4_key *from_key, *to_key; -+ -+ init_cinfo(cinfo); -+ -+ /* calculate minimal key stored in first item of items to be cut (params->from) */ -+ item_key_by_coord(params->from, &min_from_key); -+ /* and max key stored in last item of items to be cut (params->to) */ -+ max_item_key_by_coord(params->to, &max_to_key); -+ -+ /* if cut key range is not defined in input parameters - define it using cut coord range */ -+ if (params->from_key == NULL) { -+ assert("vs-1513", params->to_key == NULL); -+ unit_key_by_coord(params->from, &left_key); -+ from_key = &left_key; -+ max_unit_key_by_coord(params->to, &right_key); -+ to_key = &right_key; -+ } else { -+ from_key = params->from_key; -+ to_key = params->to_key; -+ } -+ -+ if (params->from->item_pos == params->to->item_pos) { -+ if (keylt(&min_from_key, from_key) -+ && keylt(to_key, &max_to_key)) -+ return 1; -+ -+ if (keygt(from_key, &min_from_key)) { -+ /* tail of item is to be cut cut */ -+ cinfo->tail_removed = params->from->item_pos; -+ cinfo->mode |= CMODE_TAIL; -+ } else if (keylt(to_key, &max_to_key)) { -+ /* head of item is to be cut */ -+ cinfo->head_removed = params->from->item_pos; -+ cinfo->mode |= CMODE_HEAD; -+ } else { -+ /* item is removed completely */ -+ cinfo->first_removed = params->from->item_pos; -+ cinfo->removed_count = 1; -+ cinfo->mode |= CMODE_WHOLE; -+ } -+ } else { -+ cinfo->first_removed = params->from->item_pos + 1; -+ cinfo->removed_count = -+ params->to->item_pos - params->from->item_pos - 1; -+ -+ if (keygt(from_key, &min_from_key)) { -+ /* first item is not cut completely */ -+ cinfo->tail_removed = params->from->item_pos; -+ cinfo->mode |= CMODE_TAIL; -+ } else { -+ cinfo->first_removed--; -+ cinfo->removed_count++; -+ } -+ if (keylt(to_key, &max_to_key)) { -+ /* last item is not cut completely */ -+ cinfo->head_removed = params->to->item_pos; -+ cinfo->mode |= CMODE_HEAD; -+ } else { -+ cinfo->removed_count++; -+ } -+ if (cinfo->removed_count) -+ cinfo->mode |= CMODE_WHOLE; -+ } -+ -+ return 0; -+} -+ -+static void -+call_kill_hooks(znode * node, pos_in_node_t from, pos_in_node_t count, -+ carry_kill_data * kdata) -+{ -+ coord_t coord; -+ item_plugin *iplug; -+ pos_in_node_t pos; -+ -+ coord.node = node; -+ coord.unit_pos = 0; -+ coord.between = AT_UNIT; -+ for (pos = 0; pos < count; pos++) { -+ coord_set_item_pos(&coord, from + pos); -+ coord.unit_pos = 0; -+ coord.between = AT_UNIT; -+ iplug = item_plugin_by_coord(&coord); -+ if (iplug->b.kill_hook) { -+ iplug->b.kill_hook(&coord, 0, coord_num_units(&coord), -+ kdata); -+ } -+ } -+} -+ -+/* this is used to kill item partially */ -+static pos_in_node_t -+kill_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data, -+ reiser4_key * smallest_removed, reiser4_key * new_first_key) -+{ -+ struct carry_kill_data *kdata; -+ item_plugin *iplug; -+ -+ kdata = data; -+ iplug = item_plugin_by_coord(coord); -+ -+ assert("vs-1524", iplug->b.kill_units); -+ return iplug->b.kill_units(coord, from, to, kdata, smallest_removed, -+ new_first_key); -+} -+ -+/* call item plugin to cut tail of file */ -+static pos_in_node_t -+kill_tail(coord_t * coord, void *data, reiser4_key * smallest_removed) -+{ -+ struct carry_kill_data *kdata; -+ pos_in_node_t to; -+ -+ kdata = data; -+ to = coord_last_unit_pos(coord); -+ return kill_units(coord, coord->unit_pos, to, kdata, smallest_removed, -+ NULL); -+} -+ -+/* call item plugin to cut head of item */ -+static pos_in_node_t -+kill_head(coord_t * coord, void *data, reiser4_key * smallest_removed, -+ reiser4_key * new_first_key) -+{ -+ return kill_units(coord, 0, coord->unit_pos, data, smallest_removed, -+ new_first_key); -+} -+ -+/* this is used to cut item partially */ -+static pos_in_node_t -+cut_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data, -+ reiser4_key * smallest_removed, reiser4_key * new_first_key) -+{ -+ carry_cut_data *cdata; -+ item_plugin *iplug; -+ -+ cdata = data; -+ iplug = item_plugin_by_coord(coord); -+ assert("vs-302", iplug->b.cut_units); -+ return iplug->b.cut_units(coord, from, to, cdata, smallest_removed, -+ new_first_key); -+} -+ -+/* call item plugin to cut tail of file */ -+static pos_in_node_t -+cut_tail(coord_t * coord, void *data, reiser4_key * smallest_removed) -+{ -+ carry_cut_data *cdata; -+ pos_in_node_t to; -+ -+ cdata = data; -+ to = coord_last_unit_pos(cdata->params.from); -+ return cut_units(coord, coord->unit_pos, to, data, smallest_removed, NULL); -+} -+ -+/* call item plugin to cut head of item */ -+static pos_in_node_t -+cut_head(coord_t * coord, void *data, reiser4_key * smallest_removed, -+ reiser4_key * new_first_key) -+{ -+ return cut_units(coord, 0, coord->unit_pos, data, smallest_removed, -+ new_first_key); -+} -+ -+/* this returns 1 of key of first item changed, 0 - if it did not */ -+static int -+prepare_for_compact(struct cut40_info *cinfo, -+ const struct cut_kill_params *params, int is_cut, -+ void *data, carry_plugin_info * info) -+{ -+ znode *node; -+ item_header40 *ih; -+ pos_in_node_t freed; -+ pos_in_node_t item_pos; -+ coord_t coord; -+ reiser4_key new_first_key; -+ pos_in_node_t(*kill_units_f) (coord_t *, pos_in_node_t, pos_in_node_t, -+ void *, reiser4_key *, reiser4_key *); -+ pos_in_node_t(*kill_tail_f) (coord_t *, void *, reiser4_key *); -+ pos_in_node_t(*kill_head_f) (coord_t *, void *, reiser4_key *, -+ reiser4_key *); -+ int retval; -+ -+ retval = 0; -+ -+ node = params->from->node; -+ -+ assert("vs-184", node == params->to->node); -+ assert("vs-312", !node_is_empty(node)); -+ assert("vs-297", -+ coord_compare(params->from, params->to) != COORD_CMP_ON_RIGHT); -+ -+ if (is_cut) { -+ kill_units_f = cut_units; -+ kill_tail_f = cut_tail; -+ kill_head_f = cut_head; -+ } else { -+ kill_units_f = kill_units; -+ kill_tail_f = kill_tail; -+ kill_head_f = kill_head; -+ } -+ -+ if (parse_cut(cinfo, params) == 1) { -+ /* cut from the middle of item */ -+ freed = -+ kill_units_f(params->from, params->from->unit_pos, -+ params->to->unit_pos, data, -+ params->smallest_removed, NULL); -+ -+ item_pos = params->from->item_pos; -+ ih = node40_ih_at(node, item_pos); -+ cinfo->freed_space_start = -+ ih40_get_offset(ih) + node40_item_length(node, -+ item_pos) - freed; -+ cinfo->freed_space_end = cinfo->freed_space_start + freed; -+ cinfo->first_moved = item_pos + 1; -+ } else { -+ assert("vs-1521", (cinfo->tail_removed != MAX_POS_IN_NODE || -+ cinfo->first_removed != MAX_POS_IN_NODE || -+ cinfo->head_removed != MAX_POS_IN_NODE)); -+ -+ switch (cinfo->mode) { -+ case CMODE_TAIL: -+ /* one item gets cut partially from its end */ -+ assert("vs-1562", -+ cinfo->tail_removed == params->from->item_pos); -+ -+ freed = -+ kill_tail_f(params->from, data, -+ params->smallest_removed); -+ -+ item_pos = cinfo->tail_removed; -+ ih = node40_ih_at(node, item_pos); -+ cinfo->freed_space_start = -+ ih40_get_offset(ih) + node40_item_length(node, -+ item_pos) - -+ freed; -+ cinfo->freed_space_end = -+ cinfo->freed_space_start + freed; -+ cinfo->first_moved = cinfo->tail_removed + 1; -+ break; -+ -+ case CMODE_WHOLE: -+ /* one or more items get removed completely */ -+ assert("vs-1563", -+ cinfo->first_removed == params->from->item_pos); -+ assert("vs-1564", cinfo->removed_count > 0 -+ && cinfo->removed_count != MAX_POS_IN_NODE); -+ -+ /* call kill hook for all items removed completely */ -+ if (is_cut == 0) -+ call_kill_hooks(node, cinfo->first_removed, -+ cinfo->removed_count, data); -+ -+ item_pos = cinfo->first_removed; -+ ih = node40_ih_at(node, item_pos); -+ -+ if (params->smallest_removed) -+ memcpy(params->smallest_removed, &ih->key, -+ sizeof(reiser4_key)); -+ -+ cinfo->freed_space_start = ih40_get_offset(ih); -+ -+ item_pos += (cinfo->removed_count - 1); -+ ih -= (cinfo->removed_count - 1); -+ cinfo->freed_space_end = -+ ih40_get_offset(ih) + node40_item_length(node, -+ item_pos); -+ cinfo->first_moved = item_pos + 1; -+ if (cinfo->first_removed == 0) -+ /* key of first item of the node changes */ -+ retval = 1; -+ break; -+ -+ case CMODE_HEAD: -+ /* one item gets cut partially from its head */ -+ assert("vs-1565", -+ cinfo->head_removed == params->from->item_pos); -+ -+ freed = -+ kill_head_f(params->to, data, -+ params->smallest_removed, -+ &new_first_key); -+ -+ item_pos = cinfo->head_removed; -+ ih = node40_ih_at(node, item_pos); -+ cinfo->freed_space_start = ih40_get_offset(ih); -+ cinfo->freed_space_end = ih40_get_offset(ih) + freed; -+ cinfo->first_moved = cinfo->head_removed + 1; -+ -+ /* item head is removed, therefore, item key changed */ -+ coord.node = node; -+ coord_set_item_pos(&coord, item_pos); -+ coord.unit_pos = 0; -+ coord.between = AT_UNIT; -+ update_item_key_node40(&coord, &new_first_key, NULL); -+ if (item_pos == 0) -+ /* key of first item of the node changes */ -+ retval = 1; -+ break; -+ -+ case CMODE_TAIL | CMODE_WHOLE: -+ /* one item gets cut from its end and one or more items get removed completely */ -+ assert("vs-1566", -+ cinfo->tail_removed == params->from->item_pos); -+ assert("vs-1567", -+ cinfo->first_removed == cinfo->tail_removed + 1); -+ assert("vs-1564", cinfo->removed_count > 0 -+ && cinfo->removed_count != MAX_POS_IN_NODE); -+ -+ freed = -+ kill_tail_f(params->from, data, -+ params->smallest_removed); -+ -+ item_pos = cinfo->tail_removed; -+ ih = node40_ih_at(node, item_pos); -+ cinfo->freed_space_start = -+ ih40_get_offset(ih) + node40_item_length(node, -+ item_pos) - -+ freed; -+ -+ /* call kill hook for all items removed completely */ -+ if (is_cut == 0) -+ call_kill_hooks(node, cinfo->first_removed, -+ cinfo->removed_count, data); -+ -+ item_pos += cinfo->removed_count; -+ ih -= cinfo->removed_count; -+ cinfo->freed_space_end = -+ ih40_get_offset(ih) + node40_item_length(node, -+ item_pos); -+ cinfo->first_moved = item_pos + 1; -+ break; -+ -+ case CMODE_WHOLE | CMODE_HEAD: -+ /* one or more items get removed completely and one item gets cut partially from its head */ -+ assert("vs-1568", -+ cinfo->first_removed == params->from->item_pos); -+ assert("vs-1564", cinfo->removed_count > 0 -+ && cinfo->removed_count != MAX_POS_IN_NODE); -+ assert("vs-1569", -+ cinfo->head_removed == -+ cinfo->first_removed + cinfo->removed_count); -+ -+ /* call kill hook for all items removed completely */ -+ if (is_cut == 0) -+ call_kill_hooks(node, cinfo->first_removed, -+ cinfo->removed_count, data); -+ -+ item_pos = cinfo->first_removed; -+ ih = node40_ih_at(node, item_pos); -+ -+ if (params->smallest_removed) -+ memcpy(params->smallest_removed, &ih->key, -+ sizeof(reiser4_key)); -+ -+ freed = -+ kill_head_f(params->to, data, NULL, &new_first_key); -+ -+ cinfo->freed_space_start = ih40_get_offset(ih); -+ -+ ih = node40_ih_at(node, cinfo->head_removed); -+ /* this is the most complex case. Item which got head removed and items which are to be moved -+ intact change their location differently. */ -+ cinfo->freed_space_end = ih40_get_offset(ih) + freed; -+ cinfo->first_moved = cinfo->head_removed; -+ cinfo->head_removed_location = cinfo->freed_space_start; -+ -+ /* item head is removed, therefore, item key changed */ -+ coord.node = node; -+ coord_set_item_pos(&coord, cinfo->head_removed); -+ coord.unit_pos = 0; -+ coord.between = AT_UNIT; -+ update_item_key_node40(&coord, &new_first_key, NULL); -+ -+ assert("vs-1579", cinfo->first_removed == 0); -+ /* key of first item of the node changes */ -+ retval = 1; -+ break; -+ -+ case CMODE_TAIL | CMODE_HEAD: -+ /* one item get cut from its end and its neighbor gets cut from its tail */ -+ impossible("vs-1576", "this can not happen currently"); -+ break; -+ -+ case CMODE_TAIL | CMODE_WHOLE | CMODE_HEAD: -+ impossible("vs-1577", "this can not happen currently"); -+ break; -+ default: -+ impossible("vs-1578", "unexpected cut mode"); -+ break; -+ } -+ } -+ return retval; -+} -+ -+/* plugin->u.node.kill -+ return value is number of items removed completely */ -+int kill_node40(struct carry_kill_data *kdata, carry_plugin_info * info) -+{ -+ znode *node; -+ struct cut40_info cinfo; -+ int first_key_changed; -+ -+ node = kdata->params.from->node; -+ -+ first_key_changed = -+ prepare_for_compact(&cinfo, &kdata->params, 0 /* not cut */ , kdata, -+ info); -+ compact(node, &cinfo); -+ -+ if (info) { -+ /* it is not called by node40_shift, so we have to take care -+ of changes on upper levels */ -+ if (node_is_empty(node) -+ && !(kdata->flags & DELETE_RETAIN_EMPTY)) -+ /* all contents of node is deleted */ -+ prepare_removal_node40(node, info); -+ else if (first_key_changed) { -+ prepare_for_update(NULL, node, info); -+ } -+ } -+ -+ coord_clear_iplug(kdata->params.from); -+ coord_clear_iplug(kdata->params.to); -+ -+ znode_make_dirty(node); -+ return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count; -+} -+ -+/* plugin->u.node.cut -+ return value is number of items removed completely */ -+int cut_node40(struct carry_cut_data *cdata, carry_plugin_info * info) -+{ -+ znode *node; -+ struct cut40_info cinfo; -+ int first_key_changed; -+ -+ node = cdata->params.from->node; -+ -+ first_key_changed = -+ prepare_for_compact(&cinfo, &cdata->params, 1 /* not cut */ , cdata, -+ info); -+ compact(node, &cinfo); -+ -+ if (info) { -+ /* it is not called by node40_shift, so we have to take care -+ of changes on upper levels */ -+ if (node_is_empty(node)) -+ /* all contents of node is deleted */ -+ prepare_removal_node40(node, info); -+ else if (first_key_changed) { -+ prepare_for_update(NULL, node, info); -+ } -+ } -+ -+ coord_clear_iplug(cdata->params.from); -+ coord_clear_iplug(cdata->params.to); -+ -+ znode_make_dirty(node); -+ return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count; -+} -+ -+/* this structure is used by shift method of node40 plugin */ -+struct shift_params { -+ shift_direction pend; /* when @pend == append - we are shifting to -+ left, when @pend == prepend - to right */ -+ coord_t wish_stop; /* when shifting to left this is last unit we -+ want shifted, when shifting to right - this -+ is set to unit we want to start shifting -+ from */ -+ znode *target; -+ int everything; /* it is set to 1 if everything we have to shift is -+ shifted, 0 - otherwise */ -+ -+ /* FIXME-VS: get rid of read_stop */ -+ -+ /* these are set by estimate_shift */ -+ coord_t real_stop; /* this will be set to last unit which will be -+ really shifted */ -+ -+ /* coordinate in source node before operation of unit which becomes -+ first after shift to left of last after shift to right */ -+ union { -+ coord_t future_first; -+ coord_t future_last; -+ } u; -+ -+ unsigned merging_units; /* number of units of first item which have to -+ be merged with last item of target node */ -+ unsigned merging_bytes; /* number of bytes in those units */ -+ -+ unsigned entire; /* items shifted in their entirety */ -+ unsigned entire_bytes; /* number of bytes in those items */ -+ -+ unsigned part_units; /* number of units of partially copied item */ -+ unsigned part_bytes; /* number of bytes in those units */ -+ -+ unsigned shift_bytes; /* total number of bytes in items shifted (item -+ headers not included) */ -+ -+}; -+ -+static int item_creation_overhead(coord_t *item) -+{ -+ return node_plugin_by_coord(item)->item_overhead(item->node, NULL); -+} -+ -+/* how many units are there in @source starting from source->unit_pos -+ but not further than @stop_coord */ -+static int -+wanted_units(coord_t *source, coord_t *stop_coord, shift_direction pend) -+{ -+ if (pend == SHIFT_LEFT) { -+ assert("vs-181", source->unit_pos == 0); -+ } else { -+ assert("vs-182", -+ source->unit_pos == coord_last_unit_pos(source)); -+ } -+ -+ if (source->item_pos != stop_coord->item_pos) { -+ /* @source and @stop_coord are different items */ -+ return coord_last_unit_pos(source) + 1; -+ } -+ -+ if (pend == SHIFT_LEFT) { -+ return stop_coord->unit_pos + 1; -+ } else { -+ return source->unit_pos - stop_coord->unit_pos + 1; -+ } -+} -+ -+/* this calculates what can be copied from @shift->wish_stop.node to -+ @shift->target */ -+static void -+estimate_shift(struct shift_params *shift, const reiser4_context * ctx) -+{ -+ unsigned target_free_space, size; -+ pos_in_node_t stop_item; /* item which estimating should not consider */ -+ unsigned want; /* number of units of item we want shifted */ -+ coord_t source; /* item being estimated */ -+ item_plugin *iplug; -+ -+ /* shifting to left/right starts from first/last units of -+ @shift->wish_stop.node */ -+ if (shift->pend == SHIFT_LEFT) { -+ coord_init_first_unit(&source, shift->wish_stop.node); -+ } else { -+ coord_init_last_unit(&source, shift->wish_stop.node); -+ } -+ shift->real_stop = source; -+ -+ /* free space in target node and number of items in source */ -+ target_free_space = znode_free_space(shift->target); -+ -+ shift->everything = 0; -+ if (!node_is_empty(shift->target)) { -+ /* target node is not empty, check for boundary items -+ mergeability */ -+ coord_t to; -+ -+ /* item we try to merge @source with */ -+ if (shift->pend == SHIFT_LEFT) { -+ coord_init_last_unit(&to, shift->target); -+ } else { -+ coord_init_first_unit(&to, shift->target); -+ } -+ -+ if ((shift->pend == SHIFT_LEFT) ? are_items_mergeable(&to, -+ &source) : -+ are_items_mergeable(&source, &to)) { -+ /* how many units of @source do we want to merge to -+ item @to */ -+ want = -+ wanted_units(&source, &shift->wish_stop, -+ shift->pend); -+ -+ /* how many units of @source we can merge to item -+ @to */ -+ iplug = item_plugin_by_coord(&source); -+ if (iplug->b.can_shift != NULL) -+ shift->merging_units = -+ iplug->b.can_shift(target_free_space, -+ &source, shift->target, -+ shift->pend, &size, -+ want); -+ else { -+ shift->merging_units = 0; -+ size = 0; -+ } -+ shift->merging_bytes = size; -+ shift->shift_bytes += size; -+ /* update stop coord to be set to last unit of @source -+ we can merge to @target */ -+ if (shift->merging_units) -+ /* at least one unit can be shifted */ -+ shift->real_stop.unit_pos = -+ (shift->merging_units - source.unit_pos - -+ 1) * shift->pend; -+ else { -+ /* nothing can be shifted */ -+ if (shift->pend == SHIFT_LEFT) -+ coord_init_before_first_item(&shift-> -+ real_stop, -+ source. -+ node); -+ else -+ coord_init_after_last_item(&shift-> -+ real_stop, -+ source.node); -+ } -+ assert("nikita-2081", shift->real_stop.unit_pos + 1); -+ -+ if (shift->merging_units != want) { -+ /* we could not copy as many as we want, so, -+ there is no reason for estimating any -+ longer */ -+ return; -+ } -+ -+ target_free_space -= size; -+ coord_add_item_pos(&source, shift->pend); -+ } -+ } -+ -+ /* number of item nothing of which we want to shift */ -+ stop_item = shift->wish_stop.item_pos + shift->pend; -+ -+ /* calculate how many items can be copied into given free -+ space as whole */ -+ for (; source.item_pos != stop_item; -+ coord_add_item_pos(&source, shift->pend)) { -+ if (shift->pend == SHIFT_RIGHT) -+ source.unit_pos = coord_last_unit_pos(&source); -+ -+ /* how many units of @source do we want to copy */ -+ want = wanted_units(&source, &shift->wish_stop, shift->pend); -+ -+ if (want == coord_last_unit_pos(&source) + 1) { -+ /* we want this item to be copied entirely */ -+ size = -+ item_length_by_coord(&source) + -+ item_creation_overhead(&source); -+ if (size <= target_free_space) { -+ /* item fits into target node as whole */ -+ target_free_space -= size; -+ shift->shift_bytes += -+ size - item_creation_overhead(&source); -+ shift->entire_bytes += -+ size - item_creation_overhead(&source); -+ shift->entire++; -+ -+ /* update shift->real_stop coord to be set to -+ last unit of @source we can merge to -+ @target */ -+ shift->real_stop = source; -+ if (shift->pend == SHIFT_LEFT) -+ shift->real_stop.unit_pos = -+ coord_last_unit_pos(&shift-> -+ real_stop); -+ else -+ shift->real_stop.unit_pos = 0; -+ continue; -+ } -+ } -+ -+ /* we reach here only for an item which does not fit into -+ target node in its entirety. This item may be either -+ partially shifted, or not shifted at all. We will have to -+ create new item in target node, so decrease amout of free -+ space by an item creation overhead. We can reach here also -+ if stop coord is in this item */ -+ if (target_free_space >= -+ (unsigned)item_creation_overhead(&source)) { -+ target_free_space -= item_creation_overhead(&source); -+ iplug = item_plugin_by_coord(&source); -+ if (iplug->b.can_shift) { -+ shift->part_units = iplug->b.can_shift(target_free_space, -+ &source, -+ NULL, /* target */ -+ shift->pend, -+ &size, -+ want); -+ } else { -+ target_free_space = 0; -+ shift->part_units = 0; -+ size = 0; -+ } -+ } else { -+ target_free_space = 0; -+ shift->part_units = 0; -+ size = 0; -+ } -+ shift->part_bytes = size; -+ shift->shift_bytes += size; -+ -+ /* set @shift->real_stop to last unit of @source we can merge -+ to @shift->target */ -+ if (shift->part_units) { -+ shift->real_stop = source; -+ shift->real_stop.unit_pos = -+ (shift->part_units - source.unit_pos - -+ 1) * shift->pend; -+ assert("nikita-2082", shift->real_stop.unit_pos + 1); -+ } -+ -+ if (want != shift->part_units) -+ /* not everything wanted were shifted */ -+ return; -+ break; -+ } -+ -+ shift->everything = 1; -+} -+ -+static void -+copy_units(coord_t * target, coord_t * source, unsigned from, unsigned count, -+ shift_direction dir, unsigned free_space) -+{ -+ item_plugin *iplug; -+ -+ assert("nikita-1463", target != NULL); -+ assert("nikita-1464", source != NULL); -+ assert("nikita-1465", from + count <= coord_num_units(source)); -+ -+ iplug = item_plugin_by_coord(source); -+ assert("nikita-1468", iplug == item_plugin_by_coord(target)); -+ iplug->b.copy_units(target, source, from, count, dir, free_space); -+ -+ if (dir == SHIFT_RIGHT) { -+ /* FIXME-VS: this looks not necessary. update_item_key was -+ called already by copy_units method */ -+ reiser4_key split_key; -+ -+ assert("nikita-1469", target->unit_pos == 0); -+ -+ unit_key_by_coord(target, &split_key); -+ node_plugin_by_coord(target)->update_item_key(target, -+ &split_key, NULL); -+ } -+} -+ -+/* copy part of @shift->real_stop.node starting either from its beginning or -+ from its end and ending at @shift->real_stop to either the end or the -+ beginning of @shift->target */ -+static void copy(struct shift_params *shift) -+{ -+ node40_header *nh; -+ coord_t from; -+ coord_t to; -+ item_header40 *from_ih, *to_ih; -+ int free_space_start; -+ int new_items; -+ unsigned old_items; -+ int old_offset; -+ unsigned i; -+ -+ nh = node40_node_header(shift->target); -+ free_space_start = nh40_get_free_space_start(nh); -+ old_items = nh40_get_num_items(nh); -+ new_items = shift->entire + (shift->part_units ? 1 : 0); -+ assert("vs-185", -+ shift->shift_bytes == -+ shift->merging_bytes + shift->entire_bytes + shift->part_bytes); -+ -+ from = shift->wish_stop; -+ -+ coord_init_first_unit(&to, shift->target); -+ -+ /* NOTE:NIKITA->VS not sure what I am doing: shift->target is empty, -+ hence to.between is set to EMPTY_NODE above. Looks like we want it -+ to be AT_UNIT. -+ -+ Oh, wonders of ->betweeness... -+ -+ */ -+ to.between = AT_UNIT; -+ -+ if (shift->pend == SHIFT_LEFT) { -+ /* copying to left */ -+ -+ coord_set_item_pos(&from, 0); -+ from_ih = node40_ih_at(from.node, 0); -+ -+ coord_set_item_pos(&to, -+ node40_num_of_items_internal(to.node) - 1); -+ if (shift->merging_units) { -+ /* expand last item, so that plugin methods will see -+ correct data */ -+ free_space_start += shift->merging_bytes; -+ nh40_set_free_space_start(nh, -+ (unsigned)free_space_start); -+ nh40_set_free_space(nh, -+ nh40_get_free_space(nh) - -+ shift->merging_bytes); -+ -+ /* appending last item of @target */ -+ copy_units(&to, &from, 0, /* starting from 0-th unit */ -+ shift->merging_units, SHIFT_LEFT, -+ shift->merging_bytes); -+ coord_inc_item_pos(&from); -+ from_ih--; -+ coord_inc_item_pos(&to); -+ } -+ -+ to_ih = node40_ih_at(shift->target, old_items); -+ if (shift->entire) { -+ /* copy @entire items entirely */ -+ -+ /* copy item headers */ -+ memcpy(to_ih - shift->entire + 1, -+ from_ih - shift->entire + 1, -+ shift->entire * sizeof(item_header40)); -+ /* update item header offset */ -+ old_offset = ih40_get_offset(from_ih); -+ /* AUDIT: Looks like if we calculate old_offset + free_space_start here instead of just old_offset, we can perform one "add" operation less per each iteration */ -+ for (i = 0; i < shift->entire; i++, to_ih--, from_ih--) -+ ih40_set_offset(to_ih, -+ ih40_get_offset(from_ih) - -+ old_offset + free_space_start); -+ -+ /* copy item bodies */ -+ memcpy(zdata(shift->target) + free_space_start, zdata(from.node) + old_offset, /*ih40_get_offset (from_ih), */ -+ shift->entire_bytes); -+ -+ coord_add_item_pos(&from, (int)shift->entire); -+ coord_add_item_pos(&to, (int)shift->entire); -+ } -+ -+ nh40_set_free_space_start(nh, -+ free_space_start + -+ shift->shift_bytes - -+ shift->merging_bytes); -+ nh40_set_free_space(nh, -+ nh40_get_free_space(nh) - -+ (shift->shift_bytes - shift->merging_bytes + -+ sizeof(item_header40) * new_items)); -+ -+ /* update node header */ -+ node40_set_num_items(shift->target, nh, old_items + new_items); -+ assert("vs-170", -+ nh40_get_free_space(nh) < znode_size(shift->target)); -+ -+ if (shift->part_units) { -+ /* copy heading part (@part units) of @source item as -+ a new item into @target->node */ -+ -+ /* copy item header of partially copied item */ -+ coord_set_item_pos(&to, -+ node40_num_of_items_internal(to.node) -+ - 1); -+ memcpy(to_ih, from_ih, sizeof(item_header40)); -+ ih40_set_offset(to_ih, -+ nh40_get_free_space_start(nh) - -+ shift->part_bytes); -+ if (item_plugin_by_coord(&to)->b.init) -+ item_plugin_by_coord(&to)->b.init(&to, &from, -+ NULL); -+ copy_units(&to, &from, 0, shift->part_units, SHIFT_LEFT, -+ shift->part_bytes); -+ } -+ -+ } else { -+ /* copying to right */ -+ -+ coord_set_item_pos(&from, -+ node40_num_of_items_internal(from.node) - 1); -+ from_ih = node40_ih_at_coord(&from); -+ -+ coord_set_item_pos(&to, 0); -+ -+ /* prepare space for new items */ -+ memmove(zdata(to.node) + sizeof(node40_header) + -+ shift->shift_bytes, -+ zdata(to.node) + sizeof(node40_header), -+ free_space_start - sizeof(node40_header)); -+ /* update item headers of moved items */ -+ to_ih = node40_ih_at(to.node, 0); -+ /* first item gets @merging_bytes longer. free space appears -+ at its beginning */ -+ if (!node_is_empty(to.node)) -+ ih40_set_offset(to_ih, -+ ih40_get_offset(to_ih) + -+ shift->shift_bytes - -+ shift->merging_bytes); -+ -+ for (i = 1; i < old_items; i++) -+ ih40_set_offset(to_ih - i, -+ ih40_get_offset(to_ih - i) + -+ shift->shift_bytes); -+ -+ /* move item headers to make space for new items */ -+ memmove(to_ih - old_items + 1 - new_items, -+ to_ih - old_items + 1, -+ sizeof(item_header40) * old_items); -+ to_ih -= (new_items - 1); -+ -+ nh40_set_free_space_start(nh, -+ free_space_start + -+ shift->shift_bytes); -+ nh40_set_free_space(nh, -+ nh40_get_free_space(nh) - -+ (shift->shift_bytes + -+ sizeof(item_header40) * new_items)); -+ -+ /* update node header */ -+ node40_set_num_items(shift->target, nh, old_items + new_items); -+ assert("vs-170", -+ nh40_get_free_space(nh) < znode_size(shift->target)); -+ -+ if (shift->merging_units) { -+ coord_add_item_pos(&to, new_items); -+ to.unit_pos = 0; -+ to.between = AT_UNIT; -+ /* prepend first item of @to */ -+ copy_units(&to, &from, -+ coord_last_unit_pos(&from) - -+ shift->merging_units + 1, -+ shift->merging_units, SHIFT_RIGHT, -+ shift->merging_bytes); -+ coord_dec_item_pos(&from); -+ from_ih++; -+ } -+ -+ if (shift->entire) { -+ /* copy @entire items entirely */ -+ -+ /* copy item headers */ -+ memcpy(to_ih, from_ih, -+ shift->entire * sizeof(item_header40)); -+ -+ /* update item header offset */ -+ old_offset = -+ ih40_get_offset(from_ih + shift->entire - 1); -+ /* AUDIT: old_offset + sizeof (node40_header) + shift->part_bytes calculation can be taken off the loop. */ -+ for (i = 0; i < shift->entire; i++, to_ih++, from_ih++) -+ ih40_set_offset(to_ih, -+ ih40_get_offset(from_ih) - -+ old_offset + -+ sizeof(node40_header) + -+ shift->part_bytes); -+ /* copy item bodies */ -+ coord_add_item_pos(&from, -(int)(shift->entire - 1)); -+ memcpy(zdata(to.node) + sizeof(node40_header) + -+ shift->part_bytes, item_by_coord_node40(&from), -+ shift->entire_bytes); -+ coord_dec_item_pos(&from); -+ } -+ -+ if (shift->part_units) { -+ coord_set_item_pos(&to, 0); -+ to.unit_pos = 0; -+ to.between = AT_UNIT; -+ /* copy heading part (@part units) of @source item as -+ a new item into @target->node */ -+ -+ /* copy item header of partially copied item */ -+ memcpy(to_ih, from_ih, sizeof(item_header40)); -+ ih40_set_offset(to_ih, sizeof(node40_header)); -+ if (item_plugin_by_coord(&to)->b.init) -+ item_plugin_by_coord(&to)->b.init(&to, &from, -+ NULL); -+ copy_units(&to, &from, -+ coord_last_unit_pos(&from) - -+ shift->part_units + 1, shift->part_units, -+ SHIFT_RIGHT, shift->part_bytes); -+ } -+ } -+} -+ -+/* remove everything either before or after @fact_stop. Number of items -+ removed completely is returned */ -+static int delete_copied(struct shift_params *shift) -+{ -+ coord_t from; -+ coord_t to; -+ struct carry_cut_data cdata; -+ -+ if (shift->pend == SHIFT_LEFT) { -+ /* we were shifting to left, remove everything from the -+ beginning of @shift->wish_stop->node upto -+ @shift->wish_stop */ -+ coord_init_first_unit(&from, shift->real_stop.node); -+ to = shift->real_stop; -+ -+ /* store old coordinate of unit which will be first after -+ shift to left */ -+ shift->u.future_first = to; -+ coord_next_unit(&shift->u.future_first); -+ } else { -+ /* we were shifting to right, remove everything from -+ @shift->stop_coord upto to end of -+ @shift->stop_coord->node */ -+ from = shift->real_stop; -+ coord_init_last_unit(&to, from.node); -+ -+ /* store old coordinate of unit which will be last after -+ shift to right */ -+ shift->u.future_last = from; -+ coord_prev_unit(&shift->u.future_last); -+ } -+ -+ cdata.params.from = &from; -+ cdata.params.to = &to; -+ cdata.params.from_key = NULL; -+ cdata.params.to_key = NULL; -+ cdata.params.smallest_removed = NULL; -+ return cut_node40(&cdata, NULL); -+} -+ -+/* something was moved between @left and @right. Add carry operation to @info -+ list to have carry to update delimiting key between them */ -+static int -+prepare_for_update(znode * left, znode * right, carry_plugin_info * info) -+{ -+ carry_op *op; -+ carry_node *cn; -+ -+ if (info == NULL) -+ /* nowhere to send operation to. */ -+ return 0; -+ -+ if (!should_notify_parent(right)) -+ return 0; -+ -+ op = node_post_carry(info, COP_UPDATE, right, 1); -+ if (IS_ERR(op) || op == NULL) -+ return op ? PTR_ERR(op) : -EIO; -+ -+ if (left != NULL) { -+ carry_node *reference; -+ -+ if (info->doing) -+ reference = insert_carry_node(info->doing, -+ info->todo, left); -+ else -+ reference = op->node; -+ assert("nikita-2992", reference != NULL); -+ cn = reiser4_add_carry(info->todo, POOLO_BEFORE, reference); -+ if (IS_ERR(cn)) -+ return PTR_ERR(cn); -+ cn->parent = 1; -+ cn->node = left; -+ if (ZF_ISSET(left, JNODE_ORPHAN)) -+ cn->left_before = 1; -+ op->u.update.left = cn; -+ } else -+ op->u.update.left = NULL; -+ return 0; -+} -+ -+/* plugin->u.node.prepare_removal -+ to delete a pointer to @empty from the tree add corresponding carry -+ operation (delete) to @info list */ -+int prepare_removal_node40(znode * empty, carry_plugin_info * info) -+{ -+ carry_op *op; -+ reiser4_tree *tree; -+ -+ if (!should_notify_parent(empty)) -+ return 0; -+ /* already on a road to Styx */ -+ if (ZF_ISSET(empty, JNODE_HEARD_BANSHEE)) -+ return 0; -+ op = node_post_carry(info, COP_DELETE, empty, 1); -+ if (IS_ERR(op) || op == NULL) -+ return RETERR(op ? PTR_ERR(op) : -EIO); -+ -+ op->u.delete.child = NULL; -+ op->u.delete.flags = 0; -+ -+ /* fare thee well */ -+ tree = znode_get_tree(empty); -+ read_lock_tree(tree); -+ write_lock_dk(tree); -+ znode_set_ld_key(empty, znode_get_rd_key(empty)); -+ if (znode_is_left_connected(empty) && empty->left) -+ znode_set_rd_key(empty->left, znode_get_rd_key(empty)); -+ write_unlock_dk(tree); -+ read_unlock_tree(tree); -+ -+ ZF_SET(empty, JNODE_HEARD_BANSHEE); -+ return 0; -+} -+ -+/* something were shifted from @insert_coord->node to @shift->target, update -+ @insert_coord correspondingly */ -+static void -+adjust_coord(coord_t * insert_coord, struct shift_params *shift, int removed, -+ int including_insert_coord) -+{ -+ /* item plugin was invalidated by shifting */ -+ coord_clear_iplug(insert_coord); -+ -+ if (node_is_empty(shift->wish_stop.node)) { -+ assert("vs-242", shift->everything); -+ if (including_insert_coord) { -+ if (shift->pend == SHIFT_RIGHT) { -+ /* set @insert_coord before first unit of -+ @shift->target node */ -+ coord_init_before_first_item(insert_coord, -+ shift->target); -+ } else { -+ /* set @insert_coord after last in target node */ -+ coord_init_after_last_item(insert_coord, -+ shift->target); -+ } -+ } else { -+ /* set @insert_coord inside of empty node. There is -+ only one possible coord within an empty -+ node. init_first_unit will set that coord */ -+ coord_init_first_unit(insert_coord, -+ shift->wish_stop.node); -+ } -+ return; -+ } -+ -+ if (shift->pend == SHIFT_RIGHT) { -+ /* there was shifting to right */ -+ if (shift->everything) { -+ /* everything wanted was shifted */ -+ if (including_insert_coord) { -+ /* @insert_coord is set before first unit of -+ @to node */ -+ coord_init_before_first_item(insert_coord, -+ shift->target); -+ insert_coord->between = BEFORE_UNIT; -+ } else { -+ /* @insert_coord is set after last unit of -+ @insert->node */ -+ coord_init_last_unit(insert_coord, -+ shift->wish_stop.node); -+ insert_coord->between = AFTER_UNIT; -+ } -+ } -+ return; -+ } -+ -+ /* there was shifting to left */ -+ if (shift->everything) { -+ /* everything wanted was shifted */ -+ if (including_insert_coord) { -+ /* @insert_coord is set after last unit in @to node */ -+ coord_init_after_last_item(insert_coord, shift->target); -+ } else { -+ /* @insert_coord is set before first unit in the same -+ node */ -+ coord_init_before_first_item(insert_coord, -+ shift->wish_stop.node); -+ } -+ return; -+ } -+ -+ /* FIXME-VS: the code below is complicated because with between == -+ AFTER_ITEM unit_pos is set to 0 */ -+ -+ if (!removed) { -+ /* no items were shifted entirely */ -+ assert("vs-195", shift->merging_units == 0 -+ || shift->part_units == 0); -+ -+ if (shift->real_stop.item_pos == insert_coord->item_pos) { -+ if (shift->merging_units) { -+ if (insert_coord->between == AFTER_UNIT) { -+ assert("nikita-1441", -+ insert_coord->unit_pos >= -+ shift->merging_units); -+ insert_coord->unit_pos -= -+ shift->merging_units; -+ } else if (insert_coord->between == BEFORE_UNIT) { -+ assert("nikita-2090", -+ insert_coord->unit_pos > -+ shift->merging_units); -+ insert_coord->unit_pos -= -+ shift->merging_units; -+ } -+ -+ assert("nikita-2083", -+ insert_coord->unit_pos + 1); -+ } else { -+ if (insert_coord->between == AFTER_UNIT) { -+ assert("nikita-1442", -+ insert_coord->unit_pos >= -+ shift->part_units); -+ insert_coord->unit_pos -= -+ shift->part_units; -+ } else if (insert_coord->between == BEFORE_UNIT) { -+ assert("nikita-2089", -+ insert_coord->unit_pos > -+ shift->part_units); -+ insert_coord->unit_pos -= -+ shift->part_units; -+ } -+ -+ assert("nikita-2084", -+ insert_coord->unit_pos + 1); -+ } -+ } -+ return; -+ } -+ -+ /* we shifted to left and there was no enough space for everything */ -+ switch (insert_coord->between) { -+ case AFTER_UNIT: -+ case BEFORE_UNIT: -+ if (shift->real_stop.item_pos == insert_coord->item_pos) -+ insert_coord->unit_pos -= shift->part_units; -+ case AFTER_ITEM: -+ coord_add_item_pos(insert_coord, -removed); -+ break; -+ default: -+ impossible("nikita-2087", "not ready"); -+ } -+ assert("nikita-2085", insert_coord->unit_pos + 1); -+} -+ -+static int call_shift_hooks(struct shift_params *shift) -+{ -+ unsigned i, shifted; -+ coord_t coord; -+ item_plugin *iplug; -+ -+ assert("vs-275", !node_is_empty(shift->target)); -+ -+ /* number of items shift touches */ -+ shifted = -+ shift->entire + (shift->merging_units ? 1 : 0) + -+ (shift->part_units ? 1 : 0); -+ -+ if (shift->pend == SHIFT_LEFT) { -+ /* moved items are at the end */ -+ coord_init_last_unit(&coord, shift->target); -+ coord.unit_pos = 0; -+ -+ assert("vs-279", shift->pend == 1); -+ for (i = 0; i < shifted; i++) { -+ unsigned from, count; -+ -+ iplug = item_plugin_by_coord(&coord); -+ if (i == 0 && shift->part_units) { -+ assert("vs-277", -+ coord_num_units(&coord) == -+ shift->part_units); -+ count = shift->part_units; -+ from = 0; -+ } else if (i == shifted - 1 && shift->merging_units) { -+ count = shift->merging_units; -+ from = coord_num_units(&coord) - count; -+ } else { -+ count = coord_num_units(&coord); -+ from = 0; -+ } -+ -+ if (iplug->b.shift_hook) { -+ iplug->b.shift_hook(&coord, from, count, -+ shift->wish_stop.node); -+ } -+ coord_add_item_pos(&coord, -shift->pend); -+ } -+ } else { -+ /* moved items are at the beginning */ -+ coord_init_first_unit(&coord, shift->target); -+ -+ assert("vs-278", shift->pend == -1); -+ for (i = 0; i < shifted; i++) { -+ unsigned from, count; -+ -+ iplug = item_plugin_by_coord(&coord); -+ if (i == 0 && shift->part_units) { -+ assert("vs-277", -+ coord_num_units(&coord) == -+ shift->part_units); -+ count = coord_num_units(&coord); -+ from = 0; -+ } else if (i == shifted - 1 && shift->merging_units) { -+ count = shift->merging_units; -+ from = 0; -+ } else { -+ count = coord_num_units(&coord); -+ from = 0; -+ } -+ -+ if (iplug->b.shift_hook) { -+ iplug->b.shift_hook(&coord, from, count, -+ shift->wish_stop.node); -+ } -+ coord_add_item_pos(&coord, -shift->pend); -+ } -+ } -+ -+ return 0; -+} -+ -+/* shift to left is completed. Return 1 if unit @old was moved to left neighbor */ -+static int -+unit_moved_left(const struct shift_params *shift, const coord_t * old) -+{ -+ assert("vs-944", shift->real_stop.node == old->node); -+ -+ if (shift->real_stop.item_pos < old->item_pos) -+ return 0; -+ if (shift->real_stop.item_pos == old->item_pos) { -+ if (shift->real_stop.unit_pos < old->unit_pos) -+ return 0; -+ } -+ return 1; -+} -+ -+/* shift to right is completed. Return 1 if unit @old was moved to right -+ neighbor */ -+static int -+unit_moved_right(const struct shift_params *shift, const coord_t * old) -+{ -+ assert("vs-944", shift->real_stop.node == old->node); -+ -+ if (shift->real_stop.item_pos > old->item_pos) -+ return 0; -+ if (shift->real_stop.item_pos == old->item_pos) { -+ if (shift->real_stop.unit_pos > old->unit_pos) -+ return 0; -+ } -+ return 1; -+} -+ -+/* coord @old was set in node from which shift was performed. What was shifted -+ is stored in @shift. Update @old correspondingly to performed shift */ -+static coord_t *adjust_coord2(const struct shift_params *shift, -+ const coord_t * old, coord_t * new) -+{ -+ coord_clear_iplug(new); -+ new->between = old->between; -+ -+ coord_clear_iplug(new); -+ if (old->node == shift->target) { -+ if (shift->pend == SHIFT_LEFT) { -+ /* coord which is set inside of left neighbor does not -+ change during shift to left */ -+ coord_dup(new, old); -+ return new; -+ } -+ new->node = old->node; -+ coord_set_item_pos(new, -+ old->item_pos + shift->entire + -+ (shift->part_units ? 1 : 0)); -+ new->unit_pos = old->unit_pos; -+ if (old->item_pos == 0 && shift->merging_units) -+ new->unit_pos += shift->merging_units; -+ return new; -+ } -+ -+ assert("vs-977", old->node == shift->wish_stop.node); -+ if (shift->pend == SHIFT_LEFT) { -+ if (unit_moved_left(shift, old)) { -+ /* unit @old moved to left neighbor. Calculate its -+ coordinate there */ -+ new->node = shift->target; -+ coord_set_item_pos(new, -+ node_num_items(shift->target) - -+ shift->entire - -+ (shift->part_units ? 1 : 0) + -+ old->item_pos); -+ -+ new->unit_pos = old->unit_pos; -+ if (shift->merging_units) { -+ coord_dec_item_pos(new); -+ if (old->item_pos == 0) { -+ /* unit_pos only changes if item got -+ merged */ -+ new->unit_pos = -+ coord_num_units(new) - -+ (shift->merging_units - -+ old->unit_pos); -+ } -+ } -+ } else { -+ /* unit @old did not move to left neighbor. -+ -+ Use _nocheck, because @old is outside of its node. -+ */ -+ coord_dup_nocheck(new, old); -+ coord_add_item_pos(new, -+ -shift->u.future_first.item_pos); -+ if (new->item_pos == 0) -+ new->unit_pos -= shift->u.future_first.unit_pos; -+ } -+ } else { -+ if (unit_moved_right(shift, old)) { -+ /* unit @old moved to right neighbor */ -+ new->node = shift->target; -+ coord_set_item_pos(new, -+ old->item_pos - -+ shift->real_stop.item_pos); -+ if (new->item_pos == 0) { -+ /* unit @old might change unit pos */ -+ coord_set_item_pos(new, -+ old->unit_pos - -+ shift->real_stop.unit_pos); -+ } -+ } else { -+ /* unit @old did not move to right neighbor, therefore -+ it did not change */ -+ coord_dup(new, old); -+ } -+ } -+ coord_set_iplug(new, item_plugin_by_coord(new)); -+ return new; -+} -+ -+/* this is called when shift is completed (something of source node is copied -+ to target and deleted in source) to update all taps set in current -+ context */ -+static void update_taps(const struct shift_params *shift) -+{ -+ tap_t *tap; -+ coord_t new; -+ -+ for_all_taps(tap) { -+ /* update only taps set to nodes participating in shift */ -+ if (tap->coord->node == shift->wish_stop.node -+ || tap->coord->node == shift->target) -+ tap_to_coord(tap, -+ adjust_coord2(shift, tap->coord, &new)); -+ } -+} -+ -+#if REISER4_DEBUG -+ -+struct shift_check { -+ reiser4_key key; -+ __u16 plugin_id; -+ union { -+ __u64 bytes; -+ __u64 entries; -+ void *unused; -+ } u; -+}; -+ -+void *shift_check_prepare(const znode * left, const znode * right) -+{ -+ pos_in_node_t i, nr_items; -+ int mergeable; -+ struct shift_check *data; -+ item_header40 *ih; -+ -+ if (node_is_empty(left) || node_is_empty(right)) -+ mergeable = 0; -+ else { -+ coord_t l, r; -+ -+ coord_init_last_unit(&l, left); -+ coord_init_first_unit(&r, right); -+ mergeable = are_items_mergeable(&l, &r); -+ } -+ nr_items = -+ node40_num_of_items_internal(left) + -+ node40_num_of_items_internal(right) - (mergeable ? 1 : 0); -+ data = -+ kmalloc(sizeof(struct shift_check) * nr_items, -+ reiser4_ctx_gfp_mask_get()); -+ if (data != NULL) { -+ coord_t coord; -+ pos_in_node_t item_pos; -+ -+ coord_init_first_unit(&coord, left); -+ i = 0; -+ -+ for (item_pos = 0; -+ item_pos < node40_num_of_items_internal(left); -+ item_pos++) { -+ -+ coord_set_item_pos(&coord, item_pos); -+ ih = node40_ih_at_coord(&coord); -+ -+ data[i].key = ih->key; -+ data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id)); -+ switch (data[i].plugin_id) { -+ case CTAIL_ID: -+ case FORMATTING_ID: -+ data[i].u.bytes = coord_num_units(&coord); -+ break; -+ case EXTENT_POINTER_ID: -+ data[i].u.bytes = -+ reiser4_extent_size(&coord, -+ coord_num_units(&coord)); -+ break; -+ case COMPOUND_DIR_ID: -+ data[i].u.entries = coord_num_units(&coord); -+ break; -+ default: -+ data[i].u.unused = NULL; -+ break; -+ } -+ i++; -+ } -+ -+ coord_init_first_unit(&coord, right); -+ -+ if (mergeable) { -+ assert("vs-1609", i != 0); -+ -+ ih = node40_ih_at_coord(&coord); -+ -+ assert("vs-1589", -+ data[i - 1].plugin_id == -+ le16_to_cpu(get_unaligned(&ih->plugin_id))); -+ switch (data[i - 1].plugin_id) { -+ case CTAIL_ID: -+ case FORMATTING_ID: -+ data[i - 1].u.bytes += coord_num_units(&coord); -+ break; -+ case EXTENT_POINTER_ID: -+ data[i - 1].u.bytes += -+ reiser4_extent_size(&coord, -+ coord_num_units(&coord)); -+ break; -+ case COMPOUND_DIR_ID: -+ data[i - 1].u.entries += -+ coord_num_units(&coord); -+ break; -+ default: -+ impossible("vs-1605", "wrong mergeable item"); -+ break; -+ } -+ item_pos = 1; -+ } else -+ item_pos = 0; -+ for (; item_pos < node40_num_of_items_internal(right); -+ item_pos++) { -+ -+ assert("vs-1604", i < nr_items); -+ coord_set_item_pos(&coord, item_pos); -+ ih = node40_ih_at_coord(&coord); -+ -+ data[i].key = ih->key; -+ data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id)); -+ switch (data[i].plugin_id) { -+ case CTAIL_ID: -+ case FORMATTING_ID: -+ data[i].u.bytes = coord_num_units(&coord); -+ break; -+ case EXTENT_POINTER_ID: -+ data[i].u.bytes = -+ reiser4_extent_size(&coord, -+ coord_num_units(&coord)); -+ break; -+ case COMPOUND_DIR_ID: -+ data[i].u.entries = coord_num_units(&coord); -+ break; -+ default: -+ data[i].u.unused = NULL; -+ break; -+ } -+ i++; -+ } -+ assert("vs-1606", i == nr_items); -+ } -+ return data; -+} -+ -+void shift_check(void *vp, const znode * left, const znode * right) -+{ -+ pos_in_node_t i, nr_items; -+ coord_t coord; -+ __u64 last_bytes; -+ int mergeable; -+ item_header40 *ih; -+ pos_in_node_t item_pos; -+ struct shift_check *data; -+ -+ data = (struct shift_check *)vp; -+ -+ if (data == NULL) -+ return; -+ -+ if (node_is_empty(left) || node_is_empty(right)) -+ mergeable = 0; -+ else { -+ coord_t l, r; -+ -+ coord_init_last_unit(&l, left); -+ coord_init_first_unit(&r, right); -+ mergeable = are_items_mergeable(&l, &r); -+ } -+ -+ nr_items = -+ node40_num_of_items_internal(left) + -+ node40_num_of_items_internal(right) - (mergeable ? 1 : 0); -+ -+ i = 0; -+ last_bytes = 0; -+ -+ coord_init_first_unit(&coord, left); -+ -+ for (item_pos = 0; item_pos < node40_num_of_items_internal(left); -+ item_pos++) { -+ -+ coord_set_item_pos(&coord, item_pos); -+ ih = node40_ih_at_coord(&coord); -+ -+ assert("vs-1611", i == item_pos); -+ assert("vs-1590", keyeq(&ih->key, &data[i].key)); -+ assert("vs-1591", -+ le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id); -+ if ((i < (node40_num_of_items_internal(left) - 1)) -+ || !mergeable) { -+ switch (data[i].plugin_id) { -+ case CTAIL_ID: -+ case FORMATTING_ID: -+ assert("vs-1592", -+ data[i].u.bytes == -+ coord_num_units(&coord)); -+ break; -+ case EXTENT_POINTER_ID: -+ assert("vs-1593", -+ data[i].u.bytes == -+ reiser4_extent_size(&coord, -+ coord_num_units -+ (&coord))); -+ break; -+ case COMPOUND_DIR_ID: -+ assert("vs-1594", -+ data[i].u.entries == -+ coord_num_units(&coord)); -+ break; -+ default: -+ break; -+ } -+ } -+ if (item_pos == (node40_num_of_items_internal(left) - 1) -+ && mergeable) { -+ switch (data[i].plugin_id) { -+ case CTAIL_ID: -+ case FORMATTING_ID: -+ last_bytes = coord_num_units(&coord); -+ break; -+ case EXTENT_POINTER_ID: -+ last_bytes = -+ reiser4_extent_size(&coord, -+ coord_num_units(&coord)); -+ break; -+ case COMPOUND_DIR_ID: -+ last_bytes = coord_num_units(&coord); -+ break; -+ default: -+ impossible("vs-1595", "wrong mergeable item"); -+ break; -+ } -+ } -+ i++; -+ } -+ -+ coord_init_first_unit(&coord, right); -+ if (mergeable) { -+ ih = node40_ih_at_coord(&coord); -+ -+ assert("vs-1589", -+ data[i - 1].plugin_id == le16_to_cpu(get_unaligned(&ih->plugin_id))); -+ assert("vs-1608", last_bytes != 0); -+ switch (data[i - 1].plugin_id) { -+ case CTAIL_ID: -+ case FORMATTING_ID: -+ assert("vs-1596", -+ data[i - 1].u.bytes == -+ last_bytes + coord_num_units(&coord)); -+ break; -+ -+ case EXTENT_POINTER_ID: -+ assert("vs-1597", -+ data[i - 1].u.bytes == -+ last_bytes + reiser4_extent_size(&coord, -+ coord_num_units -+ (&coord))); -+ break; -+ -+ case COMPOUND_DIR_ID: -+ assert("vs-1598", -+ data[i - 1].u.bytes == -+ last_bytes + coord_num_units(&coord)); -+ break; -+ default: -+ impossible("vs-1599", "wrong mergeable item"); -+ break; -+ } -+ item_pos = 1; -+ } else -+ item_pos = 0; -+ -+ for (; item_pos < node40_num_of_items_internal(right); item_pos++) { -+ -+ coord_set_item_pos(&coord, item_pos); -+ ih = node40_ih_at_coord(&coord); -+ -+ assert("vs-1612", keyeq(&ih->key, &data[i].key)); -+ assert("vs-1613", -+ le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id); -+ switch (data[i].plugin_id) { -+ case CTAIL_ID: -+ case FORMATTING_ID: -+ assert("vs-1600", -+ data[i].u.bytes == coord_num_units(&coord)); -+ break; -+ case EXTENT_POINTER_ID: -+ assert("vs-1601", -+ data[i].u.bytes == -+ reiser4_extent_size(&coord, -+ coord_num_units -+ (&coord))); -+ break; -+ case COMPOUND_DIR_ID: -+ assert("vs-1602", -+ data[i].u.entries == coord_num_units(&coord)); -+ break; -+ default: -+ break; -+ } -+ i++; -+ } -+ -+ assert("vs-1603", i == nr_items); -+ kfree(data); -+} -+ -+#endif -+ -+/* plugin->u.node.shift -+ look for description of this method in plugin/node/node.h */ -+int shift_node40(coord_t * from, znode * to, shift_direction pend, int delete_child, /* if @from->node becomes empty - it will be -+ deleted from the tree if this is set to 1 */ -+ int including_stop_coord, carry_plugin_info * info) -+{ -+ struct shift_params shift; -+ int result; -+ znode *left, *right; -+ znode *source; -+ int target_empty; -+ -+ assert("nikita-2161", coord_check(from)); -+ -+ memset(&shift, 0, sizeof(shift)); -+ shift.pend = pend; -+ shift.wish_stop = *from; -+ shift.target = to; -+ -+ assert("nikita-1473", znode_is_write_locked(from->node)); -+ assert("nikita-1474", znode_is_write_locked(to)); -+ -+ source = from->node; -+ -+ /* set @shift.wish_stop to rightmost/leftmost unit among units we want -+ shifted */ -+ if (pend == SHIFT_LEFT) { -+ result = coord_set_to_left(&shift.wish_stop); -+ left = to; -+ right = from->node; -+ } else { -+ result = coord_set_to_right(&shift.wish_stop); -+ left = from->node; -+ right = to; -+ } -+ -+ if (result) { -+ /* move insertion coord even if there is nothing to move */ -+ if (including_stop_coord) { -+ /* move insertion coord (@from) */ -+ if (pend == SHIFT_LEFT) { -+ /* after last item in target node */ -+ coord_init_after_last_item(from, to); -+ } else { -+ /* before first item in target node */ -+ coord_init_before_first_item(from, to); -+ } -+ } -+ -+ if (delete_child && node_is_empty(shift.wish_stop.node)) -+ result = -+ prepare_removal_node40(shift.wish_stop.node, info); -+ else -+ result = 0; -+ /* there is nothing to shift */ -+ assert("nikita-2078", coord_check(from)); -+ return result; -+ } -+ -+ target_empty = node_is_empty(to); -+ -+ /* when first node plugin with item body compression is implemented, -+ this must be changed to call node specific plugin */ -+ -+ /* shift->stop_coord is updated to last unit which really will be -+ shifted */ -+ estimate_shift(&shift, get_current_context()); -+ if (!shift.shift_bytes) { -+ /* we could not shift anything */ -+ assert("nikita-2079", coord_check(from)); -+ return 0; -+ } -+ -+ copy(&shift); -+ -+ /* result value of this is important. It is used by adjust_coord below */ -+ result = delete_copied(&shift); -+ -+ assert("vs-1610", result >= 0); -+ assert("vs-1471", -+ ((reiser4_context *) current->journal_info)->magic == -+ context_magic); -+ -+ /* item which has been moved from one node to another might want to do -+ something on that event. This can be done by item's shift_hook -+ method, which will be now called for every moved items */ -+ call_shift_hooks(&shift); -+ -+ assert("vs-1472", -+ ((reiser4_context *) current->journal_info)->magic == -+ context_magic); -+ -+ update_taps(&shift); -+ -+ assert("vs-1473", -+ ((reiser4_context *) current->journal_info)->magic == -+ context_magic); -+ -+ /* adjust @from pointer in accordance with @including_stop_coord flag -+ and amount of data which was really shifted */ -+ adjust_coord(from, &shift, result, including_stop_coord); -+ -+ if (target_empty) -+ /* -+ * items were shifted into empty node. Update delimiting key. -+ */ -+ result = prepare_for_update(NULL, left, info); -+ -+ /* add update operation to @info, which is the list of operations to -+ be performed on a higher level */ -+ result = prepare_for_update(left, right, info); -+ if (!result && node_is_empty(source) && delete_child) { -+ /* all contents of @from->node is moved to @to and @from->node -+ has to be removed from the tree, so, on higher level we -+ will be removing the pointer to node @from->node */ -+ result = prepare_removal_node40(source, info); -+ } -+ assert("nikita-2080", coord_check(from)); -+ return result ? result : (int)shift.shift_bytes; -+} -+ -+/* plugin->u.node.fast_insert() -+ look for description of this method in plugin/node/node.h */ -+int fast_insert_node40(const coord_t * coord UNUSED_ARG /* node to query */ ) -+{ -+ return 1; -+} -+ -+/* plugin->u.node.fast_paste() -+ look for description of this method in plugin/node/node.h */ -+int fast_paste_node40(const coord_t * coord UNUSED_ARG /* node to query */ ) -+{ -+ return 1; -+} -+ -+/* plugin->u.node.fast_cut() -+ look for description of this method in plugin/node/node.h */ -+int fast_cut_node40(const coord_t * coord UNUSED_ARG /* node to query */ ) -+{ -+ return 1; -+} -+ -+/* plugin->u.node.modify - not defined */ -+ -+/* plugin->u.node.max_item_size */ -+int max_item_size_node40(void) -+{ -+ return reiser4_get_current_sb()->s_blocksize - sizeof(node40_header) - -+ sizeof(item_header40); -+} -+ -+/* plugin->u.node.set_item_plugin */ -+int set_item_plugin_node40(coord_t *coord, item_id id) -+{ -+ item_header40 *ih; -+ -+ ih = node40_ih_at_coord(coord); -+ put_unaligned(cpu_to_le16(id), &ih->plugin_id); -+ coord->iplugid = id; -+ return 0; -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/node/node40.h linux-2.6.20/fs/reiser4/plugin/node/node40.h ---- linux-2.6.20.orig/fs/reiser4/plugin/node/node40.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/node/node40.h 2007-05-06 14:50:43.835018219 +0400 -@@ -0,0 +1,125 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#if !defined( __REISER4_NODE40_H__ ) -+#define __REISER4_NODE40_H__ -+ -+#include "../../forward.h" -+#include "../../dformat.h" -+#include "node.h" -+ -+#include -+ -+/* format of node header for 40 node layouts. Keep bloat out of this struct. */ -+typedef struct node40_header { -+ /* identifier of node plugin. Must be located at the very beginning -+ of a node. */ -+ common_node_header common_header; /* this is 16 bits */ -+ /* number of items. Should be first element in the node header, -+ because we haven't yet finally decided whether it shouldn't go into -+ common_header. -+ */ -+/* NIKITA-FIXME-HANS: Create a macro such that if there is only one -+ * node format at compile time, and it is this one, accesses do not function dereference when -+ * accessing these fields (and otherwise they do). Probably 80% of users will only have one node format at a time throughout the life of reiser4. */ -+ d16 nr_items; -+ /* free space in node measured in bytes */ -+ d16 free_space; -+ /* offset to start of free space in node */ -+ d16 free_space_start; -+ /* for reiser4_fsck. When information about what is a free -+ block is corrupted, and we try to recover everything even -+ if marked as freed, then old versions of data may -+ duplicate newer versions, and this field allows us to -+ restore the newer version. Also useful for when users -+ who don't have the new trashcan installed on their linux distro -+ delete the wrong files and send us desperate emails -+ offering $25 for them back. */ -+ -+ /* magic field we need to tell formatted nodes NIKITA-FIXME-HANS: improve this comment */ -+ d32 magic; -+ /* flushstamp is made of mk_id and write_counter. mk_id is an -+ id generated randomly at mkreiserfs time. So we can just -+ skip all nodes with different mk_id. write_counter is d64 -+ incrementing counter of writes on disk. It is used for -+ choosing the newest data at fsck time. NIKITA-FIXME-HANS: why was field name changed but not comment? */ -+ -+ d32 mkfs_id; -+ d64 flush_id; -+ /* node flags to be used by fsck (reiser4ck or reiser4fsck?) -+ and repacker NIKITA-FIXME-HANS: say more or reference elsewhere that says more */ -+ d16 flags; -+ -+ /* 1 is leaf level, 2 is twig level, root is the numerically -+ largest level */ -+ d8 level; -+ -+ d8 pad; -+} PACKED node40_header; -+ -+/* item headers are not standard across all node layouts, pass -+ pos_in_node to functions instead */ -+typedef struct item_header40 { -+ /* key of item */ -+ /* 0 */ reiser4_key key; -+ /* offset from start of a node measured in 8-byte chunks */ -+ /* 24 */ d16 offset; -+ /* 26 */ d16 flags; -+ /* 28 */ d16 plugin_id; -+} PACKED item_header40; -+ -+size_t item_overhead_node40(const znode * node, flow_t * aflow); -+size_t free_space_node40(znode * node); -+node_search_result lookup_node40(znode * node, const reiser4_key * key, -+ lookup_bias bias, coord_t * coord); -+int num_of_items_node40(const znode * node); -+char *item_by_coord_node40(const coord_t * coord); -+int length_by_coord_node40(const coord_t * coord); -+item_plugin *plugin_by_coord_node40(const coord_t * coord); -+reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key); -+size_t estimate_node40(znode * node); -+int check_node40(const znode * node, __u32 flags, const char **error); -+int parse_node40(znode * node); -+int init_node40(znode * node); -+#ifdef GUESS_EXISTS -+int guess_node40(const znode * node); -+#endif -+void change_item_size_node40(coord_t * coord, int by); -+int create_item_node40(coord_t * target, const reiser4_key * key, -+ reiser4_item_data * data, carry_plugin_info * info); -+void update_item_key_node40(coord_t * target, const reiser4_key * key, -+ carry_plugin_info * info); -+int kill_node40(struct carry_kill_data *, carry_plugin_info *); -+int cut_node40(struct carry_cut_data *, carry_plugin_info *); -+int shift_node40(coord_t * from, znode * to, shift_direction pend, -+ /* if @from->node becomes -+ empty - it will be deleted from -+ the tree if this is set to 1 -+ */ -+ int delete_child, int including_stop_coord, -+ carry_plugin_info * info); -+ -+int fast_insert_node40(const coord_t * coord); -+int fast_paste_node40(const coord_t * coord); -+int fast_cut_node40(const coord_t * coord); -+int max_item_size_node40(void); -+int prepare_removal_node40(znode * empty, carry_plugin_info * info); -+int set_item_plugin_node40(coord_t * coord, item_id id); -+int shrink_item_node40(coord_t * coord, int delta); -+ -+#if REISER4_DEBUG -+void *shift_check_prepare(const znode *left, const znode *right); -+void shift_check(void *vp, const znode *left, const znode *right); -+#endif -+ -+/* __REISER4_NODE40_H__ */ -+#endif -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/node/node.c linux-2.6.20/fs/reiser4/plugin/node/node.c ---- linux-2.6.20.orig/fs/reiser4/plugin/node/node.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/node/node.c 2007-05-06 14:50:43.835018219 +0400 -@@ -0,0 +1,131 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Node plugin interface. -+ -+ Description: The tree provides the abstraction of flows, which it -+ internally fragments into items which it stores in nodes. -+ -+ A key_atom is a piece of data bound to a single key. -+ -+ For reasonable space efficiency to be achieved it is often -+ necessary to store key_atoms in the nodes in the form of items, where -+ an item is a sequence of key_atoms of the same or similar type. It is -+ more space-efficient, because the item can implement (very) -+ efficient compression of key_atom's bodies using internal knowledge -+ about their semantics, and it can often avoid having a key for each -+ key_atom. Each type of item has specific operations implemented by its -+ item handler (see balance.c). -+ -+ Rationale: the rest of the code (specifically balancing routines) -+ accesses leaf level nodes through this interface. This way we can -+ implement various block layouts and even combine various layouts -+ within the same tree. Balancing/allocating algorithms should not -+ care about peculiarities of splitting/merging specific item types, -+ but rather should leave that to the item's item handler. -+ -+ Items, including those that provide the abstraction of flows, have -+ the property that if you move them in part or in whole to another -+ node, the balancing code invokes their is_left_mergeable() -+ item_operation to determine if they are mergeable with their new -+ neighbor in the node you have moved them to. For some items the -+ is_left_mergeable() function always returns null. -+ -+ When moving the bodies of items from one node to another: -+ -+ if a partial item is shifted to another node the balancing code invokes -+ an item handler method to handle the item splitting. -+ -+ if the balancing code needs to merge with an item in the node it -+ is shifting to, it will invoke an item handler method to handle -+ the item merging. -+ -+ if it needs to move whole item bodies unchanged, the balancing code uses xmemcpy() -+ adjusting the item headers after the move is done using the node handler. -+*/ -+ -+#include "../../forward.h" -+#include "../../debug.h" -+#include "../../key.h" -+#include "../../coord.h" -+#include "../plugin_header.h" -+#include "../item/item.h" -+#include "node.h" -+#include "../plugin.h" -+#include "../../znode.h" -+#include "../../tree.h" -+#include "../../super.h" -+#include "../../reiser4.h" -+ -+/** -+ * leftmost_key_in_node - get the smallest key in node -+ * @node: -+ * @key: store result here -+ * -+ * Stores the leftmost key of @node in @key. -+ */ -+reiser4_key *leftmost_key_in_node(const znode *node, reiser4_key *key) -+{ -+ assert("nikita-1634", node != NULL); -+ assert("nikita-1635", key != NULL); -+ -+ if (!node_is_empty(node)) { -+ coord_t first_item; -+ -+ coord_init_first_unit(&first_item, (znode *) node); -+ item_key_by_coord(&first_item, key); -+ } else -+ *key = *reiser4_max_key(); -+ return key; -+} -+ -+node_plugin node_plugins[LAST_NODE_ID] = { -+ [NODE40_ID] = { -+ .h = { -+ .type_id = REISER4_NODE_PLUGIN_TYPE, -+ .id = NODE40_ID, -+ .pops = NULL, -+ .label = "unified", -+ .desc = "unified node layout", -+ .linkage = {NULL, NULL} -+ }, -+ .item_overhead = item_overhead_node40, -+ .free_space = free_space_node40, -+ .lookup = lookup_node40, -+ .num_of_items = num_of_items_node40, -+ .item_by_coord = item_by_coord_node40, -+ .length_by_coord = length_by_coord_node40, -+ .plugin_by_coord = plugin_by_coord_node40, -+ .key_at = key_at_node40, -+ .estimate = estimate_node40, -+ .check = check_node40, -+ .parse = parse_node40, -+ .init = init_node40, -+#ifdef GUESS_EXISTS -+ .guess = guess_node40, -+#endif -+ .change_item_size = change_item_size_node40, -+ .create_item = create_item_node40, -+ .update_item_key = update_item_key_node40, -+ .cut_and_kill = kill_node40, -+ .cut = cut_node40, -+ .shift = shift_node40, -+ .shrink_item = shrink_item_node40, -+ .fast_insert = fast_insert_node40, -+ .fast_paste = fast_paste_node40, -+ .fast_cut = fast_cut_node40, -+ .max_item_size = max_item_size_node40, -+ .prepare_removal = prepare_removal_node40, -+ .set_item_plugin = set_item_plugin_node40 -+ } -+}; -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/node/node.h linux-2.6.20/fs/reiser4/plugin/node/node.h ---- linux-2.6.20.orig/fs/reiser4/plugin/node/node.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/node/node.h 2007-05-06 14:50:43.835018219 +0400 -@@ -0,0 +1,272 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* We need a definition of the default node layout here. */ -+ -+/* Generally speaking, it is best to have free space in the middle of the -+ node so that two sets of things can grow towards it, and to have the -+ item bodies on the left so that the last one of them grows into free -+ space. We optimize for the case where we append new items to the end -+ of the node, or grow the last item, because it hurts nothing to so -+ optimize and it is a common special case to do massive insertions in -+ increasing key order (and one of cases more likely to have a real user -+ notice the delay time for). -+ -+ formatted leaf default layout: (leaf1) -+ -+ |node header:item bodies:free space:key + pluginid + item offset| -+ -+ We grow towards the middle, optimizing layout for the case where we -+ append new items to the end of the node. The node header is fixed -+ length. Keys, and item offsets plus pluginids for the items -+ corresponding to them are in increasing key order, and are fixed -+ length. Item offsets are relative to start of node (16 bits creating -+ a node size limit of 64k, 12 bits might be a better choice....). Item -+ bodies are in decreasing key order. Item bodies have a variable size. -+ There is a one to one to one mapping of keys to item offsets to item -+ bodies. Item offsets consist of pointers to the zeroth byte of the -+ item body. Item length equals the start of the next item minus the -+ start of this item, except the zeroth item whose length equals the end -+ of the node minus the start of that item (plus a byte). In other -+ words, the item length is not recorded anywhere, and it does not need -+ to be since it is computable. -+ -+ Leaf variable length items and keys layout : (lvar) -+ -+ |node header:key offset + item offset + pluginid triplets:free space:key bodies:item bodies| -+ -+ We grow towards the middle, optimizing layout for the case where we -+ append new items to the end of the node. The node header is fixed -+ length. Keys and item offsets for the items corresponding to them are -+ in increasing key order, and keys are variable length. Item offsets -+ are relative to start of node (16 bits). Item bodies are in -+ decreasing key order. Item bodies have a variable size. There is a -+ one to one to one mapping of keys to item offsets to item bodies. -+ Item offsets consist of pointers to the zeroth byte of the item body. -+ Item length equals the start of the next item's key minus the start of -+ this item, except the zeroth item whose length equals the end of the -+ node minus the start of that item (plus a byte). -+ -+ leaf compressed keys layout: (lcomp) -+ -+ |node header:key offset + key inherit + item offset pairs:free space:key bodies:item bodies| -+ -+ We grow towards the middle, optimizing layout for the case where we -+ append new items to the end of the node. The node header is fixed -+ length. Keys and item offsets for the items corresponding to them are -+ in increasing key order, and keys are variable length. The "key -+ inherit" field indicates how much of the key prefix is identical to -+ the previous key (stem compression as described in "Managing -+ Gigabytes" is used). key_inherit is a one byte integer. The -+ intra-node searches performed through this layout are linear searches, -+ and this is theorized to not hurt performance much due to the high -+ cost of processor stalls on modern CPUs, and the small number of keys -+ in a single node. Item offsets are relative to start of node (16 -+ bits). Item bodies are in decreasing key order. Item bodies have a -+ variable size. There is a one to one to one mapping of keys to item -+ offsets to item bodies. Item offsets consist of pointers to the -+ zeroth byte of the item body. Item length equals the start of the -+ next item minus the start of this item, except the zeroth item whose -+ length equals the end of the node minus the start of that item (plus a -+ byte). In other words, item length and key length is not recorded -+ anywhere, and it does not need to be since it is computable. -+ -+ internal node default layout: (idef1) -+ -+ just like ldef1 except that item bodies are either blocknrs of -+ children or extents, and moving them may require updating parent -+ pointers in the nodes that they point to. -+*/ -+ -+/* There is an inherent 3-way tradeoff between optimizing and -+ exchanging disks between different architectures and code -+ complexity. This is optimal and simple and inexchangeable. -+ Someone else can do the code for exchanging disks and make it -+ complex. It would not be that hard. Using other than the PAGE_SIZE -+ might be suboptimal. -+*/ -+ -+#if !defined( __REISER4_NODE_H__ ) -+#define __REISER4_NODE_H__ -+ -+#define LEAF40_NODE_SIZE PAGE_CACHE_SIZE -+ -+#include "../../dformat.h" -+#include "../plugin_header.h" -+ -+#include -+ -+typedef enum { -+ NS_FOUND = 0, -+ NS_NOT_FOUND = -ENOENT -+} node_search_result; -+ -+/* Maximal possible space overhead for creation of new item in a node */ -+#define REISER4_NODE_MAX_OVERHEAD ( sizeof( reiser4_key ) + 32 ) -+ -+typedef enum { -+ REISER4_NODE_DKEYS = (1 << 0), -+ REISER4_NODE_TREE_STABLE = (1 << 1) -+} reiser4_node_check_flag; -+ -+/* cut and cut_and_kill have too long list of parameters. This structure is just to safe some space on stack */ -+struct cut_list { -+ coord_t *from; -+ coord_t *to; -+ const reiser4_key *from_key; -+ const reiser4_key *to_key; -+ reiser4_key *smallest_removed; -+ carry_plugin_info *info; -+ __u32 flags; -+ struct inode *inode; /* this is to pass list of eflushed jnodes down to extent_kill_hook */ -+ lock_handle *left; -+ lock_handle *right; -+}; -+ -+struct carry_cut_data; -+struct carry_kill_data; -+ -+/* The responsibility of the node plugin is to store and give access -+ to the sequence of items within the node. */ -+typedef struct node_plugin { -+ /* generic plugin fields */ -+ plugin_header h; -+ -+ /* calculates the amount of space that will be required to store an -+ item which is in addition to the space consumed by the item body. -+ (the space consumed by the item body can be gotten by calling -+ item->estimate) */ -+ size_t(*item_overhead) (const znode * node, flow_t * f); -+ -+ /* returns free space by looking into node (i.e., without using -+ znode->free_space). */ -+ size_t(*free_space) (znode * node); -+ /* search within the node for the one item which might -+ contain the key, invoking item->search_within to search within -+ that item to see if it is in there */ -+ node_search_result(*lookup) (znode * node, const reiser4_key * key, -+ lookup_bias bias, coord_t * coord); -+ /* number of items in node */ -+ int (*num_of_items) (const znode * node); -+ -+ /* store information about item in @coord in @data */ -+ /* break into several node ops, don't add any more uses of this before doing so */ -+ /*int ( *item_at )( const coord_t *coord, reiser4_item_data *data ); */ -+ char *(*item_by_coord) (const coord_t * coord); -+ int (*length_by_coord) (const coord_t * coord); -+ item_plugin *(*plugin_by_coord) (const coord_t * coord); -+ -+ /* store item key in @key */ -+ reiser4_key *(*key_at) (const coord_t * coord, reiser4_key * key); -+ /* conservatively estimate whether unit of what size can fit -+ into node. This estimation should be performed without -+ actually looking into the node's content (free space is saved in -+ znode). */ -+ size_t(*estimate) (znode * node); -+ -+ /* performs every consistency check the node plugin author could -+ imagine. Optional. */ -+ int (*check) (const znode * node, __u32 flags, const char **error); -+ -+ /* Called when node is read into memory and node plugin is -+ already detected. This should read some data into znode (like free -+ space counter) and, optionally, check data consistency. -+ */ -+ int (*parse) (znode * node); -+ /* This method is called on a new node to initialise plugin specific -+ data (header, etc.) */ -+ int (*init) (znode * node); -+ /* Check whether @node content conforms to this plugin format. -+ Probably only useful after support for old V3.x formats is added. -+ Uncomment after 4.0 only. -+ */ -+ /* int ( *guess )( const znode *node ); */ -+#if REISER4_DEBUG -+ void (*print) (const char *prefix, const znode * node, __u32 flags); -+#endif -+ /* change size of @item by @by bytes. @item->node has enough free -+ space. When @by > 0 - free space is appended to end of item. When -+ @by < 0 - item is truncated - it is assumed that last @by bytes if -+ the item are freed already */ -+ void (*change_item_size) (coord_t * item, int by); -+ -+ /* create new item @length bytes long in coord @target */ -+ int (*create_item) (coord_t * target, const reiser4_key * key, -+ reiser4_item_data * data, carry_plugin_info * info); -+ -+ /* update key of item. */ -+ void (*update_item_key) (coord_t * target, const reiser4_key * key, -+ carry_plugin_info * info); -+ -+ int (*cut_and_kill) (struct carry_kill_data *, carry_plugin_info *); -+ int (*cut) (struct carry_cut_data *, carry_plugin_info *); -+ -+ /* -+ * shrink item pointed to by @coord by @delta bytes. -+ */ -+ int (*shrink_item) (coord_t * coord, int delta); -+ -+ /* copy as much as possible but not more than up to @stop from -+ @stop->node to @target. If (pend == append) then data from beginning of -+ @stop->node are copied to the end of @target. If (pend == prepend) then -+ data from the end of @stop->node are copied to the beginning of -+ @target. Copied data are removed from @stop->node. Information -+ about what to do on upper level is stored in @todo */ -+ int (*shift) (coord_t * stop, znode * target, shift_direction pend, -+ int delete_node, int including_insert_coord, -+ carry_plugin_info * info); -+ /* return true if this node allows skip carry() in some situations -+ (see fs/reiser4/tree.c:insert_by_coord()). Reiser3.x format -+ emulation doesn't. -+ -+ This will speedup insertions that doesn't require updates to the -+ parent, by bypassing initialisation of carry() structures. It's -+ believed that majority of insertions will fit there. -+ -+ */ -+ int (*fast_insert) (const coord_t * coord); -+ int (*fast_paste) (const coord_t * coord); -+ int (*fast_cut) (const coord_t * coord); -+ /* this limits max size of item which can be inserted into a node and -+ number of bytes item in a node may be appended with */ -+ int (*max_item_size) (void); -+ int (*prepare_removal) (znode * empty, carry_plugin_info * info); -+ /* change plugin id of items which are in a node already. Currently it is Used in tail conversion for regular -+ * files */ -+ int (*set_item_plugin) (coord_t * coord, item_id); -+} node_plugin; -+ -+typedef enum { -+ /* standard unified node layout used for both leaf and internal -+ nodes */ -+ NODE40_ID, -+ LAST_NODE_ID -+} reiser4_node_id; -+ -+extern reiser4_key *leftmost_key_in_node(const znode * node, reiser4_key * key); -+#if REISER4_DEBUG -+extern void print_node_content(const char *prefix, const znode * node, -+ __u32 flags); -+#endif -+ -+extern void indent_znode(const znode * node); -+ -+typedef struct common_node_header { -+ /* -+ * identifier of node plugin. Must be located at the very beginning of -+ * a node. -+ */ -+ __le16 plugin_id; -+} common_node_header; -+ -+/* __REISER4_NODE_H__ */ -+#endif -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/object.c linux-2.6.20/fs/reiser4/plugin/object.c ---- linux-2.6.20.orig/fs/reiser4/plugin/object.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/object.c 2007-05-06 14:50:43.835018219 +0400 -@@ -0,0 +1,516 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* -+ * Examples of object plugins: file, directory, symlink, special file. -+ * -+ * Plugins associated with inode: -+ * -+ * Plugin of inode is plugin referenced by plugin-id field of on-disk -+ * stat-data. How we store this plugin in in-core inode is not -+ * important. Currently pointers are used, another variant is to store offsets -+ * and do array lookup on each access. -+ * -+ * Now, each inode has one selected plugin: object plugin that -+ * determines what type of file this object is: directory, regular etc. -+ * -+ * This main plugin can use other plugins that are thus subordinated to -+ * it. Directory instance of object plugin uses hash; regular file -+ * instance uses tail policy plugin. -+ * -+ * Object plugin is either taken from id in stat-data or guessed from -+ * i_mode bits. Once it is established we ask it to install its -+ * subordinate plugins, by looking again in stat-data or inheriting them -+ * from parent. -+ * -+ * How new inode is initialized during ->read_inode(): -+ * 1 read stat-data and initialize inode fields: i_size, i_mode, -+ * i_generation, capabilities etc. -+ * 2 read plugin id from stat data or try to guess plugin id -+ * from inode->i_mode bits if plugin id is missing. -+ * 3 Call ->init_inode() method of stat-data plugin to initialise inode fields. -+ * -+ * NIKITA-FIXME-HANS: can you say a little about 1 being done before 3? What -+ * if stat data does contain i_size, etc., due to it being an unusual plugin? -+ * -+ * 4 Call ->activate() method of object's plugin. Plugin is either read from -+ * from stat-data or guessed from mode bits -+ * 5 Call ->inherit() method of object plugin to inherit as yet un initialized -+ * plugins from parent. -+ * -+ * Easy induction proves that on last step all plugins of inode would be -+ * initialized. -+ * -+ * When creating new object: -+ * 1 obtain object plugin id (see next period) -+ * NIKITA-FIXME-HANS: period? -+ * 2 ->install() this plugin -+ * 3 ->inherit() the rest from the parent -+ * -+ * We need some examples of creating an object with default and non-default -+ * plugin ids. Nikita, please create them. -+ */ -+ -+#include "../inode.h" -+ -+static int _bugop(void) -+{ -+ BUG_ON(1); -+ return 0; -+} -+ -+#define bugop ((void *)_bugop) -+ -+static int _dummyop(void) -+{ -+ return 0; -+} -+ -+#define dummyop ((void *)_dummyop) -+ -+static int change_file(struct inode *inode, -+ reiser4_plugin * plugin, -+ pset_member memb) -+{ -+ /* cannot change object plugin of already existing object */ -+ if (memb == PSET_FILE) -+ return RETERR(-EINVAL); -+ -+ /* Change PSET_CREATE */ -+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset, memb, plugin); -+} -+ -+static reiser4_plugin_ops file_plugin_ops = { -+ .change = change_file -+}; -+ -+/* -+ * Definitions of object plugins. -+ */ -+ -+file_plugin file_plugins[LAST_FILE_PLUGIN_ID] = { -+ [UNIX_FILE_PLUGIN_ID] = { -+ .h = { -+ .type_id = REISER4_FILE_PLUGIN_TYPE, -+ .id = UNIX_FILE_PLUGIN_ID, -+ .groups = (1 << REISER4_REGULAR_FILE), -+ .pops = &file_plugin_ops, -+ .label = "reg", -+ .desc = "regular file", -+ .linkage = {NULL, NULL}, -+ }, -+ .inode_ops = { -+ .permission = reiser4_permission_common, -+ .setattr = setattr_unix_file, -+ .getattr = reiser4_getattr_common -+ }, -+ .file_ops = { -+ .llseek = generic_file_llseek, -+ .read = read_unix_file, -+ .write = write_unix_file, -+ .aio_read = generic_file_aio_read, -+ .ioctl = ioctl_unix_file, -+ .mmap = mmap_unix_file, -+ .open = open_unix_file, -+ .release = release_unix_file, -+ .fsync = sync_unix_file, -+ .sendfile = sendfile_unix_file -+ }, -+ .as_ops = { -+ .writepage = reiser4_writepage, -+ .readpage = readpage_unix_file, -+ .sync_page = block_sync_page, -+ .writepages = writepages_unix_file, -+ .set_page_dirty = reiser4_set_page_dirty, -+ .readpages = readpages_unix_file, -+ .prepare_write = prepare_write_unix_file, -+ .commit_write = commit_write_unix_file, -+ .bmap = bmap_unix_file, -+ .invalidatepage = reiser4_invalidatepage, -+ .releasepage = reiser4_releasepage -+ }, -+ .write_sd_by_inode = write_sd_by_inode_common, -+ .flow_by_inode = flow_by_inode_unix_file, -+ .key_by_inode = key_by_inode_and_offset_common, -+ .set_plug_in_inode = set_plug_in_inode_common, -+ .adjust_to_parent = adjust_to_parent_common, -+ .create_object = reiser4_create_object_common, -+ .delete_object = delete_object_unix_file, -+ .add_link = reiser4_add_link_common, -+ .rem_link = reiser4_rem_link_common, -+ .owns_item = owns_item_unix_file, -+ .can_add_link = can_add_link_common, -+ .detach = dummyop, -+ .bind = dummyop, -+ .safelink = safelink_common, -+ .estimate = { -+ .create = estimate_create_common, -+ .update = estimate_update_common, -+ .unlink = estimate_unlink_common -+ }, -+ .init_inode_data = init_inode_data_unix_file, -+ .cut_tree_worker = cut_tree_worker_common, -+ .wire = { -+ .write = wire_write_common, -+ .read = wire_read_common, -+ .get = wire_get_common, -+ .size = wire_size_common, -+ .done = wire_done_common -+ } -+ }, -+ [DIRECTORY_FILE_PLUGIN_ID] = { -+ .h = { -+ .type_id = REISER4_FILE_PLUGIN_TYPE, -+ .id = DIRECTORY_FILE_PLUGIN_ID, -+ .groups = (1 << REISER4_DIRECTORY_FILE), -+ .pops = &file_plugin_ops, -+ .label = "dir", -+ .desc = "directory", -+ .linkage = {NULL, NULL} -+ }, -+ .inode_ops = {.create = NULL}, -+ .file_ops = {.owner = NULL}, -+ .as_ops = {.writepage = NULL}, -+ -+ .write_sd_by_inode = write_sd_by_inode_common, -+ .flow_by_inode = bugop, -+ .key_by_inode = bugop, -+ .set_plug_in_inode = set_plug_in_inode_common, -+ .adjust_to_parent = adjust_to_parent_common_dir, -+ .create_object = reiser4_create_object_common, -+ .delete_object = reiser4_delete_dir_common, -+ .add_link = reiser4_add_link_common, -+ .rem_link = rem_link_common_dir, -+ .owns_item = owns_item_common_dir, -+ .can_add_link = can_add_link_common, -+ .can_rem_link = can_rem_link_common_dir, -+ .detach = reiser4_detach_common_dir, -+ .bind = reiser4_bind_common_dir, -+ .safelink = safelink_common, -+ .estimate = { -+ .create = estimate_create_common_dir, -+ .update = estimate_update_common, -+ .unlink = estimate_unlink_common_dir -+ }, -+ .wire = { -+ .write = wire_write_common, -+ .read = wire_read_common, -+ .get = wire_get_common, -+ .size = wire_size_common, -+ .done = wire_done_common -+ }, -+ .init_inode_data = init_inode_ordering, -+ .cut_tree_worker = cut_tree_worker_common, -+ }, -+ [SYMLINK_FILE_PLUGIN_ID] = { -+ .h = { -+ .type_id = REISER4_FILE_PLUGIN_TYPE, -+ .id = SYMLINK_FILE_PLUGIN_ID, -+ .groups = (1 << REISER4_SYMLINK_FILE), -+ .pops = &file_plugin_ops, -+ .label = "symlink", -+ .desc = "symbolic link", -+ .linkage = {NULL,NULL} -+ }, -+ .inode_ops = { -+ .readlink = generic_readlink, -+ .follow_link = reiser4_follow_link_common, -+ .permission = reiser4_permission_common, -+ .setattr = reiser4_setattr_common, -+ .getattr = reiser4_getattr_common -+ }, -+ /* inode->i_fop of symlink is initialized by NULL in setup_inode_ops */ -+ .file_ops = {.owner = NULL}, -+ .as_ops = {.writepage = NULL}, -+ -+ .write_sd_by_inode = write_sd_by_inode_common, -+ .set_plug_in_inode = set_plug_in_inode_common, -+ .adjust_to_parent = adjust_to_parent_common, -+ .create_object = reiser4_create_symlink, -+ .delete_object = reiser4_delete_object_common, -+ .add_link = reiser4_add_link_common, -+ .rem_link = reiser4_rem_link_common, -+ .can_add_link = can_add_link_common, -+ .detach = dummyop, -+ .bind = dummyop, -+ .safelink = safelink_common, -+ .estimate = { -+ .create = estimate_create_common, -+ .update = estimate_update_common, -+ .unlink = estimate_unlink_common -+ }, -+ .init_inode_data = init_inode_ordering, -+ .cut_tree_worker = cut_tree_worker_common, -+ .destroy_inode = destroy_inode_symlink, -+ .wire = { -+ .write = wire_write_common, -+ .read = wire_read_common, -+ .get = wire_get_common, -+ .size = wire_size_common, -+ .done = wire_done_common -+ } -+ }, -+ [SPECIAL_FILE_PLUGIN_ID] = { -+ .h = { -+ .type_id = REISER4_FILE_PLUGIN_TYPE, -+ .id = SPECIAL_FILE_PLUGIN_ID, -+ .groups = (1 << REISER4_SPECIAL_FILE), -+ .pops = &file_plugin_ops, -+ .label = "special", -+ .desc = -+ "special: fifo, device or socket", -+ .linkage = {NULL, NULL} -+ }, -+ .inode_ops = { -+ .permission = reiser4_permission_common, -+ .setattr = reiser4_setattr_common, -+ .getattr = reiser4_getattr_common -+ }, -+ /* file_ops of special files (sockets, block, char, fifo) are -+ initialized by init_special_inode. */ -+ .file_ops = {.owner = NULL}, -+ .as_ops = {.writepage = NULL}, -+ -+ .write_sd_by_inode = write_sd_by_inode_common, -+ .set_plug_in_inode = set_plug_in_inode_common, -+ .adjust_to_parent = adjust_to_parent_common, -+ .create_object = reiser4_create_object_common, -+ .delete_object = reiser4_delete_object_common, -+ .add_link = reiser4_add_link_common, -+ .rem_link = reiser4_rem_link_common, -+ .owns_item = owns_item_common, -+ .can_add_link = can_add_link_common, -+ .detach = dummyop, -+ .bind = dummyop, -+ .safelink = safelink_common, -+ .estimate = { -+ .create = estimate_create_common, -+ .update = estimate_update_common, -+ .unlink = estimate_unlink_common -+ }, -+ .init_inode_data = init_inode_ordering, -+ .cut_tree_worker = cut_tree_worker_common, -+ .wire = { -+ .write = wire_write_common, -+ .read = wire_read_common, -+ .get = wire_get_common, -+ .size = wire_size_common, -+ .done = wire_done_common -+ } -+ }, -+ [CRYPTCOMPRESS_FILE_PLUGIN_ID] = { -+ .h = { -+ .type_id = REISER4_FILE_PLUGIN_TYPE, -+ .id = CRYPTCOMPRESS_FILE_PLUGIN_ID, -+ .groups = (1 << REISER4_REGULAR_FILE), -+ .pops = &file_plugin_ops, -+ .label = "cryptcompress", -+ .desc = "cryptcompress file", -+ .linkage = {NULL, NULL} -+ }, -+ .inode_ops = { -+ .permission = reiser4_permission_common, -+ .setattr = prot_setattr_cryptcompress, -+ .getattr = reiser4_getattr_common -+ }, -+ .file_ops = { -+ .llseek = generic_file_llseek, -+ .read = prot_read_cryptcompress, -+ .write = prot_write_cryptcompress, -+ .aio_read = generic_file_aio_read, -+ .mmap = prot_mmap_cryptcompress, -+ .release = prot_release_cryptcompress, -+ .fsync = reiser4_sync_common, -+ .sendfile = prot_sendfile_cryptcompress -+ }, -+ .as_ops = { -+ .writepage = reiser4_writepage, -+ .readpage = readpage_cryptcompress, -+ .sync_page = block_sync_page, -+ .writepages = writepages_cryptcompress, -+ .set_page_dirty = reiser4_set_page_dirty, -+ .readpages = readpages_cryptcompress, -+ .prepare_write = prepare_write_common, -+ .invalidatepage = reiser4_invalidatepage, -+ .releasepage = reiser4_releasepage -+ }, -+ .write_sd_by_inode = write_sd_by_inode_common, -+ .flow_by_inode = flow_by_inode_cryptcompress, -+ .key_by_inode = key_by_inode_cryptcompress, -+ .set_plug_in_inode = set_plug_in_inode_common, -+ .adjust_to_parent = adjust_to_parent_cryptcompress, -+ .create_object = create_cryptcompress, -+ .open_object = open_object_cryptcompress, -+ .delete_object = delete_object_cryptcompress, -+ .add_link = reiser4_add_link_common, -+ .rem_link = reiser4_rem_link_common, -+ .owns_item = owns_item_common, -+ .can_add_link = can_add_link_common, -+ .detach = dummyop, -+ .bind = dummyop, -+ .safelink = safelink_common, -+ .estimate = { -+ .create = estimate_create_common, -+ .update = estimate_update_common, -+ .unlink = estimate_unlink_common -+ }, -+ .init_inode_data = init_inode_data_cryptcompress, -+ .cut_tree_worker = cut_tree_worker_cryptcompress, -+ .destroy_inode = destroy_inode_cryptcompress, -+ .wire = { -+ .write = wire_write_common, -+ .read = wire_read_common, -+ .get = wire_get_common, -+ .size = wire_size_common, -+ .done = wire_done_common -+ } -+ } -+}; -+ -+static int change_dir(struct inode *inode, -+ reiser4_plugin * plugin, -+ pset_member memb) -+{ -+ /* cannot change dir plugin of already existing object */ -+ return RETERR(-EINVAL); -+} -+ -+static reiser4_plugin_ops dir_plugin_ops = { -+ .change = change_dir -+}; -+ -+/* -+ * definition of directory plugins -+ */ -+ -+dir_plugin dir_plugins[LAST_DIR_ID] = { -+ /* standard hashed directory plugin */ -+ [HASHED_DIR_PLUGIN_ID] = { -+ .h = { -+ .type_id = REISER4_DIR_PLUGIN_TYPE, -+ .id = HASHED_DIR_PLUGIN_ID, -+ .pops = &dir_plugin_ops, -+ .label = "dir", -+ .desc = "hashed directory", -+ .linkage = {NULL, NULL} -+ }, -+ .inode_ops = { -+ .create = reiser4_create_common, -+ .lookup = reiser4_lookup_common, -+ .link = reiser4_link_common, -+ .unlink = reiser4_unlink_common, -+ .symlink = reiser4_symlink_common, -+ .mkdir = reiser4_mkdir_common, -+ .rmdir = reiser4_unlink_common, -+ .mknod = reiser4_mknod_common, -+ .rename = reiser4_rename_common, -+ .permission = reiser4_permission_common, -+ .setattr = reiser4_setattr_common, -+ .getattr = reiser4_getattr_common -+ }, -+ .file_ops = { -+ .llseek = reiser4_llseek_dir_common, -+ .read = generic_read_dir, -+ .readdir = reiser4_readdir_common, -+ .release = reiser4_release_dir_common, -+ .fsync = reiser4_sync_common -+ }, -+ .as_ops = { -+ .writepage = bugop, -+ .sync_page = bugop, -+ .writepages = dummyop, -+ .set_page_dirty = bugop, -+ .readpages = bugop, -+ .prepare_write = bugop, -+ .commit_write = bugop, -+ .bmap = bugop, -+ .invalidatepage = bugop, -+ .releasepage = bugop -+ }, -+ .get_parent = get_parent_common, -+ .is_name_acceptable = is_name_acceptable_common, -+ .build_entry_key = build_entry_key_hashed, -+ .build_readdir_key = build_readdir_key_common, -+ .add_entry = reiser4_add_entry_common, -+ .rem_entry = reiser4_rem_entry_common, -+ .init = reiser4_dir_init_common, -+ .done = reiser4_dir_done_common, -+ .attach = reiser4_attach_common, -+ .detach = reiser4_detach_common, -+ .estimate = { -+ .add_entry = estimate_add_entry_common, -+ .rem_entry = estimate_rem_entry_common, -+ .unlink = dir_estimate_unlink_common -+ } -+ }, -+ /* hashed directory for which seekdir/telldir are guaranteed to -+ * work. Brain-damage. */ -+ [SEEKABLE_HASHED_DIR_PLUGIN_ID] = { -+ .h = { -+ .type_id = REISER4_DIR_PLUGIN_TYPE, -+ .id = SEEKABLE_HASHED_DIR_PLUGIN_ID, -+ .pops = &dir_plugin_ops, -+ .label = "dir32", -+ .desc = "directory hashed with 31 bit hash", -+ .linkage = {NULL, NULL} -+ }, -+ .inode_ops = { -+ .create = reiser4_create_common, -+ .lookup = reiser4_lookup_common, -+ .link = reiser4_link_common, -+ .unlink = reiser4_unlink_common, -+ .symlink = reiser4_symlink_common, -+ .mkdir = reiser4_mkdir_common, -+ .rmdir = reiser4_unlink_common, -+ .mknod = reiser4_mknod_common, -+ .rename = reiser4_rename_common, -+ .permission = reiser4_permission_common, -+ .setattr = reiser4_setattr_common, -+ .getattr = reiser4_getattr_common -+ }, -+ .file_ops = { -+ .llseek = reiser4_llseek_dir_common, -+ .read = generic_read_dir, -+ .readdir = reiser4_readdir_common, -+ .release = reiser4_release_dir_common, -+ .fsync = reiser4_sync_common -+ }, -+ .as_ops = { -+ .writepage = bugop, -+ .sync_page = bugop, -+ .writepages = dummyop, -+ .set_page_dirty = bugop, -+ .readpages = bugop, -+ .prepare_write = bugop, -+ .commit_write = bugop, -+ .bmap = bugop, -+ .invalidatepage = bugop, -+ .releasepage = bugop -+ }, -+ .get_parent = get_parent_common, -+ .is_name_acceptable = is_name_acceptable_common, -+ .build_entry_key = build_entry_key_seekable, -+ .build_readdir_key = build_readdir_key_common, -+ .add_entry = reiser4_add_entry_common, -+ .rem_entry = reiser4_rem_entry_common, -+ .init = reiser4_dir_init_common, -+ .done = reiser4_dir_done_common, -+ .attach = reiser4_attach_common, -+ .detach = reiser4_detach_common, -+ .estimate = { -+ .add_entry = estimate_add_entry_common, -+ .rem_entry = estimate_rem_entry_common, -+ .unlink = dir_estimate_unlink_common -+ } -+ } -+}; -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/object.h linux-2.6.20/fs/reiser4/plugin/object.h ---- linux-2.6.20.orig/fs/reiser4/plugin/object.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/object.h 2007-05-06 14:50:43.839019469 +0400 -@@ -0,0 +1,121 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Declaration of object plugin functions. */ -+ -+#if !defined( __FS_REISER4_PLUGIN_OBJECT_H__ ) -+#define __FS_REISER4_PLUGIN_OBJECT_H__ -+ -+#include "../type_safe_hash.h" -+ -+/* common implementations of inode operations */ -+int reiser4_create_common(struct inode *parent, struct dentry *dentry, -+ int mode, struct nameidata *); -+struct dentry * reiser4_lookup_common(struct inode *parent, -+ struct dentry *dentry, -+ struct nameidata *nameidata); -+int reiser4_link_common(struct dentry *existing, struct inode *parent, -+ struct dentry *newname); -+int reiser4_unlink_common(struct inode *parent, struct dentry *victim); -+int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode); -+int reiser4_symlink_common(struct inode *parent, struct dentry *dentry, -+ const char *linkname); -+int reiser4_mknod_common(struct inode *parent, struct dentry *dentry, -+ int mode, dev_t rdev); -+int reiser4_rename_common(struct inode *old_dir, struct dentry *old_name, -+ struct inode *new_dir, struct dentry *new_name); -+void *reiser4_follow_link_common(struct dentry *, struct nameidata *data); -+int reiser4_permission_common(struct inode *, int mask, -+ struct nameidata *nameidata); -+int reiser4_setattr_common(struct dentry *, struct iattr *); -+int reiser4_getattr_common(struct vfsmount *mnt, struct dentry *, -+ struct kstat *); -+ -+/* common implementations of file operations */ -+loff_t reiser4_llseek_dir_common(struct file *, loff_t off, int origin); -+int reiser4_readdir_common(struct file *, void *dirent, filldir_t); -+int reiser4_release_dir_common(struct inode *, struct file *); -+int reiser4_sync_common(struct file *, struct dentry *, int datasync); -+ -+/* common implementations of address space operations */ -+int prepare_write_common(struct file *, struct page *, unsigned from, -+ unsigned to); -+ -+/* file plugin operations: common implementations */ -+int write_sd_by_inode_common(struct inode *); -+int key_by_inode_and_offset_common(struct inode *, loff_t, reiser4_key *); -+int set_plug_in_inode_common(struct inode *object, struct inode *parent, -+ reiser4_object_create_data *); -+int adjust_to_parent_common(struct inode *object, struct inode *parent, -+ struct inode *root); -+int adjust_to_parent_common_dir(struct inode *object, struct inode *parent, -+ struct inode *root); -+int adjust_to_parent_cryptcompress(struct inode *object, struct inode *parent, -+ struct inode *root); -+int reiser4_create_object_common(struct inode *object, struct inode *parent, -+ reiser4_object_create_data *); -+int reiser4_delete_object_common(struct inode *); -+int reiser4_delete_dir_common(struct inode *); -+int reiser4_add_link_common(struct inode *object, struct inode *parent); -+int reiser4_rem_link_common(struct inode *object, struct inode *parent); -+int rem_link_common_dir(struct inode *object, struct inode *parent); -+int owns_item_common(const struct inode *, const coord_t *); -+int owns_item_common_dir(const struct inode *, const coord_t *); -+int can_add_link_common(const struct inode *); -+int can_rem_link_common_dir(const struct inode *); -+int reiser4_detach_common_dir(struct inode *child, struct inode *parent); -+int reiser4_bind_common_dir(struct inode *child, struct inode *parent); -+int safelink_common(struct inode *, reiser4_safe_link_t, __u64 value); -+reiser4_block_nr estimate_create_common(const struct inode *); -+reiser4_block_nr estimate_create_common_dir(const struct inode *); -+reiser4_block_nr estimate_update_common(const struct inode *); -+reiser4_block_nr estimate_unlink_common(const struct inode *, -+ const struct inode *); -+reiser4_block_nr estimate_unlink_common_dir(const struct inode *, -+ const struct inode *); -+char *wire_write_common(struct inode *, char *start); -+char *wire_read_common(char *addr, reiser4_object_on_wire *); -+struct dentry *wire_get_common(struct super_block *, reiser4_object_on_wire *); -+int wire_size_common(struct inode *); -+void wire_done_common(reiser4_object_on_wire *); -+ -+/* dir plugin operations: common implementations */ -+struct dentry *get_parent_common(struct inode *child); -+int is_name_acceptable_common(const struct inode *, const char *name, int len); -+void build_entry_key_common(const struct inode *, -+ const struct qstr *qname, reiser4_key *); -+int build_readdir_key_common(struct file *dir, reiser4_key *); -+int reiser4_add_entry_common(struct inode *object, struct dentry *where, -+ reiser4_object_create_data *, reiser4_dir_entry_desc *); -+int reiser4_rem_entry_common(struct inode *object, struct dentry *where, -+ reiser4_dir_entry_desc *); -+int reiser4_dir_init_common(struct inode *object, struct inode *parent, -+ reiser4_object_create_data *); -+int reiser4_dir_done_common(struct inode *); -+int reiser4_attach_common(struct inode *child, struct inode *parent); -+int reiser4_detach_common(struct inode *object, struct inode *parent); -+reiser4_block_nr estimate_add_entry_common(const struct inode *); -+reiser4_block_nr estimate_rem_entry_common(const struct inode *); -+reiser4_block_nr dir_estimate_unlink_common(const struct inode *, -+ const struct inode *); -+ -+/* these are essential parts of common implementations, they are to make -+ customized implementations easier */ -+int do_prepare_write(struct file *, struct page *, unsigned from, unsigned to); -+ -+/* merely useful functions */ -+int lookup_sd(struct inode *, znode_lock_mode, coord_t *, lock_handle *, -+ const reiser4_key *, int silent); -+ -+/* __FS_REISER4_PLUGIN_OBJECT_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/plugin.c linux-2.6.20/fs/reiser4/plugin/plugin.c ---- linux-2.6.20.orig/fs/reiser4/plugin/plugin.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/plugin.c 2007-05-06 14:50:43.839019469 +0400 -@@ -0,0 +1,578 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Basic plugin infrastructure, lookup etc. */ -+ -+/* PLUGINS: -+ -+ Plugins are internal Reiser4 "modules" or "objects" used to increase -+ extensibility and allow external users to easily adapt reiser4 to -+ their needs. -+ -+ Plugins are classified into several disjoint "types". Plugins -+ belonging to the particular plugin type are termed "instances" of -+ this type. Currently the following types are present: -+ -+ . object plugin -+ . hash plugin -+ . tail plugin -+ . perm plugin -+ . item plugin -+ . node layout plugin -+ -+NIKITA-FIXME-HANS: update this list, and review this entire comment for currency -+ -+ Object (file) plugin determines how given file-system object serves -+ standard VFS requests for read, write, seek, mmap etc. Instances of -+ file plugins are: regular file, directory, symlink. Another example -+ of file plugin is audit plugin, that optionally records accesses to -+ underlying object and forwards requests to it. -+ -+ Hash plugins compute hashes used by reiser4 to store and locate -+ files within directories. Instances of hash plugin type are: r5, -+ tea, rupasov. -+ -+ Tail plugins (or, more precisely, tail policy plugins) determine -+ when last part of the file should be stored in a formatted item. -+ -+ Perm plugins control permissions granted for a process accessing a file. -+ -+ Scope and lookup: -+ -+ label such that pair ( type_label, plugin_label ) is unique. This -+ pair is a globally persistent and user-visible plugin -+ identifier. Internally kernel maintains plugins and plugin types in -+ arrays using an index into those arrays as plugin and plugin type -+ identifiers. File-system in turn, also maintains persistent -+ "dictionary" which is mapping from plugin label to numerical -+ identifier which is stored in file-system objects. That is, we -+ store the offset into the plugin array for that plugin type as the -+ plugin id in the stat data of the filesystem object. -+ -+ plugin_labels have meaning for the user interface that assigns -+ plugins to files, and may someday have meaning for dynamic loading of -+ plugins and for copying of plugins from one fs instance to -+ another by utilities like cp and tar. -+ -+ Internal kernel plugin type identifier (index in plugins[] array) is -+ of type reiser4_plugin_type. Set of available plugin types is -+ currently static, but dynamic loading doesn't seem to pose -+ insurmountable problems. -+ -+ Within each type plugins are addressed by the identifiers of type -+ reiser4_plugin_id (indices in -+ reiser4_plugin_type_data.builtin[]). Such identifiers are only -+ required to be unique within one type, not globally. -+ -+ Thus, plugin in memory is uniquely identified by the pair (type_id, -+ id). -+ -+ Usage: -+ -+ There exists only one instance of each plugin instance, but this -+ single instance can be associated with many entities (file-system -+ objects, items, nodes, transactions, file-descriptors etc.). Entity -+ to which plugin of given type is termed (due to the lack of -+ imagination) "subject" of this plugin type and, by abuse of -+ terminology, subject of particular instance of this type to which -+ it's attached currently. For example, inode is subject of object -+ plugin type. Inode representing directory is subject of directory -+ plugin, hash plugin type and some particular instance of hash plugin -+ type. Inode, representing regular file is subject of "regular file" -+ plugin, tail-policy plugin type etc. -+ -+ With each subject the plugin possibly stores some state. For example, -+ the state of a directory plugin (instance of object plugin type) is pointer -+ to hash plugin (if directories always use hashing that is). State of -+ audit plugin is file descriptor (struct file) of log file or some -+ magic value to do logging through printk(). -+ -+ Interface: -+ -+ In addition to a scalar identifier, each plugin type and plugin -+ proper has a "label": short string and a "description"---longer -+ descriptive string. Labels and descriptions of plugin types are -+ hard-coded into plugins[] array, declared and defined in -+ plugin.c. Label and description of plugin are stored in .label and -+ .desc fields of reiser4_plugin_header respectively. It's possible to -+ locate plugin by the pair of labels. -+ -+ Features: -+ -+ . user-level plugin manipulations: -+ + reiser4("filename/..file_plugin<='audit'"); -+ + write(open("filename/..file_plugin"), "audit", 8); -+ -+ . user level utilities lsplug and chplug to manipulate plugins. -+ Utilities are not of primary priority. Possibly they will be not -+ working on v4.0 -+ -+NIKITA-FIXME-HANS: this should be a mkreiserfs option not a mount option, do you agree? I don't think that specifying it at mount time, and then changing it with each mount, is a good model for usage. -+ -+ . mount option "plug" to set-up plugins of root-directory. -+ "plug=foo:bar" will set "bar" as default plugin of type "foo". -+ -+ Limitations: -+ -+ . each plugin type has to provide at least one builtin -+ plugin. This is technical limitation and it can be lifted in the -+ future. -+ -+ TODO: -+ -+ New plugin types/plugings: -+ Things we should be able to separately choose to inherit: -+ -+ security plugins -+ -+ stat data -+ -+ file bodies -+ -+ file plugins -+ -+ dir plugins -+ -+ . perm:acl -+ -+ d audi---audit plugin intercepting and possibly logging all -+ accesses to object. Requires to put stub functions in file_operations -+ in stead of generic_file_*. -+ -+NIKITA-FIXME-HANS: why make overflows a plugin? -+ . over---handle hash overflows -+ -+ . sqnt---handle different access patterns and instruments read-ahead -+ -+NIKITA-FIXME-HANS: describe the line below in more detail. -+ -+ . hier---handle inheritance of plugins along file-system hierarchy -+ -+ Different kinds of inheritance: on creation vs. on access. -+ Compatible/incompatible plugins. -+ Inheritance for multi-linked files. -+ Layered plugins. -+ Notion of plugin context is abandoned. -+ -+Each file is associated -+ with one plugin and dependant plugins (hash, etc.) are stored as -+ main plugin state. Now, if we have plugins used for regular files -+ but not for directories, how such plugins would be inherited? -+ . always store them with directories also -+ -+NIKTIA-FIXME-HANS: Do the line above. It is not exclusive of doing the line below which is also useful. -+ -+ . use inheritance hierarchy, independent of file-system namespace -+ -+*/ -+ -+#include "../debug.h" -+#include "../dformat.h" -+#include "plugin_header.h" -+#include "item/static_stat.h" -+#include "node/node.h" -+#include "security/perm.h" -+#include "space/space_allocator.h" -+#include "disk_format/disk_format.h" -+#include "plugin.h" -+#include "../reiser4.h" -+#include "../jnode.h" -+#include "../inode.h" -+ -+#include /* for struct super_block */ -+ -+/* public interface */ -+ -+/* initialise plugin sub-system. Just call this once on reiser4 startup. */ -+int init_plugins(void); -+int setup_plugins(struct super_block *super, reiser4_plugin ** area); -+int locate_plugin(struct inode *inode, plugin_locator * loc); -+ -+/** -+ * init_plugins - initialize plugins -+ * -+ * Initializes plugin sub-system. It is part of reiser4 module -+ * initialization. For each plugin of each type init method is called and each -+ * plugin is put into list of plugins. -+ */ -+int init_plugins(void) -+{ -+ reiser4_plugin_type type_id; -+ -+ for (type_id = 0; type_id < REISER4_PLUGIN_TYPES; ++type_id) { -+ reiser4_plugin_type_data *ptype; -+ int i; -+ -+ ptype = &plugins[type_id]; -+ assert("nikita-3508", ptype->label != NULL); -+ assert("nikita-3509", ptype->type_id == type_id); -+ -+ INIT_LIST_HEAD(&ptype->plugins_list); -+/* NIKITA-FIXME-HANS: change builtin_num to some other name lacking the term builtin. */ -+ for (i = 0; i < ptype->builtin_num; ++i) { -+ reiser4_plugin *plugin; -+ -+ plugin = plugin_at(ptype, i); -+ -+ if (plugin->h.label == NULL) -+ /* uninitialized slot encountered */ -+ continue; -+ assert("nikita-3445", plugin->h.type_id == type_id); -+ plugin->h.id = i; -+ if (plugin->h.pops != NULL && -+ plugin->h.pops->init != NULL) { -+ int result; -+ -+ result = plugin->h.pops->init(plugin); -+ if (result != 0) -+ return result; -+ } -+ INIT_LIST_HEAD(&plugin->h.linkage); -+ list_add_tail(&plugin->h.linkage, &ptype->plugins_list); -+ } -+ } -+ return 0; -+} -+ -+/* true if plugin type id is valid */ -+int is_plugin_type_valid(reiser4_plugin_type type) -+{ -+ /* "type" is unsigned, so no comparison with 0 is -+ necessary */ -+ return (type < REISER4_PLUGIN_TYPES); -+} -+ -+/* true if plugin id is valid */ -+int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id) -+{ -+ assert("nikita-1653", is_plugin_type_valid(type)); -+ return id < plugins[type].builtin_num; -+} -+ -+/* return plugin by its @type and @id. -+ -+ Both arguments are checked for validness: this is supposed to be called -+ from user-level. -+ -+NIKITA-FIXME-HANS: Do you instead mean that this checks ids created in -+user space, and passed to the filesystem by use of method files? Your -+comment really confused me on the first reading.... -+ -+*/ -+reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type /* plugin type -+ * unchecked */, -+ reiser4_plugin_id id /* plugin id, -+ * unchecked */) -+{ -+ if (is_plugin_type_valid(type)) { -+ if (is_plugin_id_valid(type, id)) -+ return plugin_at(&plugins[type], id); -+ else -+ /* id out of bounds */ -+ warning("nikita-2913", -+ "Invalid plugin id: [%i:%i]", type, id); -+ } else -+ /* type_id out of bounds */ -+ warning("nikita-2914", "Invalid type_id: %i", type); -+ return NULL; -+} -+ -+/** -+ * save_plugin_id - store plugin id in disk format -+ * @plugin: plugin to convert -+ * @area: where to store result -+ * -+ * Puts id of @plugin in little endian format to address @area. -+ */ -+int save_plugin_id(reiser4_plugin *plugin /* plugin to convert */ , -+ d16 *area /* where to store result */ ) -+{ -+ assert("nikita-1261", plugin != NULL); -+ assert("nikita-1262", area != NULL); -+ -+ put_unaligned(cpu_to_le16(plugin->h.id), area); -+ return 0; -+} -+ -+/* list of all plugins of given type */ -+struct list_head *get_plugin_list(reiser4_plugin_type type) -+{ -+ assert("nikita-1056", is_plugin_type_valid(type)); -+ return &plugins[type].plugins_list; -+} -+ -+static void update_pset_mask(reiser4_inode * info, pset_member memb) -+{ -+ struct dentry *rootdir; -+ reiser4_inode *root; -+ -+ assert("edward-1443", memb != PSET_FILE); -+ -+ rootdir = inode_by_reiser4_inode(info)->i_sb->s_root; -+ if (rootdir != NULL) { -+ root = reiser4_inode_data(rootdir->d_inode); -+ /* -+ * if inode is different from the default one, or we are -+ * changing plugin of root directory, update plugin_mask -+ */ -+ if (aset_get(info->pset, memb) != -+ aset_get(root->pset, memb) || -+ info == root) -+ info->plugin_mask |= (1 << memb); -+ else -+ info->plugin_mask &= ~(1 << memb); -+ } -+} -+ -+/* Get specified plugin set member from parent, -+ or from fs-defaults (if no parent is given) and -+ install the result to pset of @self */ -+int grab_plugin_pset(struct inode *self, -+ struct inode *ancestor, -+ pset_member memb) -+{ -+ reiser4_plugin *plug; -+ reiser4_inode *info; -+ int result = 0; -+ -+ /* Do not grab if initialised already. */ -+ info = reiser4_inode_data(self); -+ if (aset_get(info->pset, memb) != NULL) -+ return 0; -+ if (ancestor) { -+ reiser4_inode *parent; -+ -+ parent = reiser4_inode_data(ancestor); -+ plug = aset_get(parent->hset, memb) ? : -+ aset_get(parent->pset, memb); -+ } -+ else -+ plug = get_default_plugin(memb); -+ -+ result = set_plugin(&info->pset, memb, plug); -+ if (result == 0) { -+ if (!ancestor || self->i_sb->s_root->d_inode != self) -+ update_pset_mask(info, memb); -+ } -+ return result; -+} -+ -+/* Take missing pset members from root inode */ -+int finish_pset(struct inode *inode) -+{ -+ reiser4_plugin *plug; -+ reiser4_inode *root; -+ reiser4_inode *info; -+ pset_member memb; -+ int result = 0; -+ -+ root = reiser4_inode_data(inode->i_sb->s_root->d_inode); -+ info = reiser4_inode_data(inode); -+ -+ assert("edward-1455", root != NULL); -+ assert("edward-1456", info != NULL); -+ -+ /* file and directory plugins are already initialized. */ -+ for (memb = PSET_DIR + 1; memb < PSET_LAST; ++memb) { -+ -+ /* Do not grab if initialised already. */ -+ if (aset_get(info->pset, memb) != NULL) -+ continue; -+ -+ plug = aset_get(root->pset, memb); -+ result = set_plugin(&info->pset, memb, plug); -+ if (result != 0) -+ break; -+ } -+ if (result != 0) { -+ warning("nikita-3447", -+ "Cannot set up plugins for %lli", -+ (unsigned long long) -+ get_inode_oid(inode)); -+ } -+ return result; -+} -+ -+int force_plugin_pset(struct inode *self, pset_member memb, reiser4_plugin * plug) -+{ -+ reiser4_inode *info; -+ int result = 0; -+ -+ if (!self->i_sb->s_root || self->i_sb->s_root->d_inode == self) { -+ /* Changing pset in the root object. */ -+ return RETERR(-EINVAL); -+ } -+ -+ info = reiser4_inode_data(self); -+ if (plug->h.pops != NULL && plug->h.pops->change != NULL) -+ result = plug->h.pops->change(self, plug, memb); -+ else -+ result = aset_set_unsafe(&info->pset, memb, plug); -+ if (result == 0) { -+ __u16 oldmask = info->plugin_mask; -+ -+ update_pset_mask(info, memb); -+ if (oldmask != info->plugin_mask) -+ reiser4_inode_clr_flag(self, REISER4_SDLEN_KNOWN); -+ } -+ return result; -+} -+ -+reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES] = { -+ /* C90 initializers */ -+ [REISER4_FILE_PLUGIN_TYPE] = { -+ .type_id = REISER4_FILE_PLUGIN_TYPE, -+ .label = "file", -+ .desc = "Object plugins", -+ .builtin_num = sizeof_array(file_plugins), -+ .builtin = file_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(file_plugin) -+ }, -+ [REISER4_DIR_PLUGIN_TYPE] = { -+ .type_id = REISER4_DIR_PLUGIN_TYPE, -+ .label = "dir", -+ .desc = "Directory plugins", -+ .builtin_num = sizeof_array(dir_plugins), -+ .builtin = dir_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(dir_plugin) -+ }, -+ [REISER4_HASH_PLUGIN_TYPE] = { -+ .type_id = REISER4_HASH_PLUGIN_TYPE, -+ .label = "hash", -+ .desc = "Directory hashes", -+ .builtin_num = sizeof_array(hash_plugins), -+ .builtin = hash_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(hash_plugin) -+ }, -+ [REISER4_FIBRATION_PLUGIN_TYPE] = { -+ .type_id = -+ REISER4_FIBRATION_PLUGIN_TYPE, -+ .label = "fibration", -+ .desc = "Directory fibrations", -+ .builtin_num = sizeof_array(fibration_plugins), -+ .builtin = fibration_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(fibration_plugin) -+ }, -+ [REISER4_CIPHER_PLUGIN_TYPE] = { -+ .type_id = REISER4_CIPHER_PLUGIN_TYPE, -+ .label = "cipher", -+ .desc = "Cipher plugins", -+ .builtin_num = sizeof_array(cipher_plugins), -+ .builtin = cipher_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(cipher_plugin) -+ }, -+ [REISER4_DIGEST_PLUGIN_TYPE] = { -+ .type_id = REISER4_DIGEST_PLUGIN_TYPE, -+ .label = "digest", -+ .desc = "Digest plugins", -+ .builtin_num = sizeof_array(digest_plugins), -+ .builtin = digest_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(digest_plugin) -+ }, -+ [REISER4_COMPRESSION_PLUGIN_TYPE] = { -+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE, -+ .label = "compression", -+ .desc = "Compression plugins", -+ .builtin_num = sizeof_array(compression_plugins), -+ .builtin = compression_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(compression_plugin) -+ }, -+ [REISER4_FORMATTING_PLUGIN_TYPE] = { -+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE, -+ .label = "formatting", -+ .desc = "Tail inlining policies", -+ .builtin_num = sizeof_array(formatting_plugins), -+ .builtin = formatting_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(formatting_plugin) -+ }, -+ [REISER4_PERM_PLUGIN_TYPE] = { -+ .type_id = REISER4_PERM_PLUGIN_TYPE, -+ .label = "perm", -+ .desc = "Permission checks", -+ .builtin_num = sizeof_array(perm_plugins), -+ .builtin = perm_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(perm_plugin) -+ }, -+ [REISER4_ITEM_PLUGIN_TYPE] = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .label = "item", -+ .desc = "Item handlers", -+ .builtin_num = sizeof_array(item_plugins), -+ .builtin = item_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(item_plugin) -+ }, -+ [REISER4_NODE_PLUGIN_TYPE] = { -+ .type_id = REISER4_NODE_PLUGIN_TYPE, -+ .label = "node", -+ .desc = "node layout handlers", -+ .builtin_num = sizeof_array(node_plugins), -+ .builtin = node_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(node_plugin) -+ }, -+ [REISER4_SD_EXT_PLUGIN_TYPE] = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .label = "sd_ext", -+ .desc = "Parts of stat-data", -+ .builtin_num = sizeof_array(sd_ext_plugins), -+ .builtin = sd_ext_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(sd_ext_plugin) -+ }, -+ [REISER4_FORMAT_PLUGIN_TYPE] = { -+ .type_id = REISER4_FORMAT_PLUGIN_TYPE, -+ .label = "disk_layout", -+ .desc = "defines filesystem on disk layout", -+ .builtin_num = sizeof_array(format_plugins), -+ .builtin = format_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(disk_format_plugin) -+ }, -+ [REISER4_JNODE_PLUGIN_TYPE] = { -+ .type_id = REISER4_JNODE_PLUGIN_TYPE, -+ .label = "jnode", -+ .desc = "defines kind of jnode", -+ .builtin_num = sizeof_array(jnode_plugins), -+ .builtin = jnode_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(jnode_plugin) -+ }, -+ [REISER4_COMPRESSION_MODE_PLUGIN_TYPE] = { -+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ .label = "compression_mode", -+ .desc = "Defines compression mode", -+ .builtin_num = sizeof_array(compression_mode_plugins), -+ .builtin = compression_mode_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(compression_mode_plugin) -+ }, -+ [REISER4_CLUSTER_PLUGIN_TYPE] = { -+ .type_id = REISER4_CLUSTER_PLUGIN_TYPE, -+ .label = "cluster", -+ .desc = "Defines cluster size", -+ .builtin_num = sizeof_array(cluster_plugins), -+ .builtin = cluster_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(cluster_plugin) -+ } -+}; -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 120 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/plugin.h linux-2.6.20/fs/reiser4/plugin/plugin.h ---- linux-2.6.20.orig/fs/reiser4/plugin/plugin.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/plugin.h 2007-05-06 14:50:43.855024468 +0400 -@@ -0,0 +1,920 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Basic plugin data-types. -+ see fs/reiser4/plugin/plugin.c for details */ -+ -+#if !defined( __FS_REISER4_PLUGIN_TYPES_H__ ) -+#define __FS_REISER4_PLUGIN_TYPES_H__ -+ -+#include "../forward.h" -+#include "../debug.h" -+#include "../dformat.h" -+#include "../key.h" -+#include "compress/compress.h" -+#include "crypto/cipher.h" -+#include "plugin_header.h" -+#include "item/static_stat.h" -+#include "item/internal.h" -+#include "item/sde.h" -+#include "item/cde.h" -+#include "item/item.h" -+#include "node/node.h" -+#include "node/node40.h" -+#include "security/perm.h" -+#include "fibration.h" -+ -+#include "space/bitmap.h" -+#include "space/space_allocator.h" -+ -+#include "disk_format/disk_format40.h" -+#include "disk_format/disk_format.h" -+ -+#include /* for struct super_block, address_space */ -+#include /* for struct page */ -+#include /* for struct buffer_head */ -+#include /* for struct dentry */ -+#include -+#include -+ -+typedef struct reiser4_object_on_wire reiser4_object_on_wire; -+ -+/* -+ * File plugin. Defines the set of methods that file plugins implement, some -+ * of which are optional. -+ * -+ * A file plugin offers to the caller an interface for IO ( writing to and/or -+ * reading from) to what the caller sees as one sequence of bytes. An IO to it -+ * may affect more than one physical sequence of bytes, or no physical sequence -+ * of bytes, it may affect sequences of bytes offered by other file plugins to -+ * the semantic layer, and the file plugin may invoke other plugins and -+ * delegate work to them, but its interface is structured for offering the -+ * caller the ability to read and/or write what the caller sees as being a -+ * single sequence of bytes. -+ * -+ * The file plugin must present a sequence of bytes to the caller, but it does -+ * not necessarily have to store a sequence of bytes, it does not necessarily -+ * have to support efficient tree traversal to any offset in the sequence of -+ * bytes (tail and extent items, whose keys contain offsets, do however provide -+ * efficient non-sequential lookup of any offset in the sequence of bytes). -+ * -+ * Directory plugins provide methods for selecting file plugins by resolving a -+ * name for them. -+ * -+ * The functionality other filesystems call an attribute, and rigidly tie -+ * together, we decompose into orthogonal selectable features of files. Using -+ * the terminology we will define next, an attribute is a perhaps constrained, -+ * perhaps static length, file whose parent has a uni-count-intra-link to it, -+ * which might be grandparent-major-packed, and whose parent has a deletion -+ * method that deletes it. -+ * -+ * File plugins can implement constraints. -+ * -+ * Files can be of variable length (e.g. regular unix files), or of static -+ * length (e.g. static sized attributes). -+ * -+ * An object may have many sequences of bytes, and many file plugins, but, it -+ * has exactly one objectid. It is usually desirable that an object has a -+ * deletion method which deletes every item with that objectid. Items cannot -+ * in general be found by just their objectids. This means that an object must -+ * have either a method built into its deletion plugin method for knowing what -+ * items need to be deleted, or links stored with the object that provide the -+ * plugin with a method for finding those items. Deleting a file within an -+ * object may or may not have the effect of deleting the entire object, -+ * depending on the file plugin's deletion method. -+ * -+ * LINK TAXONOMY: -+ * -+ * Many objects have a reference count, and when the reference count reaches 0 -+ * the object's deletion method is invoked. Some links embody a reference -+ * count increase ("countlinks"), and others do not ("nocountlinks"). -+ * -+ * Some links are bi-directional links ("bilinks"), and some are -+ * uni-directional("unilinks"). -+ * -+ * Some links are between parts of the same object ("intralinks"), and some are -+ * between different objects ("interlinks"). -+ * -+ * PACKING TAXONOMY: -+ * -+ * Some items of an object are stored with a major packing locality based on -+ * their object's objectid (e.g. unix directory items in plan A), and these are -+ * called "self-major-packed". -+ * -+ * Some items of an object are stored with a major packing locality based on -+ * their semantic parent object's objectid (e.g. unix file bodies in plan A), -+ * and these are called "parent-major-packed". -+ * -+ * Some items of an object are stored with a major packing locality based on -+ * their semantic grandparent, and these are called "grandparent-major-packed". -+ * Now carefully notice that we run into trouble with key length if we have to -+ * store a 8 byte major+minor grandparent based packing locality, an 8 byte -+ * parent objectid, an 8 byte attribute objectid, and an 8 byte offset, all in -+ * a 24 byte key. One of these fields must be sacrificed if an item is to be -+ * grandparent-major-packed, and which to sacrifice is left to the item author -+ * choosing to make the item grandparent-major-packed. You cannot make tail -+ * items and extent items grandparent-major-packed, though you could make them -+ * self-major-packed (usually they are parent-major-packed). -+ * -+ * In the case of ACLs (which are composed of fixed length ACEs which consist -+ * of {subject-type, subject, and permission bitmask} triples), it makes sense -+ * to not have an offset field in the ACE item key, and to allow duplicate keys -+ * for ACEs. Thus, the set of ACES for a given file is found by looking for a -+ * key consisting of the objectid of the grandparent (thus grouping all ACLs in -+ * a directory together), the minor packing locality of ACE, the objectid of -+ * the file, and 0. -+ * -+ * IO involves moving data from one location to another, which means that two -+ * locations must be specified, source and destination. -+ * -+ * This source and destination can be in the filesystem, or they can be a -+ * pointer in the user process address space plus a byte count. -+ * -+ * If both source and destination are in the filesystem, then at least one of -+ * them must be representable as a pure stream of bytes (which we call a flow, -+ * and define as a struct containing a key, a data pointer, and a length). -+ * This may mean converting one of them into a flow. We provide a generic -+ * cast_into_flow() method, which will work for any plugin supporting -+ * read_flow(), though it is inefficiently implemented in that it temporarily -+ * stores the flow in a buffer (Question: what to do with huge flows that -+ * cannot fit into memory? Answer: we must not convert them all at once. ) -+ * -+ * Performing a write requires resolving the write request into a flow defining -+ * the source, and a method that performs the write, and a key that defines -+ * where in the tree the write is to go. -+ * -+ * Performing a read requires resolving the read request into a flow defining -+ * the target, and a method that performs the read, and a key that defines -+ * where in the tree the read is to come from. -+ * -+ * There will exist file plugins which have no pluginid stored on the disk for -+ * them, and which are only invoked by other plugins. -+ */ -+ -+/* This should be incremented with each new contributed -+ pair (plugin type, plugin id). -+ NOTE: Make sure there is a release of reiser4progs -+ with the corresponding version number */ -+#define PLUGIN_LIBRARY_VERSION 0 -+ -+ /* enumeration of fields within plugin_set */ -+typedef enum { -+ PSET_FILE, -+ PSET_DIR, /* PSET_FILE and PSET_DIR should be first elements: -+ * inode.c:read_inode() depends on this. */ -+ PSET_PERM, -+ PSET_FORMATTING, -+ PSET_HASH, -+ PSET_FIBRATION, -+ PSET_SD, -+ PSET_DIR_ITEM, -+ PSET_CIPHER, -+ PSET_DIGEST, -+ PSET_COMPRESSION, -+ PSET_COMPRESSION_MODE, -+ PSET_CLUSTER, -+ PSET_CREATE, -+ PSET_LAST -+} pset_member; -+ -+/* builtin file-plugins */ -+typedef enum { -+ /* regular file */ -+ UNIX_FILE_PLUGIN_ID, -+ /* directory */ -+ DIRECTORY_FILE_PLUGIN_ID, -+ /* symlink */ -+ SYMLINK_FILE_PLUGIN_ID, -+ /* for objects completely handled by the VFS: fifos, devices, -+ sockets */ -+ SPECIAL_FILE_PLUGIN_ID, -+ /* regular cryptcompress file */ -+ CRYPTCOMPRESS_FILE_PLUGIN_ID, -+ /* number of file plugins. Used as size of arrays to hold -+ file plugins. */ -+ LAST_FILE_PLUGIN_ID -+} reiser4_file_id; -+ -+typedef struct file_plugin { -+ -+ /* generic fields */ -+ plugin_header h; -+ -+ struct inode_operations inode_ops; -+ struct file_operations file_ops; -+ struct address_space_operations as_ops; -+ -+ /* save inode cached stat-data onto disk. It was called -+ reiserfs_update_sd() in 3.x */ -+ int (*write_sd_by_inode) (struct inode *); -+ -+ /* -+ * private methods: These are optional. If used they will allow you to -+ * minimize the amount of code needed to implement a deviation from -+ * some other method that also uses them. -+ */ -+ -+ /* -+ * Construct flow into @flow according to user-supplied data. -+ * -+ * This is used by read/write methods to construct a flow to -+ * write/read. ->flow_by_inode() is plugin method, rather than single -+ * global implementation, because key in a flow used by plugin may -+ * depend on data in a @buf. -+ * -+ * NIKITA-FIXME-HANS: please create statistics on what functions are -+ * dereferenced how often for the mongo benchmark. You can supervise -+ * Elena doing this for you if that helps. Email me the list of the -+ * top 10, with their counts, and an estimate of the total number of -+ * CPU cycles spent dereferencing as a percentage of CPU cycles spent -+ * processing (non-idle processing). If the total percent is, say, -+ * less than 1%, it will make our coding discussions much easier, and -+ * keep me from questioning whether functions like the below are too -+ * frequently called to be dereferenced. If the total percent is more -+ * than 1%, perhaps private methods should be listed in a "required" -+ * comment at the top of each plugin (with stern language about how if -+ * the comment is missing it will not be accepted by the maintainer), -+ * and implemented using macros not dereferenced functions. How about -+ * replacing this whole private methods part of the struct with a -+ * thorough documentation of what the standard helper functions are for -+ * use in constructing plugins? I think users have been asking for -+ * that, though not in so many words. -+ */ -+ int (*flow_by_inode) (struct inode *, const char __user *buf, -+ int user, loff_t size, -+ loff_t off, rw_op op, flow_t *); -+ -+ /* -+ * Return the key used to retrieve an offset of a file. It is used by -+ * default implementation of ->flow_by_inode() method -+ * (common_build_flow()) and, among other things, to get to the extent -+ * from jnode of unformatted node. -+ */ -+ int (*key_by_inode) (struct inode *, loff_t off, reiser4_key *); -+ -+ /* NIKITA-FIXME-HANS: this comment is not as clear to others as you think.... */ -+ /* -+ * set the plugin for a file. Called during file creation in creat() -+ * but not reiser4() unless an inode already exists for the file. -+ */ -+ int (*set_plug_in_inode) (struct inode *inode, struct inode *parent, -+ reiser4_object_create_data *); -+ -+ /* NIKITA-FIXME-HANS: comment and name seem to say different things, -+ * are you setting up the object itself also or just adjusting the -+ * parent?.... */ -+ /* set up plugins for new @object created in @parent. @root is root -+ directory. */ -+ int (*adjust_to_parent) (struct inode *object, struct inode *parent, -+ struct inode *root); -+ /* -+ * this does whatever is necessary to do when object is created. For -+ * instance, for unix files stat data is inserted. It is supposed to be -+ * called by create of struct inode_operations. -+ */ -+ int (*create_object) (struct inode *object, struct inode *parent, -+ reiser4_object_create_data *); -+ -+ /* this does whatever is necessary to do when object is opened */ -+ int (*open_object) (struct inode * inode, struct file * file); -+ /* -+ * this method should check REISER4_NO_SD and set REISER4_NO_SD on -+ * success. Deletion of an object usually includes removal of items -+ * building file body (for directories this is removal of "." and "..") -+ * and removal of stat-data item. -+ */ -+ int (*delete_object) (struct inode *); -+ -+ /* add link from @parent to @object */ -+ int (*add_link) (struct inode *object, struct inode *parent); -+ -+ /* remove link from @parent to @object */ -+ int (*rem_link) (struct inode *object, struct inode *parent); -+ -+ /* -+ * return true if item addressed by @coord belongs to @inode. This is -+ * used by read/write to properly slice flow into items in presence of -+ * multiple key assignment policies, because items of a file are not -+ * necessarily contiguous in a key space, for example, in a plan-b. -+ */ -+ int (*owns_item) (const struct inode *, const coord_t *); -+ -+ /* checks whether yet another hard links to this object can be -+ added */ -+ int (*can_add_link) (const struct inode *); -+ -+ /* checks whether hard links to this object can be removed */ -+ int (*can_rem_link) (const struct inode *); -+ -+ /* not empty for DIRECTORY_FILE_PLUGIN_ID only currently. It calls -+ detach of directory plugin to remove ".." */ -+ int (*detach) (struct inode * child, struct inode * parent); -+ -+ /* called when @child was just looked up in the @parent. It is not -+ empty for DIRECTORY_FILE_PLUGIN_ID only where it calls attach of -+ directory plugin */ -+ int (*bind) (struct inode * child, struct inode * parent); -+ -+ /* process safe-link during mount */ -+ int (*safelink) (struct inode * object, reiser4_safe_link_t link, -+ __u64 value); -+ -+ /* The couple of estimate methods for all file operations */ -+ struct { -+ reiser4_block_nr(*create) (const struct inode *); -+ reiser4_block_nr(*update) (const struct inode *); -+ reiser4_block_nr(*unlink) (const struct inode *, -+ const struct inode *); -+ } estimate; -+ -+ /* -+ * reiser4 specific part of inode has a union of structures which are -+ * specific to a plugin. This method is called when inode is read -+ * (read_inode) and when file is created (common_create_child) so that -+ * file plugin could initialize its inode data -+ */ -+ void (*init_inode_data) (struct inode *, reiser4_object_create_data *, -+ int); -+ -+ /* -+ * This method performs progressive deletion of items and whole nodes -+ * from right to left. -+ * -+ * @tap: the point deletion process begins from, -+ * @from_key: the beginning of the deleted key range, -+ * @to_key: the end of the deleted key range, -+ * @smallest_removed: the smallest removed key, -+ * -+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree -+ * operation was interrupted for allowing atom commit . -+ */ -+ int (*cut_tree_worker) (tap_t *, const reiser4_key * from_key, -+ const reiser4_key * to_key, -+ reiser4_key * smallest_removed, struct inode *, -+ int, int *); -+ -+ /* called from ->destroy_inode() */ -+ void (*destroy_inode) (struct inode *); -+ -+ /* -+ * methods to serialize object identify. This is used, for example, by -+ * reiser4_{en,de}code_fh(). -+ */ -+ struct { -+ /* store object's identity at @area */ -+ char *(*write) (struct inode * inode, char *area); -+ /* parse object from wire to the @obj */ -+ char *(*read) (char *area, reiser4_object_on_wire * obj); -+ /* given object identity in @obj, find or create its dentry */ -+ struct dentry *(*get) (struct super_block * s, -+ reiser4_object_on_wire * obj); -+ /* how many bytes ->wire.write() consumes */ -+ int (*size) (struct inode * inode); -+ /* finish with object identify */ -+ void (*done) (reiser4_object_on_wire * obj); -+ } wire; -+} file_plugin; -+ -+extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID]; -+ -+struct reiser4_object_on_wire { -+ file_plugin *plugin; -+ union { -+ struct { -+ obj_key_id key_id; -+ } std; -+ void *generic; -+ } u; -+}; -+ -+/* builtin dir-plugins */ -+typedef enum { -+ HASHED_DIR_PLUGIN_ID, -+ SEEKABLE_HASHED_DIR_PLUGIN_ID, -+ LAST_DIR_ID -+} reiser4_dir_id; -+ -+typedef struct dir_plugin { -+ /* generic fields */ -+ plugin_header h; -+ -+ struct inode_operations inode_ops; -+ struct file_operations file_ops; -+ struct address_space_operations as_ops; -+ -+ /* -+ * private methods: These are optional. If used they will allow you to -+ * minimize the amount of code needed to implement a deviation from -+ * some other method that uses them. You could logically argue that -+ * they should be a separate type of plugin. -+ */ -+ -+ struct dentry *(*get_parent) (struct inode * childdir); -+ -+ /* -+ * check whether "name" is acceptable name to be inserted into this -+ * object. Optionally implemented by directory-like objects. Can check -+ * for maximal length, reserved symbols etc -+ */ -+ int (*is_name_acceptable) (const struct inode * inode, const char *name, -+ int len); -+ -+ void (*build_entry_key) (const struct inode * dir /* directory where -+ * entry is (or will -+ * be) in.*/ , -+ const struct qstr * name /* name of file -+ * referenced by this -+ * entry */ , -+ reiser4_key * result /* resulting key of -+ * directory entry */ ); -+ int (*build_readdir_key) (struct file * dir, reiser4_key * result); -+ int (*add_entry) (struct inode * object, struct dentry * where, -+ reiser4_object_create_data * data, -+ reiser4_dir_entry_desc * entry); -+ int (*rem_entry) (struct inode * object, struct dentry * where, -+ reiser4_dir_entry_desc * entry); -+ -+ /* -+ * initialize directory structure for newly created object. For normal -+ * unix directories, insert dot and dotdot. -+ */ -+ int (*init) (struct inode * object, struct inode * parent, -+ reiser4_object_create_data * data); -+ -+ /* destroy directory */ -+ int (*done) (struct inode * child); -+ -+ /* called when @subdir was just looked up in the @dir */ -+ int (*attach) (struct inode * subdir, struct inode * dir); -+ int (*detach) (struct inode * subdir, struct inode * dir); -+ -+ struct { -+ reiser4_block_nr(*add_entry) (const struct inode *); -+ reiser4_block_nr(*rem_entry) (const struct inode *); -+ reiser4_block_nr(*unlink) (const struct inode *, -+ const struct inode *); -+ } estimate; -+} dir_plugin; -+ -+extern dir_plugin dir_plugins[LAST_DIR_ID]; -+ -+typedef struct formatting_plugin { -+ /* generic fields */ -+ plugin_header h; -+ /* returns non-zero iff file's tail has to be stored -+ in a direct item. */ -+ int (*have_tail) (const struct inode * inode, loff_t size); -+} formatting_plugin; -+ -+typedef struct hash_plugin { -+ /* generic fields */ -+ plugin_header h; -+ /* computes hash of the given name */ -+ __u64(*hash) (const unsigned char *name, int len); -+} hash_plugin; -+ -+typedef struct cipher_plugin { -+ /* generic fields */ -+ plugin_header h; -+ struct crypto_blkcipher * (*alloc) (void); -+ void (*free) (struct crypto_blkcipher * tfm); -+ /* Offset translator. For each offset this returns (k * offset), where -+ k (k >= 1) is an expansion factor of the cipher algorithm. -+ For all symmetric algorithms k == 1. For asymmetric algorithms (which -+ inflate data) offset translation guarantees that all disk cluster's -+ units will have keys smaller then next cluster's one. -+ */ -+ loff_t(*scale) (struct inode * inode, size_t blocksize, loff_t src); -+ /* Cipher algorithms can accept data only by chunks of cipher block -+ size. This method is to align any flow up to cipher block size when -+ we pass it to cipher algorithm. To align means to append padding of -+ special format specific to the cipher algorithm */ -+ int (*align_stream) (__u8 * tail, int clust_size, int blocksize); -+ /* low-level key manager (check, install, etc..) */ -+ int (*setkey) (struct crypto_tfm * tfm, const __u8 * key, -+ unsigned int keylen); -+ /* main text processing procedures */ -+ void (*encrypt) (__u32 * expkey, __u8 * dst, const __u8 * src); -+ void (*decrypt) (__u32 * expkey, __u8 * dst, const __u8 * src); -+} cipher_plugin; -+ -+typedef struct digest_plugin { -+ /* generic fields */ -+ plugin_header h; -+ /* fingerprint size in bytes */ -+ int fipsize; -+ struct crypto_hash * (*alloc) (void); -+ void (*free) (struct crypto_hash * tfm); -+} digest_plugin; -+ -+typedef struct compression_plugin { -+ /* generic fields */ -+ plugin_header h; -+ int (*init) (void); -+ /* the maximum number of bytes the size of the "compressed" data can -+ * exceed the uncompressed data. */ -+ int (*overrun) (unsigned src_len); -+ coa_t(*alloc) (tfm_action act); -+ void (*free) (coa_t coa, tfm_action act); -+ /* minimal size of the flow we still try to compress */ -+ int (*min_size_deflate) (void); -+ __u32(*checksum) (char *data, __u32 length); -+ /* main transform procedures */ -+ void (*compress) (coa_t coa, __u8 * src_first, unsigned src_len, -+ __u8 * dst_first, unsigned *dst_len); -+ void (*decompress) (coa_t coa, __u8 * src_first, unsigned src_len, -+ __u8 * dst_first, unsigned *dst_len); -+} compression_plugin; -+ -+typedef struct compression_mode_plugin { -+ /* generic fields */ -+ plugin_header h; -+ /* this is called when estimating compressibility -+ of a logical cluster by its content */ -+ int (*should_deflate) (struct inode * inode, cloff_t index); -+ /* this is called when results of compression should be saved */ -+ int (*accept_hook) (struct inode * inode, cloff_t index); -+ /* this is called when results of compression should be discarded */ -+ int (*discard_hook) (struct inode * inode, cloff_t index); -+} compression_mode_plugin; -+ -+typedef struct cluster_plugin { -+ /* generic fields */ -+ plugin_header h; -+ int shift; -+} cluster_plugin; -+ -+typedef struct sd_ext_plugin { -+ /* generic fields */ -+ plugin_header h; -+ int (*present) (struct inode * inode, char **area, int *len); -+ int (*absent) (struct inode * inode); -+ int (*save_len) (struct inode * inode); -+ int (*save) (struct inode * inode, char **area); -+ /* alignment requirement for this stat-data part */ -+ int alignment; -+} sd_ext_plugin; -+ -+/* this plugin contains methods to allocate objectid for newly created files, -+ to deallocate objectid when file gets removed, to report number of used and -+ free objectids */ -+typedef struct oid_allocator_plugin { -+ /* generic fields */ -+ plugin_header h; -+ int (*init_oid_allocator) (reiser4_oid_allocator * map, __u64 nr_files, -+ __u64 oids); -+ /* used to report statfs->f_files */ -+ __u64(*oids_used) (reiser4_oid_allocator * map); -+ /* get next oid to use */ -+ __u64(*next_oid) (reiser4_oid_allocator * map); -+ /* used to report statfs->f_ffree */ -+ __u64(*oids_free) (reiser4_oid_allocator * map); -+ /* allocate new objectid */ -+ int (*allocate_oid) (reiser4_oid_allocator * map, oid_t *); -+ /* release objectid */ -+ int (*release_oid) (reiser4_oid_allocator * map, oid_t); -+ /* how many pages to reserve in transaction for allocation of new -+ objectid */ -+ int (*oid_reserve_allocate) (reiser4_oid_allocator * map); -+ /* how many pages to reserve in transaction for freeing of an -+ objectid */ -+ int (*oid_reserve_release) (reiser4_oid_allocator * map); -+ void (*print_info) (const char *, reiser4_oid_allocator *); -+} oid_allocator_plugin; -+ -+/* disk layout plugin: this specifies super block, journal, bitmap (if there -+ are any) locations, etc */ -+typedef struct disk_format_plugin { -+ /* generic fields */ -+ plugin_header h; -+ /* replay journal, initialize super_info_data, etc */ -+ int (*init_format) (struct super_block *, void *data); -+ -+ /* key of root directory stat data */ -+ const reiser4_key *(*root_dir_key) (const struct super_block *); -+ -+ int (*release) (struct super_block *); -+ jnode *(*log_super) (struct super_block *); -+ int (*check_open) (const struct inode * object); -+ int (*version_update) (struct super_block *); -+} disk_format_plugin; -+ -+struct jnode_plugin { -+ /* generic fields */ -+ plugin_header h; -+ int (*init) (jnode * node); -+ int (*parse) (jnode * node); -+ struct address_space *(*mapping) (const jnode * node); -+ unsigned long (*index) (const jnode * node); -+ jnode *(*clone) (jnode * node); -+}; -+ -+/* plugin instance. */ -+/* */ -+/* This is "wrapper" union for all types of plugins. Most of the code uses */ -+/* plugins of particular type (file_plugin, dir_plugin, etc.) rather than */ -+/* operates with pointers to reiser4_plugin. This union is only used in */ -+/* some generic code in plugin/plugin.c that operates on all */ -+/* plugins. Technically speaking purpose of this union is to add type */ -+/* safety to said generic code: each plugin type (file_plugin, for */ -+/* example), contains plugin_header as its first memeber. This first member */ -+/* is located at the same place in memory as .h member of */ -+/* reiser4_plugin. Generic code, obtains pointer to reiser4_plugin and */ -+/* looks in the .h which is header of plugin type located in union. This */ -+/* allows to avoid type-casts. */ -+union reiser4_plugin { -+ /* generic fields */ -+ plugin_header h; -+ /* file plugin */ -+ file_plugin file; -+ /* directory plugin */ -+ dir_plugin dir; -+ /* hash plugin, used by directory plugin */ -+ hash_plugin hash; -+ /* fibration plugin used by directory plugin */ -+ fibration_plugin fibration; -+ /* cipher transform plugin, used by file plugin */ -+ cipher_plugin cipher; -+ /* digest transform plugin, used by file plugin */ -+ digest_plugin digest; -+ /* compression transform plugin, used by file plugin */ -+ compression_plugin compression; -+ /* tail plugin, used by file plugin */ -+ formatting_plugin formatting; -+ /* permission plugin */ -+ perm_plugin perm; -+ /* node plugin */ -+ node_plugin node; -+ /* item plugin */ -+ item_plugin item; -+ /* stat-data extension plugin */ -+ sd_ext_plugin sd_ext; -+ /* disk layout plugin */ -+ disk_format_plugin format; -+ /* object id allocator plugin */ -+ oid_allocator_plugin oid_allocator; -+ /* plugin for different jnode types */ -+ jnode_plugin jnode; -+ /* compression mode plugin, used by object plugin */ -+ compression_mode_plugin compression_mode; -+ /* cluster plugin, used by object plugin */ -+ cluster_plugin clust; -+ /* place-holder for new plugin types that can be registered -+ dynamically, and used by other dynamically loaded plugins. */ -+ void *generic; -+}; -+ -+struct reiser4_plugin_ops { -+ /* called when plugin is initialized */ -+ int (*init) (reiser4_plugin * plugin); -+ /* called when plugin is unloaded */ -+ int (*done) (reiser4_plugin * plugin); -+ /* load given plugin from disk */ -+ int (*load) (struct inode * inode, -+ reiser4_plugin * plugin, char **area, int *len); -+ /* how many space is required to store this plugin's state -+ in stat-data */ -+ int (*save_len) (struct inode * inode, reiser4_plugin * plugin); -+ /* save persistent plugin-data to disk */ -+ int (*save) (struct inode * inode, reiser4_plugin * plugin, -+ char **area); -+ /* alignment requirement for on-disk state of this plugin -+ in number of bytes */ -+ int alignment; -+ /* install itself into given inode. This can return error -+ (e.g., you cannot change hash of non-empty directory). */ -+ int (*change) (struct inode * inode, reiser4_plugin * plugin, -+ pset_member memb); -+ /* install itself into given inode. This can return error -+ (e.g., you cannot change hash of non-empty directory). */ -+ int (*inherit) (struct inode * inode, struct inode * parent, -+ reiser4_plugin * plugin); -+}; -+ -+/* functions implemented in fs/reiser4/plugin/plugin.c */ -+ -+/* stores plugin reference in reiser4-specific part of inode */ -+extern int set_object_plugin(struct inode *inode, reiser4_plugin_id id); -+extern int setup_plugins(struct super_block *super, reiser4_plugin ** area); -+extern int init_plugins(void); -+ -+/* builtin plugins */ -+ -+/* builtin hash-plugins */ -+ -+typedef enum { -+ RUPASOV_HASH_ID, -+ R5_HASH_ID, -+ TEA_HASH_ID, -+ FNV1_HASH_ID, -+ DEGENERATE_HASH_ID, -+ LAST_HASH_ID -+} reiser4_hash_id; -+ -+/* builtin cipher plugins */ -+ -+typedef enum { -+ NONE_CIPHER_ID, -+ LAST_CIPHER_ID -+} reiser4_cipher_id; -+ -+/* builtin digest plugins */ -+ -+typedef enum { -+ SHA256_32_DIGEST_ID, -+ LAST_DIGEST_ID -+} reiser4_digest_id; -+ -+/* builtin compression mode plugins */ -+typedef enum { -+ NONE_COMPRESSION_MODE_ID, -+ LATTD_COMPRESSION_MODE_ID, -+ ULTIM_COMPRESSION_MODE_ID, -+ FORCE_COMPRESSION_MODE_ID, -+ CONVX_COMPRESSION_MODE_ID, -+ LAST_COMPRESSION_MODE_ID -+} reiser4_compression_mode_id; -+ -+/* builtin cluster plugins */ -+typedef enum { -+ CLUSTER_64K_ID, -+ CLUSTER_32K_ID, -+ CLUSTER_16K_ID, -+ CLUSTER_8K_ID, -+ CLUSTER_4K_ID, -+ LAST_CLUSTER_ID -+} reiser4_cluster_id; -+ -+/* builtin tail-plugins */ -+ -+typedef enum { -+ NEVER_TAILS_FORMATTING_ID, -+ ALWAYS_TAILS_FORMATTING_ID, -+ SMALL_FILE_FORMATTING_ID, -+ LAST_TAIL_FORMATTING_ID -+} reiser4_formatting_id; -+ -+/* compression/clustering specific data */ -+typedef struct compression_data { -+ reiser4_compression_id coa; /* id of the compression algorithm */ -+} compression_data_t; -+ -+typedef __u8 cluster_data_t; /* cluster info */ -+ -+/* data type used to pack parameters that we pass to vfs object creation -+ function create_object() */ -+struct reiser4_object_create_data { -+ /* plugin to control created object */ -+ reiser4_file_id id; -+ /* mode of regular file, directory or special file */ -+/* what happens if some other sort of perm plugin is in use? */ -+ int mode; -+ /* rdev of special file */ -+ dev_t rdev; -+ /* symlink target */ -+ const char *name; -+ /* add here something for non-standard objects you invent, like -+ query for interpolation file etc. */ -+ -+ crypto_stat_t * crypto; -+ compression_data_t *compression; -+ cluster_data_t *cluster; -+ -+ struct inode *parent; -+ struct dentry *dentry; -+}; -+ -+/* description of directory entry being created/destroyed/sought for -+ -+ It is passed down to the directory plugin and farther to the -+ directory item plugin methods. Creation of new directory is done in -+ several stages: first we search for an entry with the same name, then -+ create new one. reiser4_dir_entry_desc is used to store some information -+ collected at some stage of this process and required later: key of -+ item that we want to insert/delete and pointer to an object that will -+ be bound by the new directory entry. Probably some more fields will -+ be added there. -+ -+*/ -+struct reiser4_dir_entry_desc { -+ /* key of directory entry */ -+ reiser4_key key; -+ /* object bound by this entry. */ -+ struct inode *obj; -+}; -+ -+#define MAX_PLUGIN_TYPE_LABEL_LEN 32 -+#define MAX_PLUGIN_PLUG_LABEL_LEN 32 -+ -+/* used for interface with user-land: table-driven parsing in -+ reiser4(). */ -+typedef struct plugin_locator { -+ reiser4_plugin_type type_id; -+ reiser4_plugin_id id; -+ char type_label[MAX_PLUGIN_TYPE_LABEL_LEN]; -+ char plug_label[MAX_PLUGIN_PLUG_LABEL_LEN]; -+} plugin_locator; -+ -+extern int locate_plugin(struct inode *inode, plugin_locator * loc); -+ -+#define PLUGIN_BY_ID(TYPE,ID,FIELD) \ -+static inline TYPE *TYPE ## _by_id( reiser4_plugin_id id ) \ -+{ \ -+ reiser4_plugin *plugin = plugin_by_id ( ID, id ); \ -+ return plugin ? & plugin -> FIELD : NULL; \ -+} \ -+static inline TYPE *TYPE ## _by_disk_id( reiser4_tree *tree, d16 *id ) \ -+{ \ -+ reiser4_plugin *plugin = plugin_by_disk_id ( tree, ID, id ); \ -+ return plugin ? & plugin -> FIELD : NULL; \ -+} \ -+static inline TYPE *TYPE ## _by_unsafe_id( reiser4_plugin_id id ) \ -+{ \ -+ reiser4_plugin *plugin = plugin_by_unsafe_id ( ID, id ); \ -+ return plugin ? & plugin -> FIELD : NULL; \ -+} \ -+static inline reiser4_plugin* TYPE ## _to_plugin( TYPE* plugin ) \ -+{ \ -+ return ( reiser4_plugin * ) plugin; \ -+} \ -+static inline reiser4_plugin_id TYPE ## _id( TYPE* plugin ) \ -+{ \ -+ return TYPE ## _to_plugin (plugin) -> h.id; \ -+} \ -+typedef struct { int foo; } TYPE ## _plugin_dummy -+ -+PLUGIN_BY_ID(item_plugin, REISER4_ITEM_PLUGIN_TYPE, item); -+PLUGIN_BY_ID(file_plugin, REISER4_FILE_PLUGIN_TYPE, file); -+PLUGIN_BY_ID(dir_plugin, REISER4_DIR_PLUGIN_TYPE, dir); -+PLUGIN_BY_ID(node_plugin, REISER4_NODE_PLUGIN_TYPE, node); -+PLUGIN_BY_ID(sd_ext_plugin, REISER4_SD_EXT_PLUGIN_TYPE, sd_ext); -+PLUGIN_BY_ID(perm_plugin, REISER4_PERM_PLUGIN_TYPE, perm); -+PLUGIN_BY_ID(hash_plugin, REISER4_HASH_PLUGIN_TYPE, hash); -+PLUGIN_BY_ID(fibration_plugin, REISER4_FIBRATION_PLUGIN_TYPE, fibration); -+PLUGIN_BY_ID(cipher_plugin, REISER4_CIPHER_PLUGIN_TYPE, cipher); -+PLUGIN_BY_ID(digest_plugin, REISER4_DIGEST_PLUGIN_TYPE, digest); -+PLUGIN_BY_ID(compression_plugin, REISER4_COMPRESSION_PLUGIN_TYPE, compression); -+PLUGIN_BY_ID(formatting_plugin, REISER4_FORMATTING_PLUGIN_TYPE, formatting); -+PLUGIN_BY_ID(disk_format_plugin, REISER4_FORMAT_PLUGIN_TYPE, format); -+PLUGIN_BY_ID(jnode_plugin, REISER4_JNODE_PLUGIN_TYPE, jnode); -+PLUGIN_BY_ID(compression_mode_plugin, REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ compression_mode); -+PLUGIN_BY_ID(cluster_plugin, REISER4_CLUSTER_PLUGIN_TYPE, clust); -+ -+extern int save_plugin_id(reiser4_plugin * plugin, d16 * area); -+ -+extern struct list_head *get_plugin_list(reiser4_plugin_type type_id); -+ -+#define for_all_plugins(ptype, plugin) \ -+for (plugin = list_entry(get_plugin_list(ptype)->next, reiser4_plugin, h.linkage); \ -+ get_plugin_list(ptype) != &plugin->h.linkage; \ -+ plugin = list_entry(plugin->h.linkage.next, reiser4_plugin, h.linkage)) -+ -+ -+extern int grab_plugin_pset(struct inode *self, struct inode *ancestor, pset_member memb); -+extern int force_plugin_pset(struct inode *self, pset_member memb, reiser4_plugin *plug); -+extern int finish_pset(struct inode *inode); -+ -+/* defined in fs/reiser4/plugin/object.c */ -+extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID]; -+/* defined in fs/reiser4/plugin/object.c */ -+extern dir_plugin dir_plugins[LAST_DIR_ID]; -+/* defined in fs/reiser4/plugin/item/static_stat.c */ -+extern sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION]; -+/* defined in fs/reiser4/plugin/hash.c */ -+extern hash_plugin hash_plugins[LAST_HASH_ID]; -+/* defined in fs/reiser4/plugin/fibration.c */ -+extern fibration_plugin fibration_plugins[LAST_FIBRATION_ID]; -+/* defined in fs/reiser4/plugin/crypt.c */ -+extern cipher_plugin cipher_plugins[LAST_CIPHER_ID]; -+/* defined in fs/reiser4/plugin/digest.c */ -+extern digest_plugin digest_plugins[LAST_DIGEST_ID]; -+/* defined in fs/reiser4/plugin/compress/compress.c */ -+extern compression_plugin compression_plugins[LAST_COMPRESSION_ID]; -+/* defined in fs/reiser4/plugin/compress/compression_mode.c */ -+extern compression_mode_plugin -+compression_mode_plugins[LAST_COMPRESSION_MODE_ID]; -+/* defined in fs/reiser4/plugin/cluster.c */ -+extern cluster_plugin cluster_plugins[LAST_CLUSTER_ID]; -+/* defined in fs/reiser4/plugin/tail.c */ -+extern formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID]; -+/* defined in fs/reiser4/plugin/security/security.c */ -+extern perm_plugin perm_plugins[LAST_PERM_ID]; -+/* defined in fs/reiser4/plugin/item/item.c */ -+extern item_plugin item_plugins[LAST_ITEM_ID]; -+/* defined in fs/reiser4/plugin/node/node.c */ -+extern node_plugin node_plugins[LAST_NODE_ID]; -+/* defined in fs/reiser4/plugin/disk_format/disk_format.c */ -+extern disk_format_plugin format_plugins[LAST_FORMAT_ID]; -+ -+/* __FS_REISER4_PLUGIN_TYPES_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/plugin_header.h linux-2.6.20/fs/reiser4/plugin/plugin_header.h ---- linux-2.6.20.orig/fs/reiser4/plugin/plugin_header.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/plugin_header.h 2007-05-06 14:50:43.855024468 +0400 -@@ -0,0 +1,144 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* plugin header. Data structures required by all plugin types. */ -+ -+#if !defined( __PLUGIN_HEADER_H__ ) -+#define __PLUGIN_HEADER_H__ -+ -+/* plugin data-types and constants */ -+ -+#include "../debug.h" -+#include "../dformat.h" -+ -+typedef enum { -+ REISER4_FILE_PLUGIN_TYPE, -+ REISER4_DIR_PLUGIN_TYPE, -+ REISER4_ITEM_PLUGIN_TYPE, -+ REISER4_NODE_PLUGIN_TYPE, -+ REISER4_HASH_PLUGIN_TYPE, -+ REISER4_FIBRATION_PLUGIN_TYPE, -+ REISER4_FORMATTING_PLUGIN_TYPE, -+ REISER4_PERM_PLUGIN_TYPE, -+ REISER4_SD_EXT_PLUGIN_TYPE, -+ REISER4_FORMAT_PLUGIN_TYPE, -+ REISER4_JNODE_PLUGIN_TYPE, -+ REISER4_CIPHER_PLUGIN_TYPE, -+ REISER4_DIGEST_PLUGIN_TYPE, -+ REISER4_COMPRESSION_PLUGIN_TYPE, -+ REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ REISER4_CLUSTER_PLUGIN_TYPE, -+ REISER4_PLUGIN_TYPES -+} reiser4_plugin_type; -+ -+typedef enum { -+ REISER4_DIRECTORY_FILE, -+ REISER4_REGULAR_FILE, -+ REISER4_SYMLINK_FILE, -+ REISER4_SPECIAL_FILE, -+} reiser4_plugin_group; -+ -+struct reiser4_plugin_ops; -+/* generic plugin operations, supported by each -+ plugin type. */ -+typedef struct reiser4_plugin_ops reiser4_plugin_ops; -+ -+/* the common part of all plugin instances. */ -+typedef struct plugin_header { -+ /* plugin type */ -+ reiser4_plugin_type type_id; -+ /* id of this plugin */ -+ reiser4_plugin_id id; -+ /* bitmask of groups the plugin belongs to. */ -+ reiser4_plugin_groups groups; -+ /* plugin operations */ -+ reiser4_plugin_ops *pops; -+/* NIKITA-FIXME-HANS: usage of and access to label and desc is not commented and defined. */ -+ /* short label of this plugin */ -+ const char *label; -+ /* descriptive string.. */ -+ const char *desc; -+ /* list linkage */ -+ struct list_head linkage; -+} plugin_header; -+ -+#define plugin_of_group(plug, group) (plug->h.groups & (1 << group)) -+ -+/* PRIVATE INTERFACES */ -+/* NIKITA-FIXME-HANS: what is this for and why does it duplicate what is in plugin_header? */ -+/* plugin type representation. */ -+typedef struct reiser4_plugin_type_data { -+ /* internal plugin type identifier. Should coincide with -+ index of this item in plugins[] array. */ -+ reiser4_plugin_type type_id; -+ /* short symbolic label of this plugin type. Should be no longer -+ than MAX_PLUGIN_TYPE_LABEL_LEN characters including '\0'. */ -+ const char *label; -+ /* plugin type description longer than .label */ -+ const char *desc; -+ -+/* NIKITA-FIXME-HANS: define built-in */ -+ /* number of built-in plugin instances of this type */ -+ int builtin_num; -+ /* array of built-in plugins */ -+ void *builtin; -+ struct list_head plugins_list; -+ size_t size; -+} reiser4_plugin_type_data; -+ -+extern reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES]; -+ -+int is_plugin_type_valid(reiser4_plugin_type type); -+int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id); -+ -+static inline reiser4_plugin *plugin_at(reiser4_plugin_type_data * ptype, int i) -+{ -+ char *builtin; -+ -+ builtin = ptype->builtin; -+ return (reiser4_plugin *) (builtin + i * ptype->size); -+} -+ -+/* return plugin by its @type_id and @id */ -+static inline reiser4_plugin *plugin_by_id(reiser4_plugin_type type, -+ reiser4_plugin_id id) -+{ -+ assert("nikita-1651", is_plugin_type_valid(type)); -+ assert("nikita-1652", is_plugin_id_valid(type, id)); -+ return plugin_at(&plugins[type], id); -+} -+ -+extern reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type_id, -+ reiser4_plugin_id id); -+ -+/** -+ * plugin_by_disk_id - get reiser4_plugin -+ * @type_id: plugin type id -+ * @did: plugin id in disk format -+ * -+ * Returns reiser4_plugin by plugin type id an dplugin_id. -+ */ -+static inline reiser4_plugin *plugin_by_disk_id(reiser4_tree * tree UNUSED_ARG, -+ reiser4_plugin_type type_id, -+ __le16 *plugin_id) -+{ -+ /* -+ * what we should do properly is to maintain within each file-system a -+ * dictionary that maps on-disk plugin ids to "universal" ids. This -+ * dictionary will be resolved on mount time, so that this function -+ * will perform just one additional array lookup. -+ */ -+ return plugin_by_unsafe_id(type_id, le16_to_cpu(*plugin_id)); -+} -+ -+/* __PLUGIN_HEADER_H__ */ -+#endif -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/plugin_set.c linux-2.6.20/fs/reiser4/plugin/plugin_set.c ---- linux-2.6.20.orig/fs/reiser4/plugin/plugin_set.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/plugin_set.c 2007-05-06 14:50:43.855024468 +0400 -@@ -0,0 +1,379 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+/* This file contains Reiser4 plugin set operations */ -+ -+/* plugin sets -+ * -+ * Each file in reiser4 is controlled by a whole set of plugins (file plugin, -+ * directory plugin, hash plugin, tail policy plugin, security plugin, etc.) -+ * assigned (inherited, deduced from mode bits, etc.) at creation time. This -+ * set of plugins (so called pset) is described by structure plugin_set (see -+ * plugin/plugin_set.h), which contains pointers to all required plugins. -+ * -+ * Children can inherit some pset members from their parent, however sometimes -+ * it is useful to specify members different from parent ones. Since object's -+ * pset can not be easily changed without fatal consequences, we use for this -+ * purpose another special plugin table (so called hset, or heir set) described -+ * by the same structure. -+ * -+ * Inode only stores a pointers to pset and hset. Different inodes with the -+ * same set of pset (hset) members point to the same pset (hset). This is -+ * archived by storing psets and hsets in global hash table. Races are avoided -+ * by simple (and efficient so far) solution of never recycling psets, even -+ * when last inode pointing to it is destroyed. -+ */ -+ -+#include "../debug.h" -+#include "../super.h" -+#include "plugin_set.h" -+ -+#include -+#include -+ -+/* slab for plugin sets */ -+static struct kmem_cache *plugin_set_slab; -+ -+static spinlock_t plugin_set_lock[8] __cacheline_aligned_in_smp = { -+ [0 ... 7] = SPIN_LOCK_UNLOCKED -+}; -+ -+/* hash table support */ -+ -+#define PS_TABLE_SIZE (32) -+ -+static inline plugin_set *cast_to(const unsigned long *a) -+{ -+ return container_of(a, plugin_set, hashval); -+} -+ -+static inline int pseq(const unsigned long *a1, const unsigned long *a2) -+{ -+ plugin_set *set1; -+ plugin_set *set2; -+ -+ /* make sure fields are not missed in the code below */ -+ cassert(sizeof *set1 == -+ sizeof set1->hashval + -+ sizeof set1->link + -+ sizeof set1->file + -+ sizeof set1->dir + -+ sizeof set1->perm + -+ sizeof set1->formatting + -+ sizeof set1->hash + -+ sizeof set1->fibration + -+ sizeof set1->sd + -+ sizeof set1->dir_item + -+ sizeof set1->cipher + -+ sizeof set1->digest + -+ sizeof set1->compression + -+ sizeof set1->compression_mode + -+ sizeof set1->cluster + -+ sizeof set1->create); -+ -+ set1 = cast_to(a1); -+ set2 = cast_to(a2); -+ return -+ set1->hashval == set2->hashval && -+ set1->file == set2->file && -+ set1->dir == set2->dir && -+ set1->perm == set2->perm && -+ set1->formatting == set2->formatting && -+ set1->hash == set2->hash && -+ set1->fibration == set2->fibration && -+ set1->sd == set2->sd && -+ set1->dir_item == set2->dir_item && -+ set1->cipher == set2->cipher && -+ set1->digest == set2->digest && -+ set1->compression == set2->compression && -+ set1->compression_mode == set2->compression_mode && -+ set1->cluster == set2->cluster && -+ set1->create == set2->create; -+} -+ -+#define HASH_FIELD(hash, set, field) \ -+({ \ -+ (hash) += (unsigned long)(set)->field >> 2; \ -+}) -+ -+static inline unsigned long calculate_hash(const plugin_set * set) -+{ -+ unsigned long result; -+ -+ result = 0; -+ HASH_FIELD(result, set, file); -+ HASH_FIELD(result, set, dir); -+ HASH_FIELD(result, set, perm); -+ HASH_FIELD(result, set, formatting); -+ HASH_FIELD(result, set, hash); -+ HASH_FIELD(result, set, fibration); -+ HASH_FIELD(result, set, sd); -+ HASH_FIELD(result, set, dir_item); -+ HASH_FIELD(result, set, cipher); -+ HASH_FIELD(result, set, digest); -+ HASH_FIELD(result, set, compression); -+ HASH_FIELD(result, set, compression_mode); -+ HASH_FIELD(result, set, cluster); -+ HASH_FIELD(result, set, create); -+ return result & (PS_TABLE_SIZE - 1); -+} -+ -+static inline unsigned long -+pshash(ps_hash_table * table, const unsigned long *a) -+{ -+ return *a; -+} -+ -+/* The hash table definition */ -+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get()) -+#define KFREE(ptr, size) kfree(ptr) -+TYPE_SAFE_HASH_DEFINE(ps, plugin_set, unsigned long, hashval, link, pshash, -+ pseq); -+#undef KFREE -+#undef KMALLOC -+ -+static ps_hash_table ps_table; -+static plugin_set empty_set = { -+ .hashval = 0, -+ .file = NULL, -+ .dir = NULL, -+ .perm = NULL, -+ .formatting = NULL, -+ .hash = NULL, -+ .fibration = NULL, -+ .sd = NULL, -+ .dir_item = NULL, -+ .cipher = NULL, -+ .digest = NULL, -+ .compression = NULL, -+ .compression_mode = NULL, -+ .cluster = NULL, -+ .create = NULL, -+ .link = {NULL} -+}; -+ -+plugin_set *plugin_set_get_empty(void) -+{ -+ return &empty_set; -+} -+ -+void plugin_set_put(plugin_set * set) -+{ -+} -+ -+static inline unsigned long *pset_field(plugin_set * set, int offset) -+{ -+ return (unsigned long *)(((char *)set) + offset); -+} -+ -+static int plugin_set_field(plugin_set ** set, const unsigned long val, -+ const int offset) -+{ -+ unsigned long *spot; -+ spinlock_t *lock; -+ plugin_set replica; -+ plugin_set *twin; -+ plugin_set *psal; -+ plugin_set *orig; -+ -+ assert("nikita-2902", set != NULL); -+ assert("nikita-2904", *set != NULL); -+ -+ spot = pset_field(*set, offset); -+ if (unlikely(*spot == val)) -+ return 0; -+ -+ replica = *(orig = *set); -+ *pset_field(&replica, offset) = val; -+ replica.hashval = calculate_hash(&replica); -+ rcu_read_lock(); -+ twin = ps_hash_find(&ps_table, &replica.hashval); -+ if (unlikely(twin == NULL)) { -+ rcu_read_unlock(); -+ psal = kmem_cache_alloc(plugin_set_slab, -+ reiser4_ctx_gfp_mask_get()); -+ if (psal == NULL) -+ return RETERR(-ENOMEM); -+ *psal = replica; -+ lock = &plugin_set_lock[replica.hashval & 7]; -+ spin_lock(lock); -+ twin = ps_hash_find(&ps_table, &replica.hashval); -+ if (likely(twin == NULL)) { -+ *set = psal; -+ ps_hash_insert_rcu(&ps_table, psal); -+ } else { -+ *set = twin; -+ kmem_cache_free(plugin_set_slab, psal); -+ } -+ spin_unlock(lock); -+ } else { -+ rcu_read_unlock(); -+ *set = twin; -+ } -+ return 0; -+} -+ -+static struct { -+ int offset; -+ reiser4_plugin_groups groups; -+ reiser4_plugin_type type; -+} pset_descr[PSET_LAST] = { -+ [PSET_FILE] = { -+ .offset = offsetof(plugin_set, file), -+ .type = REISER4_FILE_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_DIR] = { -+ .offset = offsetof(plugin_set, dir), -+ .type = REISER4_DIR_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_PERM] = { -+ .offset = offsetof(plugin_set, perm), -+ .type = REISER4_PERM_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_FORMATTING] = { -+ .offset = offsetof(plugin_set, formatting), -+ .type = REISER4_FORMATTING_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_HASH] = { -+ .offset = offsetof(plugin_set, hash), -+ .type = REISER4_HASH_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_FIBRATION] = { -+ .offset = offsetof(plugin_set, fibration), -+ .type = REISER4_FIBRATION_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_SD] = { -+ .offset = offsetof(plugin_set, sd), -+ .type = REISER4_ITEM_PLUGIN_TYPE, -+ .groups = (1 << STAT_DATA_ITEM_TYPE) -+ }, -+ [PSET_DIR_ITEM] = { -+ .offset = offsetof(plugin_set, dir_item), -+ .type = REISER4_ITEM_PLUGIN_TYPE, -+ .groups = (1 << DIR_ENTRY_ITEM_TYPE) -+ }, -+ [PSET_CIPHER] = { -+ .offset = offsetof(plugin_set, cipher), -+ .type = REISER4_CIPHER_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_DIGEST] = { -+ .offset = offsetof(plugin_set, digest), -+ .type = REISER4_DIGEST_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_COMPRESSION] = { -+ .offset = offsetof(plugin_set, compression), -+ .type = REISER4_COMPRESSION_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_COMPRESSION_MODE] = { -+ .offset = offsetof(plugin_set, compression_mode), -+ .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_CLUSTER] = { -+ .offset = offsetof(plugin_set, cluster), -+ .type = REISER4_CLUSTER_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_CREATE] = { -+ .offset = offsetof(plugin_set, create), -+ .type = REISER4_FILE_PLUGIN_TYPE, -+ .groups = (1 << REISER4_REGULAR_FILE) -+ } -+}; -+ -+#define DEFINE_PSET_OPS(PREFIX) \ -+ reiser4_plugin_type PREFIX##_member_to_type_unsafe(pset_member memb) \ -+{ \ -+ if (memb > PSET_LAST) \ -+ return REISER4_PLUGIN_TYPES; \ -+ return pset_descr[memb].type; \ -+} \ -+ \ -+int PREFIX##_set_unsafe(plugin_set ** set, pset_member memb, \ -+ reiser4_plugin * plugin) \ -+{ \ -+ assert("nikita-3492", set != NULL); \ -+ assert("nikita-3493", *set != NULL); \ -+ assert("nikita-3494", plugin != NULL); \ -+ assert("nikita-3495", 0 <= memb && memb < PSET_LAST); \ -+ assert("nikita-3496", plugin->h.type_id == pset_descr[memb].type); \ -+ \ -+ if (pset_descr[memb].groups) \ -+ if (!(pset_descr[memb].groups & plugin->h.groups)) \ -+ return -EINVAL; \ -+ \ -+ return plugin_set_field(set, \ -+ (unsigned long)plugin, pset_descr[memb].offset); \ -+} \ -+ \ -+reiser4_plugin *PREFIX##_get(plugin_set * set, pset_member memb) \ -+{ \ -+ assert("nikita-3497", set != NULL); \ -+ assert("nikita-3498", 0 <= memb && memb < PSET_LAST); \ -+ \ -+ return *(reiser4_plugin **) (((char *)set) + pset_descr[memb].offset); \ -+} -+ -+DEFINE_PSET_OPS(aset); -+ -+int set_plugin(plugin_set ** set, pset_member memb, reiser4_plugin * plugin) { -+ return plugin_set_field(set, -+ (unsigned long)plugin, pset_descr[memb].offset); -+} -+ -+/** -+ * init_plugin_set - create plugin set cache and hash table -+ * -+ * Initializes slab cache of plugin_set-s and their hash table. It is part of -+ * reiser4 module initialization. -+ */ -+int init_plugin_set(void) -+{ -+ int result; -+ -+ result = ps_hash_init(&ps_table, PS_TABLE_SIZE); -+ if (result == 0) { -+ plugin_set_slab = kmem_cache_create("plugin_set", -+ sizeof(plugin_set), 0, -+ SLAB_HWCACHE_ALIGN, -+ NULL, NULL); -+ if (plugin_set_slab == NULL) -+ result = RETERR(-ENOMEM); -+ } -+ return result; -+} -+ -+/** -+ * done_plugin_set - delete plugin_set cache and plugin_set hash table -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+void done_plugin_set(void) -+{ -+ plugin_set *cur, *next; -+ -+ for_all_in_htable(&ps_table, ps, cur, next) { -+ ps_hash_remove(&ps_table, cur); -+ kmem_cache_free(plugin_set_slab, cur); -+ } -+ destroy_reiser4_cache(&plugin_set_slab); -+ ps_hash_done(&ps_table); -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 120 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/plugin_set.h linux-2.6.20/fs/reiser4/plugin/plugin_set.h ---- linux-2.6.20.orig/fs/reiser4/plugin/plugin_set.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/plugin_set.h 2007-05-06 14:50:43.855024468 +0400 -@@ -0,0 +1,77 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Reiser4 plugin set definition. -+ See fs/reiser4/plugin/plugin_set.c for details */ -+ -+#if !defined( __PLUGIN_SET_H__ ) -+#define __PLUGIN_SET_H__ -+ -+#include "../type_safe_hash.h" -+#include "plugin.h" -+ -+#include -+ -+struct plugin_set; -+typedef struct plugin_set plugin_set; -+ -+TYPE_SAFE_HASH_DECLARE(ps, plugin_set); -+ -+struct plugin_set { -+ unsigned long hashval; -+ /* plugin of file */ -+ file_plugin *file; -+ /* plugin of dir */ -+ dir_plugin *dir; -+ /* perm plugin for this file */ -+ perm_plugin *perm; -+ /* tail policy plugin. Only meaningful for regular files */ -+ formatting_plugin *formatting; -+ /* hash plugin. Only meaningful for directories. */ -+ hash_plugin *hash; -+ /* fibration plugin. Only meaningful for directories. */ -+ fibration_plugin *fibration; -+ /* plugin of stat-data */ -+ item_plugin *sd; -+ /* plugin of items a directory is built of */ -+ item_plugin *dir_item; -+ /* cipher plugin */ -+ cipher_plugin *cipher; -+ /* digest plugin */ -+ digest_plugin *digest; -+ /* compression plugin */ -+ compression_plugin *compression; -+ /* compression mode plugin */ -+ compression_mode_plugin *compression_mode; -+ /* cluster plugin */ -+ cluster_plugin *cluster; -+ /* this specifies file plugin of regular children. -+ only meaningful for directories */ -+ file_plugin *create; -+ ps_hash_link link; -+}; -+ -+extern plugin_set *plugin_set_get_empty(void); -+extern void plugin_set_put(plugin_set * set); -+ -+extern int init_plugin_set(void); -+extern void done_plugin_set(void); -+ -+extern reiser4_plugin *aset_get(plugin_set * set, pset_member memb); -+extern int set_plugin(plugin_set ** set, pset_member memb, -+ reiser4_plugin * plugin); -+extern int aset_set_unsafe(plugin_set ** set, pset_member memb, -+ reiser4_plugin * plugin); -+extern reiser4_plugin_type aset_member_to_type_unsafe(pset_member memb); -+ -+/* __PLUGIN_SET_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/security/Makefile linux-2.6.20/fs/reiser4/plugin/security/Makefile ---- linux-2.6.20.orig/fs/reiser4/plugin/security/Makefile 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/security/Makefile 2007-05-06 14:50:43.855024468 +0400 -@@ -0,0 +1,4 @@ -+obj-$(CONFIG_REISER4_FS) += security_plugins.o -+ -+security_plugins-objs := \ -+ perm.o -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/security/perm.c linux-2.6.20/fs/reiser4/plugin/security/perm.c ---- linux-2.6.20.orig/fs/reiser4/plugin/security/perm.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/security/perm.c 2007-05-06 14:50:43.859025718 +0400 -@@ -0,0 +1,44 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* -+ * this file contains implementation of permission plugins. Currently, only -+ * RWX_PERM_ID is implemented -+ */ -+ -+#include "../plugin.h" -+#include "../plugin_header.h" -+#include "../../debug.h" -+ -+perm_plugin perm_plugins[LAST_PERM_ID] = { -+ [NULL_PERM_ID] = { -+ .h = { -+ .type_id = REISER4_PERM_PLUGIN_TYPE, -+ .id = NULL_PERM_ID, -+ .pops = NULL, -+ .label = "null", -+ .desc = "stub permission plugin", -+ .linkage = {NULL, NULL} -+ }, -+ .read_ok = NULL, -+ .write_ok = NULL, -+ .lookup_ok = NULL, -+ .create_ok = NULL, -+ .link_ok = NULL, -+ .unlink_ok = NULL, -+ .delete_ok = NULL, -+ .mask_ok = NULL, -+ .setattr_ok = NULL, -+ .getattr_ok = NULL, -+ .rename_ok = NULL, -+ } -+}; -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/security/perm.h linux-2.6.20/fs/reiser4/plugin/security/perm.h ---- linux-2.6.20.orig/fs/reiser4/plugin/security/perm.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/security/perm.h 2007-05-06 14:50:43.859025718 +0400 -@@ -0,0 +1,82 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Perm (short for "permissions") plugins common stuff. */ -+ -+#if !defined( __REISER4_PERM_H__ ) -+#define __REISER4_PERM_H__ -+ -+#include "../../forward.h" -+#include "../plugin_header.h" -+ -+#include -+#include /* for struct file */ -+#include /* for struct dentry */ -+ -+/* interface for perm plugin. -+ -+ Perm plugin method can be implemented through: -+ -+ 1. consulting ->i_mode bits in stat data -+ -+ 2. obtaining acl from the tree and inspecting it -+ -+ 3. asking some kernel module or user-level program to authorize access. -+ -+ This allows for integration with things like capabilities, SELinux-style -+ secutiry contexts, etc. -+ -+*/ -+/* NIKITA-FIXME-HANS: define what this is targeted for. It does not seem to be intended for use with sys_reiser4. Explain. */ -+typedef struct perm_plugin { -+ /* generic plugin fields */ -+ plugin_header h; -+ -+ /* check permissions for read/write */ -+ int (*read_ok) (struct file *file, const char __user *buf, -+ size_t size, loff_t *off); -+ int (*write_ok) (struct file *file, const char __user *buf, -+ size_t size, loff_t *off); -+ -+ /* check permissions for lookup */ -+ int (*lookup_ok) (struct inode * parent, struct dentry * dentry); -+ -+ /* check permissions for create */ -+ int (*create_ok) (struct inode * parent, struct dentry * dentry, -+ reiser4_object_create_data * data); -+ -+ /* check permissions for linking @where to @existing */ -+ int (*link_ok) (struct dentry * existing, struct inode * parent, -+ struct dentry * where); -+ -+ /* check permissions for unlinking @victim from @parent */ -+ int (*unlink_ok) (struct inode * parent, struct dentry * victim); -+ -+ /* check permissions for deletion of @object whose last reference is -+ by @parent */ -+ int (*delete_ok) (struct inode * parent, struct dentry * victim); -+ int (*mask_ok) (struct inode * inode, int mask); -+ /* check whether attribute change is acceptable */ -+ int (*setattr_ok) (struct dentry * dentry, struct iattr * attr); -+ -+ /* check whether stat(2) is allowed */ -+ int (*getattr_ok) (struct vfsmount * mnt UNUSED_ARG, -+ struct dentry * dentry, struct kstat * stat); -+ /* check whether rename(2) is allowed */ -+ int (*rename_ok) (struct inode * old_dir, struct dentry * old, -+ struct inode * new_dir, struct dentry * new); -+} perm_plugin; -+ -+typedef enum { NULL_PERM_ID, LAST_PERM_ID } reiser4_perm_id; -+ -+/* __REISER4_PERM_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/space/bitmap.c linux-2.6.20/fs/reiser4/plugin/space/bitmap.c ---- linux-2.6.20.orig/fs/reiser4/plugin/space/bitmap.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/space/bitmap.c 2007-05-06 14:50:43.859025718 +0400 -@@ -0,0 +1,1585 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "../../debug.h" -+#include "../../dformat.h" -+#include "../../txnmgr.h" -+#include "../../jnode.h" -+#include "../../block_alloc.h" -+#include "../../tree.h" -+#include "../../super.h" -+#include "../plugin.h" -+#include "space_allocator.h" -+#include "bitmap.h" -+ -+#include -+#include /* for struct super_block */ -+#include -+#include -+ -+/* Proposed (but discarded) optimization: dynamic loading/unloading of bitmap -+ * blocks -+ -+ A useful optimization of reiser4 bitmap handling would be dynamic bitmap -+ blocks loading/unloading which is different from v3.x where all bitmap -+ blocks are loaded at mount time. -+ -+ To implement bitmap blocks unloading we need to count bitmap block usage -+ and detect currently unused blocks allowing them to be unloaded. It is not -+ a simple task since we allow several threads to modify one bitmap block -+ simultaneously. -+ -+ Briefly speaking, the following schema is proposed: we count in special -+ variable associated with each bitmap block. That is for counting of block -+ alloc/dealloc operations on that bitmap block. With a deferred block -+ deallocation feature of reiser4 all those operation will be represented in -+ atom dirty/deleted lists as jnodes for freshly allocated or deleted -+ nodes. -+ -+ So, we increment usage counter for each new node allocated or deleted, and -+ decrement it at atom commit one time for each node from the dirty/deleted -+ atom's list. Of course, freshly allocated node deletion and node reusing -+ from atom deleted (if we do so) list should decrement bitmap usage counter -+ also. -+ -+ This schema seems to be working but that reference counting is -+ not easy to debug. I think we should agree with Hans and do not implement -+ it in v4.0. Current code implements "on-demand" bitmap blocks loading only. -+ -+ For simplicity all bitmap nodes (both commit and working bitmap blocks) are -+ loaded into memory on fs mount time or each bitmap nodes are loaded at the -+ first access to it, the "dont_load_bitmap" mount option controls whether -+ bimtap nodes should be loaded at mount time. Dynamic unloading of bitmap -+ nodes currently is not supported. */ -+ -+#define CHECKSUM_SIZE 4 -+ -+#define BYTES_PER_LONG (sizeof(long)) -+ -+#if BITS_PER_LONG == 64 -+# define LONG_INT_SHIFT (6) -+#else -+# define LONG_INT_SHIFT (5) -+#endif -+ -+#define LONG_INT_MASK (BITS_PER_LONG - 1UL) -+ -+typedef unsigned long ulong_t; -+ -+#define bmap_size(blocksize) ((blocksize) - CHECKSUM_SIZE) -+#define bmap_bit_count(blocksize) (bmap_size(blocksize) << 3) -+ -+/* Block allocation/deallocation are done through special bitmap objects which -+ are allocated in an array at fs mount. */ -+struct bitmap_node { -+ struct mutex mutex; /* long term lock object */ -+ -+ jnode *wjnode; /* j-nodes for WORKING ... */ -+ jnode *cjnode; /* ... and COMMIT bitmap blocks */ -+ -+ bmap_off_t first_zero_bit; /* for skip_busy option implementation */ -+ -+ atomic_t loaded; /* a flag which shows that bnode is loaded -+ * already */ -+}; -+ -+static inline char *bnode_working_data(struct bitmap_node *bnode) -+{ -+ char *data; -+ -+ data = jdata(bnode->wjnode); -+ assert("zam-429", data != NULL); -+ -+ return data + CHECKSUM_SIZE; -+} -+ -+static inline char *bnode_commit_data(const struct bitmap_node *bnode) -+{ -+ char *data; -+ -+ data = jdata(bnode->cjnode); -+ assert("zam-430", data != NULL); -+ -+ return data + CHECKSUM_SIZE; -+} -+ -+static inline __u32 bnode_commit_crc(const struct bitmap_node *bnode) -+{ -+ char *data; -+ -+ data = jdata(bnode->cjnode); -+ assert("vpf-261", data != NULL); -+ -+ return le32_to_cpu(get_unaligned((d32 *)data)); -+} -+ -+static inline void bnode_set_commit_crc(struct bitmap_node *bnode, __u32 crc) -+{ -+ char *data; -+ -+ data = jdata(bnode->cjnode); -+ assert("vpf-261", data != NULL); -+ -+ put_unaligned(cpu_to_le32(crc), (d32 *)data); -+} -+ -+/* ZAM-FIXME-HANS: is the idea that this might be a union someday? having -+ * written the code, does this added abstraction still have */ -+/* ANSWER(Zam): No, the abstractions is in the level above (exact place is the -+ * reiser4_space_allocator structure) */ -+/* ZAM-FIXME-HANS: I don't understand your english in comment above. */ -+/* FIXME-HANS(Zam): I don't understand the questions like "might be a union -+ * someday?". What they about? If there is a reason to have a union, it should -+ * be a union, if not, it should not be a union. "..might be someday" means no -+ * reason. */ -+struct bitmap_allocator_data { -+ /* an array for bitmap blocks direct access */ -+ struct bitmap_node *bitmap; -+}; -+ -+#define get_barray(super) \ -+(((struct bitmap_allocator_data *)(get_super_private(super)->space_allocator.u.generic)) -> bitmap) -+ -+#define get_bnode(super, i) (get_barray(super) + i) -+ -+/* allocate and initialize jnode with JNODE_BITMAP type */ -+static jnode *bnew(void) -+{ -+ jnode *jal = jalloc(); -+ -+ if (jal) -+ jnode_init(jal, current_tree, JNODE_BITMAP); -+ -+ return jal; -+} -+ -+/* this file contains: -+ - bitmap based implementation of space allocation plugin -+ - all the helper functions like set bit, find_first_zero_bit, etc */ -+ -+/* Audited by: green(2002.06.12) */ -+static int find_next_zero_bit_in_word(ulong_t word, int start_bit) -+{ -+ ulong_t mask = 1UL << start_bit; -+ int i = start_bit; -+ -+ while ((word & mask) != 0) { -+ mask <<= 1; -+ if (++i >= BITS_PER_LONG) -+ break; -+ } -+ -+ return i; -+} -+ -+#include -+ -+#if BITS_PER_LONG == 64 -+ -+#define OFF(addr) (((ulong_t)(addr) & (BYTES_PER_LONG - 1)) << 3) -+#define BASE(addr) ((ulong_t*) ((ulong_t)(addr) & ~(BYTES_PER_LONG - 1))) -+ -+static inline void reiser4_set_bit(int nr, void *addr) -+{ -+ ext2_set_bit(nr + OFF(addr), BASE(addr)); -+} -+ -+static inline void reiser4_clear_bit(int nr, void *addr) -+{ -+ ext2_clear_bit(nr + OFF(addr), BASE(addr)); -+} -+ -+static inline int reiser4_test_bit(int nr, void *addr) -+{ -+ return ext2_test_bit(nr + OFF(addr), BASE(addr)); -+} -+static inline int reiser4_find_next_zero_bit(void *addr, int maxoffset, -+ int offset) -+{ -+ int off = OFF(addr); -+ -+ return ext2_find_next_zero_bit(BASE(addr), maxoffset + off, -+ offset + off) - off; -+} -+ -+#else -+ -+#define reiser4_set_bit(nr, addr) ext2_set_bit(nr, addr) -+#define reiser4_clear_bit(nr, addr) ext2_clear_bit(nr, addr) -+#define reiser4_test_bit(nr, addr) ext2_test_bit(nr, addr) -+ -+#define reiser4_find_next_zero_bit(addr, maxoffset, offset) \ -+ext2_find_next_zero_bit(addr, maxoffset, offset) -+#endif -+ -+/* Search for a set bit in the bit array [@start_offset, @max_offset[, offsets -+ * are counted from @addr, return the offset of the first bit if it is found, -+ * @maxoffset otherwise. */ -+static bmap_off_t __reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset, -+ bmap_off_t start_offset) -+{ -+ ulong_t *base = addr; -+ /* start_offset is in bits, convert it to byte offset within bitmap. */ -+ int word_nr = start_offset >> LONG_INT_SHIFT; -+ /* bit number within the byte. */ -+ int bit_nr = start_offset & LONG_INT_MASK; -+ int max_word_nr = (max_offset - 1) >> LONG_INT_SHIFT; -+ -+ assert("zam-387", max_offset != 0); -+ -+ /* Unaligned @start_offset case. */ -+ if (bit_nr != 0) { -+ bmap_nr_t nr; -+ -+ nr = find_next_zero_bit_in_word(~(base[word_nr]), bit_nr); -+ -+ if (nr < BITS_PER_LONG) -+ return (word_nr << LONG_INT_SHIFT) + nr; -+ -+ ++word_nr; -+ } -+ -+ /* Fast scan trough aligned words. */ -+ while (word_nr <= max_word_nr) { -+ if (base[word_nr] != 0) { -+ return (word_nr << LONG_INT_SHIFT) -+ + find_next_zero_bit_in_word(~(base[word_nr]), 0); -+ } -+ -+ ++word_nr; -+ } -+ -+ return max_offset; -+} -+ -+#if BITS_PER_LONG == 64 -+ -+static bmap_off_t reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset, -+ bmap_off_t start_offset) -+{ -+ bmap_off_t off = OFF(addr); -+ -+ return __reiser4_find_next_set_bit(BASE(addr), max_offset + off, -+ start_offset + off) - off; -+} -+ -+#else -+#define reiser4_find_next_set_bit(addr, max_offset, start_offset) \ -+ __reiser4_find_next_set_bit(addr, max_offset, start_offset) -+#endif -+ -+/* search for the first set bit in single word. */ -+static int find_last_set_bit_in_word(ulong_t word, int start_bit) -+{ -+ ulong_t bit_mask; -+ int nr = start_bit; -+ -+ assert("zam-965", start_bit < BITS_PER_LONG); -+ assert("zam-966", start_bit >= 0); -+ -+ bit_mask = (1UL << nr); -+ -+ while (bit_mask != 0) { -+ if (bit_mask & word) -+ return nr; -+ bit_mask >>= 1; -+ nr--; -+ } -+ return BITS_PER_LONG; -+} -+ -+/* Search bitmap for a set bit in backward direction from the end to the -+ * beginning of given region -+ * -+ * @result: result offset of the last set bit -+ * @addr: base memory address, -+ * @low_off: low end of the search region, edge bit included into the region, -+ * @high_off: high end of the search region, edge bit included into the region, -+ * -+ * @return: 0 - set bit was found, -1 otherwise. -+ */ -+static int -+reiser4_find_last_set_bit(bmap_off_t * result, void *addr, bmap_off_t low_off, -+ bmap_off_t high_off) -+{ -+ ulong_t *base = addr; -+ int last_word; -+ int first_word; -+ int last_bit; -+ int nr; -+ -+ assert("zam-962", high_off >= low_off); -+ -+ last_word = high_off >> LONG_INT_SHIFT; -+ last_bit = high_off & LONG_INT_MASK; -+ first_word = low_off >> LONG_INT_SHIFT; -+ -+ if (last_bit < BITS_PER_LONG) { -+ nr = find_last_set_bit_in_word(base[last_word], last_bit); -+ if (nr < BITS_PER_LONG) { -+ *result = (last_word << LONG_INT_SHIFT) + nr; -+ return 0; -+ } -+ --last_word; -+ } -+ while (last_word >= first_word) { -+ if (base[last_word] != 0x0) { -+ last_bit = -+ find_last_set_bit_in_word(base[last_word], -+ BITS_PER_LONG - 1); -+ assert("zam-972", last_bit < BITS_PER_LONG); -+ *result = (last_word << LONG_INT_SHIFT) + last_bit; -+ return 0; -+ } -+ --last_word; -+ } -+ -+ return -1; /* set bit not found */ -+} -+ -+/* Search bitmap for a clear bit in backward direction from the end to the -+ * beginning of given region */ -+static int -+reiser4_find_last_zero_bit(bmap_off_t * result, void *addr, bmap_off_t low_off, -+ bmap_off_t high_off) -+{ -+ ulong_t *base = addr; -+ int last_word; -+ int first_word; -+ int last_bit; -+ int nr; -+ -+ last_word = high_off >> LONG_INT_SHIFT; -+ last_bit = high_off & LONG_INT_MASK; -+ first_word = low_off >> LONG_INT_SHIFT; -+ -+ if (last_bit < BITS_PER_LONG) { -+ nr = find_last_set_bit_in_word(~base[last_word], last_bit); -+ if (nr < BITS_PER_LONG) { -+ *result = (last_word << LONG_INT_SHIFT) + nr; -+ return 0; -+ } -+ --last_word; -+ } -+ while (last_word >= first_word) { -+ if (base[last_word] != (ulong_t) (-1)) { -+ *result = (last_word << LONG_INT_SHIFT) + -+ find_last_set_bit_in_word(~base[last_word], -+ BITS_PER_LONG - 1); -+ return 0; -+ } -+ --last_word; -+ } -+ -+ return -1; /* zero bit not found */ -+} -+ -+/* Audited by: green(2002.06.12) */ -+static void reiser4_clear_bits(char *addr, bmap_off_t start, bmap_off_t end) -+{ -+ int first_byte; -+ int last_byte; -+ -+ unsigned char first_byte_mask = 0xFF; -+ unsigned char last_byte_mask = 0xFF; -+ -+ assert("zam-410", start < end); -+ -+ first_byte = start >> 3; -+ last_byte = (end - 1) >> 3; -+ -+ if (last_byte > first_byte + 1) -+ memset(addr + first_byte + 1, 0, -+ (size_t) (last_byte - first_byte - 1)); -+ -+ first_byte_mask >>= 8 - (start & 0x7); -+ last_byte_mask <<= ((end - 1) & 0x7) + 1; -+ -+ if (first_byte == last_byte) { -+ addr[first_byte] &= (first_byte_mask | last_byte_mask); -+ } else { -+ addr[first_byte] &= first_byte_mask; -+ addr[last_byte] &= last_byte_mask; -+ } -+} -+ -+/* Audited by: green(2002.06.12) */ -+/* ZAM-FIXME-HANS: comment this */ -+static void reiser4_set_bits(char *addr, bmap_off_t start, bmap_off_t end) -+{ -+ int first_byte; -+ int last_byte; -+ -+ unsigned char first_byte_mask = 0xFF; -+ unsigned char last_byte_mask = 0xFF; -+ -+ assert("zam-386", start < end); -+ -+ first_byte = start >> 3; -+ last_byte = (end - 1) >> 3; -+ -+ if (last_byte > first_byte + 1) -+ memset(addr + first_byte + 1, 0xFF, -+ (size_t) (last_byte - first_byte - 1)); -+ -+ first_byte_mask <<= start & 0x7; -+ last_byte_mask >>= 7 - ((end - 1) & 0x7); -+ -+ if (first_byte == last_byte) { -+ addr[first_byte] |= (first_byte_mask & last_byte_mask); -+ } else { -+ addr[first_byte] |= first_byte_mask; -+ addr[last_byte] |= last_byte_mask; -+ } -+} -+ -+#define ADLER_BASE 65521 -+#define ADLER_NMAX 5552 -+ -+/* Calculates the adler32 checksum for the data pointed by `data` of the -+ length `len`. This function was originally taken from zlib, version 1.1.3, -+ July 9th, 1998. -+ -+ Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler -+ -+ This software is provided 'as-is', without any express or implied -+ warranty. In no event will the authors be held liable for any damages -+ arising from the use of this software. -+ -+ Permission is granted to anyone to use this software for any purpose, -+ including commercial applications, and to alter it and redistribute it -+ freely, subject to the following restrictions: -+ -+ 1. The origin of this software must not be misrepresented; you must not -+ claim that you wrote the original software. If you use this software -+ in a product, an acknowledgment in the product documentation would be -+ appreciated but is not required. -+ 2. Altered source versions must be plainly marked as such, and must not be -+ misrepresented as being the original software. -+ 3. This notice may not be removed or altered from any source distribution. -+ -+ Jean-loup Gailly Mark Adler -+ jloup@gzip.org madler@alumni.caltech.edu -+ -+ The above comment applies only to the reiser4_adler32 function. -+*/ -+ -+__u32 reiser4_adler32(char *data, __u32 len) -+{ -+ unsigned char *t = data; -+ __u32 s1 = 1; -+ __u32 s2 = 0; -+ int k; -+ -+ while (len > 0) { -+ k = len < ADLER_NMAX ? len : ADLER_NMAX; -+ len -= k; -+ -+ while (k--) { -+ s1 += *t++; -+ s2 += s1; -+ } -+ -+ s1 %= ADLER_BASE; -+ s2 %= ADLER_BASE; -+ } -+ return (s2 << 16) | s1; -+} -+ -+#define sb_by_bnode(bnode) \ -+ ((struct super_block *)jnode_get_tree(bnode->wjnode)->super) -+ -+static __u32 bnode_calc_crc(const struct bitmap_node *bnode, unsigned long size) -+{ -+ return reiser4_adler32(bnode_commit_data(bnode), bmap_size(size)); -+} -+ -+static int -+bnode_check_adler32(const struct bitmap_node *bnode, unsigned long size) -+{ -+ if (bnode_calc_crc(bnode, size) != bnode_commit_crc(bnode)) { -+ bmap_nr_t bmap; -+ -+ bmap = bnode - get_bnode(sb_by_bnode(bnode), 0); -+ -+ warning("vpf-263", -+ "Checksum for the bitmap block %llu is incorrect", -+ bmap); -+ -+ return RETERR(-EIO); -+ } -+ -+ return 0; -+} -+ -+#define REISER4_CHECK_BMAP_CRC (0) -+ -+#if REISER4_CHECK_BMAP_CRC -+static int bnode_check_crc(const struct bitmap_node *bnode) -+{ -+ return bnode_check_adler32(bnode, -+ bmap_size(sb_by_bnode(bnode)->s_blocksize)); -+} -+ -+/* REISER4_CHECK_BMAP_CRC */ -+#else -+ -+#define bnode_check_crc(bnode) (0) -+ -+/* REISER4_CHECK_BMAP_CRC */ -+#endif -+ -+/* Recalculates the adler32 checksum for only 1 byte change. -+ adler - previous adler checksum -+ old_data, data - old, new byte values. -+ tail == (chunk - offset) : length, checksum was calculated for, - offset of -+ the changed byte within this chunk. -+ This function can be used for checksum calculation optimisation. -+*/ -+ -+static __u32 -+adler32_recalc(__u32 adler, unsigned char old_data, unsigned char data, -+ __u32 tail) -+{ -+ __u32 delta = data - old_data + 2 * ADLER_BASE; -+ __u32 s1 = adler & 0xffff; -+ __u32 s2 = (adler >> 16) & 0xffff; -+ -+ s1 = (delta + s1) % ADLER_BASE; -+ s2 = (delta * tail + s2) % ADLER_BASE; -+ -+ return (s2 << 16) | s1; -+} -+ -+#define LIMIT(val, boundary) ((val) > (boundary) ? (boundary) : (val)) -+ -+/** -+ * get_nr_bitmap - calculate number of bitmap blocks -+ * @super: super block with initialized blocksize and block count -+ * -+ * Calculates number of bitmap blocks of a filesystem which uses bitmaps to -+ * maintain free disk space. It assumes that each bitmap addresses the same -+ * number of blocks which is calculated by bmap_block_count macro defined in -+ * above. Number of blocks in the filesystem has to be initialized in reiser4 -+ * private data of super block already so that it can be obtained via -+ * reiser4_block_count(). Unfortunately, number of blocks addressed by a bitmap -+ * is not power of 2 because 4 bytes are used for checksum. Therefore, we have -+ * to use special function to divide and modulo 64bits filesystem block -+ * counters. -+ * -+ * Example: suppose filesystem have 32768 blocks. Blocksize is 4096. Each bitmap -+ * block addresses (4096 - 4) * 8 = 32736 blocks. Number of bitmaps to address -+ * all 32768 blocks is calculated as (32768 - 1) / 32736 + 1 = 2. -+ */ -+static bmap_nr_t get_nr_bmap(const struct super_block *super) -+{ -+ u64 quotient; -+ -+ assert("zam-393", reiser4_block_count(super) != 0); -+ -+ quotient = reiser4_block_count(super) - 1; -+ do_div(quotient, bmap_bit_count(super->s_blocksize)); -+ return quotient + 1; -+} -+ -+/** -+ * parse_blocknr - calculate bitmap number and offset in it by block number -+ * @block: pointer to block number to calculate location in bitmap of -+ * @bmap: pointer where to store bitmap block number -+ * @offset: pointer where to store offset within bitmap block -+ * -+ * Calculates location of bit which is responsible for allocation/freeing of -+ * block @*block. That location is represented by bitmap block number and offset -+ * within that bitmap block. -+ */ -+static void -+parse_blocknr(const reiser4_block_nr *block, bmap_nr_t *bmap, -+ bmap_off_t *offset) -+{ -+ struct super_block *super = get_current_context()->super; -+ u64 quotient = *block; -+ -+ *offset = do_div(quotient, bmap_bit_count(super->s_blocksize)); -+ *bmap = quotient; -+ -+ assert("zam-433", *bmap < get_nr_bmap(super)); -+ assert("", *offset < bmap_bit_count(super->s_blocksize)); -+} -+ -+#if REISER4_DEBUG -+/* Audited by: green(2002.06.12) */ -+static void -+check_block_range(const reiser4_block_nr * start, const reiser4_block_nr * len) -+{ -+ struct super_block *sb = reiser4_get_current_sb(); -+ -+ assert("zam-436", sb != NULL); -+ -+ assert("zam-455", start != NULL); -+ assert("zam-437", *start != 0); -+ assert("zam-541", !reiser4_blocknr_is_fake(start)); -+ assert("zam-441", *start < reiser4_block_count(sb)); -+ -+ if (len != NULL) { -+ assert("zam-438", *len != 0); -+ assert("zam-442", *start + *len <= reiser4_block_count(sb)); -+ } -+} -+ -+static void check_bnode_loaded(const struct bitmap_node *bnode) -+{ -+ assert("zam-485", bnode != NULL); -+ assert("zam-483", jnode_page(bnode->wjnode) != NULL); -+ assert("zam-484", jnode_page(bnode->cjnode) != NULL); -+ assert("nikita-2820", jnode_is_loaded(bnode->wjnode)); -+ assert("nikita-2821", jnode_is_loaded(bnode->cjnode)); -+} -+ -+#else -+ -+# define check_block_range(start, len) do { /* nothing */} while(0) -+# define check_bnode_loaded(bnode) do { /* nothing */} while(0) -+ -+#endif -+ -+/* modify bnode->first_zero_bit (if we free bits before); bnode should be -+ spin-locked */ -+static inline void -+adjust_first_zero_bit(struct bitmap_node *bnode, bmap_off_t offset) -+{ -+ if (offset < bnode->first_zero_bit) -+ bnode->first_zero_bit = offset; -+} -+ -+/* return a physical disk address for logical bitmap number @bmap */ -+/* FIXME-VS: this is somehow related to disk layout? */ -+/* ZAM-FIXME-HANS: your answer is? Use not more than one function dereference -+ * per block allocation so that performance is not affected. Probably this -+ * whole file should be considered part of the disk layout plugin, and other -+ * disk layouts can use other defines and efficiency will not be significantly -+ * affected. */ -+ -+#define REISER4_FIRST_BITMAP_BLOCK \ -+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 2) -+ -+/* Audited by: green(2002.06.12) */ -+static void -+get_bitmap_blocknr(struct super_block *super, bmap_nr_t bmap, -+ reiser4_block_nr * bnr) -+{ -+ -+ assert("zam-390", bmap < get_nr_bmap(super)); -+ -+#ifdef CONFIG_REISER4_BADBLOCKS -+#define BITMAP_PLUGIN_DISKMAP_ID ((0xc0e1<<16) | (0xe0ff)) -+ /* Check if the diskmap have this already, first. */ -+ if (reiser4_get_diskmap_value(BITMAP_PLUGIN_DISKMAP_ID, bmap, bnr) == 0) -+ return; /* Found it in diskmap */ -+#endif -+ /* FIXME_ZAM: before discussing of disk layouts and disk format -+ plugins I implement bitmap location scheme which is close to scheme -+ used in reiser 3.6 */ -+ if (bmap == 0) { -+ *bnr = REISER4_FIRST_BITMAP_BLOCK; -+ } else { -+ *bnr = bmap * bmap_bit_count(super->s_blocksize); -+ } -+} -+ -+/* construct a fake block number for shadow bitmap (WORKING BITMAP) block */ -+/* Audited by: green(2002.06.12) */ -+static void get_working_bitmap_blocknr(bmap_nr_t bmap, reiser4_block_nr * bnr) -+{ -+ *bnr = -+ (reiser4_block_nr) ((bmap & ~REISER4_BLOCKNR_STATUS_BIT_MASK) | -+ REISER4_BITMAP_BLOCKS_STATUS_VALUE); -+} -+ -+/* bnode structure initialization */ -+static void -+init_bnode(struct bitmap_node *bnode, -+ struct super_block *super UNUSED_ARG, bmap_nr_t bmap UNUSED_ARG) -+{ -+ memset(bnode, 0, sizeof(struct bitmap_node)); -+ -+ mutex_init(&bnode->mutex); -+ atomic_set(&bnode->loaded, 0); -+} -+ -+static void release(jnode * node) -+{ -+ jrelse(node); -+ JF_SET(node, JNODE_HEARD_BANSHEE); -+ jput(node); -+} -+ -+/* This function is for internal bitmap.c use because it assumes that jnode is -+ in under full control of this thread */ -+static void done_bnode(struct bitmap_node *bnode) -+{ -+ if (bnode) { -+ atomic_set(&bnode->loaded, 0); -+ if (bnode->wjnode != NULL) -+ release(bnode->wjnode); -+ if (bnode->cjnode != NULL) -+ release(bnode->cjnode); -+ bnode->wjnode = bnode->cjnode = NULL; -+ } -+} -+ -+/* ZAM-FIXME-HANS: comment this. Called only by load_and_lock_bnode()*/ -+static int prepare_bnode(struct bitmap_node *bnode, jnode **cjnode_ret, -+ jnode **wjnode_ret) -+{ -+ struct super_block *super; -+ jnode *cjnode; -+ jnode *wjnode; -+ bmap_nr_t bmap; -+ int ret; -+ -+ super = reiser4_get_current_sb(); -+ -+ *wjnode_ret = wjnode = bnew(); -+ if (wjnode == NULL) { -+ *cjnode_ret = NULL; -+ return RETERR(-ENOMEM); -+ } -+ -+ *cjnode_ret = cjnode = bnew(); -+ if (cjnode == NULL) -+ return RETERR(-ENOMEM); -+ -+ bmap = bnode - get_bnode(super, 0); -+ -+ get_working_bitmap_blocknr(bmap, &wjnode->blocknr); -+ get_bitmap_blocknr(super, bmap, &cjnode->blocknr); -+ -+ jref(cjnode); -+ jref(wjnode); -+ -+ /* load commit bitmap */ -+ ret = jload_gfp(cjnode, GFP_NOFS, 1); -+ -+ if (ret) -+ goto error; -+ -+ /* allocate memory for working bitmap block. Note that for -+ * bitmaps jinit_new() doesn't actually modifies node content, -+ * so parallel calls to this are ok. */ -+ ret = jinit_new(wjnode, GFP_NOFS); -+ -+ if (ret != 0) { -+ jrelse(cjnode); -+ goto error; -+ } -+ -+ return 0; -+ -+ error: -+ jput(cjnode); -+ jput(wjnode); -+ *wjnode_ret = *cjnode_ret = NULL; -+ return ret; -+ -+} -+ -+/* Check the bnode data on read. */ -+static int check_struct_bnode(struct bitmap_node *bnode, __u32 blksize) -+{ -+ void *data; -+ int ret; -+ -+ /* Check CRC */ -+ ret = bnode_check_adler32(bnode, blksize); -+ -+ if (ret) { -+ return ret; -+ } -+ -+ data = jdata(bnode->cjnode) + CHECKSUM_SIZE; -+ -+ /* Check the very first bit -- it must be busy. */ -+ if (!reiser4_test_bit(0, data)) { -+ warning("vpf-1362", "The allocator block %llu is not marked " -+ "as used.", (unsigned long long)bnode->cjnode->blocknr); -+ -+ return -EINVAL; -+ } -+ -+ return 0; -+} -+ -+/* load bitmap blocks "on-demand" */ -+static int load_and_lock_bnode(struct bitmap_node *bnode) -+{ -+ int ret; -+ -+ jnode *cjnode; -+ jnode *wjnode; -+ -+ assert("nikita-3040", reiser4_schedulable()); -+ -+/* ZAM-FIXME-HANS: since bitmaps are never unloaded, this does not -+ * need to be atomic, right? Just leave a comment that if bitmaps were -+ * unloadable, this would need to be atomic. */ -+ if (atomic_read(&bnode->loaded)) { -+ /* bitmap is already loaded, nothing to do */ -+ check_bnode_loaded(bnode); -+ mutex_lock(&bnode->mutex); -+ assert("nikita-2827", atomic_read(&bnode->loaded)); -+ return 0; -+ } -+ -+ ret = prepare_bnode(bnode, &cjnode, &wjnode); -+ if (ret == 0) { -+ mutex_lock(&bnode->mutex); -+ -+ if (!atomic_read(&bnode->loaded)) { -+ assert("nikita-2822", cjnode != NULL); -+ assert("nikita-2823", wjnode != NULL); -+ assert("nikita-2824", jnode_is_loaded(cjnode)); -+ assert("nikita-2825", jnode_is_loaded(wjnode)); -+ -+ bnode->wjnode = wjnode; -+ bnode->cjnode = cjnode; -+ -+ ret = check_struct_bnode(bnode, current_blocksize); -+ if (!ret) { -+ cjnode = wjnode = NULL; -+ atomic_set(&bnode->loaded, 1); -+ /* working bitmap is initialized by on-disk -+ * commit bitmap. This should be performed -+ * under mutex. */ -+ memcpy(bnode_working_data(bnode), -+ bnode_commit_data(bnode), -+ bmap_size(current_blocksize)); -+ } else -+ mutex_unlock(&bnode->mutex); -+ } else -+ /* race: someone already loaded bitmap while we were -+ * busy initializing data. */ -+ check_bnode_loaded(bnode); -+ } -+ -+ if (wjnode != NULL) { -+ release(wjnode); -+ bnode->wjnode = NULL; -+ } -+ if (cjnode != NULL) { -+ release(cjnode); -+ bnode->cjnode = NULL; -+ } -+ -+ return ret; -+} -+ -+static void release_and_unlock_bnode(struct bitmap_node *bnode) -+{ -+ check_bnode_loaded(bnode); -+ mutex_unlock(&bnode->mutex); -+} -+ -+/* This function does all block allocation work but only for one bitmap -+ block.*/ -+/* FIXME_ZAM: It does not allow us to allocate block ranges across bitmap -+ block responsibility zone boundaries. This had no sense in v3.6 but may -+ have it in v4.x */ -+/* ZAM-FIXME-HANS: do you mean search one bitmap block forward? */ -+static int -+search_one_bitmap_forward(bmap_nr_t bmap, bmap_off_t * offset, -+ bmap_off_t max_offset, int min_len, int max_len) -+{ -+ struct super_block *super = get_current_context()->super; -+ struct bitmap_node *bnode = get_bnode(super, bmap); -+ -+ char *data; -+ -+ bmap_off_t search_end; -+ bmap_off_t start; -+ bmap_off_t end; -+ -+ int set_first_zero_bit = 0; -+ -+ int ret; -+ -+ assert("zam-364", min_len > 0); -+ assert("zam-365", max_len >= min_len); -+ assert("zam-366", *offset <= max_offset); -+ -+ ret = load_and_lock_bnode(bnode); -+ -+ if (ret) -+ return ret; -+ -+ data = bnode_working_data(bnode); -+ -+ start = *offset; -+ -+ if (bnode->first_zero_bit >= start) { -+ start = bnode->first_zero_bit; -+ set_first_zero_bit = 1; -+ } -+ -+ while (start + min_len < max_offset) { -+ -+ start = -+ reiser4_find_next_zero_bit((long *)data, max_offset, start); -+ if (set_first_zero_bit) { -+ bnode->first_zero_bit = start; -+ set_first_zero_bit = 0; -+ } -+ if (start >= max_offset) -+ break; -+ -+ search_end = LIMIT(start + max_len, max_offset); -+ end = -+ reiser4_find_next_set_bit((long *)data, search_end, start); -+ if (end >= start + min_len) { -+ /* we can't trust find_next_set_bit result if set bit -+ was not fount, result may be bigger than -+ max_offset */ -+ if (end > search_end) -+ end = search_end; -+ -+ ret = end - start; -+ *offset = start; -+ -+ reiser4_set_bits(data, start, end); -+ -+ /* FIXME: we may advance first_zero_bit if [start, -+ end] region overlaps the first_zero_bit point */ -+ -+ break; -+ } -+ -+ start = end + 1; -+ } -+ -+ release_and_unlock_bnode(bnode); -+ -+ return ret; -+} -+ -+static int -+search_one_bitmap_backward(bmap_nr_t bmap, bmap_off_t * start_offset, -+ bmap_off_t end_offset, int min_len, int max_len) -+{ -+ struct super_block *super = get_current_context()->super; -+ struct bitmap_node *bnode = get_bnode(super, bmap); -+ char *data; -+ bmap_off_t start; -+ int ret; -+ -+ assert("zam-958", min_len > 0); -+ assert("zam-959", max_len >= min_len); -+ assert("zam-960", *start_offset >= end_offset); -+ -+ ret = load_and_lock_bnode(bnode); -+ if (ret) -+ return ret; -+ -+ data = bnode_working_data(bnode); -+ start = *start_offset; -+ -+ while (1) { -+ bmap_off_t end, search_end; -+ -+ /* Find the beginning of the zero filled region */ -+ if (reiser4_find_last_zero_bit(&start, data, end_offset, start)) -+ break; -+ /* Is there more than `min_len' bits from `start' to -+ * `end_offset'? */ -+ if (start < end_offset + min_len - 1) -+ break; -+ -+ /* Do not search to `end_offset' if we need to find less than -+ * `max_len' zero bits. */ -+ if (end_offset + max_len - 1 < start) -+ search_end = start - max_len + 1; -+ else -+ search_end = end_offset; -+ -+ if (reiser4_find_last_set_bit(&end, data, search_end, start)) -+ end = search_end; -+ else -+ end++; -+ -+ if (end + min_len <= start + 1) { -+ if (end < search_end) -+ end = search_end; -+ ret = start - end + 1; -+ *start_offset = end; /* `end' is lowest offset */ -+ assert("zam-987", -+ reiser4_find_next_set_bit(data, start + 1, -+ end) >= start + 1); -+ reiser4_set_bits(data, end, start + 1); -+ break; -+ } -+ -+ if (end <= end_offset) -+ /* left search boundary reached. */ -+ break; -+ start = end - 1; -+ } -+ -+ release_and_unlock_bnode(bnode); -+ return ret; -+} -+ -+/* allocate contiguous range of blocks in bitmap */ -+static int bitmap_alloc_forward(reiser4_block_nr * start, -+ const reiser4_block_nr * end, int min_len, -+ int max_len) -+{ -+ bmap_nr_t bmap, end_bmap; -+ bmap_off_t offset, end_offset; -+ int len; -+ -+ reiser4_block_nr tmp; -+ -+ struct super_block *super = get_current_context()->super; -+ const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize); -+ -+ parse_blocknr(start, &bmap, &offset); -+ -+ tmp = *end - 1; -+ parse_blocknr(&tmp, &end_bmap, &end_offset); -+ ++end_offset; -+ -+ assert("zam-358", end_bmap >= bmap); -+ assert("zam-359", ergo(end_bmap == bmap, end_offset >= offset)); -+ -+ for (; bmap < end_bmap; bmap++, offset = 0) { -+ len = -+ search_one_bitmap_forward(bmap, &offset, max_offset, -+ min_len, max_len); -+ if (len != 0) -+ goto out; -+ } -+ -+ len = -+ search_one_bitmap_forward(bmap, &offset, end_offset, min_len, -+ max_len); -+ out: -+ *start = bmap * max_offset + offset; -+ return len; -+} -+ -+/* allocate contiguous range of blocks in bitmap (from @start to @end in -+ * backward direction) */ -+static int bitmap_alloc_backward(reiser4_block_nr * start, -+ const reiser4_block_nr * end, int min_len, -+ int max_len) -+{ -+ bmap_nr_t bmap, end_bmap; -+ bmap_off_t offset, end_offset; -+ int len; -+ struct super_block *super = get_current_context()->super; -+ const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize); -+ -+ parse_blocknr(start, &bmap, &offset); -+ parse_blocknr(end, &end_bmap, &end_offset); -+ -+ assert("zam-961", end_bmap <= bmap); -+ assert("zam-962", ergo(end_bmap == bmap, end_offset <= offset)); -+ -+ for (; bmap > end_bmap; bmap--, offset = max_offset - 1) { -+ len = -+ search_one_bitmap_backward(bmap, &offset, 0, min_len, -+ max_len); -+ if (len != 0) -+ goto out; -+ } -+ -+ len = -+ search_one_bitmap_backward(bmap, &offset, end_offset, min_len, -+ max_len); -+ out: -+ *start = bmap * max_offset + offset; -+ return len; -+} -+ -+/* plugin->u.space_allocator.alloc_blocks() */ -+static int alloc_blocks_forward(reiser4_blocknr_hint *hint, int needed, -+ reiser4_block_nr *start, reiser4_block_nr *len) -+{ -+ struct super_block *super = get_current_context()->super; -+ int actual_len; -+ -+ reiser4_block_nr search_start; -+ reiser4_block_nr search_end; -+ -+ assert("zam-398", super != NULL); -+ assert("zam-412", hint != NULL); -+ assert("zam-397", hint->blk <= reiser4_block_count(super)); -+ -+ if (hint->max_dist == 0) -+ search_end = reiser4_block_count(super); -+ else -+ search_end = -+ LIMIT(hint->blk + hint->max_dist, -+ reiser4_block_count(super)); -+ -+ /* We use @hint -> blk as a search start and search from it to the end -+ of the disk or in given region if @hint -> max_dist is not zero */ -+ search_start = hint->blk; -+ -+ actual_len = -+ bitmap_alloc_forward(&search_start, &search_end, 1, needed); -+ -+ /* There is only one bitmap search if max_dist was specified or first -+ pass was from the beginning of the bitmap. We also do one pass for -+ scanning bitmap in backward direction. */ -+ if (!(actual_len != 0 || hint->max_dist != 0 || search_start == 0)) { -+ /* next step is a scanning from 0 to search_start */ -+ search_end = search_start; -+ search_start = 0; -+ actual_len = -+ bitmap_alloc_forward(&search_start, &search_end, 1, needed); -+ } -+ if (actual_len == 0) -+ return RETERR(-ENOSPC); -+ if (actual_len < 0) -+ return RETERR(actual_len); -+ *len = actual_len; -+ *start = search_start; -+ return 0; -+} -+ -+static int alloc_blocks_backward(reiser4_blocknr_hint * hint, int needed, -+ reiser4_block_nr * start, -+ reiser4_block_nr * len) -+{ -+ reiser4_block_nr search_start; -+ reiser4_block_nr search_end; -+ int actual_len; -+ -+ ON_DEBUG(struct super_block *super = reiser4_get_current_sb()); -+ -+ assert("zam-969", super != NULL); -+ assert("zam-970", hint != NULL); -+ assert("zam-971", hint->blk <= reiser4_block_count(super)); -+ -+ search_start = hint->blk; -+ if (hint->max_dist == 0 || search_start <= hint->max_dist) -+ search_end = 0; -+ else -+ search_end = search_start - hint->max_dist; -+ -+ actual_len = -+ bitmap_alloc_backward(&search_start, &search_end, 1, needed); -+ if (actual_len == 0) -+ return RETERR(-ENOSPC); -+ if (actual_len < 0) -+ return RETERR(actual_len); -+ *len = actual_len; -+ *start = search_start; -+ return 0; -+} -+ -+/* plugin->u.space_allocator.alloc_blocks() */ -+int reiser4_alloc_blocks_bitmap(reiser4_space_allocator * allocator, -+ reiser4_blocknr_hint * hint, int needed, -+ reiser4_block_nr * start, reiser4_block_nr * len) -+{ -+ if (hint->backward) -+ return alloc_blocks_backward(hint, needed, start, len); -+ return alloc_blocks_forward(hint, needed, start, len); -+} -+ -+/* plugin->u.space_allocator.dealloc_blocks(). */ -+/* It just frees blocks in WORKING BITMAP. Usually formatted an unformatted -+ nodes deletion is deferred until transaction commit. However, deallocation -+ of temporary objects like wandered blocks and transaction commit records -+ requires immediate node deletion from WORKING BITMAP.*/ -+void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator * allocator, -+ reiser4_block_nr start, reiser4_block_nr len) -+{ -+ struct super_block *super = reiser4_get_current_sb(); -+ -+ bmap_nr_t bmap; -+ bmap_off_t offset; -+ -+ struct bitmap_node *bnode; -+ int ret; -+ -+ assert("zam-468", len != 0); -+ check_block_range(&start, &len); -+ -+ parse_blocknr(&start, &bmap, &offset); -+ -+ assert("zam-469", offset + len <= bmap_bit_count(super->s_blocksize)); -+ -+ bnode = get_bnode(super, bmap); -+ -+ assert("zam-470", bnode != NULL); -+ -+ ret = load_and_lock_bnode(bnode); -+ assert("zam-481", ret == 0); -+ -+ reiser4_clear_bits(bnode_working_data(bnode), offset, -+ (bmap_off_t) (offset + len)); -+ -+ adjust_first_zero_bit(bnode, offset); -+ -+ release_and_unlock_bnode(bnode); -+} -+ -+/* plugin->u.space_allocator.check_blocks(). */ -+void reiser4_check_blocks_bitmap(const reiser4_block_nr * start, -+ const reiser4_block_nr * len, int desired) -+{ -+#if REISER4_DEBUG -+ struct super_block *super = reiser4_get_current_sb(); -+ -+ bmap_nr_t bmap; -+ bmap_off_t start_offset; -+ bmap_off_t end_offset; -+ -+ struct bitmap_node *bnode; -+ int ret; -+ -+ assert("zam-622", len != NULL); -+ check_block_range(start, len); -+ parse_blocknr(start, &bmap, &start_offset); -+ -+ end_offset = start_offset + *len; -+ assert("nikita-2214", end_offset <= bmap_bit_count(super->s_blocksize)); -+ -+ bnode = get_bnode(super, bmap); -+ -+ assert("nikita-2215", bnode != NULL); -+ -+ ret = load_and_lock_bnode(bnode); -+ assert("zam-626", ret == 0); -+ -+ assert("nikita-2216", jnode_is_loaded(bnode->wjnode)); -+ -+ if (desired) { -+ assert("zam-623", -+ reiser4_find_next_zero_bit(bnode_working_data(bnode), -+ end_offset, start_offset) -+ >= end_offset); -+ } else { -+ assert("zam-624", -+ reiser4_find_next_set_bit(bnode_working_data(bnode), -+ end_offset, start_offset) -+ >= end_offset); -+ } -+ -+ release_and_unlock_bnode(bnode); -+#endif -+} -+ -+/* conditional insertion of @node into atom's overwrite set if it was not there */ -+static void cond_add_to_overwrite_set(txn_atom * atom, jnode * node) -+{ -+ assert("zam-546", atom != NULL); -+ assert("zam-547", atom->stage == ASTAGE_PRE_COMMIT); -+ assert("zam-548", node != NULL); -+ -+ spin_lock_atom(atom); -+ spin_lock_jnode(node); -+ -+ if (node->atom == NULL) { -+ JF_SET(node, JNODE_OVRWR); -+ insert_into_atom_ovrwr_list(atom, node); -+ } else { -+ assert("zam-549", node->atom == atom); -+ } -+ -+ spin_unlock_jnode(node); -+ spin_unlock_atom(atom); -+} -+ -+/* an actor which applies delete set to COMMIT bitmap pages and link modified -+ pages in a single-linked list */ -+static int -+apply_dset_to_commit_bmap(txn_atom * atom, const reiser4_block_nr * start, -+ const reiser4_block_nr * len, void *data) -+{ -+ -+ bmap_nr_t bmap; -+ bmap_off_t offset; -+ int ret; -+ -+ long long *blocks_freed_p = data; -+ -+ struct bitmap_node *bnode; -+ -+ struct super_block *sb = reiser4_get_current_sb(); -+ -+ check_block_range(start, len); -+ -+ parse_blocknr(start, &bmap, &offset); -+ -+ /* FIXME-ZAM: we assume that all block ranges are allocated by this -+ bitmap-based allocator and each block range can't go over a zone of -+ responsibility of one bitmap block; same assumption is used in -+ other journal hooks in bitmap code. */ -+ bnode = get_bnode(sb, bmap); -+ assert("zam-448", bnode != NULL); -+ -+ /* it is safe to unlock atom with is in ASTAGE_PRE_COMMIT */ -+ assert("zam-767", atom->stage == ASTAGE_PRE_COMMIT); -+ ret = load_and_lock_bnode(bnode); -+ if (ret) -+ return ret; -+ -+ /* put bnode into atom's overwrite set */ -+ cond_add_to_overwrite_set(atom, bnode->cjnode); -+ -+ data = bnode_commit_data(bnode); -+ -+ ret = bnode_check_crc(bnode); -+ if (ret != 0) -+ return ret; -+ -+ if (len != NULL) { -+ /* FIXME-ZAM: a check that all bits are set should be there */ -+ assert("zam-443", -+ offset + *len <= bmap_bit_count(sb->s_blocksize)); -+ reiser4_clear_bits(data, offset, (bmap_off_t) (offset + *len)); -+ -+ (*blocks_freed_p) += *len; -+ } else { -+ reiser4_clear_bit(offset, data); -+ (*blocks_freed_p)++; -+ } -+ -+ bnode_set_commit_crc(bnode, bnode_calc_crc(bnode, sb->s_blocksize)); -+ -+ release_and_unlock_bnode(bnode); -+ -+ return 0; -+} -+ -+/* plugin->u.space_allocator.pre_commit_hook(). */ -+/* It just applies transaction changes to fs-wide COMMIT BITMAP, hoping the -+ rest is done by transaction manager (allocate wandered locations for COMMIT -+ BITMAP blocks, copy COMMIT BITMAP blocks data). */ -+/* Only one instance of this function can be running at one given time, because -+ only one transaction can be committed a time, therefore it is safe to access -+ some global variables without any locking */ -+ -+int reiser4_pre_commit_hook_bitmap(void) -+{ -+ struct super_block *super = reiser4_get_current_sb(); -+ txn_atom *atom; -+ -+ long long blocks_freed = 0; -+ -+ atom = get_current_atom_locked(); -+ assert("zam-876", atom->stage == ASTAGE_PRE_COMMIT); -+ spin_unlock_atom(atom); -+ -+ { /* scan atom's captured list and find all freshly allocated nodes, -+ * mark corresponded bits in COMMIT BITMAP as used */ -+ struct list_head *head = ATOM_CLEAN_LIST(atom); -+ jnode *node = list_entry(head->next, jnode, capture_link); -+ -+ while (head != &node->capture_link) { -+ /* we detect freshly allocated jnodes */ -+ if (JF_ISSET(node, JNODE_RELOC)) { -+ int ret; -+ bmap_nr_t bmap; -+ -+ bmap_off_t offset; -+ bmap_off_t index; -+ struct bitmap_node *bn; -+ __u32 size = bmap_size(super->s_blocksize); -+ __u32 crc; -+ char byte; -+ -+ assert("zam-559", !JF_ISSET(node, JNODE_OVRWR)); -+ assert("zam-460", -+ !reiser4_blocknr_is_fake(&node->blocknr)); -+ -+ parse_blocknr(&node->blocknr, &bmap, &offset); -+ bn = get_bnode(super, bmap); -+ -+ index = offset >> 3; -+ assert("vpf-276", index < size); -+ -+ ret = bnode_check_crc(bnode); -+ if (ret != 0) -+ return ret; -+ -+ check_bnode_loaded(bn); -+ load_and_lock_bnode(bn); -+ -+ byte = *(bnode_commit_data(bn) + index); -+ reiser4_set_bit(offset, bnode_commit_data(bn)); -+ -+ crc = adler32_recalc(bnode_commit_crc(bn), byte, -+ *(bnode_commit_data(bn) + -+ index), -+ size - index), -+ bnode_set_commit_crc(bn, crc); -+ -+ release_and_unlock_bnode(bn); -+ -+ ret = bnode_check_crc(bn); -+ if (ret != 0) -+ return ret; -+ -+ /* working of this depends on how it inserts -+ new j-node into clean list, because we are -+ scanning the same list now. It is OK, if -+ insertion is done to the list front */ -+ cond_add_to_overwrite_set(atom, bn->cjnode); -+ } -+ -+ node = list_entry(node->capture_link.next, jnode, capture_link); -+ } -+ } -+ -+ blocknr_set_iterator(atom, &atom->delete_set, apply_dset_to_commit_bmap, -+ &blocks_freed, 0); -+ -+ blocks_freed -= atom->nr_blocks_allocated; -+ -+ { -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = get_super_private(super); -+ -+ spin_lock_reiser4_super(sbinfo); -+ sbinfo->blocks_free_committed += blocks_freed; -+ spin_unlock_reiser4_super(sbinfo); -+ } -+ -+ return 0; -+} -+ -+/* plugin->u.space_allocator.init_allocator -+ constructor of reiser4_space_allocator object. It is called on fs mount */ -+int reiser4_init_allocator_bitmap(reiser4_space_allocator * allocator, -+ struct super_block *super, void *arg) -+{ -+ struct bitmap_allocator_data *data = NULL; -+ bmap_nr_t bitmap_blocks_nr; -+ bmap_nr_t i; -+ -+ assert("nikita-3039", reiser4_schedulable()); -+ -+ /* getting memory for bitmap allocator private data holder */ -+ data = -+ kmalloc(sizeof(struct bitmap_allocator_data), -+ reiser4_ctx_gfp_mask_get()); -+ -+ if (data == NULL) -+ return RETERR(-ENOMEM); -+ -+ /* allocation and initialization for the array of bnodes */ -+ bitmap_blocks_nr = get_nr_bmap(super); -+ -+ /* FIXME-ZAM: it is not clear what to do with huge number of bitmaps -+ which is bigger than 2^32 (= 8 * 4096 * 4096 * 2^32 bytes = 5.76e+17, -+ may I never meet someone who still uses the ia32 architecture when -+ storage devices of that size enter the market, and wants to use ia32 -+ with that storage device, much less reiser4. ;-) -Hans). Kmalloc is not possible and, -+ probably, another dynamic data structure should replace a static -+ array of bnodes. */ -+ /*data->bitmap = reiser4_kmalloc((size_t) (sizeof (struct bitmap_node) * bitmap_blocks_nr), GFP_KERNEL); */ -+ data->bitmap = reiser4_vmalloc(sizeof(struct bitmap_node) * bitmap_blocks_nr); -+ if (data->bitmap == NULL) { -+ kfree(data); -+ return RETERR(-ENOMEM); -+ } -+ -+ for (i = 0; i < bitmap_blocks_nr; i++) -+ init_bnode(data->bitmap + i, super, i); -+ -+ allocator->u.generic = data; -+ -+#if REISER4_DEBUG -+ get_super_private(super)->min_blocks_used += bitmap_blocks_nr; -+#endif -+ -+ /* Load all bitmap blocks at mount time. */ -+ if (!test_bit -+ (REISER4_DONT_LOAD_BITMAP, &get_super_private(super)->fs_flags)) { -+ __u64 start_time, elapsed_time; -+ struct bitmap_node *bnode; -+ int ret; -+ -+ if (REISER4_DEBUG) -+ printk(KERN_INFO "loading reiser4 bitmap..."); -+ start_time = jiffies; -+ -+ for (i = 0; i < bitmap_blocks_nr; i++) { -+ bnode = data->bitmap + i; -+ ret = load_and_lock_bnode(bnode); -+ if (ret) { -+ reiser4_destroy_allocator_bitmap(allocator, -+ super); -+ return ret; -+ } -+ release_and_unlock_bnode(bnode); -+ } -+ -+ elapsed_time = jiffies - start_time; -+ if (REISER4_DEBUG) -+ printk("...done (%llu jiffies)\n", -+ (unsigned long long)elapsed_time); -+ } -+ -+ return 0; -+} -+ -+/* plugin->u.space_allocator.destroy_allocator -+ destructor. It is called on fs unmount */ -+int reiser4_destroy_allocator_bitmap(reiser4_space_allocator * allocator, -+ struct super_block *super) -+{ -+ bmap_nr_t bitmap_blocks_nr; -+ bmap_nr_t i; -+ -+ struct bitmap_allocator_data *data = allocator->u.generic; -+ -+ assert("zam-414", data != NULL); -+ assert("zam-376", data->bitmap != NULL); -+ -+ bitmap_blocks_nr = get_nr_bmap(super); -+ -+ for (i = 0; i < bitmap_blocks_nr; i++) { -+ struct bitmap_node *bnode = data->bitmap + i; -+ -+ mutex_lock(&bnode->mutex); -+ -+#if REISER4_DEBUG -+ if (atomic_read(&bnode->loaded)) { -+ jnode *wj = bnode->wjnode; -+ jnode *cj = bnode->cjnode; -+ -+ assert("zam-480", jnode_page(cj) != NULL); -+ assert("zam-633", jnode_page(wj) != NULL); -+ -+ assert("zam-634", -+ memcmp(jdata(wj), jdata(wj), -+ bmap_size(super->s_blocksize)) == 0); -+ -+ } -+#endif -+ done_bnode(bnode); -+ mutex_unlock(&bnode->mutex); -+ } -+ -+ vfree(data->bitmap); -+ kfree(data); -+ -+ allocator->u.generic = NULL; -+ -+ return 0; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/space/bitmap.h linux-2.6.20/fs/reiser4/plugin/space/bitmap.h ---- linux-2.6.20.orig/fs/reiser4/plugin/space/bitmap.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/space/bitmap.h 2007-05-06 14:50:43.863026968 +0400 -@@ -0,0 +1,47 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#if !defined (__REISER4_PLUGIN_SPACE_BITMAP_H__) -+#define __REISER4_PLUGIN_SPACE_BITMAP_H__ -+ -+#include "../../dformat.h" -+#include "../../block_alloc.h" -+ -+#include /* for __u?? */ -+#include /* for struct super_block */ -+/* EDWARD-FIXME-HANS: write something as informative as the below for every .h file lacking it. */ -+/* declarations of functions implementing methods of space allocator plugin for -+ bitmap based allocator. The functions themselves are in bitmap.c */ -+extern int reiser4_init_allocator_bitmap(reiser4_space_allocator *, -+ struct super_block *, void *); -+extern int reiser4_destroy_allocator_bitmap(reiser4_space_allocator *, -+ struct super_block *); -+extern int reiser4_alloc_blocks_bitmap(reiser4_space_allocator *, -+ reiser4_blocknr_hint *, int needed, -+ reiser4_block_nr * start, -+ reiser4_block_nr * len); -+extern void reiser4_check_blocks_bitmap(const reiser4_block_nr *, -+ const reiser4_block_nr *, int); -+extern void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator *, -+ reiser4_block_nr, -+ reiser4_block_nr); -+extern int reiser4_pre_commit_hook_bitmap(void); -+ -+#define reiser4_post_commit_hook_bitmap() do{}while(0) -+#define reiser4_post_write_back_hook_bitmap() do{}while(0) -+#define reiser4_print_info_bitmap(pref, al) do{}while(0) -+ -+typedef __u64 bmap_nr_t; -+typedef __u32 bmap_off_t; -+ -+#endif /* __REISER4_PLUGIN_SPACE_BITMAP_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/space/Makefile linux-2.6.20/fs/reiser4/plugin/space/Makefile ---- linux-2.6.20.orig/fs/reiser4/plugin/space/Makefile 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/space/Makefile 2007-05-06 14:50:43.863026968 +0400 -@@ -0,0 +1,4 @@ -+obj-$(CONFIG_REISER4_FS) += space_plugins.o -+ -+space_plugins-objs := \ -+ bitmap.o -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/space/space_allocator.h linux-2.6.20/fs/reiser4/plugin/space/space_allocator.h ---- linux-2.6.20.orig/fs/reiser4/plugin/space/space_allocator.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/space/space_allocator.h 2007-05-06 14:50:43.863026968 +0400 -@@ -0,0 +1,80 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#ifndef __SPACE_ALLOCATOR_H__ -+#define __SPACE_ALLOCATOR_H__ -+ -+#include "../../forward.h" -+#include "bitmap.h" -+/* NIKITA-FIXME-HANS: surely this could use a comment. Something about how bitmap is the only space allocator for now, -+ * but... */ -+#define DEF_SPACE_ALLOCATOR(allocator) \ -+ \ -+static inline int sa_init_allocator (reiser4_space_allocator * al, struct super_block *s, void * opaque) \ -+{ \ -+ return reiser4_init_allocator_##allocator (al, s, opaque); \ -+} \ -+ \ -+static inline void sa_destroy_allocator (reiser4_space_allocator *al, struct super_block *s) \ -+{ \ -+ reiser4_destroy_allocator_##allocator (al, s); \ -+} \ -+ \ -+static inline int sa_alloc_blocks (reiser4_space_allocator *al, reiser4_blocknr_hint * hint, \ -+ int needed, reiser4_block_nr * start, reiser4_block_nr * len) \ -+{ \ -+ return reiser4_alloc_blocks_##allocator (al, hint, needed, start, len); \ -+} \ -+static inline void sa_dealloc_blocks (reiser4_space_allocator * al, reiser4_block_nr start, reiser4_block_nr len) \ -+{ \ -+ reiser4_dealloc_blocks_##allocator (al, start, len); \ -+} \ -+ \ -+static inline void sa_check_blocks (const reiser4_block_nr * start, const reiser4_block_nr * end, int desired) \ -+{ \ -+ reiser4_check_blocks_##allocator (start, end, desired); \ -+} \ -+ \ -+static inline void sa_pre_commit_hook (void) \ -+{ \ -+ reiser4_pre_commit_hook_##allocator (); \ -+} \ -+ \ -+static inline void sa_post_commit_hook (void) \ -+{ \ -+ reiser4_post_commit_hook_##allocator (); \ -+} \ -+ \ -+static inline void sa_post_write_back_hook (void) \ -+{ \ -+ reiser4_post_write_back_hook_##allocator(); \ -+} \ -+ \ -+static inline void sa_print_info(const char * prefix, reiser4_space_allocator * al) \ -+{ \ -+ reiser4_print_info_##allocator (prefix, al); \ -+} -+ -+DEF_SPACE_ALLOCATOR(bitmap) -+ -+/* this object is part of reiser4 private in-core super block */ -+struct reiser4_space_allocator { -+ union { -+ /* space allocators might use this pointer to reference their -+ * data. */ -+ void *generic; -+ } u; -+}; -+ -+/* __SPACE_ALLOCATOR_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/plugin/tail_policy.c linux-2.6.20/fs/reiser4/plugin/tail_policy.c ---- linux-2.6.20.orig/fs/reiser4/plugin/tail_policy.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/plugin/tail_policy.c 2007-05-06 14:50:43.863026968 +0400 -@@ -0,0 +1,113 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Formatting policy plugins */ -+ -+/* -+ * Formatting policy plugin is used by object plugin (of regular file) to -+ * convert file between two representations. -+ * -+ * Currently following policies are implemented: -+ * never store file in formatted nodes -+ * always store file in formatted nodes -+ * store file in formatted nodes if file is smaller than 4 blocks (default) -+ */ -+ -+#include "../tree.h" -+#include "../inode.h" -+#include "../super.h" -+#include "object.h" -+#include "plugin.h" -+#include "node/node.h" -+#include "plugin_header.h" -+ -+#include -+#include /* For struct inode */ -+ -+/** -+ * have_formatting_never - -+ * @inode: -+ * @size: -+ * -+ * -+ */ -+/* Never store file's tail as direct item */ -+/* Audited by: green(2002.06.12) */ -+static int have_formatting_never(const struct inode *inode UNUSED_ARG -+ /* inode to operate on */ , -+ loff_t size UNUSED_ARG /* new object size */ ) -+{ -+ return 0; -+} -+ -+/* Always store file's tail as direct item */ -+/* Audited by: green(2002.06.12) */ -+static int -+have_formatting_always(const struct inode *inode UNUSED_ARG -+ /* inode to operate on */ , -+ loff_t size UNUSED_ARG /* new object size */ ) -+{ -+ return 1; -+} -+ -+/* This function makes test if we should store file denoted @inode as tails only or -+ as extents only. */ -+static int -+have_formatting_default(const struct inode *inode UNUSED_ARG -+ /* inode to operate on */ , -+ loff_t size /* new object size */ ) -+{ -+ assert("umka-1253", inode != NULL); -+ -+ if (size > inode->i_sb->s_blocksize * 4) -+ return 0; -+ -+ return 1; -+} -+ -+/* tail plugins */ -+formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID] = { -+ [NEVER_TAILS_FORMATTING_ID] = { -+ .h = { -+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE, -+ .id = NEVER_TAILS_FORMATTING_ID, -+ .pops = NULL, -+ .label = "never", -+ .desc = "Never store file's tail", -+ .linkage = {NULL, NULL} -+ }, -+ .have_tail = have_formatting_never -+ }, -+ [ALWAYS_TAILS_FORMATTING_ID] = { -+ .h = { -+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE, -+ .id = ALWAYS_TAILS_FORMATTING_ID, -+ .pops = NULL, -+ .label = "always", -+ .desc = "Always store file's tail", -+ .linkage = {NULL, NULL} -+ }, -+ .have_tail = have_formatting_always -+ }, -+ [SMALL_FILE_FORMATTING_ID] = { -+ .h = { -+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE, -+ .id = SMALL_FILE_FORMATTING_ID, -+ .pops = NULL, -+ .label = "4blocks", -+ .desc = "store files shorter than 4 blocks in tail items", -+ .linkage = {NULL, NULL} -+ }, -+ .have_tail = have_formatting_default -+ } -+}; -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/pool.c linux-2.6.20/fs/reiser4/pool.c ---- linux-2.6.20.orig/fs/reiser4/pool.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/pool.c 2007-05-06 14:50:43.863026968 +0400 -@@ -0,0 +1,234 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Fast pool allocation. -+ -+ There are situations when some sub-system normally asks memory allocator -+ for only few objects, but under some circumstances could require much -+ more. Typical and actually motivating example is tree balancing. It needs -+ to keep track of nodes that were involved into it, and it is well-known -+ that in reasonable packed balanced tree most (92.938121%) percent of all -+ balancings end up after working with only few nodes (3.141592 on -+ average). But in rare cases balancing can involve much more nodes -+ (3*tree_height+1 in extremal situation). -+ -+ On the one hand, we don't want to resort to dynamic allocation (slab, -+ malloc(), etc.) to allocate data structures required to keep track of -+ nodes during balancing. On the other hand, we cannot statically allocate -+ required amount of space on the stack, because first: it is useless wastage -+ of precious resource, and second: this amount is unknown in advance (tree -+ height can change). -+ -+ Pools, implemented in this file are solution for this problem: -+ -+ - some configurable amount of objects is statically preallocated on the -+ stack -+ -+ - if this preallocated pool is exhausted and more objects is requested -+ they are allocated dynamically. -+ -+ Pools encapsulate distinction between statically and dynamically allocated -+ objects. Both allocation and recycling look exactly the same. -+ -+ To keep track of dynamically allocated objects, pool adds its own linkage -+ to each object. -+ -+ NOTE-NIKITA This linkage also contains some balancing-specific data. This -+ is not perfect. On the other hand, balancing is currently the only client -+ of pool code. -+ -+ NOTE-NIKITA Another desirable feature is to rewrite all pool manipulation -+ functions in the style of tslist/tshash, i.e., make them unreadable, but -+ type-safe. -+ -+*/ -+ -+#include "debug.h" -+#include "pool.h" -+#include "super.h" -+ -+#include -+#include -+ -+/* initialize new pool object */ -+static void reiser4_init_pool_obj(reiser4_pool_header * h /* pool object to -+ * initialize */ ) -+{ -+ INIT_LIST_HEAD(&h->usage_linkage); -+ INIT_LIST_HEAD(&h->level_linkage); -+ INIT_LIST_HEAD(&h->extra_linkage); -+} -+ -+/* initialize new pool */ -+void reiser4_init_pool(reiser4_pool * pool /* pool to initialize */ , -+ size_t obj_size /* size of objects in @pool */ , -+ int num_of_objs /* number of preallocated objects */ , -+ char *data /* area for preallocated objects */ ) -+{ -+ reiser4_pool_header *h; -+ int i; -+ -+ assert("nikita-955", pool != NULL); -+ assert("nikita-1044", obj_size > 0); -+ assert("nikita-956", num_of_objs >= 0); -+ assert("nikita-957", data != NULL); -+ -+ memset(pool, 0, sizeof *pool); -+ pool->obj_size = obj_size; -+ pool->data = data; -+ INIT_LIST_HEAD(&pool->free); -+ INIT_LIST_HEAD(&pool->used); -+ INIT_LIST_HEAD(&pool->extra); -+ memset(data, 0, obj_size * num_of_objs); -+ for (i = 0; i < num_of_objs; ++i) { -+ h = (reiser4_pool_header *) (data + i * obj_size); -+ reiser4_init_pool_obj(h); -+ /* add pool header to the end of pool's free list */ -+ list_add_tail(&h->usage_linkage, &pool->free); -+ } -+} -+ -+/* release pool resources -+ -+ Release all resources acquired by this pool, specifically, dynamically -+ allocated objects. -+ -+*/ -+void reiser4_done_pool(reiser4_pool * pool UNUSED_ARG /* pool to destroy */ ) -+{ -+} -+ -+/* allocate carry object from pool -+ -+ First, try to get preallocated object. If this fails, resort to dynamic -+ allocation. -+ -+*/ -+static void *reiser4_pool_alloc(reiser4_pool * pool /* pool to allocate object -+ * from */ ) -+{ -+ reiser4_pool_header *result; -+ -+ assert("nikita-959", pool != NULL); -+ -+ if (!list_empty(&pool->free)) { -+ struct list_head *linkage; -+ -+ linkage = pool->free.next; -+ list_del(linkage); -+ INIT_LIST_HEAD(linkage); -+ result = list_entry(linkage, reiser4_pool_header, usage_linkage); -+ BUG_ON(!list_empty(&result->level_linkage) || -+ !list_empty(&result->extra_linkage)); -+ } else { -+ /* pool is empty. Extra allocations don't deserve dedicated -+ slab to be served from, as they are expected to be rare. */ -+ result = kmalloc(pool->obj_size, reiser4_ctx_gfp_mask_get()); -+ if (result != 0) { -+ reiser4_init_pool_obj(result); -+ list_add(&result->extra_linkage, &pool->extra); -+ } else -+ return ERR_PTR(RETERR(-ENOMEM)); -+ BUG_ON(!list_empty(&result->usage_linkage) || -+ !list_empty(&result->level_linkage)); -+ } -+ ++pool->objs; -+ list_add(&result->usage_linkage, &pool->used); -+ memset(result + 1, 0, pool->obj_size - sizeof *result); -+ return result; -+} -+ -+/* return object back to the pool */ -+void reiser4_pool_free(reiser4_pool * pool, reiser4_pool_header * h /* pool to return object back -+ * into */ ) -+{ -+ assert("nikita-961", h != NULL); -+ assert("nikita-962", pool != NULL); -+ -+ --pool->objs; -+ assert("nikita-963", pool->objs >= 0); -+ -+ list_del_init(&h->usage_linkage); -+ list_del_init(&h->level_linkage); -+ -+ if (list_empty(&h->extra_linkage)) -+ /* -+ * pool header is not an extra one. Push it onto free list -+ * using usage_linkage -+ */ -+ list_add(&h->usage_linkage, &pool->free); -+ else { -+ /* remove pool header from pool's extra list and kfree it */ -+ list_del(&h->extra_linkage); -+ kfree(h); -+ } -+} -+ -+/* add new object to the carry level list -+ -+ Carry level is FIFO most of the time, but not always. Complications arise -+ when make_space() function tries to go to the left neighbor and thus adds -+ carry node before existing nodes, and also, when updating delimiting keys -+ after moving data between two nodes, we want left node to be locked before -+ right node. -+ -+ Latter case is confusing at the first glance. Problem is that COP_UPDATE -+ opration that updates delimiting keys is sometimes called with two nodes -+ (when data are moved between two nodes) and sometimes with only one node -+ (when leftmost item is deleted in a node). In any case operation is -+ supplied with at least node whose left delimiting key is to be updated -+ (that is "right" node). -+ -+*/ -+reiser4_pool_header *reiser4_add_obj(reiser4_pool * pool /* pool from which to -+ * allocate new object -+ */, -+ struct list_head *list /* list where to add -+ * object */, -+ pool_ordering order /* where to add */, -+ reiser4_pool_header * reference -+ /* after (or before) which existing object -+ to add */) -+{ -+ reiser4_pool_header *result; -+ -+ assert("nikita-972", pool != NULL); -+ -+ result = reiser4_pool_alloc(pool); -+ if (IS_ERR(result)) -+ return result; -+ -+ assert("nikita-973", result != NULL); -+ -+ switch (order) { -+ case POOLO_BEFORE: -+ __list_add(&result->level_linkage, -+ reference->level_linkage.prev, -+ &reference->level_linkage); -+ break; -+ case POOLO_AFTER: -+ __list_add(&result->level_linkage, -+ &reference->level_linkage, -+ reference->level_linkage.next); -+ break; -+ case POOLO_LAST: -+ list_add_tail(&result->level_linkage, list); -+ break; -+ case POOLO_FIRST: -+ list_add(&result->level_linkage, list); -+ break; -+ default: -+ wrong_return_value("nikita-927", "order"); -+ } -+ return result; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/pool.h linux-2.6.20/fs/reiser4/pool.h ---- linux-2.6.20.orig/fs/reiser4/pool.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/pool.h 2007-05-06 14:50:43.863026968 +0400 -@@ -0,0 +1,55 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Fast pool allocation */ -+ -+#ifndef __REISER4_POOL_H__ -+#define __REISER4_POOL_H__ -+ -+#include -+ -+typedef struct reiser4_pool { -+ size_t obj_size; -+ int objs; -+ char *data; -+ struct list_head free; -+ struct list_head used; -+ struct list_head extra; -+} reiser4_pool; -+ -+typedef struct reiser4_pool_header { -+ /* object is either on free or "used" lists */ -+ struct list_head usage_linkage; -+ struct list_head level_linkage; -+ struct list_head extra_linkage; -+} reiser4_pool_header; -+ -+typedef enum { -+ POOLO_BEFORE, -+ POOLO_AFTER, -+ POOLO_LAST, -+ POOLO_FIRST -+} pool_ordering; -+ -+/* pool manipulation functions */ -+ -+extern void reiser4_init_pool(reiser4_pool * pool, size_t obj_size, -+ int num_of_objs, char *data); -+extern void reiser4_done_pool(reiser4_pool * pool); -+extern void reiser4_pool_free(reiser4_pool * pool, reiser4_pool_header * h); -+reiser4_pool_header *reiser4_add_obj(reiser4_pool * pool, -+ struct list_head * list, -+ pool_ordering order, -+ reiser4_pool_header * reference); -+ -+/* __REISER4_POOL_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/readahead.c linux-2.6.20/fs/reiser4/readahead.c ---- linux-2.6.20.orig/fs/reiser4/readahead.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/readahead.c 2007-05-06 14:50:43.867028218 +0400 -@@ -0,0 +1,138 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#include "forward.h" -+#include "tree.h" -+#include "tree_walk.h" -+#include "super.h" -+#include "inode.h" -+#include "key.h" -+#include "znode.h" -+ -+#include /* for totalram_pages */ -+ -+void reiser4_init_ra_info(ra_info_t * rai) -+{ -+ rai->key_to_stop = *reiser4_min_key(); -+} -+ -+/* global formatted node readahead parameter. It can be set by mount option -o readahead:NUM:1 */ -+static inline int ra_adjacent_only(int flags) -+{ -+ return flags & RA_ADJACENT_ONLY; -+} -+ -+/* this is used by formatted_readahead to decide whether read for right neighbor of node is to be issued. It returns 1 -+ if right neighbor's first key is less or equal to readahead's stop key */ -+static int should_readahead_neighbor(znode * node, ra_info_t * info) -+{ -+ int result; -+ -+ read_lock_dk(znode_get_tree(node)); -+ result = keyle(znode_get_rd_key(node), &info->key_to_stop); -+ read_unlock_dk(znode_get_tree(node)); -+ return result; -+} -+ -+#define LOW_MEM_PERCENTAGE (5) -+ -+static int low_on_memory(void) -+{ -+ unsigned int freepages; -+ -+ freepages = nr_free_pages(); -+ return freepages < (totalram_pages * LOW_MEM_PERCENTAGE / 100); -+} -+ -+/* start read for @node and for a few of its right neighbors */ -+void formatted_readahead(znode * node, ra_info_t * info) -+{ -+ ra_params_t *ra_params; -+ znode *cur; -+ int i; -+ int grn_flags; -+ lock_handle next_lh; -+ -+ /* do nothing if node block number has not been assigned to node (which means it is still in cache). */ -+ if (reiser4_blocknr_is_fake(znode_get_block(node))) -+ return; -+ -+ ra_params = get_current_super_ra_params(); -+ -+ if (znode_page(node) == NULL) -+ jstartio(ZJNODE(node)); -+ -+ if (znode_get_level(node) != LEAF_LEVEL) -+ return; -+ -+ /* don't waste memory for read-ahead when low on memory */ -+ if (low_on_memory()) -+ return; -+ -+ /* We can have locked nodes on upper tree levels, in this situation lock -+ priorities do not help to resolve deadlocks, we have to use TRY_LOCK -+ here. */ -+ grn_flags = (GN_CAN_USE_UPPER_LEVELS | GN_TRY_LOCK); -+ -+ i = 0; -+ cur = zref(node); -+ init_lh(&next_lh); -+ while (i < ra_params->max) { -+ const reiser4_block_nr *nextblk; -+ -+ if (!should_readahead_neighbor(cur, info)) -+ break; -+ -+ if (reiser4_get_right_neighbor -+ (&next_lh, cur, ZNODE_READ_LOCK, grn_flags)) -+ break; -+ -+ nextblk = znode_get_block(next_lh.node); -+ if (reiser4_blocknr_is_fake(nextblk) || -+ (ra_adjacent_only(ra_params->flags) -+ && *nextblk != *znode_get_block(cur) + 1)) { -+ break; -+ } -+ -+ zput(cur); -+ cur = zref(next_lh.node); -+ done_lh(&next_lh); -+ if (znode_page(cur) == NULL) -+ jstartio(ZJNODE(cur)); -+ else -+ /* Do not scan read-ahead window if pages already -+ * allocated (and i/o already started). */ -+ break; -+ -+ i++; -+ } -+ zput(cur); -+ done_lh(&next_lh); -+} -+ -+void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap) -+{ -+ reiser4_key *stop_key; -+ -+ assert("nikita-3542", dir != NULL); -+ assert("nikita-3543", tap != NULL); -+ -+ stop_key = &tap->ra_info.key_to_stop; -+ /* initialize readdir readahead information: include into readahead -+ * stat data of all files of the directory */ -+ set_key_locality(stop_key, get_inode_oid(dir)); -+ set_key_type(stop_key, KEY_SD_MINOR); -+ set_key_ordering(stop_key, get_key_ordering(reiser4_max_key())); -+ set_key_objectid(stop_key, get_key_objectid(reiser4_max_key())); -+ set_key_offset(stop_key, get_key_offset(reiser4_max_key())); -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/readahead.h linux-2.6.20/fs/reiser4/readahead.h ---- linux-2.6.20.orig/fs/reiser4/readahead.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/readahead.h 2007-05-06 14:50:43.867028218 +0400 -@@ -0,0 +1,48 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#ifndef __READAHEAD_H__ -+#define __READAHEAD_H__ -+ -+#include "key.h" -+ -+typedef enum { -+ RA_ADJACENT_ONLY = 1, /* only requests nodes which are adjacent. Default is NO (not only adjacent) */ -+} ra_global_flags; -+ -+/* reiser4 super block has a field of this type. It controls readahead during tree traversals */ -+typedef struct formatted_read_ahead_params { -+ unsigned long max; /* request not more than this amount of nodes. Default is totalram_pages / 4 */ -+ int flags; -+} ra_params_t; -+ -+typedef struct { -+ reiser4_key key_to_stop; -+} ra_info_t; -+ -+void formatted_readahead(znode *, ra_info_t *); -+void reiser4_init_ra_info(ra_info_t * rai); -+ -+struct reiser4_file_ra_state { -+ loff_t start; /* Current window */ -+ loff_t size; -+ loff_t next_size; /* Next window size */ -+ loff_t ahead_start; /* Ahead window */ -+ loff_t ahead_size; -+ loff_t max_window_size; /* Maximum readahead window */ -+ loff_t slow_start; /* enlarging r/a size algorithm. */ -+}; -+ -+extern void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap); -+ -+/* __READAHEAD_H__ */ -+#endif -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/README linux-2.6.20/fs/reiser4/README ---- linux-2.6.20.orig/fs/reiser4/README 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/README 2007-05-06 14:50:43.867028218 +0400 -@@ -0,0 +1,125 @@ -+[LICENSING] -+ -+Reiser4 is hereby licensed under the GNU General -+Public License version 2. -+ -+Source code files that contain the phrase "licensing governed by -+reiser4/README" are "governed files" throughout this file. Governed -+files are licensed under the GPL. The portions of them owned by Hans -+Reiser, or authorized to be licensed by him, have been in the past, -+and likely will be in the future, licensed to other parties under -+other licenses. If you add your code to governed files, and don't -+want it to be owned by Hans Reiser, put your copyright label on that -+code so the poor blight and his customers can keep things straight. -+All portions of governed files not labeled otherwise are owned by Hans -+Reiser, and by adding your code to it, widely distributing it to -+others or sending us a patch, and leaving the sentence in stating that -+licensing is governed by the statement in this file, you accept this. -+It will be a kindness if you identify whether Hans Reiser is allowed -+to license code labeled as owned by you on your behalf other than -+under the GPL, because he wants to know if it is okay to do so and put -+a check in the mail to you (for non-trivial improvements) when he -+makes his next sale. He makes no guarantees as to the amount if any, -+though he feels motivated to motivate contributors, and you can surely -+discuss this with him before or after contributing. You have the -+right to decline to allow him to license your code contribution other -+than under the GPL. -+ -+Further licensing options are available for commercial and/or other -+interests directly from Hans Reiser: reiser@namesys.com. If you interpret -+the GPL as not allowing those additional licensing options, you read -+it wrongly, and Richard Stallman agrees with me, when carefully read -+you can see that those restrictions on additional terms do not apply -+to the owner of the copyright, and my interpretation of this shall -+govern for this license. -+ -+[END LICENSING] -+ -+Reiser4 is a file system based on dancing tree algorithms, and is -+described at http://www.namesys.com -+ -+mkfs.reiser4 and other utilities are on our webpage or wherever your -+Linux provider put them. You really want to be running the latest -+version off the website if you use fsck. -+ -+Yes, if you update your reiser4 kernel module you do have to -+recompile your kernel, most of the time. The errors you get will be -+quite cryptic if your forget to do so. -+ -+Hideous Commercial Pitch: Spread your development costs across other OS -+vendors. Select from the best in the world, not the best in your -+building, by buying from third party OS component suppliers. Leverage -+the software component development power of the internet. Be the most -+aggressive in taking advantage of the commercial possibilities of -+decentralized internet development, and add value through your branded -+integration that you sell as an operating system. Let your competitors -+be the ones to compete against the entire internet by themselves. Be -+hip, get with the new economic trend, before your competitors do. Send -+email to reiser@namesys.com -+ -+Hans Reiser was the primary architect of Reiser4, but a whole team -+chipped their ideas in. He invested everything he had into Namesys -+for 5.5 dark years of no money before Reiser3 finally started to work well -+enough to bring in money. He owns the copyright. -+ -+DARPA was the primary sponsor of Reiser4. DARPA does not endorse -+Reiser4, it merely sponsors it. DARPA is, in solely Hans's personal -+opinion, unique in its willingness to invest into things more -+theoretical than the VC community can readily understand, and more -+longterm than allows them to be sure that they will be the ones to -+extract the economic benefits from. DARPA also integrated us into a -+security community that transformed our security worldview. -+ -+Vladimir Saveliev is our lead programmer, with us from the beginning, -+and he worked long hours writing the cleanest code. This is why he is -+now the lead programmer after years of commitment to our work. He -+always made the effort to be the best he could be, and to make his -+code the best that it could be. What resulted was quite remarkable. I -+don't think that money can ever motivate someone to work the way he -+did, he is one of the most selfless men I know. -+ -+Alexander Lyamin was our sysadmin, and helped to educate us in -+security issues. Moscow State University and IMT were very generous -+in the internet access they provided us, and in lots of other little -+ways that a generous institution can be. -+ -+Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the -+locking code, the block allocator, and finished the flushing code. -+His code is always crystal clean and well structured. -+ -+Nikita Danilov wrote the core of the balancing code, the core of the -+plugins code, and the directory code. He worked a steady pace of long -+hours that produced a whole lot of well abstracted code. He is our -+senior computer scientist. -+ -+Vladimir Demidov wrote the parser. Writing an in kernel parser is -+something very few persons have the skills for, and it is thanks to -+him that we can say that the parser is really not so big compared to -+various bits of our other code, and making a parser work in the kernel -+was not so complicated as everyone would imagine mainly because it was -+him doing it... -+ -+Joshua McDonald wrote the transaction manager, and the flush code. -+The flush code unexpectedly turned out be extremely hairy for reasons -+you can read about on our web page, and he did a great job on an -+extremely difficult task. -+ -+Nina Reiser handled our accounting, government relations, and much -+more. -+ -+Ramon Reiser developed our website. -+ -+Beverly Palmer drew our graphics. -+ -+Vitaly Fertman developed librepair, userspace plugins repair code, fsck -+and worked with Umka on developing libreiser4 and userspace plugins. -+ -+Yury Umanets (aka Umka) developed libreiser4, userspace plugins and -+userspace tools (reiser4progs). -+ -+Oleg Drokin (aka Green) is the release manager who fixes everything. -+It is so nice to have someone like that on the team. He (plus Chris -+and Jeff) make it possible for the entire rest of the Namesys team to -+focus on Reiser4, and he fixed a whole lot of Reiser4 bugs also. It -+is just amazing to watch his talent for spotting bugs in action. -+ -diff -urN linux-2.6.20.orig/fs/reiser4/reiser4.h linux-2.6.20/fs/reiser4/reiser4.h ---- linux-2.6.20.orig/fs/reiser4/reiser4.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/reiser4.h 2007-05-06 14:50:43.867028218 +0400 -@@ -0,0 +1,269 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* definitions of common constants used by reiser4 */ -+ -+#if !defined( __REISER4_H__ ) -+#define __REISER4_H__ -+ -+#include /* for HZ */ -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * reiser4 compilation options. -+ */ -+ -+#if defined(CONFIG_REISER4_DEBUG) -+/* turn on assertion checks */ -+#define REISER4_DEBUG (1) -+#else -+#define REISER4_DEBUG (0) -+#endif -+ -+#if defined(CONFIG_ZLIB_INFLATE) -+/* turn on zlib */ -+#define REISER4_ZLIB (1) -+#else -+#define REISER4_ZLIB (0) -+#endif -+ -+#if defined(CONFIG_CRYPTO_SHA256) -+#define REISER4_SHA256 (1) -+#else -+#define REISER4_SHA256 (0) -+#endif -+ -+/* -+ * Turn on large keys mode. In his mode (which is default), reiser4 key has 4 -+ * 8-byte components. In the old "small key" mode, it's 3 8-byte -+ * components. Additional component, referred to as "ordering" is used to -+ * order items from which given object is composed of. As such, ordering is -+ * placed between locality and objectid. For directory item ordering contains -+ * initial prefix of the file name this item is for. This sorts all directory -+ * items within given directory lexicographically (but see -+ * fibration.[ch]). For file body and stat-data, ordering contains initial -+ * prefix of the name file was initially created with. In the common case -+ * (files with single name) this allows to order file bodies and stat-datas in -+ * the same order as their respective directory entries, thus speeding up -+ * readdir. -+ * -+ * Note, that kernel can only mount file system with the same key size as one -+ * it is compiled for, so flipping this option may render your data -+ * inaccessible. -+ */ -+#define REISER4_LARGE_KEY (1) -+/*#define REISER4_LARGE_KEY (0)*/ -+ -+/*#define GUESS_EXISTS 1*/ -+ -+/* -+ * PLEASE update fs/reiser4/kattr.c:show_options() when adding new compilation -+ * option -+ */ -+ -+extern const char *REISER4_SUPER_MAGIC_STRING; -+extern const int REISER4_MAGIC_OFFSET; /* offset to magic string from the -+ * beginning of device */ -+ -+/* here go tunable parameters that are not worth special entry in kernel -+ configuration */ -+ -+/* default number of slots in coord-by-key caches */ -+#define CBK_CACHE_SLOTS (16) -+/* how many elementary tree operation to carry on the next level */ -+#define CARRIES_POOL_SIZE (5) -+/* size of pool of preallocated nodes for carry process. */ -+#define NODES_LOCKED_POOL_SIZE (5) -+ -+#define REISER4_NEW_NODE_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT) -+#define REISER4_NEW_EXTENT_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT) -+#define REISER4_PASTE_FLAGS (COPI_GO_LEFT) -+#define REISER4_INSERT_FLAGS (COPI_GO_LEFT) -+ -+/* we are supporting reservation of disk space on uid basis */ -+#define REISER4_SUPPORT_UID_SPACE_RESERVATION (0) -+/* we are supporting reservation of disk space for groups */ -+#define REISER4_SUPPORT_GID_SPACE_RESERVATION (0) -+/* we are supporting reservation of disk space for root */ -+#define REISER4_SUPPORT_ROOT_SPACE_RESERVATION (0) -+/* we use rapid flush mode, see flush.c for comments. */ -+#define REISER4_USE_RAPID_FLUSH (1) -+ -+/* -+ * set this to 0 if you don't want to use wait-for-flush in ->writepage(). -+ */ -+#define REISER4_USE_ENTD (1) -+ -+/* key allocation is Plan-A */ -+#define REISER4_PLANA_KEY_ALLOCATION (1) -+/* key allocation follows good old 3.x scheme */ -+#define REISER4_3_5_KEY_ALLOCATION (0) -+ -+/* size of hash-table for znodes */ -+#define REISER4_ZNODE_HASH_TABLE_SIZE (1 << 13) -+ -+/* number of buckets in lnode hash-table */ -+#define LNODE_HTABLE_BUCKETS (1024) -+ -+/* some ridiculously high maximal limit on height of znode tree. This -+ is used in declaration of various per level arrays and -+ to allocate stattistics gathering array for per-level stats. */ -+#define REISER4_MAX_ZTREE_HEIGHT (8) -+ -+#define REISER4_PANIC_MSG_BUFFER_SIZE (1024) -+ -+/* If array contains less than REISER4_SEQ_SEARCH_BREAK elements then, -+ sequential search is on average faster than binary. This is because -+ of better optimization and because sequential search is more CPU -+ cache friendly. This number (25) was found by experiments on dual AMD -+ Athlon(tm), 1400MHz. -+ -+ NOTE: testing in kernel has shown that binary search is more effective than -+ implied by results of the user level benchmarking. Probably because in the -+ node keys are separated by other data. So value was adjusted after few -+ tests. More thorough tuning is needed. -+*/ -+#define REISER4_SEQ_SEARCH_BREAK (3) -+ -+/* don't allow tree to be lower than this */ -+#define REISER4_MIN_TREE_HEIGHT (TWIG_LEVEL) -+ -+/* NOTE NIKITA this is no longer used: maximal atom size is auto-adjusted to -+ * available memory. */ -+/* Default value of maximal atom size. Can be ovewritten by -+ tmgr.atom_max_size mount option. By default infinity. */ -+#define REISER4_ATOM_MAX_SIZE ((unsigned)(~0)) -+ -+/* Default value of maximal atom age (in jiffies). After reaching this age -+ atom will be forced to commit, either synchronously or asynchronously. Can -+ be overwritten by tmgr.atom_max_age mount option. */ -+#define REISER4_ATOM_MAX_AGE (600 * HZ) -+ -+/* sleeping period for ktxnmrgd */ -+#define REISER4_TXNMGR_TIMEOUT (5 * HZ) -+ -+/* timeout to wait for ent thread in writepage. Default: 3 milliseconds. */ -+#define REISER4_ENTD_TIMEOUT (3 * HZ / 1000) -+ -+/* start complaining after that many restarts in coord_by_key(). -+ -+ This either means incredibly heavy contention for this part of a tree, or -+ some corruption or bug. -+*/ -+#define REISER4_CBK_ITERATIONS_LIMIT (100) -+ -+/* return -EIO after that many iterations in coord_by_key(). -+ -+ I have witnessed more than 800 iterations (in 30 thread test) before cbk -+ finished. --nikita -+*/ -+#define REISER4_MAX_CBK_ITERATIONS 500000 -+ -+/* put a per-inode limit on maximal number of directory entries with identical -+ keys in hashed directory. -+ -+ Disable this until inheritance interfaces stabilize: we need some way to -+ set per directory limit. -+*/ -+#define REISER4_USE_COLLISION_LIMIT (0) -+ -+/* If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty leaf-level blocks it -+ will force them to be relocated. */ -+#define FLUSH_RELOCATE_THRESHOLD 64 -+/* If flush finds can find a block allocation closer than at most FLUSH_RELOCATE_DISTANCE -+ from the preceder it will relocate to that position. */ -+#define FLUSH_RELOCATE_DISTANCE 64 -+ -+/* If we have written this much or more blocks before encountering busy jnode -+ in flush list - abort flushing hoping that next time we get called -+ this jnode will be clean already, and we will save some seeks. */ -+#define FLUSH_WRITTEN_THRESHOLD 50 -+ -+/* The maximum number of nodes to scan left on a level during flush. */ -+#define FLUSH_SCAN_MAXNODES 10000 -+ -+/* per-atom limit of flushers */ -+#define ATOM_MAX_FLUSHERS (1) -+ -+/* default tracing buffer size */ -+#define REISER4_TRACE_BUF_SIZE (1 << 15) -+ -+/* what size units of IO we would like cp, etc., to use, in writing to -+ reiser4. In bytes. -+ -+ Can be overwritten by optimal_io_size mount option. -+*/ -+#define REISER4_OPTIMAL_IO_SIZE (64 * 1024) -+ -+/* see comments in inode.c:oid_to_uino() */ -+#define REISER4_UINO_SHIFT (1 << 30) -+ -+/* Mark function argument as unused to avoid compiler warnings. */ -+#define UNUSED_ARG __attribute__((unused)) -+ -+#if ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3) -+#define NONNULL __attribute__((nonnull)) -+#else -+#define NONNULL -+#endif -+ -+/* master super block offset in bytes.*/ -+#define REISER4_MASTER_OFFSET 65536 -+ -+/* size of VFS block */ -+#define VFS_BLKSIZE 512 -+/* number of bits in size of VFS block (512==2^9) */ -+#define VFS_BLKSIZE_BITS 9 -+ -+#define REISER4_I reiser4_inode_data -+ -+/* implication */ -+#define ergo( antecedent, consequent ) ( !( antecedent ) || ( consequent ) ) -+/* logical equivalence */ -+#define equi( p1, p2 ) ( ergo( ( p1 ), ( p2 ) ) && ergo( ( p2 ), ( p1 ) ) ) -+ -+#define sizeof_array(x) ((int) (sizeof(x) / sizeof(x[0]))) -+ -+#define NOT_YET (0) -+ -+/** Reiser4 specific error codes **/ -+ -+#define REISER4_ERROR_CODE_BASE 500 -+ -+/* Neighbor is not available (side neighbor or parent) */ -+#define E_NO_NEIGHBOR (REISER4_ERROR_CODE_BASE) -+ -+/* Node was not found in cache */ -+#define E_NOT_IN_CACHE (REISER4_ERROR_CODE_BASE + 1) -+ -+/* node has no free space enough for completion of balancing operation */ -+#define E_NODE_FULL (REISER4_ERROR_CODE_BASE + 2) -+ -+/* repeat operation */ -+#define E_REPEAT (REISER4_ERROR_CODE_BASE + 3) -+ -+/* deadlock happens */ -+#define E_DEADLOCK (REISER4_ERROR_CODE_BASE + 4) -+ -+/* operation cannot be performed, because it would block and non-blocking mode -+ * was requested. */ -+#define E_BLOCK (REISER4_ERROR_CODE_BASE + 5) -+ -+/* wait some event (depends on context), then repeat */ -+#define E_WAIT (REISER4_ERROR_CODE_BASE + 6) -+ -+#endif /* __REISER4_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/safe_link.c linux-2.6.20/fs/reiser4/safe_link.c ---- linux-2.6.20.orig/fs/reiser4/safe_link.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/safe_link.c 2007-05-06 14:50:43.867028218 +0400 -@@ -0,0 +1,351 @@ -+/* Copyright 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Safe-links. */ -+ -+/* -+ * Safe-links are used to maintain file system consistency during operations -+ * that spawns multiple transactions. For example: -+ * -+ * 1. Unlink. UNIX supports "open-but-unlinked" files, that is files -+ * without user-visible names in the file system, but still opened by some -+ * active process. What happens here is that unlink proper (i.e., removal -+ * of the last file name) and file deletion (truncate of file body to zero -+ * and deletion of stat-data, that happens when last file descriptor is -+ * closed), may belong to different transactions T1 and T2. If a crash -+ * happens after T1 commit, but before T2 commit, on-disk file system has -+ * a file without name, that is, disk space leak. -+ * -+ * 2. Truncate. Truncate of large file may spawn multiple transactions. If -+ * system crashes while truncate was in-progress, file is left partially -+ * truncated, which violates "atomicity guarantees" of reiser4, viz. that -+ * every system is atomic. -+ * -+ * Safe-links address both above cases. Basically, safe-link is a way post -+ * some operation to be executed during commit of some other transaction than -+ * current one. (Another way to look at the safe-link is to interpret it as a -+ * logical logging.) -+ * -+ * Specifically, at the beginning of unlink safe-link in inserted in the -+ * tree. This safe-link is normally removed by file deletion code (during -+ * transaction T2 in the above terms). Truncate also inserts safe-link that is -+ * normally removed when truncate operation is finished. -+ * -+ * This means, that in the case of "clean umount" there are no safe-links in -+ * the tree. If safe-links are observed during mount, it means that (a) system -+ * was terminated abnormally, and (b) safe-link correspond to the "pending" -+ * (i.e., not finished) operations that were in-progress during system -+ * termination. Each safe-link record enough information to complete -+ * corresponding operation, and mount simply "replays" them (hence, the -+ * analogy with the logical logging). -+ * -+ * Safe-links are implemented as blackbox items (see -+ * plugin/item/blackbox.[ch]). -+ * -+ * For the reference: ext3 also has similar mechanism, it's called "an orphan -+ * list" there. -+ */ -+ -+#include "safe_link.h" -+#include "debug.h" -+#include "inode.h" -+ -+#include "plugin/item/blackbox.h" -+ -+#include -+ -+/* -+ * On-disk format of safe-link. -+ */ -+typedef struct safelink { -+ reiser4_key sdkey; /* key of stat-data for the file safe-link is -+ * for */ -+ d64 size; /* size to which file should be truncated */ -+} safelink_t; -+ -+/* -+ * locality where safe-link items are stored. Next to the objectid of root -+ * directory. -+ */ -+static oid_t safe_link_locality(reiser4_tree * tree) -+{ -+ return get_key_objectid(get_super_private(tree->super)->df_plug-> -+ root_dir_key(tree->super)) + 1; -+} -+ -+/* -+ Construct a key for the safe-link. Key has the following format: -+ -+| 60 | 4 | 64 | 4 | 60 | 64 | -++---------------+---+------------------+---+---------------+------------------+ -+| locality | 0 | 0 | 0 | objectid | link type | -++---------------+---+------------------+---+---------------+------------------+ -+| | | | | -+| 8 bytes | 8 bytes | 8 bytes | 8 bytes | -+ -+ This is in large keys format. In small keys format second 8 byte chunk is -+ out. Locality is a constant returned by safe_link_locality(). objectid is -+ an oid of a file on which operation protected by this safe-link is -+ performed. link-type is used to distinguish safe-links for different -+ operations. -+ -+ */ -+static reiser4_key *build_link_key(reiser4_tree * tree, oid_t oid, -+ reiser4_safe_link_t link, reiser4_key * key) -+{ -+ reiser4_key_init(key); -+ set_key_locality(key, safe_link_locality(tree)); -+ set_key_objectid(key, oid); -+ set_key_offset(key, link); -+ return key; -+} -+ -+/* -+ * how much disk space is necessary to insert and remove (in the -+ * error-handling path) safe-link. -+ */ -+static __u64 safe_link_tograb(reiser4_tree * tree) -+{ -+ return -+ /* insert safe link */ -+ estimate_one_insert_item(tree) + -+ /* remove safe link */ -+ estimate_one_item_removal(tree) + -+ /* drill to the leaf level during insertion */ -+ 1 + estimate_one_insert_item(tree) + -+ /* -+ * possible update of existing safe-link. Actually, if -+ * safe-link existed already (we failed to remove it), then no -+ * insertion is necessary, so this term is already "covered", -+ * but for simplicity let's left it. -+ */ -+ 1; -+} -+ -+/* -+ * grab enough disk space to insert and remove (in the error-handling path) -+ * safe-link. -+ */ -+int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags) -+{ -+ int result; -+ -+ grab_space_enable(); -+ /* The sbinfo->delete_mutex can be taken here. -+ * safe_link_release() should be called before leaving reiser4 -+ * context. */ -+ result = -+ reiser4_grab_reserved(tree->super, safe_link_tograb(tree), flags); -+ grab_space_enable(); -+ return result; -+} -+ -+/* -+ * release unused disk space reserved by safe_link_grab(). -+ */ -+void safe_link_release(reiser4_tree * tree) -+{ -+ reiser4_release_reserved(tree->super); -+} -+ -+/* -+ * insert into tree safe-link for operation @link on inode @inode. -+ */ -+int safe_link_add(struct inode *inode, reiser4_safe_link_t link) -+{ -+ reiser4_key key; -+ safelink_t sl; -+ int length; -+ int result; -+ reiser4_tree *tree; -+ -+ build_sd_key(inode, &sl.sdkey); -+ length = sizeof sl.sdkey; -+ -+ if (link == SAFE_TRUNCATE) { -+ /* -+ * for truncate we have to store final file length also, -+ * expand item. -+ */ -+ length += sizeof(sl.size); -+ put_unaligned(cpu_to_le64(inode->i_size), &sl.size); -+ } -+ tree = reiser4_tree_by_inode(inode); -+ build_link_key(tree, get_inode_oid(inode), link, &key); -+ -+ result = store_black_box(tree, &key, &sl, length); -+ if (result == -EEXIST) -+ result = update_black_box(tree, &key, &sl, length); -+ return result; -+} -+ -+/* -+ * remove safe-link corresponding to the operation @link on inode @inode from -+ * the tree. -+ */ -+int safe_link_del(reiser4_tree * tree, oid_t oid, reiser4_safe_link_t link) -+{ -+ reiser4_key key; -+ -+ return kill_black_box(tree, build_link_key(tree, oid, link, &key)); -+} -+ -+/* -+ * in-memory structure to keep information extracted from safe-link. This is -+ * used to iterate over all safe-links. -+ */ -+typedef struct { -+ reiser4_tree *tree; /* internal tree */ -+ reiser4_key key; /* safe-link key */ -+ reiser4_key sdkey; /* key of object stat-data */ -+ reiser4_safe_link_t link; /* safe-link type */ -+ oid_t oid; /* object oid */ -+ __u64 size; /* final size for truncate */ -+} safe_link_context; -+ -+/* -+ * start iterating over all safe-links. -+ */ -+static void safe_link_iter_begin(reiser4_tree * tree, safe_link_context * ctx) -+{ -+ ctx->tree = tree; -+ reiser4_key_init(&ctx->key); -+ set_key_locality(&ctx->key, safe_link_locality(tree)); -+ set_key_objectid(&ctx->key, get_key_objectid(reiser4_max_key())); -+ set_key_offset(&ctx->key, get_key_offset(reiser4_max_key())); -+} -+ -+/* -+ * return next safe-link. -+ */ -+static int safe_link_iter_next(safe_link_context * ctx) -+{ -+ int result; -+ safelink_t sl; -+ -+ result = load_black_box(ctx->tree, &ctx->key, &sl, sizeof sl, 0); -+ if (result == 0) { -+ ctx->oid = get_key_objectid(&ctx->key); -+ ctx->link = get_key_offset(&ctx->key); -+ ctx->sdkey = sl.sdkey; -+ if (ctx->link == SAFE_TRUNCATE) -+ ctx->size = le64_to_cpu(get_unaligned(&sl.size)); -+ } -+ return result; -+} -+ -+/* -+ * check are there any more safe-links left in the tree. -+ */ -+static int safe_link_iter_finished(safe_link_context * ctx) -+{ -+ return get_key_locality(&ctx->key) != safe_link_locality(ctx->tree); -+} -+ -+/* -+ * finish safe-link iteration. -+ */ -+static void safe_link_iter_end(safe_link_context * ctx) -+{ -+ /* nothing special */ -+} -+ -+/* -+ * process single safe-link. -+ */ -+static int process_safelink(struct super_block *super, reiser4_safe_link_t link, -+ reiser4_key * sdkey, oid_t oid, __u64 size) -+{ -+ struct inode *inode; -+ int result; -+ -+ /* -+ * obtain object inode by reiser4_iget(), then call object plugin -+ * ->safelink() method to do actual work, then delete safe-link on -+ * success. -+ */ -+ inode = reiser4_iget(super, sdkey, 1); -+ if (!IS_ERR(inode)) { -+ file_plugin *fplug; -+ -+ fplug = inode_file_plugin(inode); -+ assert("nikita-3428", fplug != NULL); -+ assert("", oid == get_inode_oid(inode)); -+ if (fplug->safelink != NULL) { -+ /* reiser4_txn_restart_current is not necessary because -+ * mounting is signle thread. However, without it -+ * deadlock detection code will complain (see -+ * nikita-3361). */ -+ reiser4_txn_restart_current(); -+ result = fplug->safelink(inode, link, size); -+ } else { -+ warning("nikita-3430", -+ "Cannot handle safelink for %lli", -+ (unsigned long long)oid); -+ reiser4_print_key("key", sdkey); -+ result = 0; -+ } -+ if (result != 0) { -+ warning("nikita-3431", -+ "Error processing safelink for %lli: %i", -+ (unsigned long long)oid, result); -+ } -+ reiser4_iget_complete(inode); -+ iput(inode); -+ if (result == 0) { -+ result = safe_link_grab(reiser4_get_tree(super), BA_CAN_COMMIT); -+ if (result == 0) -+ result = -+ safe_link_del(reiser4_get_tree(super), oid, link); -+ safe_link_release(reiser4_get_tree(super)); -+ /* -+ * restart transaction: if there was large number of -+ * safe-links, their processing may fail to fit into -+ * single transaction. -+ */ -+ if (result == 0) -+ reiser4_txn_restart_current(); -+ } -+ } else -+ result = PTR_ERR(inode); -+ return result; -+} -+ -+/* -+ * iterate over all safe-links in the file-system processing them one by one. -+ */ -+int process_safelinks(struct super_block *super) -+{ -+ safe_link_context ctx; -+ int result; -+ -+ if (rofs_super(super)) -+ /* do nothing on the read-only file system */ -+ return 0; -+ safe_link_iter_begin(&get_super_private(super)->tree, &ctx); -+ result = 0; -+ do { -+ result = safe_link_iter_next(&ctx); -+ if (safe_link_iter_finished(&ctx) || result == -ENOENT) { -+ result = 0; -+ break; -+ } -+ if (result == 0) -+ result = process_safelink(super, ctx.link, -+ &ctx.sdkey, ctx.oid, -+ ctx.size); -+ } while (result == 0); -+ safe_link_iter_end(&ctx); -+ return result; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/safe_link.h linux-2.6.20/fs/reiser4/safe_link.h ---- linux-2.6.20.orig/fs/reiser4/safe_link.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/safe_link.h 2007-05-06 14:50:43.867028218 +0400 -@@ -0,0 +1,29 @@ -+/* Copyright 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Safe-links. See safe_link.c for details. */ -+ -+#if !defined( __FS_SAFE_LINK_H__ ) -+#define __FS_SAFE_LINK_H__ -+ -+#include "tree.h" -+ -+int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags); -+void safe_link_release(reiser4_tree * tree); -+int safe_link_add(struct inode *inode, reiser4_safe_link_t link); -+int safe_link_del(reiser4_tree *, oid_t oid, reiser4_safe_link_t link); -+ -+int process_safelinks(struct super_block *super); -+ -+/* __FS_SAFE_LINK_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/seal.c linux-2.6.20/fs/reiser4/seal.c ---- linux-2.6.20.orig/fs/reiser4/seal.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/seal.c 2007-05-06 14:50:43.871029467 +0400 -@@ -0,0 +1,218 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+/* Seals implementation. */ -+/* Seals are "weak" tree pointers. They are analogous to tree coords in -+ allowing to bypass tree traversal. But normal usage of coords implies that -+ node pointed to by coord is locked, whereas seals don't keep a lock (or -+ even a reference) to znode. In stead, each znode contains a version number, -+ increased on each znode modification. This version number is copied into a -+ seal when seal is created. Later, one can "validate" seal by calling -+ reiser4_seal_validate(). If znode is in cache and its version number is -+ still the same, seal is "pristine" and coord associated with it can be -+ re-used immediately. -+ -+ If, on the other hand, znode is out of cache, or it is obviously different -+ one from the znode seal was initially attached to (for example, it is on -+ the different level, or is being removed from the tree), seal is -+ irreparably invalid ("burned") and tree traversal has to be repeated. -+ -+ Otherwise, there is some hope, that while znode was modified (and seal was -+ "broken" as a result), key attached to the seal is still in the node. This -+ is checked by first comparing this key with delimiting keys of node and, if -+ key is ok, doing intra-node lookup. -+ -+ Znode version is maintained in the following way: -+ -+ there is reiser4_tree.znode_epoch counter. Whenever new znode is created, -+ znode_epoch is incremented and its new value is stored in ->version field -+ of new znode. Whenever znode is dirtied (which means it was probably -+ modified), znode_epoch is also incremented and its new value is stored in -+ znode->version. This is done so, because just incrementing znode->version -+ on each update is not enough: it may so happen, that znode get deleted, new -+ znode is allocated for the same disk block and gets the same version -+ counter, tricking seal code into false positive. -+*/ -+ -+#include "forward.h" -+#include "debug.h" -+#include "key.h" -+#include "coord.h" -+#include "seal.h" -+#include "plugin/item/item.h" -+#include "plugin/node/node.h" -+#include "jnode.h" -+#include "znode.h" -+#include "super.h" -+ -+static znode *seal_node(const seal_t * seal); -+static int seal_matches(const seal_t * seal, znode * node); -+ -+/* initialise seal. This can be called several times on the same seal. @coord -+ and @key can be NULL. */ -+void reiser4_seal_init(seal_t * seal /* seal to initialise */ , -+ const coord_t * coord /* coord @seal will be -+ * attached to */ , -+ const reiser4_key * key UNUSED_ARG /* key @seal will be -+ * attached to */ ) -+{ -+ assert("nikita-1886", seal != NULL); -+ memset(seal, 0, sizeof *seal); -+ if (coord != NULL) { -+ znode *node; -+ -+ node = coord->node; -+ assert("nikita-1987", node != NULL); -+ spin_lock_znode(node); -+ seal->version = node->version; -+ assert("nikita-1988", seal->version != 0); -+ seal->block = *znode_get_block(node); -+#if REISER4_DEBUG -+ seal->coord1 = *coord; -+ if (key != NULL) -+ seal->key = *key; -+#endif -+ spin_unlock_znode(node); -+ } -+} -+ -+/* finish with seal */ -+void reiser4_seal_done(seal_t * seal /* seal to clear */ ) -+{ -+ assert("nikita-1887", seal != NULL); -+ seal->version = 0; -+} -+ -+/* true if seal was initialised */ -+int reiser4_seal_is_set(const seal_t * seal /* seal to query */ ) -+{ -+ assert("nikita-1890", seal != NULL); -+ return seal->version != 0; -+} -+ -+#if REISER4_DEBUG -+/* helper function for reiser4_seal_validate(). It checks that item at @coord -+ * has expected key. This is to detect cases where node was modified but wasn't -+ * marked dirty. */ -+static inline int check_seal_match(const coord_t * coord /* coord to check */ , -+ const reiser4_key * k /* expected key */ ) -+{ -+ reiser4_key ukey; -+ -+ return (coord->between != AT_UNIT) || -+ /* FIXME-VS: we only can compare keys for items whose units -+ represent exactly one key */ -+ ((coord_is_existing_unit(coord)) -+ && (item_is_extent(coord) -+ || keyeq(k, unit_key_by_coord(coord, &ukey)))) -+ || ((coord_is_existing_unit(coord)) && (item_is_ctail(coord)) -+ && keyge(k, unit_key_by_coord(coord, &ukey))); -+} -+#endif -+ -+/* this is used by reiser4_seal_validate. It accepts return value of -+ * longterm_lock_znode and returns 1 if it can be interpreted as seal -+ * validation failure. For instance, when longterm_lock_znode returns -EINVAL, -+ * reiser4_seal_validate returns -E_REPEAT and caller will call tre search. -+ * We cannot do this in longterm_lock_znode(), because sometimes we want to -+ * distinguish between -EINVAL and -E_REPEAT. */ -+static int should_repeat(int return_code) -+{ -+ return return_code == -EINVAL; -+} -+ -+/* (re-)validate seal. -+ -+ Checks whether seal is pristine, and try to revalidate it if possible. -+ -+ If seal was burned, or broken irreparably, return -E_REPEAT. -+ -+ NOTE-NIKITA currently reiser4_seal_validate() returns -E_REPEAT if key we are -+ looking for is in range of keys covered by the sealed node, but item wasn't -+ found by node ->lookup() method. Alternative is to return -ENOENT in this -+ case, but this would complicate callers logic. -+ -+*/ -+int reiser4_seal_validate(seal_t * seal /* seal to validate */, -+ coord_t * coord /* coord to validate against */, -+ const reiser4_key * key /* key to validate against */, -+ lock_handle * lh /* resulting lock handle */, -+ znode_lock_mode mode /* lock node */, -+ znode_lock_request request /* locking priority */) -+{ -+ znode *node; -+ int result; -+ -+ assert("nikita-1889", seal != NULL); -+ assert("nikita-1881", reiser4_seal_is_set(seal)); -+ assert("nikita-1882", key != NULL); -+ assert("nikita-1883", coord != NULL); -+ assert("nikita-1884", lh != NULL); -+ assert("nikita-1885", keyeq(&seal->key, key)); -+ assert("nikita-1989", coords_equal(&seal->coord1, coord)); -+ -+ /* obtain znode by block number */ -+ node = seal_node(seal); -+ if (node != NULL) { -+ /* znode was in cache, lock it */ -+ result = longterm_lock_znode(lh, node, mode, request); -+ zput(node); -+ if (result == 0) { -+ if (seal_matches(seal, node)) { -+ /* if seal version and znode version -+ coincide */ -+ ON_DEBUG(coord_update_v(coord)); -+ assert("nikita-1990", -+ node == seal->coord1.node); -+ assert("nikita-1898", -+ WITH_DATA_RET(coord->node, 1, -+ check_seal_match(coord, -+ key))); -+ } else -+ result = RETERR(-E_REPEAT); -+ } -+ if (result != 0) { -+ if (should_repeat(result)) -+ result = RETERR(-E_REPEAT); -+ /* unlock node on failure */ -+ done_lh(lh); -+ } -+ } else { -+ /* znode wasn't in cache */ -+ result = RETERR(-E_REPEAT); -+ } -+ return result; -+} -+ -+/* helpers functions */ -+ -+/* obtain reference to znode seal points to, if in cache */ -+static znode *seal_node(const seal_t * seal /* seal to query */ ) -+{ -+ assert("nikita-1891", seal != NULL); -+ return zlook(current_tree, &seal->block); -+} -+ -+/* true if @seal version and @node version coincide */ -+static int seal_matches(const seal_t * seal /* seal to check */ , -+ znode * node /* node to check */ ) -+{ -+ int result; -+ -+ assert("nikita-1991", seal != NULL); -+ assert("nikita-1993", node != NULL); -+ -+ spin_lock_znode(node); -+ result = (seal->version == node->version); -+ spin_unlock_znode(node); -+ return result; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/seal.h linux-2.6.20/fs/reiser4/seal.h ---- linux-2.6.20.orig/fs/reiser4/seal.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/seal.h 2007-05-06 14:50:43.871029467 +0400 -@@ -0,0 +1,49 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Declaration of seals: "weak" tree pointers. See seal.c for comments. */ -+ -+#ifndef __SEAL_H__ -+#define __SEAL_H__ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "coord.h" -+ -+/* for __u?? types */ -+/*#include */ -+ -+/* seal. See comment at the top of seal.c */ -+typedef struct seal_s { -+ /* version of znode recorder at the time of seal creation */ -+ __u64 version; -+ /* block number of znode attached to this seal */ -+ reiser4_block_nr block; -+#if REISER4_DEBUG -+ /* coord this seal is attached to. For debugging. */ -+ coord_t coord1; -+ /* key this seal is attached to. For debugging. */ -+ reiser4_key key; -+#endif -+} seal_t; -+ -+extern void reiser4_seal_init(seal_t *, const coord_t *, const reiser4_key *); -+extern void reiser4_seal_done(seal_t *); -+extern int reiser4_seal_is_set(const seal_t *); -+extern int reiser4_seal_validate(seal_t *, coord_t *, -+ const reiser4_key *, lock_handle *, -+ znode_lock_mode mode, znode_lock_request request); -+ -+/* __SEAL_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/search.c linux-2.6.20/fs/reiser4/search.c ---- linux-2.6.20.orig/fs/reiser4/search.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/search.c 2007-05-06 14:50:43.871029467 +0400 -@@ -0,0 +1,1611 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "coord.h" -+#include "seal.h" -+#include "plugin/item/item.h" -+#include "plugin/node/node.h" -+#include "plugin/plugin.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree_walk.h" -+#include "tree.h" -+#include "reiser4.h" -+#include "super.h" -+#include "inode.h" -+ -+#include -+ -+static const char *bias_name(lookup_bias bias); -+ -+/* tree searching algorithm, intranode searching algorithms are in -+ plugin/node/ */ -+ -+/* tree lookup cache -+ * -+ * The coord by key cache consists of small list of recently accessed nodes -+ * maintained according to the LRU discipline. Before doing real top-to-down -+ * tree traversal this cache is scanned for nodes that can contain key -+ * requested. -+ * -+ * The efficiency of coord cache depends heavily on locality of reference for -+ * tree accesses. Our user level simulations show reasonably good hit ratios -+ * for coord cache under most loads so far. -+ */ -+ -+/* Initialise coord cache slot */ -+static void cbk_cache_init_slot(cbk_cache_slot *slot) -+{ -+ assert("nikita-345", slot != NULL); -+ -+ INIT_LIST_HEAD(&slot->lru); -+ slot->node = NULL; -+} -+ -+/* Initialize coord cache */ -+int cbk_cache_init(cbk_cache *cache /* cache to init */ ) -+{ -+ int i; -+ -+ assert("nikita-346", cache != NULL); -+ -+ cache->slot = -+ kmalloc(sizeof(cbk_cache_slot) * cache->nr_slots, -+ reiser4_ctx_gfp_mask_get()); -+ if (cache->slot == NULL) -+ return RETERR(-ENOMEM); -+ -+ INIT_LIST_HEAD(&cache->lru); -+ for (i = 0; i < cache->nr_slots; ++i) { -+ cbk_cache_init_slot(cache->slot + i); -+ list_add_tail(&((cache->slot + i)->lru), &cache->lru); -+ } -+ rwlock_init(&cache->guard); -+ return 0; -+} -+ -+/* free cbk cache data */ -+void cbk_cache_done(cbk_cache * cache /* cache to release */ ) -+{ -+ assert("nikita-2493", cache != NULL); -+ if (cache->slot != NULL) { -+ kfree(cache->slot); -+ cache->slot = NULL; -+ } -+} -+ -+/* macro to iterate over all cbk cache slots */ -+#define for_all_slots(cache, slot) \ -+ for ((slot) = list_entry((cache)->lru.next, cbk_cache_slot, lru); \ -+ &(cache)->lru != &(slot)->lru; \ -+ (slot) = list_entry(slot->lru.next, cbk_cache_slot, lru)) -+ -+#if REISER4_DEBUG -+/* this function assures that [cbk-cache-invariant] invariant holds */ -+static int cbk_cache_invariant(const cbk_cache *cache) -+{ -+ cbk_cache_slot *slot; -+ int result; -+ int unused; -+ -+ if (cache->nr_slots == 0) -+ return 1; -+ -+ assert("nikita-2469", cache != NULL); -+ unused = 0; -+ result = 1; -+ read_lock(&((cbk_cache *)cache)->guard); -+ for_all_slots(cache, slot) { -+ /* in LRU first go all `used' slots followed by `unused' */ -+ if (unused && (slot->node != NULL)) -+ result = 0; -+ if (slot->node == NULL) -+ unused = 1; -+ else { -+ cbk_cache_slot *scan; -+ -+ /* all cached nodes are different */ -+ scan = slot; -+ while (result) { -+ scan = list_entry(scan->lru.next, cbk_cache_slot, lru); -+ if (&cache->lru == &scan->lru) -+ break; -+ if (slot->node == scan->node) -+ result = 0; -+ } -+ } -+ if (!result) -+ break; -+ } -+ read_unlock(&((cbk_cache *)cache)->guard); -+ return result; -+} -+ -+#endif -+ -+/* Remove references, if any, to @node from coord cache */ -+void cbk_cache_invalidate(const znode * node /* node to remove from cache */ , -+ reiser4_tree * tree /* tree to remove node from */ ) -+{ -+ cbk_cache_slot *slot; -+ cbk_cache *cache; -+ int i; -+ -+ assert("nikita-350", node != NULL); -+ assert("nikita-1479", LOCK_CNT_GTZ(rw_locked_tree)); -+ -+ cache = &tree->cbk_cache; -+ assert("nikita-2470", cbk_cache_invariant(cache)); -+ -+ write_lock(&(cache->guard)); -+ for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) { -+ if (slot->node == node) { -+ list_move_tail(&slot->lru, &cache->lru); -+ slot->node = NULL; -+ break; -+ } -+ } -+ write_unlock(&(cache->guard)); -+ assert("nikita-2471", cbk_cache_invariant(cache)); -+} -+ -+/* add to the cbk-cache in the "tree" information about "node". This -+ can actually be update of existing slot in a cache. */ -+static void cbk_cache_add(const znode *node /* node to add to the cache */ ) -+{ -+ cbk_cache *cache; -+ cbk_cache_slot *slot; -+ int i; -+ -+ assert("nikita-352", node != NULL); -+ -+ cache = &znode_get_tree(node)->cbk_cache; -+ assert("nikita-2472", cbk_cache_invariant(cache)); -+ -+ if (cache->nr_slots == 0) -+ return; -+ -+ write_lock(&(cache->guard)); -+ /* find slot to update/add */ -+ for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) { -+ /* oops, this node is already in a cache */ -+ if (slot->node == node) -+ break; -+ } -+ /* if all slots are used, reuse least recently used one */ -+ if (i == cache->nr_slots) { -+ slot = list_entry(cache->lru.prev, cbk_cache_slot, lru); -+ slot->node = (znode *) node; -+ } -+ list_move(&slot->lru, &cache->lru); -+ write_unlock(&(cache->guard)); -+ assert("nikita-2473", cbk_cache_invariant(cache)); -+} -+ -+static int setup_delimiting_keys(cbk_handle * h); -+static lookup_result coord_by_handle(cbk_handle * handle); -+static lookup_result traverse_tree(cbk_handle * h); -+static int cbk_cache_search(cbk_handle * h); -+ -+static level_lookup_result cbk_level_lookup(cbk_handle * h); -+static level_lookup_result cbk_node_lookup(cbk_handle * h); -+ -+/* helper functions */ -+ -+static void update_stale_dk(reiser4_tree * tree, znode * node); -+ -+/* release parent node during traversal */ -+static void put_parent(cbk_handle * h); -+/* check consistency of fields */ -+static int sanity_check(cbk_handle * h); -+/* release resources in handle */ -+static void hput(cbk_handle * h); -+ -+static level_lookup_result search_to_left(cbk_handle * h); -+ -+/* pack numerous (numberous I should say) arguments of coord_by_key() into -+ * cbk_handle */ -+static cbk_handle *cbk_pack(cbk_handle * handle, -+ reiser4_tree * tree, -+ const reiser4_key * key, -+ coord_t * coord, -+ lock_handle * active_lh, -+ lock_handle * parent_lh, -+ znode_lock_mode lock_mode, -+ lookup_bias bias, -+ tree_level lock_level, -+ tree_level stop_level, -+ __u32 flags, ra_info_t * info) -+{ -+ memset(handle, 0, sizeof *handle); -+ -+ handle->tree = tree; -+ handle->key = key; -+ handle->lock_mode = lock_mode; -+ handle->bias = bias; -+ handle->lock_level = lock_level; -+ handle->stop_level = stop_level; -+ handle->coord = coord; -+ /* set flags. See comment in tree.h:cbk_flags */ -+ handle->flags = flags | CBK_TRUST_DK | CBK_USE_CRABLOCK; -+ -+ handle->active_lh = active_lh; -+ handle->parent_lh = parent_lh; -+ handle->ra_info = info; -+ return handle; -+} -+ -+/* main tree lookup procedure -+ -+ Check coord cache. If key we are looking for is not found there, call cbk() -+ to do real tree traversal. -+ -+ As we have extents on the twig level, @lock_level and @stop_level can -+ be different from LEAF_LEVEL and each other. -+ -+ Thread cannot keep any reiser4 locks (tree, znode, dk spin-locks, or znode -+ long term locks) while calling this. -+*/ -+lookup_result coord_by_key(reiser4_tree * tree /* tree to perform search -+ * in. Usually this tree is -+ * part of file-system -+ * super-block */ , -+ const reiser4_key * key /* key to look for */ , -+ coord_t * coord /* where to store found -+ * position in a tree. Fields -+ * in "coord" are only valid if -+ * coord_by_key() returned -+ * "CBK_COORD_FOUND" */ , -+ lock_handle * lh, /* resulting lock handle */ -+ znode_lock_mode lock_mode /* type of lookup we -+ * want on node. Pass -+ * ZNODE_READ_LOCK here -+ * if you only want to -+ * read item found and -+ * ZNODE_WRITE_LOCK if -+ * you want to modify -+ * it */ , -+ lookup_bias bias /* what to return if coord -+ * with exactly the @key is -+ * not in the tree */ , -+ tree_level lock_level /* tree level where to start -+ * taking @lock type of -+ * locks */ , -+ tree_level stop_level /* tree level to stop. Pass -+ * LEAF_LEVEL or TWIG_LEVEL -+ * here Item being looked -+ * for has to be between -+ * @lock_level and -+ * @stop_level, inclusive */ , -+ __u32 flags /* search flags */ , -+ ra_info_t * -+ info -+ /* information about desired tree traversal readahead */ -+ ) -+{ -+ cbk_handle handle; -+ lock_handle parent_lh; -+ lookup_result result; -+ -+ init_lh(lh); -+ init_lh(&parent_lh); -+ -+ assert("nikita-3023", reiser4_schedulable()); -+ -+ assert("nikita-353", tree != NULL); -+ assert("nikita-354", key != NULL); -+ assert("nikita-355", coord != NULL); -+ assert("nikita-356", (bias == FIND_EXACT) -+ || (bias == FIND_MAX_NOT_MORE_THAN)); -+ assert("nikita-357", stop_level >= LEAF_LEVEL); -+ /* no locks can be held during tree traversal */ -+ assert("nikita-2104", lock_stack_isclean(get_current_lock_stack())); -+ -+ cbk_pack(&handle, -+ tree, -+ key, -+ coord, -+ lh, -+ &parent_lh, -+ lock_mode, bias, lock_level, stop_level, flags, info); -+ -+ result = coord_by_handle(&handle); -+ assert("nikita-3247", -+ ergo(!IS_CBKERR(result), coord->node == lh->node)); -+ return result; -+} -+ -+/* like coord_by_key(), but starts traversal from vroot of @object rather than -+ * from tree root. */ -+lookup_result reiser4_object_lookup(struct inode * object, -+ const reiser4_key * key, -+ coord_t * coord, -+ lock_handle * lh, -+ znode_lock_mode lock_mode, -+ lookup_bias bias, -+ tree_level lock_level, -+ tree_level stop_level, __u32 flags, -+ ra_info_t * info) -+{ -+ cbk_handle handle; -+ lock_handle parent_lh; -+ lookup_result result; -+ -+ init_lh(lh); -+ init_lh(&parent_lh); -+ -+ assert("nikita-3023", reiser4_schedulable()); -+ -+ assert("nikita-354", key != NULL); -+ assert("nikita-355", coord != NULL); -+ assert("nikita-356", (bias == FIND_EXACT) -+ || (bias == FIND_MAX_NOT_MORE_THAN)); -+ assert("nikita-357", stop_level >= LEAF_LEVEL); -+ /* no locks can be held during tree search by key */ -+ assert("nikita-2104", lock_stack_isclean(get_current_lock_stack())); -+ -+ cbk_pack(&handle, -+ object != NULL ? reiser4_tree_by_inode(object) : current_tree, -+ key, -+ coord, -+ lh, -+ &parent_lh, -+ lock_mode, bias, lock_level, stop_level, flags, info); -+ handle.object = object; -+ -+ result = coord_by_handle(&handle); -+ assert("nikita-3247", -+ ergo(!IS_CBKERR(result), coord->node == lh->node)); -+ return result; -+} -+ -+/* lookup by cbk_handle. Common part of coord_by_key() and -+ reiser4_object_lookup(). */ -+static lookup_result coord_by_handle(cbk_handle * handle) -+{ -+ /* -+ * first check cbk_cache (which is look-aside cache for our tree) and -+ * of this fails, start traversal. -+ */ -+ /* first check whether "key" is in cache of recent lookups. */ -+ if (cbk_cache_search(handle) == 0) -+ return handle->result; -+ else -+ return traverse_tree(handle); -+} -+ -+/* Execute actor for each item (or unit, depending on @through_units_p), -+ starting from @coord, right-ward, until either: -+ -+ - end of the tree is reached -+ - unformatted node is met -+ - error occurred -+ - @actor returns 0 or less -+ -+ Error code, or last actor return value is returned. -+ -+ This is used by plugin/dir/hashe_dir.c:reiser4_find_entry() to move through -+ sequence of entries with identical keys and alikes. -+*/ -+int reiser4_iterate_tree(reiser4_tree * tree /* tree to scan */ , -+ coord_t * coord /* coord to start from */ , -+ lock_handle * lh /* lock handle to start with and to -+ * update along the way */ , -+ tree_iterate_actor_t actor /* function to call on each -+ * item/unit */ , -+ void *arg /* argument to pass to @actor */ , -+ znode_lock_mode mode /* lock mode on scanned nodes */ , -+ int through_units_p /* call @actor on each item or on -+ * each unit */ ) -+{ -+ int result; -+ -+ assert("nikita-1143", tree != NULL); -+ assert("nikita-1145", coord != NULL); -+ assert("nikita-1146", lh != NULL); -+ assert("nikita-1147", actor != NULL); -+ -+ result = zload(coord->node); -+ coord_clear_iplug(coord); -+ if (result != 0) -+ return result; -+ if (!coord_is_existing_unit(coord)) { -+ zrelse(coord->node); -+ return -ENOENT; -+ } -+ while ((result = actor(tree, coord, lh, arg)) > 0) { -+ /* move further */ -+ if ((through_units_p && coord_next_unit(coord)) || -+ (!through_units_p && coord_next_item(coord))) { -+ do { -+ lock_handle couple; -+ -+ /* move to the next node */ -+ init_lh(&couple); -+ result = -+ reiser4_get_right_neighbor(&couple, -+ coord->node, -+ (int)mode, -+ GN_CAN_USE_UPPER_LEVELS); -+ zrelse(coord->node); -+ if (result == 0) { -+ -+ result = zload(couple.node); -+ if (result != 0) { -+ done_lh(&couple); -+ return result; -+ } -+ -+ coord_init_first_unit(coord, -+ couple.node); -+ done_lh(lh); -+ move_lh(lh, &couple); -+ } else -+ return result; -+ } while (node_is_empty(coord->node)); -+ } -+ -+ assert("nikita-1149", coord_is_existing_unit(coord)); -+ } -+ zrelse(coord->node); -+ return result; -+} -+ -+/* return locked uber znode for @tree */ -+int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode, -+ znode_lock_request pri, lock_handle * lh) -+{ -+ int result; -+ -+ result = longterm_lock_znode(lh, tree->uber, mode, pri); -+ return result; -+} -+ -+/* true if @key is strictly within @node -+ -+ we are looking for possibly non-unique key and it is item is at the edge of -+ @node. May be it is in the neighbor. -+*/ -+static int znode_contains_key_strict(znode * node /* node to check key -+ * against */ , -+ const reiser4_key * -+ key /* key to check */ , -+ int isunique) -+{ -+ int answer; -+ -+ assert("nikita-1760", node != NULL); -+ assert("nikita-1722", key != NULL); -+ -+ if (keyge(key, &node->rd_key)) -+ return 0; -+ -+ answer = keycmp(&node->ld_key, key); -+ -+ if (isunique) -+ return answer != GREATER_THAN; -+ else -+ return answer == LESS_THAN; -+} -+ -+/* -+ * Virtual Root (vroot) code. -+ * -+ * For given file system object (e.g., regular file or directory) let's -+ * define its "virtual root" as lowest in the tree (that is, furtherest -+ * from the tree root) node such that all body items of said object are -+ * located in a tree rooted at this node. -+ * -+ * Once vroot of object is found all tree lookups for items within body of -+ * this object ("object lookups") can be started from its vroot rather -+ * than from real root. This has following advantages: -+ * -+ * 1. amount of nodes traversed during lookup (and, hence, amount of -+ * key comparisons made) decreases, and -+ * -+ * 2. contention on tree root is decreased. This latter was actually -+ * motivating reason behind vroot, because spin lock of root node, -+ * which is taken when acquiring long-term lock on root node is the -+ * hottest lock in the reiser4. -+ * -+ * How to find vroot. -+ * -+ * When vroot of object F is not yet determined, all object lookups start -+ * from the root of the tree. At each tree level during traversal we have -+ * a node N such that a key we are looking for (which is the key inside -+ * object's body) is located within N. In function handle_vroot() called -+ * from cbk_level_lookup() we check whether N is possible vroot for -+ * F. Check is trivial---if neither leftmost nor rightmost item of N -+ * belongs to F (and we already have helpful ->owns_item() method of -+ * object plugin for this), then N is possible vroot of F. This, of -+ * course, relies on the assumption that each object occupies contiguous -+ * range of keys in the tree. -+ * -+ * Thus, traversing tree downward and checking each node as we go, we can -+ * find lowest such node, which, by definition, is vroot. -+ * -+ * How to track vroot. -+ * -+ * Nohow. If actual vroot changes, next object lookup will just restart -+ * from the actual tree root, refreshing object's vroot along the way. -+ * -+ */ -+ -+/* -+ * Check whether @node is possible vroot of @object. -+ */ -+static void handle_vroot(struct inode *object, znode * node) -+{ -+ file_plugin *fplug; -+ coord_t coord; -+ -+ fplug = inode_file_plugin(object); -+ assert("nikita-3353", fplug != NULL); -+ assert("nikita-3354", fplug->owns_item != NULL); -+ -+ if (unlikely(node_is_empty(node))) -+ return; -+ -+ coord_init_first_unit(&coord, node); -+ /* -+ * if leftmost item of @node belongs to @object, we cannot be sure -+ * that @node is vroot of @object, because, some items of @object are -+ * probably in the sub-tree rooted at the left neighbor of @node. -+ */ -+ if (fplug->owns_item(object, &coord)) -+ return; -+ coord_init_last_unit(&coord, node); -+ /* mutatis mutandis for the rightmost item */ -+ if (fplug->owns_item(object, &coord)) -+ return; -+ /* otherwise, @node is possible vroot of @object */ -+ inode_set_vroot(object, node); -+} -+ -+/* -+ * helper function used by traverse tree to start tree traversal not from the -+ * tree root, but from @h->object's vroot, if possible. -+ */ -+static int prepare_object_lookup(cbk_handle * h) -+{ -+ znode *vroot; -+ int result; -+ -+ vroot = inode_get_vroot(h->object); -+ if (vroot == NULL) { -+ /* -+ * object doesn't have known vroot, start from real tree root. -+ */ -+ return LOOKUP_CONT; -+ } -+ -+ h->level = znode_get_level(vroot); -+ /* take a long-term lock on vroot */ -+ h->result = longterm_lock_znode(h->active_lh, vroot, -+ cbk_lock_mode(h->level, h), -+ ZNODE_LOCK_LOPRI); -+ result = LOOKUP_REST; -+ if (h->result == 0) { -+ int isunique; -+ int inside; -+ -+ isunique = h->flags & CBK_UNIQUE; -+ /* check that key is inside vroot */ -+ read_lock_dk(h->tree); -+ inside = (znode_contains_key_strict(vroot, h->key, isunique) && -+ !ZF_ISSET(vroot, JNODE_HEARD_BANSHEE)); -+ read_unlock_dk(h->tree); -+ if (inside) { -+ h->result = zload(vroot); -+ if (h->result == 0) { -+ /* search for key in vroot. */ -+ result = cbk_node_lookup(h); -+ zrelse(vroot); /*h->active_lh->node); */ -+ if (h->active_lh->node != vroot) { -+ result = LOOKUP_REST; -+ } else if (result == LOOKUP_CONT) { -+ move_lh(h->parent_lh, h->active_lh); -+ h->flags &= ~CBK_DKSET; -+ } -+ } -+ } -+ } -+ -+ zput(vroot); -+ -+ if (IS_CBKERR(h->result) || result == LOOKUP_REST) -+ hput(h); -+ return result; -+} -+ -+/* main function that handles common parts of tree traversal: starting -+ (fake znode handling), restarts, error handling, completion */ -+static lookup_result traverse_tree(cbk_handle * h /* search handle */ ) -+{ -+ int done; -+ int iterations; -+ int vroot_used; -+ -+ assert("nikita-365", h != NULL); -+ assert("nikita-366", h->tree != NULL); -+ assert("nikita-367", h->key != NULL); -+ assert("nikita-368", h->coord != NULL); -+ assert("nikita-369", (h->bias == FIND_EXACT) -+ || (h->bias == FIND_MAX_NOT_MORE_THAN)); -+ assert("nikita-370", h->stop_level >= LEAF_LEVEL); -+ assert("nikita-2949", !(h->flags & CBK_DKSET)); -+ assert("zam-355", lock_stack_isclean(get_current_lock_stack())); -+ -+ done = 0; -+ iterations = 0; -+ vroot_used = 0; -+ -+ /* loop for restarts */ -+ restart: -+ -+ assert("nikita-3024", reiser4_schedulable()); -+ -+ h->result = CBK_COORD_FOUND; -+ /* connect_znode() needs it */ -+ h->ld_key = *reiser4_min_key(); -+ h->rd_key = *reiser4_max_key(); -+ h->flags |= CBK_DKSET; -+ h->error = NULL; -+ -+ if (!vroot_used && h->object != NULL) { -+ vroot_used = 1; -+ done = prepare_object_lookup(h); -+ if (done == LOOKUP_REST) { -+ goto restart; -+ } else if (done == LOOKUP_DONE) -+ return h->result; -+ } -+ if (h->parent_lh->node == NULL) { -+ done = -+ get_uber_znode(h->tree, ZNODE_READ_LOCK, ZNODE_LOCK_LOPRI, -+ h->parent_lh); -+ -+ assert("nikita-1637", done != -E_DEADLOCK); -+ -+ h->block = h->tree->root_block; -+ h->level = h->tree->height; -+ h->coord->node = h->parent_lh->node; -+ -+ if (done != 0) -+ return done; -+ } -+ -+ /* loop descending a tree */ -+ while (!done) { -+ -+ if (unlikely((iterations > REISER4_CBK_ITERATIONS_LIMIT) && -+ IS_POW(iterations))) { -+ warning("nikita-1481", "Too many iterations: %i", -+ iterations); -+ reiser4_print_key("key", h->key); -+ ++iterations; -+ } else if (unlikely(iterations > REISER4_MAX_CBK_ITERATIONS)) { -+ h->error = -+ "reiser-2018: Too many iterations. Tree corrupted, or (less likely) starvation occurring."; -+ h->result = RETERR(-EIO); -+ break; -+ } -+ switch (cbk_level_lookup(h)) { -+ case LOOKUP_CONT: -+ move_lh(h->parent_lh, h->active_lh); -+ continue; -+ default: -+ wrong_return_value("nikita-372", "cbk_level"); -+ case LOOKUP_DONE: -+ done = 1; -+ break; -+ case LOOKUP_REST: -+ hput(h); -+ /* deadlock avoidance is normal case. */ -+ if (h->result != -E_DEADLOCK) -+ ++iterations; -+ reiser4_preempt_point(); -+ goto restart; -+ } -+ } -+ /* that's all. The rest is error handling */ -+ if (unlikely(h->error != NULL)) { -+ warning("nikita-373", "%s: level: %i, " -+ "lock_level: %i, stop_level: %i " -+ "lock_mode: %s, bias: %s", -+ h->error, h->level, h->lock_level, h->stop_level, -+ lock_mode_name(h->lock_mode), bias_name(h->bias)); -+ reiser4_print_address("block", &h->block); -+ reiser4_print_key("key", h->key); -+ print_coord_content("coord", h->coord); -+ } -+ /* `unlikely' error case */ -+ if (unlikely(IS_CBKERR(h->result))) { -+ /* failure. do cleanup */ -+ hput(h); -+ } else { -+ assert("nikita-1605", WITH_DATA_RET -+ (h->coord->node, 1, -+ ergo((h->result == CBK_COORD_FOUND) && -+ (h->bias == FIND_EXACT) && -+ (!node_is_empty(h->coord->node)), -+ coord_is_existing_item(h->coord)))); -+ } -+ return h->result; -+} -+ -+/* find delimiting keys of child -+ -+ Determine left and right delimiting keys for child pointed to by -+ @parent_coord. -+ -+*/ -+static void find_child_delimiting_keys(znode * parent /* parent znode, passed -+ * locked */ , -+ const coord_t * parent_coord /* coord where -+ * pointer to -+ * child is -+ * stored */ , -+ reiser4_key * ld /* where to store left -+ * delimiting key */ , -+ reiser4_key * rd /* where to store right -+ * delimiting key */ ) -+{ -+ coord_t neighbor; -+ -+ assert("nikita-1484", parent != NULL); -+ assert_rw_locked(&(znode_get_tree(parent)->dk_lock)); -+ -+ coord_dup(&neighbor, parent_coord); -+ -+ if (neighbor.between == AT_UNIT) -+ /* imitate item ->lookup() behavior. */ -+ neighbor.between = AFTER_UNIT; -+ -+ if (coord_set_to_left(&neighbor) == 0) -+ unit_key_by_coord(&neighbor, ld); -+ else { -+ assert("nikita-14851", 0); -+ *ld = *znode_get_ld_key(parent); -+ } -+ -+ coord_dup(&neighbor, parent_coord); -+ if (neighbor.between == AT_UNIT) -+ neighbor.between = AFTER_UNIT; -+ if (coord_set_to_right(&neighbor) == 0) -+ unit_key_by_coord(&neighbor, rd); -+ else -+ *rd = *znode_get_rd_key(parent); -+} -+ -+/* -+ * setup delimiting keys for a child -+ * -+ * @parent parent node -+ * -+ * @coord location in @parent where pointer to @child is -+ * -+ * @child child node -+ */ -+int -+set_child_delimiting_keys(znode * parent, const coord_t * coord, znode * child) -+{ -+ reiser4_tree *tree; -+ -+ assert("nikita-2952", -+ znode_get_level(parent) == znode_get_level(coord->node)); -+ -+ /* fast check without taking dk lock. This is safe, because -+ * JNODE_DKSET is never cleared once set. */ -+ if (!ZF_ISSET(child, JNODE_DKSET)) { -+ tree = znode_get_tree(parent); -+ write_lock_dk(tree); -+ if (likely(!ZF_ISSET(child, JNODE_DKSET))) { -+ find_child_delimiting_keys(parent, coord, -+ &child->ld_key, -+ &child->rd_key); -+ ON_DEBUG(child->ld_key_version = -+ atomic_inc_return(&delim_key_version); -+ child->rd_key_version = -+ atomic_inc_return(&delim_key_version);); -+ ZF_SET(child, JNODE_DKSET); -+ } -+ write_unlock_dk(tree); -+ return 1; -+ } -+ return 0; -+} -+ -+/* Perform tree lookup at one level. This is called from cbk_traverse() -+ function that drives lookup through tree and calls cbk_node_lookup() to -+ perform lookup within one node. -+ -+ See comments in a code. -+*/ -+static level_lookup_result cbk_level_lookup(cbk_handle * h /* search handle */ ) -+{ -+ int ret; -+ int setdk; -+ int ldkeyset = 0; -+ reiser4_key ldkey; -+ reiser4_key key; -+ znode *active; -+ -+ assert("nikita-3025", reiser4_schedulable()); -+ -+ /* acquire reference to @active node */ -+ active = -+ zget(h->tree, &h->block, h->parent_lh->node, h->level, -+ reiser4_ctx_gfp_mask_get()); -+ -+ if (IS_ERR(active)) { -+ h->result = PTR_ERR(active); -+ return LOOKUP_DONE; -+ } -+ -+ /* lock @active */ -+ h->result = longterm_lock_znode(h->active_lh, -+ active, -+ cbk_lock_mode(h->level, h), -+ ZNODE_LOCK_LOPRI); -+ /* longterm_lock_znode() acquires additional reference to znode (which -+ will be later released by longterm_unlock_znode()). Release -+ reference acquired by zget(). -+ */ -+ zput(active); -+ if (unlikely(h->result != 0)) -+ goto fail_or_restart; -+ -+ setdk = 0; -+ /* if @active is accessed for the first time, setup delimiting keys on -+ it. Delimiting keys are taken from the parent node. See -+ setup_delimiting_keys() for details. -+ */ -+ if (h->flags & CBK_DKSET) { -+ setdk = setup_delimiting_keys(h); -+ h->flags &= ~CBK_DKSET; -+ } else { -+ znode *parent; -+ -+ parent = h->parent_lh->node; -+ h->result = zload(parent); -+ if (unlikely(h->result != 0)) -+ goto fail_or_restart; -+ -+ if (!ZF_ISSET(active, JNODE_DKSET)) -+ setdk = set_child_delimiting_keys(parent, -+ h->coord, active); -+ else { -+ read_lock_dk(h->tree); -+ find_child_delimiting_keys(parent, h->coord, &ldkey, -+ &key); -+ read_unlock_dk(h->tree); -+ ldkeyset = 1; -+ } -+ zrelse(parent); -+ } -+ -+ /* this is ugly kludge. Reminder: this is necessary, because -+ ->lookup() method returns coord with ->between field probably set -+ to something different from AT_UNIT. -+ */ -+ h->coord->between = AT_UNIT; -+ -+ if (znode_just_created(active) && (h->coord->node != NULL)) { -+ write_lock_tree(h->tree); -+ /* if we are going to load znode right now, setup -+ ->in_parent: coord where pointer to this node is stored in -+ parent. -+ */ -+ coord_to_parent_coord(h->coord, &active->in_parent); -+ write_unlock_tree(h->tree); -+ } -+ -+ /* check connectedness without holding tree lock---false negatives -+ * will be re-checked by connect_znode(), and false positives are -+ * impossible---@active cannot suddenly turn into unconnected -+ * state. */ -+ if (!znode_is_connected(active)) { -+ h->result = connect_znode(h->coord, active); -+ if (unlikely(h->result != 0)) { -+ put_parent(h); -+ goto fail_or_restart; -+ } -+ } -+ -+ jload_prefetch(ZJNODE(active)); -+ -+ if (setdk) -+ update_stale_dk(h->tree, active); -+ -+ /* put_parent() cannot be called earlier, because connect_znode() -+ assumes parent node is referenced; */ -+ put_parent(h); -+ -+ if ((!znode_contains_key_lock(active, h->key) && -+ (h->flags & CBK_TRUST_DK)) -+ || ZF_ISSET(active, JNODE_HEARD_BANSHEE)) { -+ /* 1. key was moved out of this node while this thread was -+ waiting for the lock. Restart. More elaborate solution is -+ to determine where key moved (to the left, or to the right) -+ and try to follow it through sibling pointers. -+ -+ 2. or, node itself is going to be removed from the -+ tree. Release lock and restart. -+ */ -+ h->result = -E_REPEAT; -+ } -+ if (h->result == -E_REPEAT) -+ return LOOKUP_REST; -+ -+ h->result = zload_ra(active, h->ra_info); -+ if (h->result) { -+ return LOOKUP_DONE; -+ } -+ -+ /* sanity checks */ -+ if (sanity_check(h)) { -+ zrelse(active); -+ return LOOKUP_DONE; -+ } -+ -+ /* check that key of leftmost item in the @active is the same as in -+ * its parent */ -+ if (ldkeyset && !node_is_empty(active) && -+ !keyeq(leftmost_key_in_node(active, &key), &ldkey)) { -+ warning("vs-3533", "Keys are inconsistent. Fsck?"); -+ reiser4_print_key("inparent", &ldkey); -+ reiser4_print_key("inchild", &key); -+ h->result = RETERR(-EIO); -+ zrelse(active); -+ return LOOKUP_DONE; -+ } -+ -+ if (h->object != NULL) -+ handle_vroot(h->object, active); -+ -+ ret = cbk_node_lookup(h); -+ -+ /* h->active_lh->node might change, but active is yet to be zrelsed */ -+ zrelse(active); -+ -+ return ret; -+ -+ fail_or_restart: -+ if (h->result == -E_DEADLOCK) -+ return LOOKUP_REST; -+ return LOOKUP_DONE; -+} -+ -+#if REISER4_DEBUG -+/* check left and right delimiting keys of a znode */ -+void check_dkeys(znode * node) -+{ -+ znode *left; -+ znode *right; -+ -+ read_lock_tree(current_tree); -+ read_lock_dk(current_tree); -+ -+ assert("vs-1710", znode_is_any_locked(node)); -+ assert("vs-1197", -+ !keygt(znode_get_ld_key(node), znode_get_rd_key(node))); -+ -+ left = node->left; -+ right = node->right; -+ -+ if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET) -+ && left != NULL && ZF_ISSET(left, JNODE_DKSET)) -+ /* check left neighbor. Note that left neighbor is not locked, -+ so it might get wrong delimiting keys therefore */ -+ assert("vs-1198", -+ (keyeq(znode_get_rd_key(left), znode_get_ld_key(node)) -+ || ZF_ISSET(left, JNODE_HEARD_BANSHEE))); -+ -+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET) -+ && right != NULL && ZF_ISSET(right, JNODE_DKSET)) -+ /* check right neighbor. Note that right neighbor is not -+ locked, so it might get wrong delimiting keys therefore */ -+ assert("vs-1199", -+ (keyeq(znode_get_rd_key(node), znode_get_ld_key(right)) -+ || ZF_ISSET(right, JNODE_HEARD_BANSHEE))); -+ -+ read_unlock_dk(current_tree); -+ read_unlock_tree(current_tree); -+} -+#endif -+ -+/* true if @key is left delimiting key of @node */ -+static int key_is_ld(znode * node, const reiser4_key * key) -+{ -+ int ld; -+ -+ assert("nikita-1716", node != NULL); -+ assert("nikita-1758", key != NULL); -+ -+ read_lock_dk(znode_get_tree(node)); -+ assert("nikita-1759", znode_contains_key(node, key)); -+ ld = keyeq(znode_get_ld_key(node), key); -+ read_unlock_dk(znode_get_tree(node)); -+ return ld; -+} -+ -+/* Process one node during tree traversal. -+ -+ This is called by cbk_level_lookup(). */ -+static level_lookup_result cbk_node_lookup(cbk_handle * h /* search handle */ ) -+{ -+ /* node plugin of @active */ -+ node_plugin *nplug; -+ /* item plugin of item that was found */ -+ item_plugin *iplug; -+ /* search bias */ -+ lookup_bias node_bias; -+ /* node we are operating upon */ -+ znode *active; -+ /* tree we are searching in */ -+ reiser4_tree *tree; -+ /* result */ -+ int result; -+ -+ assert("nikita-379", h != NULL); -+ -+ active = h->active_lh->node; -+ tree = h->tree; -+ -+ nplug = active->nplug; -+ assert("nikita-380", nplug != NULL); -+ -+ ON_DEBUG(check_dkeys(active)); -+ -+ /* return item from "active" node with maximal key not greater than -+ "key" */ -+ node_bias = h->bias; -+ result = nplug->lookup(active, h->key, node_bias, h->coord); -+ if (unlikely(result != NS_FOUND && result != NS_NOT_FOUND)) { -+ /* error occurred */ -+ h->result = result; -+ return LOOKUP_DONE; -+ } -+ if (h->level == h->stop_level) { -+ /* welcome to the stop level */ -+ assert("nikita-381", h->coord->node == active); -+ if (result == NS_FOUND) { -+ /* success of tree lookup */ -+ if (!(h->flags & CBK_UNIQUE) -+ && key_is_ld(active, h->key)) { -+ return search_to_left(h); -+ } else -+ h->result = CBK_COORD_FOUND; -+ } else { -+ h->result = CBK_COORD_NOTFOUND; -+ } -+ if (!(h->flags & CBK_IN_CACHE)) -+ cbk_cache_add(active); -+ return LOOKUP_DONE; -+ } -+ -+ if (h->level > TWIG_LEVEL && result == NS_NOT_FOUND) { -+ h->error = "not found on internal node"; -+ h->result = result; -+ return LOOKUP_DONE; -+ } -+ -+ assert("vs-361", h->level > h->stop_level); -+ -+ if (handle_eottl(h, &result)) { -+ assert("vs-1674", (result == LOOKUP_DONE || -+ result == LOOKUP_REST)); -+ return result; -+ } -+ -+ /* go down to next level */ -+ check_me("vs-12", zload(h->coord->node) == 0); -+ assert("nikita-2116", item_is_internal(h->coord)); -+ iplug = item_plugin_by_coord(h->coord); -+ iplug->s.internal.down_link(h->coord, h->key, &h->block); -+ zrelse(h->coord->node); -+ --h->level; -+ return LOOKUP_CONT; /* continue */ -+} -+ -+/* scan cbk_cache slots looking for a match for @h */ -+static int cbk_cache_scan_slots(cbk_handle * h /* cbk handle */ ) -+{ -+ level_lookup_result llr; -+ znode *node; -+ reiser4_tree *tree; -+ cbk_cache_slot *slot; -+ cbk_cache *cache; -+ tree_level level; -+ int isunique; -+ const reiser4_key *key; -+ int result; -+ -+ assert("nikita-1317", h != NULL); -+ assert("nikita-1315", h->tree != NULL); -+ assert("nikita-1316", h->key != NULL); -+ -+ tree = h->tree; -+ cache = &tree->cbk_cache; -+ if (cache->nr_slots == 0) -+ /* size of cbk cache was set to 0 by mount time option. */ -+ return RETERR(-ENOENT); -+ -+ assert("nikita-2474", cbk_cache_invariant(cache)); -+ node = NULL; /* to keep gcc happy */ -+ level = h->level; -+ key = h->key; -+ isunique = h->flags & CBK_UNIQUE; -+ result = RETERR(-ENOENT); -+ -+ /* -+ * this is time-critical function and dragons had, hence, been settled -+ * here. -+ * -+ * Loop below scans cbk cache slots trying to find matching node with -+ * suitable range of delimiting keys and located at the h->level. -+ * -+ * Scan is done under cbk cache spin lock that protects slot->node -+ * pointers. If suitable node is found we want to pin it in -+ * memory. But slot->node can point to the node with x_count 0 -+ * (unreferenced). Such node can be recycled at any moment, or can -+ * already be in the process of being recycled (within jput()). -+ * -+ * As we found node in the cbk cache, it means that jput() hasn't yet -+ * called cbk_cache_invalidate(). -+ * -+ * We acquire reference to the node without holding tree lock, and -+ * later, check node's RIP bit. This avoids races with jput(). -+ */ -+ -+ rcu_read_lock(); -+ read_lock(&((cbk_cache *)cache)->guard); -+ -+ slot = list_entry(cache->lru.next, cbk_cache_slot, lru); -+ slot = list_entry(slot->lru.prev, cbk_cache_slot, lru); -+ BUG_ON(&slot->lru != &cache->lru);/*????*/ -+ while (1) { -+ -+ slot = list_entry(slot->lru.next, cbk_cache_slot, lru); -+ -+ if (&cache->lru != &slot->lru) -+ node = slot->node; -+ else -+ node = NULL; -+ -+ if (unlikely(node == NULL)) -+ break; -+ -+ /* -+ * this is (hopefully) the only place in the code where we are -+ * working with delimiting keys without holding dk lock. This -+ * is fine here, because this is only "guess" anyway---keys -+ * are rechecked under dk lock below. -+ */ -+ if (znode_get_level(node) == level && -+ /* reiser4_min_key < key < reiser4_max_key */ -+ znode_contains_key_strict(node, key, isunique)) { -+ zref(node); -+ result = 0; -+ spin_lock_prefetch(&tree->tree_lock); -+ break; -+ } -+ } -+ read_unlock(&((cbk_cache *)cache)->guard); -+ -+ assert("nikita-2475", cbk_cache_invariant(cache)); -+ -+ if (unlikely(result == 0 && ZF_ISSET(node, JNODE_RIP))) -+ result = -ENOENT; -+ -+ rcu_read_unlock(); -+ -+ if (result != 0) { -+ h->result = CBK_COORD_NOTFOUND; -+ return RETERR(-ENOENT); -+ } -+ -+ result = -+ longterm_lock_znode(h->active_lh, node, cbk_lock_mode(level, h), -+ ZNODE_LOCK_LOPRI); -+ zput(node); -+ if (result != 0) -+ return result; -+ result = zload(node); -+ if (result != 0) -+ return result; -+ -+ /* recheck keys */ -+ read_lock_dk(tree); -+ result = (znode_contains_key_strict(node, key, isunique) && -+ !ZF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ read_unlock_dk(tree); -+ if (result) { -+ /* do lookup inside node */ -+ llr = cbk_node_lookup(h); -+ /* if cbk_node_lookup() wandered to another node (due to eottl -+ or non-unique keys), adjust @node */ -+ /*node = h->active_lh->node; */ -+ -+ if (llr != LOOKUP_DONE) { -+ /* restart or continue on the next level */ -+ result = RETERR(-ENOENT); -+ } else if (IS_CBKERR(h->result)) -+ /* io or oom */ -+ result = RETERR(-ENOENT); -+ else { -+ /* good. Either item found or definitely not found. */ -+ result = 0; -+ -+ write_lock(&(cache->guard)); -+ if (slot->node == h->active_lh->node /*node */ ) { -+ /* if this node is still in cbk cache---move -+ its slot to the head of the LRU list. */ -+ list_move(&slot->lru, &cache->lru); -+ } -+ write_unlock(&(cache->guard)); -+ } -+ } else { -+ /* race. While this thread was waiting for the lock, node was -+ rebalanced and item we are looking for, shifted out of it -+ (if it ever was here). -+ -+ Continuing scanning is almost hopeless: node key range was -+ moved to, is almost certainly at the beginning of the LRU -+ list at this time, because it's hot, but restarting -+ scanning from the very beginning is complex. Just return, -+ so that cbk() will be performed. This is not that -+ important, because such races should be rare. Are they? -+ */ -+ result = RETERR(-ENOENT); /* -ERAUGHT */ -+ } -+ zrelse(node); -+ assert("nikita-2476", cbk_cache_invariant(cache)); -+ return result; -+} -+ -+/* look for item with given key in the coord cache -+ -+ This function, called by coord_by_key(), scans "coord cache" (&cbk_cache) -+ which is a small LRU list of znodes accessed lately. For each znode in -+ znode in this list, it checks whether key we are looking for fits into key -+ range covered by this node. If so, and in addition, node lies at allowed -+ level (this is to handle extents on a twig level), node is locked, and -+ lookup inside it is performed. -+ -+ we need a measurement of the cost of this cache search compared to the cost -+ of coord_by_key. -+ -+*/ -+static int cbk_cache_search(cbk_handle * h /* cbk handle */ ) -+{ -+ int result = 0; -+ tree_level level; -+ -+ /* add CBK_IN_CACHE to the handle flags. This means that -+ * cbk_node_lookup() assumes that cbk_cache is scanned and would add -+ * found node to the cache. */ -+ h->flags |= CBK_IN_CACHE; -+ for (level = h->stop_level; level <= h->lock_level; ++level) { -+ h->level = level; -+ result = cbk_cache_scan_slots(h); -+ if (result != 0) { -+ done_lh(h->active_lh); -+ done_lh(h->parent_lh); -+ } else { -+ assert("nikita-1319", !IS_CBKERR(h->result)); -+ break; -+ } -+ } -+ h->flags &= ~CBK_IN_CACHE; -+ return result; -+} -+ -+/* type of lock we want to obtain during tree traversal. On stop level -+ we want type of lock user asked for, on upper levels: read lock. */ -+znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h) -+{ -+ assert("nikita-382", h != NULL); -+ -+ return (level <= h->lock_level) ? h->lock_mode : ZNODE_READ_LOCK; -+} -+ -+/* update outdated delimiting keys */ -+static void stale_dk(reiser4_tree * tree, znode * node) -+{ -+ znode *right; -+ -+ read_lock_tree(tree); -+ write_lock_dk(tree); -+ right = node->right; -+ -+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && -+ right && ZF_ISSET(right, JNODE_DKSET) && -+ !keyeq(znode_get_rd_key(node), znode_get_ld_key(right))) -+ znode_set_rd_key(node, znode_get_ld_key(right)); -+ -+ write_unlock_dk(tree); -+ read_unlock_tree(tree); -+} -+ -+/* check for possibly outdated delimiting keys, and update them if -+ * necessary. */ -+static void update_stale_dk(reiser4_tree * tree, znode * node) -+{ -+ znode *right; -+ reiser4_key rd; -+ -+ read_lock_tree(tree); -+ read_lock_dk(tree); -+ rd = *znode_get_rd_key(node); -+ right = node->right; -+ if (unlikely(ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && -+ right && ZF_ISSET(right, JNODE_DKSET) && -+ !keyeq(&rd, znode_get_ld_key(right)))) { -+ assert("nikita-38211", ZF_ISSET(node, JNODE_DKSET)); -+ read_unlock_dk(tree); -+ read_unlock_tree(tree); -+ stale_dk(tree, node); -+ return; -+ } -+ read_unlock_dk(tree); -+ read_unlock_tree(tree); -+} -+ -+/* -+ * handle searches a the non-unique key. -+ * -+ * Suppose that we are looking for an item with possibly non-unique key 100. -+ * -+ * Root node contains two pointers: one to a node with left delimiting key 0, -+ * and another to a node with left delimiting key 100. Item we interested in -+ * may well happen in the sub-tree rooted at the first pointer. -+ * -+ * To handle this search_to_left() is called when search reaches stop -+ * level. This function checks it is _possible_ that item we are looking for -+ * is in the left neighbor (this can be done by comparing delimiting keys) and -+ * if so, tries to lock left neighbor (this is low priority lock, so it can -+ * deadlock, tree traversal is just restarted if it did) and then checks -+ * whether left neighbor actually contains items with our key. -+ * -+ * Note that this is done on the stop level only. It is possible to try such -+ * left-check on each level, but as duplicate keys are supposed to be rare -+ * (very unlikely that more than one node is completely filled with items with -+ * duplicate keys), it sis cheaper to scan to the left on the stop level once. -+ * -+ */ -+static level_lookup_result search_to_left(cbk_handle * h /* search handle */ ) -+{ -+ level_lookup_result result; -+ coord_t *coord; -+ znode *node; -+ znode *neighbor; -+ -+ lock_handle lh; -+ -+ assert("nikita-1761", h != NULL); -+ assert("nikita-1762", h->level == h->stop_level); -+ -+ init_lh(&lh); -+ coord = h->coord; -+ node = h->active_lh->node; -+ assert("nikita-1763", coord_is_leftmost_unit(coord)); -+ -+ h->result = -+ reiser4_get_left_neighbor(&lh, node, (int)h->lock_mode, -+ GN_CAN_USE_UPPER_LEVELS); -+ neighbor = NULL; -+ switch (h->result) { -+ case -E_DEADLOCK: -+ result = LOOKUP_REST; -+ break; -+ case 0:{ -+ node_plugin *nplug; -+ coord_t crd; -+ lookup_bias bias; -+ -+ neighbor = lh.node; -+ h->result = zload(neighbor); -+ if (h->result != 0) { -+ result = LOOKUP_DONE; -+ break; -+ } -+ -+ nplug = neighbor->nplug; -+ -+ coord_init_zero(&crd); -+ bias = h->bias; -+ h->bias = FIND_EXACT; -+ h->result = -+ nplug->lookup(neighbor, h->key, h->bias, &crd); -+ h->bias = bias; -+ -+ if (h->result == NS_NOT_FOUND) { -+ case -E_NO_NEIGHBOR: -+ h->result = CBK_COORD_FOUND; -+ if (!(h->flags & CBK_IN_CACHE)) -+ cbk_cache_add(node); -+ default: /* some other error */ -+ result = LOOKUP_DONE; -+ } else if (h->result == NS_FOUND) { -+ read_lock_dk(znode_get_tree(neighbor)); -+ h->rd_key = *znode_get_ld_key(node); -+ leftmost_key_in_node(neighbor, &h->ld_key); -+ read_unlock_dk(znode_get_tree(neighbor)); -+ h->flags |= CBK_DKSET; -+ -+ h->block = *znode_get_block(neighbor); -+ /* clear coord -> node so that cbk_level_lookup() -+ wouldn't overwrite parent hint in neighbor. -+ -+ Parent hint was set up by -+ reiser4_get_left_neighbor() -+ */ -+ /* FIXME: why do we have to spinlock here? */ -+ write_lock_tree(znode_get_tree(neighbor)); -+ h->coord->node = NULL; -+ write_unlock_tree(znode_get_tree(neighbor)); -+ result = LOOKUP_CONT; -+ } else { -+ result = LOOKUP_DONE; -+ } -+ if (neighbor != NULL) -+ zrelse(neighbor); -+ } -+ } -+ done_lh(&lh); -+ return result; -+} -+ -+/* debugging aid: return symbolic name of search bias */ -+static const char *bias_name(lookup_bias bias /* bias to get name of */ ) -+{ -+ if (bias == FIND_EXACT) -+ return "exact"; -+ else if (bias == FIND_MAX_NOT_MORE_THAN) -+ return "left-slant"; -+/* else if( bias == RIGHT_SLANT_BIAS ) */ -+/* return "right-bias"; */ -+ else { -+ static char buf[30]; -+ -+ sprintf(buf, "unknown: %i", bias); -+ return buf; -+ } -+} -+ -+#if REISER4_DEBUG -+/* debugging aid: print human readable information about @p */ -+void print_coord_content(const char *prefix /* prefix to print */ , -+ coord_t * p /* coord to print */ ) -+{ -+ reiser4_key key; -+ -+ if (p == NULL) { -+ printk("%s: null\n", prefix); -+ return; -+ } -+ if ((p->node != NULL) && znode_is_loaded(p->node) -+ && coord_is_existing_item(p)) -+ printk("%s: data: %p, length: %i\n", prefix, -+ item_body_by_coord(p), item_length_by_coord(p)); -+ if (znode_is_loaded(p->node)) { -+ item_key_by_coord(p, &key); -+ reiser4_print_key(prefix, &key); -+ } -+} -+ -+/* debugging aid: print human readable information about @block */ -+void reiser4_print_address(const char *prefix /* prefix to print */ , -+ const reiser4_block_nr * block /* block number to print */ ) -+{ -+ printk("%s: %s\n", prefix, sprint_address(block)); -+} -+#endif -+ -+/* return string containing human readable representation of @block */ -+char *sprint_address(const reiser4_block_nr * -+ block /* block number to print */ ) -+{ -+ static char address[30]; -+ -+ if (block == NULL) -+ sprintf(address, "null"); -+ else if (reiser4_blocknr_is_fake(block)) -+ sprintf(address, "%llx", (unsigned long long)(*block)); -+ else -+ sprintf(address, "%llu", (unsigned long long)(*block)); -+ return address; -+} -+ -+/* release parent node during traversal */ -+static void put_parent(cbk_handle * h /* search handle */ ) -+{ -+ assert("nikita-383", h != NULL); -+ if (h->parent_lh->node != NULL) { -+ longterm_unlock_znode(h->parent_lh); -+ } -+} -+ -+/* helper function used by coord_by_key(): release reference to parent znode -+ stored in handle before processing its child. */ -+static void hput(cbk_handle * h /* search handle */ ) -+{ -+ assert("nikita-385", h != NULL); -+ done_lh(h->parent_lh); -+ done_lh(h->active_lh); -+} -+ -+/* Helper function used by cbk(): update delimiting keys of child node (stored -+ in h->active_lh->node) using key taken from parent on the parent level. */ -+static int setup_delimiting_keys(cbk_handle * h /* search handle */ ) -+{ -+ znode *active; -+ reiser4_tree *tree; -+ -+ assert("nikita-1088", h != NULL); -+ -+ active = h->active_lh->node; -+ -+ /* fast check without taking dk lock. This is safe, because -+ * JNODE_DKSET is never cleared once set. */ -+ if (!ZF_ISSET(active, JNODE_DKSET)) { -+ tree = znode_get_tree(active); -+ write_lock_dk(tree); -+ if (!ZF_ISSET(active, JNODE_DKSET)) { -+ znode_set_ld_key(active, &h->ld_key); -+ znode_set_rd_key(active, &h->rd_key); -+ ZF_SET(active, JNODE_DKSET); -+ } -+ write_unlock_dk(tree); -+ return 1; -+ } -+ return 0; -+} -+ -+/* true if @block makes sense for the @tree. Used to detect corrupted node -+ * pointers */ -+static int -+block_nr_is_correct(reiser4_block_nr * block /* block number to check */ , -+ reiser4_tree * tree /* tree to check against */ ) -+{ -+ assert("nikita-757", block != NULL); -+ assert("nikita-758", tree != NULL); -+ -+ /* check to see if it exceeds the size of the device. */ -+ return reiser4_blocknr_is_sane_for(tree->super, block); -+} -+ -+/* check consistency of fields */ -+static int sanity_check(cbk_handle * h /* search handle */ ) -+{ -+ assert("nikita-384", h != NULL); -+ -+ if (h->level < h->stop_level) { -+ h->error = "Buried under leaves"; -+ h->result = RETERR(-EIO); -+ return LOOKUP_DONE; -+ } else if (!block_nr_is_correct(&h->block, h->tree)) { -+ h->error = "bad block number"; -+ h->result = RETERR(-EIO); -+ return LOOKUP_DONE; -+ } else -+ return 0; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/status_flags.c linux-2.6.20/fs/reiser4/status_flags.c ---- linux-2.6.20.orig/fs/reiser4/status_flags.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/status_flags.c 2007-05-06 14:50:43.875030717 +0400 -@@ -0,0 +1,175 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Functions that deal with reiser4 status block, query status and update it, if needed */ -+ -+#include -+#include -+#include -+#include -+#include "debug.h" -+#include "dformat.h" -+#include "status_flags.h" -+#include "super.h" -+ -+/* This is our end I/O handler that marks page uptodate if IO was successful. It also -+ unconditionally unlocks the page, so we can see that io was done. -+ We do not free bio, because we hope to reuse that. */ -+static int reiser4_status_endio(struct bio *bio, unsigned int bytes_done, -+ int err) -+{ -+ if (bio->bi_size) -+ return 1; -+ -+ if (test_bit(BIO_UPTODATE, &bio->bi_flags)) { -+ SetPageUptodate(bio->bi_io_vec->bv_page); -+ } else { -+ ClearPageUptodate(bio->bi_io_vec->bv_page); -+ SetPageError(bio->bi_io_vec->bv_page); -+ } -+ unlock_page(bio->bi_io_vec->bv_page); -+ return 0; -+} -+ -+/* Initialise status code. This is expected to be called from the disk format -+ code. block paremeter is where status block lives. */ -+int reiser4_status_init(reiser4_block_nr block) -+{ -+ struct super_block *sb = reiser4_get_current_sb(); -+ struct reiser4_status *statuspage; -+ struct bio *bio; -+ struct page *page; -+ -+ get_super_private(sb)->status_page = NULL; -+ get_super_private(sb)->status_bio = NULL; -+ -+ page = alloc_pages(reiser4_ctx_gfp_mask_get(), 0); -+ if (!page) -+ return -ENOMEM; -+ -+ bio = bio_alloc(reiser4_ctx_gfp_mask_get(), 1); -+ if (bio != NULL) { -+ bio->bi_sector = block * (sb->s_blocksize >> 9); -+ bio->bi_bdev = sb->s_bdev; -+ bio->bi_io_vec[0].bv_page = page; -+ bio->bi_io_vec[0].bv_len = sb->s_blocksize; -+ bio->bi_io_vec[0].bv_offset = 0; -+ bio->bi_vcnt = 1; -+ bio->bi_size = sb->s_blocksize; -+ bio->bi_end_io = reiser4_status_endio; -+ } else { -+ __free_pages(page, 0); -+ return -ENOMEM; -+ } -+ lock_page(page); -+ submit_bio(READ, bio); -+ blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping); -+ wait_on_page_locked(page); -+ if (!PageUptodate(page)) { -+ warning("green-2007", -+ "I/O error while tried to read status page\n"); -+ return -EIO; -+ } -+ -+ statuspage = (struct reiser4_status *)kmap_atomic(page, KM_USER0); -+ if (memcmp -+ (statuspage->magic, REISER4_STATUS_MAGIC, -+ sizeof(REISER4_STATUS_MAGIC))) { -+ /* Magic does not match. */ -+ kunmap_atomic((char *)statuspage, KM_USER0); -+ warning("green-2008", "Wrong magic in status block\n"); -+ __free_pages(page, 0); -+ bio_put(bio); -+ return -EINVAL; -+ } -+ kunmap_atomic((char *)statuspage, KM_USER0); -+ -+ get_super_private(sb)->status_page = page; -+ get_super_private(sb)->status_bio = bio; -+ return 0; -+} -+ -+/* Query the status of fs. Returns if the FS can be safely mounted. -+ Also if "status" and "extended" parameters are given, it will fill -+ actual parts of status from disk there. */ -+int reiser4_status_query(u64 * status, u64 * extended) -+{ -+ struct super_block *sb = reiser4_get_current_sb(); -+ struct reiser4_status *statuspage; -+ int retval; -+ -+ if (!get_super_private(sb)->status_page) { // No status page? -+ return REISER4_STATUS_MOUNT_UNKNOWN; -+ } -+ statuspage = (struct reiser4_status *) -+ kmap_atomic(get_super_private(sb)->status_page, KM_USER0); -+ switch ((long)le64_to_cpu(get_unaligned(&statuspage->status))) { // FIXME: this cast is a hack for 32 bit arches to work. -+ case REISER4_STATUS_OK: -+ retval = REISER4_STATUS_MOUNT_OK; -+ break; -+ case REISER4_STATUS_CORRUPTED: -+ retval = REISER4_STATUS_MOUNT_WARN; -+ break; -+ case REISER4_STATUS_DAMAGED: -+ case REISER4_STATUS_DESTROYED: -+ case REISER4_STATUS_IOERROR: -+ retval = REISER4_STATUS_MOUNT_RO; -+ break; -+ default: -+ retval = REISER4_STATUS_MOUNT_UNKNOWN; -+ break; -+ } -+ -+ if (status) -+ *status = le64_to_cpu(get_unaligned(&statuspage->status)); -+ if (extended) -+ *extended = le64_to_cpu(get_unaligned(&statuspage->extended_status)); -+ -+ kunmap_atomic((char *)statuspage, KM_USER0); -+ return retval; -+} -+ -+/* This function should be called when something bad happens (e.g. from reiser4_panic). -+ It fills the status structure and tries to push it to disk. */ -+int reiser4_status_write(__u64 status, __u64 extended_status, char *message) -+{ -+ struct super_block *sb = reiser4_get_current_sb(); -+ struct reiser4_status *statuspage; -+ struct bio *bio = get_super_private(sb)->status_bio; -+ -+ if (!get_super_private(sb)->status_page) { // No status page? -+ return -1; -+ } -+ statuspage = (struct reiser4_status *) -+ kmap_atomic(get_super_private(sb)->status_page, KM_USER0); -+ -+ put_unaligned(cpu_to_le64(status), &statuspage->status); -+ put_unaligned(cpu_to_le64(extended_status), &statuspage->extended_status); -+ strncpy(statuspage->texterror, message, REISER4_TEXTERROR_LEN); -+ -+ kunmap_atomic((char *)statuspage, KM_USER0); -+ bio->bi_bdev = sb->s_bdev; -+ bio->bi_io_vec[0].bv_page = get_super_private(sb)->status_page; -+ bio->bi_io_vec[0].bv_len = sb->s_blocksize; -+ bio->bi_io_vec[0].bv_offset = 0; -+ bio->bi_vcnt = 1; -+ bio->bi_size = sb->s_blocksize; -+ bio->bi_end_io = reiser4_status_endio; -+ lock_page(get_super_private(sb)->status_page); // Safe as nobody should touch our page. -+ /* We can block now, but we have no other choice anyway */ -+ submit_bio(WRITE, bio); -+ blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping); -+ return 0; // We do not wait for io to finish. -+} -+ -+/* Frees the page with status and bio structure. Should be called by disk format at umount time */ -+int reiser4_status_finish(void) -+{ -+ struct super_block *sb = reiser4_get_current_sb(); -+ -+ __free_pages(get_super_private(sb)->status_page, 0); -+ get_super_private(sb)->status_page = NULL; -+ bio_put(get_super_private(sb)->status_bio); -+ get_super_private(sb)->status_bio = NULL; -+ return 0; -+} -diff -urN linux-2.6.20.orig/fs/reiser4/status_flags.h linux-2.6.20/fs/reiser4/status_flags.h ---- linux-2.6.20.orig/fs/reiser4/status_flags.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/status_flags.h 2007-05-06 14:50:43.875030717 +0400 -@@ -0,0 +1,43 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Here we declare structures and flags that store reiser4 status on disk. -+ The status that helps us to find out if the filesystem is valid or if it -+ contains some critical, or not so critical errors */ -+ -+#if !defined( __REISER4_STATUS_FLAGS_H__ ) -+#define __REISER4_STATUS_FLAGS_H__ -+ -+#include "dformat.h" -+/* These are major status flags */ -+#define REISER4_STATUS_OK 0 -+#define REISER4_STATUS_CORRUPTED 0x1 -+#define REISER4_STATUS_DAMAGED 0x2 -+#define REISER4_STATUS_DESTROYED 0x4 -+#define REISER4_STATUS_IOERROR 0x8 -+ -+/* Return values for reiser4_status_query() */ -+#define REISER4_STATUS_MOUNT_OK 0 -+#define REISER4_STATUS_MOUNT_WARN 1 -+#define REISER4_STATUS_MOUNT_RO 2 -+#define REISER4_STATUS_MOUNT_UNKNOWN -1 -+ -+#define REISER4_TEXTERROR_LEN 256 -+ -+#define REISER4_STATUS_MAGIC "ReiSeR4StATusBl" -+/* We probably need to keep its size under sector size which is 512 bytes */ -+struct reiser4_status { -+ char magic[16]; -+ d64 status; /* Current FS state */ -+ d64 extended_status; /* Any additional info that might have sense in addition to "status". E.g. -+ last sector where io error happened if status is "io error encountered" */ -+ d64 stacktrace[10]; /* Last ten functional calls made (addresses) */ -+ char texterror[REISER4_TEXTERROR_LEN]; /* Any error message if appropriate, otherwise filled with zeroes */ -+}; -+ -+int reiser4_status_init(reiser4_block_nr block); -+int reiser4_status_query(u64 * status, u64 * extended); -+int reiser4_status_write(u64 status, u64 extended_status, char *message); -+int reiser4_status_finish(void); -+ -+#endif -diff -urN linux-2.6.20.orig/fs/reiser4/super.c linux-2.6.20/fs/reiser4/super.c ---- linux-2.6.20.orig/fs/reiser4/super.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/super.c 2007-05-06 14:50:43.875030717 +0400 -@@ -0,0 +1,316 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Super-block manipulations. */ -+ -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "plugin/security/perm.h" -+#include "plugin/space/space_allocator.h" -+#include "plugin/plugin.h" -+#include "tree.h" -+#include "vfs_ops.h" -+#include "super.h" -+#include "reiser4.h" -+ -+#include /* for __u?? */ -+#include /* for struct super_block */ -+ -+static __u64 reserved_for_gid(const struct super_block *super, gid_t gid); -+static __u64 reserved_for_uid(const struct super_block *super, uid_t uid); -+static __u64 reserved_for_root(const struct super_block *super); -+ -+/* Return reiser4-specific part of super block */ -+reiser4_super_info_data *get_super_private_nocheck(const struct super_block *super /* super block -+ * queried */ ) -+{ -+ return (reiser4_super_info_data *) super->s_fs_info; -+} -+ -+/* Return reiser4 fstype: value that is returned in ->f_type field by statfs() */ -+long reiser4_statfs_type(const struct super_block *super UNUSED_ARG) -+{ -+ assert("nikita-448", super != NULL); -+ assert("nikita-449", is_reiser4_super(super)); -+ return (long)REISER4_SUPER_MAGIC; -+} -+ -+/* functions to read/modify fields of reiser4_super_info_data */ -+ -+/* get number of blocks in file system */ -+__u64 reiser4_block_count(const struct super_block *super /* super block -+ queried */ ) -+{ -+ assert("vs-494", super != NULL); -+ assert("vs-495", is_reiser4_super(super)); -+ return get_super_private(super)->block_count; -+} -+ -+#if REISER4_DEBUG -+/* -+ * number of blocks in the current file system -+ */ -+__u64 reiser4_current_block_count(void) -+{ -+ return get_current_super_private()->block_count; -+} -+#endif /* REISER4_DEBUG */ -+ -+/* set number of block in filesystem */ -+void reiser4_set_block_count(const struct super_block *super, __u64 nr) -+{ -+ assert("vs-501", super != NULL); -+ assert("vs-502", is_reiser4_super(super)); -+ get_super_private(super)->block_count = nr; -+ /* -+ * The proper calculation of the reserved space counter (%5 of device -+ * block counter) we need a 64 bit division which is missing in Linux -+ * on i386 platform. Because we do not need a precise calculation here -+ * we can replace a div64 operation by this combination of -+ * multiplication and shift: 51. / (2^10) == .0498 . -+ * FIXME: this is a bug. It comes up only for very small filesystems -+ * which probably are never used. Nevertheless, it is a bug. Number of -+ * reserved blocks must be not less than maximal number of blocks which -+ * get grabbed with BA_RESERVED. -+ */ -+ get_super_private(super)->blocks_reserved = ((nr * 51) >> 10); -+} -+ -+/* amount of blocks used (allocated for data) in file system */ -+__u64 reiser4_data_blocks(const struct super_block *super /* super block -+ queried */ ) -+{ -+ assert("nikita-452", super != NULL); -+ assert("nikita-453", is_reiser4_super(super)); -+ return get_super_private(super)->blocks_used; -+} -+ -+/* set number of block used in filesystem */ -+void reiser4_set_data_blocks(const struct super_block *super, __u64 nr) -+{ -+ assert("vs-503", super != NULL); -+ assert("vs-504", is_reiser4_super(super)); -+ get_super_private(super)->blocks_used = nr; -+} -+ -+/* amount of free blocks in file system */ -+__u64 reiser4_free_blocks(const struct super_block *super /* super block -+ queried */ ) -+{ -+ assert("nikita-454", super != NULL); -+ assert("nikita-455", is_reiser4_super(super)); -+ return get_super_private(super)->blocks_free; -+} -+ -+/* set number of blocks free in filesystem */ -+void reiser4_set_free_blocks(const struct super_block *super, __u64 nr) -+{ -+ assert("vs-505", super != NULL); -+ assert("vs-506", is_reiser4_super(super)); -+ get_super_private(super)->blocks_free = nr; -+} -+ -+/* get mkfs unique identifier */ -+__u32 reiser4_mkfs_id(const struct super_block *super /* super block -+ queried */ ) -+{ -+ assert("vpf-221", super != NULL); -+ assert("vpf-222", is_reiser4_super(super)); -+ return get_super_private(super)->mkfs_id; -+} -+ -+/* amount of free blocks in file system */ -+__u64 reiser4_free_committed_blocks(const struct super_block *super) -+{ -+ assert("vs-497", super != NULL); -+ assert("vs-498", is_reiser4_super(super)); -+ return get_super_private(super)->blocks_free_committed; -+} -+ -+/* amount of blocks in the file system reserved for @uid and @gid */ -+long reiser4_reserved_blocks(const struct super_block *super /* super block -+ queried */ , -+ uid_t uid /* user id */ , -+ gid_t gid /* group id */ ) -+{ -+ long reserved; -+ -+ assert("nikita-456", super != NULL); -+ assert("nikita-457", is_reiser4_super(super)); -+ -+ reserved = 0; -+ if (REISER4_SUPPORT_GID_SPACE_RESERVATION) -+ reserved += reserved_for_gid(super, gid); -+ if (REISER4_SUPPORT_UID_SPACE_RESERVATION) -+ reserved += reserved_for_uid(super, uid); -+ if (REISER4_SUPPORT_ROOT_SPACE_RESERVATION && (uid == 0)) -+ reserved += reserved_for_root(super); -+ return reserved; -+} -+ -+/* get/set value of/to grabbed blocks counter */ -+__u64 reiser4_grabbed_blocks(const struct super_block * super) -+{ -+ assert("zam-512", super != NULL); -+ assert("zam-513", is_reiser4_super(super)); -+ -+ return get_super_private(super)->blocks_grabbed; -+} -+ -+__u64 reiser4_flush_reserved(const struct super_block * super) -+{ -+ assert("vpf-285", super != NULL); -+ assert("vpf-286", is_reiser4_super(super)); -+ -+ return get_super_private(super)->blocks_flush_reserved; -+} -+ -+/* get/set value of/to counter of fake allocated formatted blocks */ -+__u64 reiser4_fake_allocated(const struct super_block * super) -+{ -+ assert("zam-516", super != NULL); -+ assert("zam-517", is_reiser4_super(super)); -+ -+ return get_super_private(super)->blocks_fake_allocated; -+} -+ -+/* get/set value of/to counter of fake allocated unformatted blocks */ -+__u64 reiser4_fake_allocated_unformatted(const struct super_block * super) -+{ -+ assert("zam-516", super != NULL); -+ assert("zam-517", is_reiser4_super(super)); -+ -+ return get_super_private(super)->blocks_fake_allocated_unformatted; -+} -+ -+/* get/set value of/to counter of clustered blocks */ -+__u64 reiser4_clustered_blocks(const struct super_block * super) -+{ -+ assert("edward-601", super != NULL); -+ assert("edward-602", is_reiser4_super(super)); -+ -+ return get_super_private(super)->blocks_clustered; -+} -+ -+/* space allocator used by this file system */ -+reiser4_space_allocator * reiser4_get_space_allocator(const struct super_block -+ *super) -+{ -+ assert("nikita-1965", super != NULL); -+ assert("nikita-1966", is_reiser4_super(super)); -+ return &get_super_private(super)->space_allocator; -+} -+ -+/* return fake inode used to bind formatted nodes in the page cache */ -+struct inode *reiser4_get_super_fake(const struct super_block *super /* super block -+ queried */ ) -+{ -+ assert("nikita-1757", super != NULL); -+ return get_super_private(super)->fake; -+} -+ -+/* return fake inode used to bind copied on capture nodes in the page cache */ -+struct inode *reiser4_get_cc_fake(const struct super_block *super /* super block -+ queried */ ) -+{ -+ assert("nikita-1757", super != NULL); -+ return get_super_private(super)->cc; -+} -+ -+/* return fake inode used to bind bitmaps and journlal heads */ -+struct inode *reiser4_get_bitmap_fake(const struct super_block *super) -+{ -+ assert("nikita-17571", super != NULL); -+ return get_super_private(super)->bitmap; -+} -+ -+/* tree used by this file system */ -+reiser4_tree *reiser4_get_tree(const struct super_block * super /* super block -+ * queried */ ) -+{ -+ assert("nikita-460", super != NULL); -+ assert("nikita-461", is_reiser4_super(super)); -+ return &get_super_private(super)->tree; -+} -+ -+/* Check that @super is (looks like) reiser4 super block. This is mainly for -+ use in assertions. */ -+int is_reiser4_super(const struct super_block *super /* super block -+ * queried */ ) -+{ -+ return -+ super != NULL && -+ get_super_private(super) != NULL && -+ super->s_op == &(get_super_private(super)->ops.super); -+} -+ -+int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f) -+{ -+ return test_bit((int)f, &get_super_private(super)->fs_flags); -+} -+ -+/* amount of blocks reserved for given group in file system */ -+static __u64 reserved_for_gid(const struct super_block *super UNUSED_ARG /* super -+ * block -+ * queried */ , -+ gid_t gid UNUSED_ARG /* group id */ ) -+{ -+ return 0; -+} -+ -+/* amount of blocks reserved for given user in file system */ -+static __u64 reserved_for_uid(const struct super_block *super UNUSED_ARG /* super -+ block -+ queried */ , -+ uid_t uid UNUSED_ARG /* user id */ ) -+{ -+ return 0; -+} -+ -+/* amount of blocks reserved for super user in file system */ -+static __u64 reserved_for_root(const struct super_block *super UNUSED_ARG /* super -+ block -+ queried */ ) -+{ -+ return 0; -+} -+ -+/* -+ * true if block number @blk makes sense for the file system at @super. -+ */ -+int -+reiser4_blocknr_is_sane_for(const struct super_block *super, -+ const reiser4_block_nr * blk) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ assert("nikita-2957", super != NULL); -+ assert("nikita-2958", blk != NULL); -+ -+ if (reiser4_blocknr_is_fake(blk)) -+ return 1; -+ -+ sbinfo = get_super_private(super); -+ return *blk < sbinfo->block_count; -+} -+ -+#if REISER4_DEBUG -+/* -+ * true, if block number @blk makes sense for the current file system -+ */ -+int reiser4_blocknr_is_sane(const reiser4_block_nr * blk) -+{ -+ return reiser4_blocknr_is_sane_for(reiser4_get_current_sb(), blk); -+} -+#endif /* REISER4_DEBUG */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/super.h linux-2.6.20/fs/reiser4/super.h ---- linux-2.6.20.orig/fs/reiser4/super.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/super.h 2007-05-06 14:50:43.875030717 +0400 -@@ -0,0 +1,464 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Super-block functions. See super.c for details. */ -+ -+#if !defined( __REISER4_SUPER_H__ ) -+#define __REISER4_SUPER_H__ -+ -+#include "tree.h" -+#include "entd.h" -+#include "wander.h" -+#include "fsdata.h" -+#include "plugin/object.h" -+#include "plugin/space/space_allocator.h" -+ -+/* -+ * Flush algorithms parameters. -+ */ -+typedef struct { -+ unsigned relocate_threshold; -+ unsigned relocate_distance; -+ unsigned written_threshold; -+ unsigned scan_maxnodes; -+} flush_params; -+ -+typedef enum { -+ /* -+ * True if this file system doesn't support hard-links (multiple names) -+ * for directories: this is default UNIX behavior. -+ * -+ * If hard-links on directoires are not allowed, file system is Acyclic -+ * Directed Graph (modulo dot, and dotdot, of course). -+ * -+ * This is used by reiser4_link(). -+ */ -+ REISER4_ADG = 0, -+ /* -+ * set if all nodes in internal tree have the same node layout plugin. -+ * If so, znode_guess_plugin() will return tree->node_plugin in stead -+ * of guessing plugin by plugin id stored in the node. -+ */ -+ REISER4_ONE_NODE_PLUGIN = 1, -+ /* if set, bsd gid assignment is supported. */ -+ REISER4_BSD_GID = 2, -+ /* [mac]_time are 32 bit in inode */ -+ REISER4_32_BIT_TIMES = 3, -+ /* load all bitmap blocks at mount time */ -+ REISER4_DONT_LOAD_BITMAP = 5, -+ /* enforce atomicity during write(2) */ -+ REISER4_ATOMIC_WRITE = 6, -+ /* don't use write barriers in the log writer code. */ -+ REISER4_NO_WRITE_BARRIER = 7 -+} reiser4_fs_flag; -+ -+/* -+ * VFS related operation vectors. -+ */ -+typedef struct object_ops { -+ struct super_operations super; -+ struct dentry_operations dentry; -+ struct export_operations export; -+} object_ops; -+ -+/* reiser4-specific part of super block -+ -+ Locking -+ -+ Fields immutable after mount: -+ -+ ->oid* -+ ->space* -+ ->default_[ug]id -+ ->mkfs_id -+ ->trace_flags -+ ->debug_flags -+ ->fs_flags -+ ->df_plug -+ ->optimal_io_size -+ ->plug -+ ->flush -+ ->u (bad name) -+ ->txnmgr -+ ->ra_params -+ ->fsuid -+ ->journal_header -+ ->journal_footer -+ -+ Fields protected by ->lnode_guard -+ -+ ->lnode_htable -+ -+ Fields protected by per-super block spin lock -+ -+ ->block_count -+ ->blocks_used -+ ->blocks_free -+ ->blocks_free_committed -+ ->blocks_grabbed -+ ->blocks_fake_allocated_unformatted -+ ->blocks_fake_allocated -+ ->blocks_flush_reserved -+ ->eflushed -+ ->blocknr_hint_default -+ -+ After journal replaying during mount, -+ -+ ->last_committed_tx -+ -+ is protected by ->tmgr.commit_mutex -+ -+ Invariants involving this data-type: -+ -+ [sb-block-counts] -+ [sb-grabbed] -+ [sb-fake-allocated] -+*/ -+struct reiser4_super_info_data { -+ /* -+ * guard spinlock which protects reiser4 super block fields (currently -+ * blocks_free, blocks_free_committed) -+ */ -+ spinlock_t guard; -+ -+ /* next oid that will be returned by oid_allocate() */ -+ oid_t next_to_use; -+ /* total number of used oids */ -+ oid_t oids_in_use; -+ -+ /* space manager plugin */ -+ reiser4_space_allocator space_allocator; -+ -+ /* reiser4 internal tree */ -+ reiser4_tree tree; -+ -+ /* -+ * default user id used for light-weight files without their own -+ * stat-data. -+ */ -+ uid_t default_uid; -+ -+ /* -+ * default group id used for light-weight files without their own -+ * stat-data. -+ */ -+ gid_t default_gid; -+ -+ /* mkfs identifier generated at mkfs time. */ -+ __u32 mkfs_id; -+ /* amount of blocks in a file system */ -+ __u64 block_count; -+ -+ /* inviolable reserve */ -+ __u64 blocks_reserved; -+ -+ /* amount of blocks used by file system data and meta-data. */ -+ __u64 blocks_used; -+ -+ /* -+ * amount of free blocks. This is "working" free blocks counter. It is -+ * like "working" bitmap, please see block_alloc.c for description. -+ */ -+ __u64 blocks_free; -+ -+ /* -+ * free block count for fs committed state. This is "commit" version of -+ * free block counter. -+ */ -+ __u64 blocks_free_committed; -+ -+ /* -+ * number of blocks reserved for further allocation, for all -+ * threads. -+ */ -+ __u64 blocks_grabbed; -+ -+ /* number of fake allocated unformatted blocks in tree. */ -+ __u64 blocks_fake_allocated_unformatted; -+ -+ /* number of fake allocated formatted blocks in tree. */ -+ __u64 blocks_fake_allocated; -+ -+ /* number of blocks reserved for flush operations. */ -+ __u64 blocks_flush_reserved; -+ -+ /* number of blocks reserved for cluster operations. */ -+ __u64 blocks_clustered; -+ -+ /* unique file-system identifier */ -+ __u32 fsuid; -+ -+ /* On-disk format version. If does not equal to the disk_format -+ plugin version, some format updates (e.g. enlarging plugin -+ set, etc) may have place on mount. */ -+ int version; -+ -+ /* file-system wide flags. See reiser4_fs_flag enum */ -+ unsigned long fs_flags; -+ -+ /* transaction manager */ -+ txn_mgr tmgr; -+ -+ /* ent thread */ -+ entd_context entd; -+ -+ /* fake inode used to bind formatted nodes */ -+ struct inode *fake; -+ /* inode used to bind bitmaps (and journal heads) */ -+ struct inode *bitmap; -+ /* inode used to bind copied on capture nodes */ -+ struct inode *cc; -+ -+ /* disk layout plugin */ -+ disk_format_plugin *df_plug; -+ -+ /* disk layout specific part of reiser4 super info data */ -+ union { -+ format40_super_info format40; -+ } u; -+ -+ /* value we return in st_blksize on stat(2) */ -+ unsigned long optimal_io_size; -+ -+ /* parameters for the flush algorithm */ -+ flush_params flush; -+ -+ /* pointers to jnodes for journal header and footer */ -+ jnode *journal_header; -+ jnode *journal_footer; -+ -+ journal_location jloc; -+ -+ /* head block number of last committed transaction */ -+ __u64 last_committed_tx; -+ -+ /* -+ * we remember last written location for using as a hint for new block -+ * allocation -+ */ -+ __u64 blocknr_hint_default; -+ -+ /* committed number of files (oid allocator state variable ) */ -+ __u64 nr_files_committed; -+ -+ ra_params_t ra_params; -+ -+ /* -+ * A mutex for serializing cut tree operation if out-of-free-space: -+ * the only one cut_tree thread is allowed to grab space from reserved -+ * area (it is 5% of disk space) -+ */ -+ struct mutex delete_mutex; -+ /* task owning ->delete_mutex */ -+ struct task_struct *delete_mutex_owner; -+ -+ /* Diskmap's blocknumber */ -+ __u64 diskmap_block; -+ -+ /* What to do in case of error */ -+ int onerror; -+ -+ /* operations for objects on this file system */ -+ object_ops ops; -+ -+ /* -+ * structure to maintain d_cursors. See plugin/file_ops_readdir.c for -+ * more details -+ */ -+ d_cursor_info d_info; -+ -+#ifdef CONFIG_REISER4_BADBLOCKS -+ /* Alternative master superblock offset (in bytes) */ -+ unsigned long altsuper; -+#endif -+ struct repacker *repacker; -+ struct page *status_page; -+ struct bio *status_bio; -+ -+#if REISER4_DEBUG -+ /* -+ * minimum used blocks value (includes super blocks, bitmap blocks and -+ * other fs reserved areas), depends on fs format and fs size. -+ */ -+ __u64 min_blocks_used; -+ -+ /* -+ * when debugging is on, all jnodes (including znodes, bitmaps, etc.) -+ * are kept on a list anchored at sbinfo->all_jnodes. This list is -+ * protected by sbinfo->all_guard spin lock. This lock should be taken -+ * with _irq modifier, because it is also modified from interrupt -+ * contexts (by RCU). -+ */ -+ spinlock_t all_guard; -+ /* list of all jnodes */ -+ struct list_head all_jnodes; -+#endif -+ struct dentry *debugfs_root; -+}; -+ -+extern reiser4_super_info_data *get_super_private_nocheck(const struct -+ super_block *super); -+ -+/* Return reiser4-specific part of super block */ -+static inline reiser4_super_info_data *get_super_private(const struct -+ super_block *super) -+{ -+ assert("nikita-447", super != NULL); -+ -+ return (reiser4_super_info_data *) super->s_fs_info; -+} -+ -+/* get ent context for the @super */ -+static inline entd_context *get_entd_context(struct super_block *super) -+{ -+ return &get_super_private(super)->entd; -+} -+ -+/* "Current" super-block: main super block used during current system -+ call. Reference to this super block is stored in reiser4_context. */ -+static inline struct super_block *reiser4_get_current_sb(void) -+{ -+ return get_current_context()->super; -+} -+ -+/* Reiser4-specific part of "current" super-block: main super block used -+ during current system call. Reference to this super block is stored in -+ reiser4_context. */ -+static inline reiser4_super_info_data *get_current_super_private(void) -+{ -+ return get_super_private(reiser4_get_current_sb()); -+} -+ -+static inline ra_params_t *get_current_super_ra_params(void) -+{ -+ return &(get_current_super_private()->ra_params); -+} -+ -+/* -+ * true, if file system on @super is read-only -+ */ -+static inline int rofs_super(struct super_block *super) -+{ -+ return super->s_flags & MS_RDONLY; -+} -+ -+/* -+ * true, if @tree represents read-only file system -+ */ -+static inline int rofs_tree(reiser4_tree * tree) -+{ -+ return rofs_super(tree->super); -+} -+ -+/* -+ * true, if file system where @inode lives on, is read-only -+ */ -+static inline int rofs_inode(struct inode *inode) -+{ -+ return rofs_super(inode->i_sb); -+} -+ -+/* -+ * true, if file system where @node lives on, is read-only -+ */ -+static inline int rofs_jnode(jnode * node) -+{ -+ return rofs_tree(jnode_get_tree(node)); -+} -+ -+extern __u64 reiser4_current_block_count(void); -+ -+extern void build_object_ops(struct super_block *super, object_ops * ops); -+ -+#define REISER4_SUPER_MAGIC 0x52345362 /* (*(__u32 *)"R4Sb"); */ -+ -+static inline void spin_lock_reiser4_super(reiser4_super_info_data *sbinfo) -+{ -+ spin_lock(&(sbinfo->guard)); -+} -+ -+static inline void spin_unlock_reiser4_super(reiser4_super_info_data *sbinfo) -+{ -+ assert_spin_locked(&(sbinfo->guard)); -+ spin_unlock(&(sbinfo->guard)); -+} -+ -+extern __u64 reiser4_flush_reserved(const struct super_block *); -+extern int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f); -+extern long reiser4_statfs_type(const struct super_block *super); -+extern __u64 reiser4_block_count(const struct super_block *super); -+extern void reiser4_set_block_count(const struct super_block *super, __u64 nr); -+extern __u64 reiser4_data_blocks(const struct super_block *super); -+extern void reiser4_set_data_blocks(const struct super_block *super, __u64 nr); -+extern __u64 reiser4_free_blocks(const struct super_block *super); -+extern void reiser4_set_free_blocks(const struct super_block *super, __u64 nr); -+extern __u32 reiser4_mkfs_id(const struct super_block *super); -+ -+extern __u64 reiser4_free_committed_blocks(const struct super_block *super); -+ -+extern __u64 reiser4_grabbed_blocks(const struct super_block *); -+extern __u64 reiser4_fake_allocated(const struct super_block *); -+extern __u64 reiser4_fake_allocated_unformatted(const struct super_block *); -+extern __u64 reiser4_clustered_blocks(const struct super_block *); -+ -+extern long reiser4_reserved_blocks(const struct super_block *super, uid_t uid, -+ gid_t gid); -+ -+extern reiser4_space_allocator * -+reiser4_get_space_allocator(const struct super_block *super); -+extern reiser4_oid_allocator * -+reiser4_get_oid_allocator(const struct super_block *super); -+extern struct inode *reiser4_get_super_fake(const struct super_block *super); -+extern struct inode *reiser4_get_cc_fake(const struct super_block *super); -+extern struct inode *reiser4_get_bitmap_fake(const struct super_block *super); -+extern reiser4_tree *reiser4_get_tree(const struct super_block *super); -+extern int is_reiser4_super(const struct super_block *super); -+ -+extern int reiser4_blocknr_is_sane(const reiser4_block_nr * blk); -+extern int reiser4_blocknr_is_sane_for(const struct super_block *super, -+ const reiser4_block_nr * blk); -+extern int reiser4_fill_super(struct super_block *s, void *data, int silent); -+extern int reiser4_done_super(struct super_block *s); -+ -+/* step of fill super */ -+extern int reiser4_init_fs_info(struct super_block *); -+extern void reiser4_done_fs_info(struct super_block *); -+extern int reiser4_init_super_data(struct super_block *, char *opt_string); -+extern int reiser4_init_read_super(struct super_block *, int silent); -+extern int reiser4_init_root_inode(struct super_block *); -+extern reiser4_plugin *get_default_plugin(pset_member memb); -+ -+/* Maximal possible object id. */ -+#define ABSOLUTE_MAX_OID ((oid_t)~0) -+ -+#define OIDS_RESERVED ( 1 << 16 ) -+int oid_init_allocator(struct super_block *, oid_t nr_files, oid_t next); -+oid_t oid_allocate(struct super_block *); -+int oid_release(struct super_block *, oid_t); -+oid_t oid_next(const struct super_block *); -+void oid_count_allocated(void); -+void oid_count_released(void); -+long oids_used(const struct super_block *); -+ -+#if REISER4_DEBUG -+void print_fs_info(const char *prefix, const struct super_block *); -+#endif -+ -+extern void destroy_reiser4_cache(struct kmem_cache **); -+ -+extern struct super_operations reiser4_super_operations; -+extern struct export_operations reiser4_export_operations; -+extern struct dentry_operations reiser4_dentry_operations; -+ -+/* __REISER4_SUPER_H__ */ -+#endif -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 120 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/super_ops.c linux-2.6.20/fs/reiser4/super_ops.c ---- linux-2.6.20.orig/fs/reiser4/super_ops.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/super_ops.c 2007-05-06 14:50:43.879031967 +0400 -@@ -0,0 +1,728 @@ -+/* Copyright 2005 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#include "inode.h" -+#include "page_cache.h" -+#include "ktxnmgrd.h" -+#include "flush.h" -+#include "safe_link.h" -+ -+#include -+#include -+#include -+#include -+#include -+ -+/* slab cache for inodes */ -+static struct kmem_cache *inode_cache; -+ -+static struct dentry *reiser4_debugfs_root = NULL; -+ -+/** -+ * init_once - constructor for reiser4 inodes -+ * @obj: inode to be initialized -+ * @cache: cache @obj belongs to -+ * @flags: SLAB flags -+ * -+ * Initialization function to be called when new page is allocated by reiser4 -+ * inode cache. It is set on inode cache creation. -+ */ -+static void init_once(void *obj, struct kmem_cache *cache, unsigned long flags) -+{ -+ reiser4_inode_object *info; -+ -+ info = obj; -+ -+ if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == -+ SLAB_CTOR_CONSTRUCTOR) { -+ /* initialize vfs inode */ -+ inode_init_once(&info->vfs_inode); -+ -+ /* -+ * initialize reiser4 specific part fo inode. -+ * NOTE-NIKITA add here initializations for locks, list heads, -+ * etc. that will be added to our private inode part. -+ */ -+ INIT_LIST_HEAD(get_readdir_list(&info->vfs_inode)); -+ init_rwsem(&info->p.conv_sem); -+ /* init semaphore which is used during inode loading */ -+ loading_init_once(&info->p); -+ INIT_RADIX_TREE(jnode_tree_by_reiser4_inode(&info->p), -+ GFP_ATOMIC); -+#if REISER4_DEBUG -+ info->p.nr_jnodes = 0; -+#endif -+ } -+} -+ -+/** -+ * init_inodes - create znode cache -+ * -+ * Initializes slab cache of inodes. It is part of reiser4 module initialization. -+ */ -+static int init_inodes(void) -+{ -+ inode_cache = kmem_cache_create("reiser4_inode", -+ sizeof(reiser4_inode_object), -+ 0, -+ SLAB_HWCACHE_ALIGN | -+ SLAB_RECLAIM_ACCOUNT, init_once, NULL); -+ if (inode_cache == NULL) -+ return RETERR(-ENOMEM); -+ return 0; -+} -+ -+/** -+ * done_inodes - delete inode cache -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+static void done_inodes(void) -+{ -+ destroy_reiser4_cache(&inode_cache); -+} -+ -+/** -+ * reiser4_alloc_inode - alloc_inode of super operations -+ * @super: super block new inode is allocated for -+ * -+ * Allocates new inode, initializes reiser4 specific part of it. -+ */ -+static struct inode *reiser4_alloc_inode(struct super_block *super) -+{ -+ reiser4_inode_object *obj; -+ -+ assert("nikita-1696", super != NULL); -+ obj = kmem_cache_alloc(inode_cache, reiser4_ctx_gfp_mask_get()); -+ if (obj != NULL) { -+ reiser4_inode *info; -+ -+ info = &obj->p; -+ -+ info->pset = plugin_set_get_empty(); -+ info->hset = plugin_set_get_empty(); -+ info->extmask = 0; -+ info->locality_id = 0ull; -+ info->plugin_mask = 0; -+ info->heir_mask = 0; -+#if !REISER4_INO_IS_OID -+ info->oid_hi = 0; -+#endif -+ reiser4_seal_init(&info->sd_seal, NULL, NULL); -+ coord_init_invalid(&info->sd_coord, NULL); -+ info->flags = 0; -+ spin_lock_init(&info->guard); -+ /* this deals with info's loading semaphore */ -+ loading_alloc(info); -+ info->vroot = UBER_TREE_ADDR; -+ return &obj->vfs_inode; -+ } else -+ return NULL; -+} -+ -+/** -+ * reiser4_destroy_inode - destroy_inode of super operations -+ * @inode: inode being destroyed -+ * -+ * Puts reiser4 specific portion of inode, frees memory occupied by inode. -+ */ -+static void reiser4_destroy_inode(struct inode *inode) -+{ -+ reiser4_inode *info; -+ -+ info = reiser4_inode_data(inode); -+ -+ assert("vs-1220", inode_has_no_jnodes(info)); -+ -+ if (!is_bad_inode(inode) && is_inode_loaded(inode)) { -+ file_plugin *fplug = inode_file_plugin(inode); -+ if (fplug->destroy_inode != NULL) -+ fplug->destroy_inode(inode); -+ } -+ reiser4_dispose_cursors(inode); -+ if (info->pset) -+ plugin_set_put(info->pset); -+ if (info->hset) -+ plugin_set_put(info->hset); -+ -+ /* -+ * cannot add similar assertion about ->i_list as prune_icache return -+ * inode into slab with dangling ->list.{next,prev}. This is safe, -+ * because they are re-initialized in the new_inode(). -+ */ -+ assert("nikita-2895", list_empty(&inode->i_dentry)); -+ assert("nikita-2896", hlist_unhashed(&inode->i_hash)); -+ assert("nikita-2898", list_empty_careful(get_readdir_list(inode))); -+ -+ /* this deals with info's loading semaphore */ -+ loading_destroy(info); -+ -+ kmem_cache_free(inode_cache, -+ container_of(info, reiser4_inode_object, p)); -+} -+ -+/** -+ * reiser4_dirty_inode - dirty_inode of super operations -+ * @inode: inode being dirtied -+ * -+ * Updates stat data. -+ */ -+static void reiser4_dirty_inode(struct inode *inode) -+{ -+ int result; -+ -+ if (!is_in_reiser4_context()) -+ return; -+ assert("", !IS_RDONLY(inode)); -+ assert("", (inode_file_plugin(inode)->estimate.update(inode) <= -+ get_current_context()->grabbed_blocks)); -+ -+ result = reiser4_update_sd(inode); -+ if (result) -+ warning("", "failed to dirty inode for %llu: %d", -+ get_inode_oid(inode), result); -+} -+ -+/** -+ * reiser4_delete_inode - delete_inode of super operations -+ * @inode: inode to delete -+ * -+ * Calls file plugin's delete_object method to delete object items from -+ * filesystem tree and calls clear_inode. -+ */ -+static void reiser4_delete_inode(struct inode *inode) -+{ -+ reiser4_context *ctx; -+ file_plugin *fplug; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) { -+ warning("vs-15", "failed to init context"); -+ return; -+ } -+ -+ if (is_inode_loaded(inode)) { -+ fplug = inode_file_plugin(inode); -+ if (fplug != NULL && fplug->delete_object != NULL) -+ fplug->delete_object(inode); -+ } -+ -+ truncate_inode_pages(&inode->i_data, 0); -+ inode->i_blocks = 0; -+ clear_inode(inode); -+ reiser4_exit_context(ctx); -+} -+ -+/** -+ * reiser4_put_super - put_super of super operations -+ * @super: super block to free -+ * -+ * Stops daemons, release resources, umounts in short. -+ */ -+static void reiser4_put_super(struct super_block *super) -+{ -+ reiser4_super_info_data *sbinfo; -+ reiser4_context *ctx; -+ -+ sbinfo = get_super_private(super); -+ assert("vs-1699", sbinfo); -+ -+ debugfs_remove(sbinfo->tmgr.debugfs_atom_count); -+ debugfs_remove(sbinfo->tmgr.debugfs_id_count); -+ debugfs_remove(sbinfo->debugfs_root); -+ -+ ctx = reiser4_init_context(super); -+ if (IS_ERR(ctx)) { -+ warning("vs-17", "failed to init context"); -+ return; -+ } -+ -+ /* have disk format plugin to free its resources */ -+ if (get_super_private(super)->df_plug->release) -+ get_super_private(super)->df_plug->release(super); -+ -+ reiser4_done_formatted_fake(super); -+ -+ /* stop daemons: ktxnmgr and entd */ -+ reiser4_done_entd(super); -+ reiser4_done_ktxnmgrd(super); -+ reiser4_done_txnmgr(&sbinfo->tmgr); -+ -+ reiser4_done_fs_info(super); -+ reiser4_exit_context(ctx); -+} -+ -+/** -+ * reiser4_write_super - write_super of super operations -+ * @super: super block to write -+ * -+ * Captures znode associated with super block, comit all transactions. -+ */ -+static void reiser4_write_super(struct super_block *super) -+{ -+ int ret; -+ reiser4_context *ctx; -+ -+ assert("vs-1700", !rofs_super(super)); -+ -+ ctx = reiser4_init_context(super); -+ if (IS_ERR(ctx)) { -+ warning("vs-16", "failed to init context"); -+ return; -+ } -+ -+ ret = reiser4_capture_super_block(super); -+ if (ret != 0) -+ warning("vs-1701", -+ "reiser4_capture_super_block failed in write_super: %d", -+ ret); -+ ret = txnmgr_force_commit_all(super, 0); -+ if (ret != 0) -+ warning("jmacd-77113", -+ "txn_force failed in write_super: %d", ret); -+ -+ super->s_dirt = 0; -+ -+ reiser4_exit_context(ctx); -+} -+ -+/** -+ * reiser4_statfs - statfs of super operations -+ * @super: super block of file system in queried -+ * @stafs: buffer to fill with statistics -+ * -+ * Returns information about filesystem. -+ */ -+static int reiser4_statfs(struct dentry *dentry, struct kstatfs *statfs) -+{ -+ sector_t total; -+ sector_t reserved; -+ sector_t free; -+ sector_t forroot; -+ sector_t deleted; -+ reiser4_context *ctx; -+ struct super_block *super = dentry->d_sb; -+ -+ assert("nikita-408", super != NULL); -+ assert("nikita-409", statfs != NULL); -+ -+ ctx = reiser4_init_context(super); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ statfs->f_type = reiser4_statfs_type(super); -+ statfs->f_bsize = super->s_blocksize; -+ -+ /* -+ * 5% of total block space is reserved. This is needed for flush and -+ * for truncates (so that we are able to perform truncate/unlink even -+ * on the otherwise completely full file system). If this reservation -+ * is hidden from statfs(2), users will mistakenly guess that they -+ * have enough free space to complete some operation, which is -+ * frustrating. -+ * -+ * Another possible solution is to subtract ->blocks_reserved from -+ * ->f_bfree, but changing available space seems less intrusive than -+ * letting user to see 5% of disk space to be used directly after -+ * mkfs. -+ */ -+ total = reiser4_block_count(super); -+ reserved = get_super_private(super)->blocks_reserved; -+ deleted = txnmgr_count_deleted_blocks(); -+ free = reiser4_free_blocks(super) + deleted; -+ forroot = reiser4_reserved_blocks(super, 0, 0); -+ -+ /* -+ * These counters may be in inconsistent state because we take the -+ * values without keeping any global spinlock. Here we do a sanity -+ * check that free block counter does not exceed the number of all -+ * blocks. -+ */ -+ if (free > total) -+ free = total; -+ statfs->f_blocks = total - reserved; -+ /* make sure statfs->f_bfree is never larger than statfs->f_blocks */ -+ if (free > reserved) -+ free -= reserved; -+ else -+ free = 0; -+ statfs->f_bfree = free; -+ -+ if (free > forroot) -+ free -= forroot; -+ else -+ free = 0; -+ statfs->f_bavail = free; -+ -+ statfs->f_files = 0; -+ statfs->f_ffree = 0; -+ -+ /* maximal acceptable name length depends on directory plugin. */ -+ assert("nikita-3351", super->s_root->d_inode != NULL); -+ statfs->f_namelen = reiser4_max_filename_len(super->s_root->d_inode); -+ reiser4_exit_context(ctx); -+ return 0; -+} -+ -+/** -+ * reiser4_clear_inode - clear_inode of super operation -+ * @inode: inode about to destroy -+ * -+ * Does sanity checks: being destroyed should have all jnodes detached. -+ */ -+static void reiser4_clear_inode(struct inode *inode) -+{ -+#if REISER4_DEBUG -+ reiser4_inode *r4_inode; -+ -+ r4_inode = reiser4_inode_data(inode); -+ if (!inode_has_no_jnodes(r4_inode)) -+ warning("vs-1732", "reiser4 inode has %ld jnodes\n", -+ r4_inode->nr_jnodes); -+#endif -+} -+ -+/** -+ * reiser4_sync_inodes - sync_inodes of super operations -+ * @super: -+ * @wbc: -+ * -+ * This method is called by background and non-backgound writeback. Reiser4's -+ * implementation uses generic_sync_sb_inodes to call reiser4_writepages for -+ * each of dirty inodes. Reiser4_writepages handles pages dirtied via shared -+ * mapping - dirty pages get into atoms. Writeout is called to flush some -+ * atoms. -+ */ -+static void reiser4_sync_inodes(struct super_block *super, -+ struct writeback_control *wbc) -+{ -+ reiser4_context *ctx; -+ long to_write; -+ -+ if (wbc->for_kupdate) -+ /* reiser4 has its own means of periodical write-out */ -+ return; -+ -+ to_write = wbc->nr_to_write; -+ assert("vs-49", wbc->older_than_this == NULL); -+ -+ ctx = reiser4_init_context(super); -+ if (IS_ERR(ctx)) { -+ warning("vs-13", "failed to init context"); -+ return; -+ } -+ -+ /* -+ * call reiser4_writepages for each of dirty inodes to turn dirty pages -+ * into transactions if they were not yet. -+ */ -+ generic_sync_sb_inodes(super, wbc); -+ -+ /* flush goes here */ -+ wbc->nr_to_write = to_write; -+ reiser4_writeout(super, wbc); -+ -+ /* avoid recursive calls to ->sync_inodes */ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+} -+ -+/** -+ * reiser4_show_options - show_options of super operations -+ * @m: file where to write information -+ * @mnt: mount structure -+ * -+ * Makes reiser4 mount options visible in /proc/mounts. -+ */ -+static int reiser4_show_options(struct seq_file *m, struct vfsmount *mnt) -+{ -+ struct super_block *super; -+ reiser4_super_info_data *sbinfo; -+ -+ super = mnt->mnt_sb; -+ sbinfo = get_super_private(super); -+ -+ seq_printf(m, ",atom_max_size=0x%x", sbinfo->tmgr.atom_max_size); -+ seq_printf(m, ",atom_max_age=0x%x", sbinfo->tmgr.atom_max_age); -+ seq_printf(m, ",atom_min_size=0x%x", sbinfo->tmgr.atom_min_size); -+ seq_printf(m, ",atom_max_flushers=0x%x", -+ sbinfo->tmgr.atom_max_flushers); -+ seq_printf(m, ",cbk_cache_slots=0x%x", -+ sbinfo->tree.cbk_cache.nr_slots); -+ -+ return 0; -+} -+ -+struct super_operations reiser4_super_operations = { -+ .alloc_inode = reiser4_alloc_inode, -+ .destroy_inode = reiser4_destroy_inode, -+ .dirty_inode = reiser4_dirty_inode, -+ .delete_inode = reiser4_delete_inode, -+ .put_super = reiser4_put_super, -+ .write_super = reiser4_write_super, -+ .statfs = reiser4_statfs, -+ .clear_inode = reiser4_clear_inode, -+ .sync_inodes = reiser4_sync_inodes, -+ .show_options = reiser4_show_options -+}; -+ -+/** -+ * fill_super - initialize super block on mount -+ * @super: super block to fill -+ * @data: reiser4 specific mount option -+ * @silent: -+ * -+ * This is to be called by reiser4_get_sb. Mounts filesystem. -+ */ -+static int fill_super(struct super_block *super, void *data, int silent) -+{ -+ reiser4_context ctx; -+ int result; -+ reiser4_super_info_data *sbinfo; -+ -+ assert("zam-989", super != NULL); -+ -+ super->s_op = NULL; -+ init_stack_context(&ctx, super); -+ -+ /* allocate reiser4 specific super block */ -+ if ((result = reiser4_init_fs_info(super)) != 0) -+ goto failed_init_sinfo; -+ -+ sbinfo = get_super_private(super); -+ /* initialize various reiser4 parameters, parse mount options */ -+ if ((result = reiser4_init_super_data(super, data)) != 0) -+ goto failed_init_super_data; -+ -+ /* read reiser4 master super block, initialize disk format plugin */ -+ if ((result = reiser4_init_read_super(super, silent)) != 0) -+ goto failed_init_read_super; -+ -+ /* initialize transaction manager */ -+ reiser4_init_txnmgr(&sbinfo->tmgr); -+ -+ /* initialize ktxnmgrd context and start kernel thread ktxnmrgd */ -+ if ((result = reiser4_init_ktxnmgrd(super)) != 0) -+ goto failed_init_ktxnmgrd; -+ -+ /* initialize entd context and start kernel thread entd */ -+ if ((result = reiser4_init_entd(super)) != 0) -+ goto failed_init_entd; -+ -+ /* initialize address spaces for formatted nodes and bitmaps */ -+ if ((result = reiser4_init_formatted_fake(super)) != 0) -+ goto failed_init_formatted_fake; -+ -+ /* initialize disk format plugin */ -+ if ((result = get_super_private(super)->df_plug->init_format(super, data)) != 0 ) -+ goto failed_init_disk_format; -+ -+ /* -+ * There are some 'committed' versions of reiser4 super block counters, -+ * which correspond to reiser4 on-disk state. These counters are -+ * initialized here -+ */ -+ sbinfo->blocks_free_committed = sbinfo->blocks_free; -+ sbinfo->nr_files_committed = oids_used(super); -+ -+ /* get inode of root directory */ -+ if ((result = reiser4_init_root_inode(super)) != 0) -+ goto failed_init_root_inode; -+ -+ if ((result = get_super_private(super)->df_plug->version_update(super)) != 0 ) -+ goto failed_update_format_version; -+ -+ process_safelinks(super); -+ reiser4_exit_context(&ctx); -+ -+ sbinfo->debugfs_root = debugfs_create_dir(super->s_id, -+ reiser4_debugfs_root); -+ if (sbinfo->debugfs_root) { -+ sbinfo->tmgr.debugfs_atom_count = -+ debugfs_create_u32("atom_count", S_IFREG|S_IRUSR, -+ sbinfo->debugfs_root, -+ &sbinfo->tmgr.atom_count); -+ sbinfo->tmgr.debugfs_id_count = -+ debugfs_create_u32("id_count", S_IFREG|S_IRUSR, -+ sbinfo->debugfs_root, -+ &sbinfo->tmgr.id_count); -+ } -+ return 0; -+ -+ failed_update_format_version: -+ failed_init_root_inode: -+ if (sbinfo->df_plug->release) -+ sbinfo->df_plug->release(super); -+ failed_init_disk_format: -+ reiser4_done_formatted_fake(super); -+ failed_init_formatted_fake: -+ reiser4_done_entd(super); -+ failed_init_entd: -+ reiser4_done_ktxnmgrd(super); -+ failed_init_ktxnmgrd: -+ reiser4_done_txnmgr(&sbinfo->tmgr); -+ failed_init_read_super: -+ failed_init_super_data: -+ reiser4_done_fs_info(super); -+ failed_init_sinfo: -+ reiser4_exit_context(&ctx); -+ return result; -+} -+ -+/** -+ * reiser4_get_sb - get_sb of file_system_type operations -+ * @fs_type: -+ * @flags: mount flags MS_RDONLY, MS_VERBOSE, etc -+ * @dev_name: block device file name -+ * @data: specific mount options -+ * -+ * Reiser4 mount entry. -+ */ -+static int reiser4_get_sb(struct file_system_type *fs_type, int flags, -+ const char *dev_name, void *data, struct vfsmount *mnt) -+{ -+ return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt); -+} -+ -+/* structure describing the reiser4 filesystem implementation */ -+static struct file_system_type reiser4_fs_type = { -+ .owner = THIS_MODULE, -+ .name = "reiser4", -+ .fs_flags = FS_REQUIRES_DEV, -+ .get_sb = reiser4_get_sb, -+ .kill_sb = kill_block_super, -+ .next = NULL -+}; -+ -+void destroy_reiser4_cache(struct kmem_cache **cachep) -+{ -+ BUG_ON(*cachep == NULL); -+ kmem_cache_destroy(*cachep); -+ *cachep = NULL; -+} -+ -+/** -+ * init_reiser4 - reiser4 initialization entry point -+ * -+ * Initializes reiser4 slabs, registers reiser4 filesystem type. It is called -+ * on kernel initialization or during reiser4 module load. -+ */ -+static int __init init_reiser4(void) -+{ -+ int result; -+ -+ printk(KERN_INFO -+ "Loading Reiser4. " -+ "See www.namesys.com for a description of Reiser4.\n"); -+ -+ /* initialize slab cache of inodes */ -+ if ((result = init_inodes()) != 0) -+ goto failed_inode_cache; -+ -+ /* initialize cache of znodes */ -+ if ((result = init_znodes()) != 0) -+ goto failed_init_znodes; -+ -+ /* initialize all plugins */ -+ if ((result = init_plugins()) != 0) -+ goto failed_init_plugins; -+ -+ /* initialize cache of plugin_set-s and plugin_set's hash table */ -+ if ((result = init_plugin_set()) != 0) -+ goto failed_init_plugin_set; -+ -+ /* initialize caches of txn_atom-s and txn_handle-s */ -+ if ((result = init_txnmgr_static()) != 0) -+ goto failed_init_txnmgr_static; -+ -+ /* initialize cache of jnodes */ -+ if ((result = init_jnodes()) != 0) -+ goto failed_init_jnodes; -+ -+ /* initialize cache of flush queues */ -+ if ((result = reiser4_init_fqs()) != 0) -+ goto failed_init_fqs; -+ -+ /* initialize cache of structures attached to dentry->d_fsdata */ -+ if ((result = reiser4_init_dentry_fsdata()) != 0) -+ goto failed_init_dentry_fsdata; -+ -+ /* initialize cache of structures attached to file->private_data */ -+ if ((result = reiser4_init_file_fsdata()) != 0) -+ goto failed_init_file_fsdata; -+ -+ /* -+ * initialize cache of d_cursors. See plugin/file_ops_readdir.c for -+ * more details -+ */ -+ if ((result = reiser4_init_d_cursor()) != 0) -+ goto failed_init_d_cursor; -+ -+ if ((result = register_filesystem(&reiser4_fs_type)) == 0) { -+ reiser4_debugfs_root = debugfs_create_dir("reiser4", NULL); -+ return 0; -+ } -+ -+ reiser4_done_d_cursor(); -+ failed_init_d_cursor: -+ reiser4_done_file_fsdata(); -+ failed_init_file_fsdata: -+ reiser4_done_dentry_fsdata(); -+ failed_init_dentry_fsdata: -+ reiser4_done_fqs(); -+ failed_init_fqs: -+ done_jnodes(); -+ failed_init_jnodes: -+ done_txnmgr_static(); -+ failed_init_txnmgr_static: -+ done_plugin_set(); -+ failed_init_plugin_set: -+ failed_init_plugins: -+ done_znodes(); -+ failed_init_znodes: -+ done_inodes(); -+ failed_inode_cache: -+ return result; -+} -+ -+/** -+ * done_reiser4 - reiser4 exit entry point -+ * -+ * Unregister reiser4 filesystem type, deletes caches. It is called on shutdown -+ * or at module unload. -+ */ -+static void __exit done_reiser4(void) -+{ -+ int result; -+ -+ debugfs_remove(reiser4_debugfs_root); -+ result = unregister_filesystem(&reiser4_fs_type); -+ BUG_ON(result != 0); -+ reiser4_done_d_cursor(); -+ reiser4_done_file_fsdata(); -+ reiser4_done_dentry_fsdata(); -+ reiser4_done_fqs(); -+ done_jnodes(); -+ done_txnmgr_static(); -+ done_plugin_set(); -+ done_znodes(); -+ destroy_reiser4_cache(&inode_cache); -+} -+ -+module_init(init_reiser4); -+module_exit(done_reiser4); -+ -+MODULE_DESCRIPTION("Reiser4 filesystem"); -+MODULE_AUTHOR("Hans Reiser "); -+ -+MODULE_LICENSE("GPL"); -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/tap.c linux-2.6.20/fs/reiser4/tap.c ---- linux-2.6.20.orig/fs/reiser4/tap.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/tap.c 2007-05-06 14:50:43.879031967 +0400 -@@ -0,0 +1,377 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* -+ Tree Access Pointer (tap). -+ -+ tap is data structure combining coord and lock handle (mostly). It is -+ useful when one has to scan tree nodes (for example, in readdir, or flush), -+ for tap functions allow to move tap in either direction transparently -+ crossing unit/item/node borders. -+ -+ Tap doesn't provide automatic synchronization of its fields as it is -+ supposed to be per-thread object. -+*/ -+ -+#include "forward.h" -+#include "debug.h" -+#include "coord.h" -+#include "tree.h" -+#include "context.h" -+#include "tap.h" -+#include "znode.h" -+#include "tree_walk.h" -+ -+#if REISER4_DEBUG -+static int tap_invariant(const tap_t * tap); -+static void tap_check(const tap_t * tap); -+#else -+#define tap_check(tap) noop -+#endif -+ -+/** load node tap is pointing to, if not loaded already */ -+int reiser4_tap_load(tap_t * tap) -+{ -+ tap_check(tap); -+ if (tap->loaded == 0) { -+ int result; -+ -+ result = zload_ra(tap->coord->node, &tap->ra_info); -+ if (result != 0) -+ return result; -+ coord_clear_iplug(tap->coord); -+ } -+ ++tap->loaded; -+ tap_check(tap); -+ return 0; -+} -+ -+/** release node tap is pointing to. Dual to tap_load() */ -+void reiser4_tap_relse(tap_t * tap) -+{ -+ tap_check(tap); -+ if (tap->loaded > 0) { -+ --tap->loaded; -+ if (tap->loaded == 0) { -+ zrelse(tap->coord->node); -+ } -+ } -+ tap_check(tap); -+} -+ -+/** -+ * init tap to consist of @coord and @lh. Locks on nodes will be acquired with -+ * @mode -+ */ -+void reiser4_tap_init(tap_t * tap, coord_t * coord, lock_handle * lh, -+ znode_lock_mode mode) -+{ -+ tap->coord = coord; -+ tap->lh = lh; -+ tap->mode = mode; -+ tap->loaded = 0; -+ INIT_LIST_HEAD(&tap->linkage); -+ reiser4_init_ra_info(&tap->ra_info); -+} -+ -+/** add @tap to the per-thread list of all taps */ -+void reiser4_tap_monitor(tap_t * tap) -+{ -+ assert("nikita-2623", tap != NULL); -+ tap_check(tap); -+ list_add(&tap->linkage, reiser4_taps_list()); -+ tap_check(tap); -+} -+ -+/* duplicate @src into @dst. Copy lock handle. @dst is not initially -+ * loaded. */ -+void reiser4_tap_copy(tap_t * dst, tap_t * src) -+{ -+ assert("nikita-3193", src != NULL); -+ assert("nikita-3194", dst != NULL); -+ -+ *dst->coord = *src->coord; -+ if (src->lh->node) -+ copy_lh(dst->lh, src->lh); -+ dst->mode = src->mode; -+ dst->loaded = 0; -+ INIT_LIST_HEAD(&dst->linkage); -+ dst->ra_info = src->ra_info; -+} -+ -+/** finish with @tap */ -+void reiser4_tap_done(tap_t * tap) -+{ -+ assert("nikita-2565", tap != NULL); -+ tap_check(tap); -+ if (tap->loaded > 0) -+ zrelse(tap->coord->node); -+ done_lh(tap->lh); -+ tap->loaded = 0; -+ list_del_init(&tap->linkage); -+ tap->coord->node = NULL; -+} -+ -+/** -+ * move @tap to the new node, locked with @target. Load @target, if @tap was -+ * already loaded. -+ */ -+int reiser4_tap_move(tap_t * tap, lock_handle * target) -+{ -+ int result = 0; -+ -+ assert("nikita-2567", tap != NULL); -+ assert("nikita-2568", target != NULL); -+ assert("nikita-2570", target->node != NULL); -+ assert("nikita-2569", tap->coord->node == tap->lh->node); -+ -+ tap_check(tap); -+ if (tap->loaded > 0) -+ result = zload_ra(target->node, &tap->ra_info); -+ -+ if (result == 0) { -+ if (tap->loaded > 0) -+ zrelse(tap->coord->node); -+ done_lh(tap->lh); -+ copy_lh(tap->lh, target); -+ tap->coord->node = target->node; -+ coord_clear_iplug(tap->coord); -+ } -+ tap_check(tap); -+ return result; -+} -+ -+/** -+ * move @tap to @target. Acquire lock on @target, if @tap was already -+ * loaded. -+ */ -+static int tap_to(tap_t * tap, znode * target) -+{ -+ int result; -+ -+ assert("nikita-2624", tap != NULL); -+ assert("nikita-2625", target != NULL); -+ -+ tap_check(tap); -+ result = 0; -+ if (tap->coord->node != target) { -+ lock_handle here; -+ -+ init_lh(&here); -+ result = longterm_lock_znode(&here, target, -+ tap->mode, ZNODE_LOCK_HIPRI); -+ if (result == 0) { -+ result = reiser4_tap_move(tap, &here); -+ done_lh(&here); -+ } -+ } -+ tap_check(tap); -+ return result; -+} -+ -+/** -+ * move @tap to given @target, loading and locking @target->node if -+ * necessary -+ */ -+int tap_to_coord(tap_t * tap, coord_t * target) -+{ -+ int result; -+ -+ tap_check(tap); -+ result = tap_to(tap, target->node); -+ if (result == 0) -+ coord_dup(tap->coord, target); -+ tap_check(tap); -+ return result; -+} -+ -+/** return list of all taps */ -+struct list_head *reiser4_taps_list(void) -+{ -+ return &get_current_context()->taps; -+} -+ -+/** helper function for go_{next,prev}_{item,unit,node}() */ -+int go_dir_el(tap_t * tap, sideof dir, int units_p) -+{ -+ coord_t dup; -+ coord_t *coord; -+ int result; -+ -+ int (*coord_dir) (coord_t *); -+ int (*get_dir_neighbor) (lock_handle *, znode *, int, int); -+ void (*coord_init) (coord_t *, const znode *); -+ ON_DEBUG(int (*coord_check) (const coord_t *)); -+ -+ assert("nikita-2556", tap != NULL); -+ assert("nikita-2557", tap->coord != NULL); -+ assert("nikita-2558", tap->lh != NULL); -+ assert("nikita-2559", tap->coord->node != NULL); -+ -+ tap_check(tap); -+ if (dir == LEFT_SIDE) { -+ coord_dir = units_p ? coord_prev_unit : coord_prev_item; -+ get_dir_neighbor = reiser4_get_left_neighbor; -+ coord_init = coord_init_last_unit; -+ } else { -+ coord_dir = units_p ? coord_next_unit : coord_next_item; -+ get_dir_neighbor = reiser4_get_right_neighbor; -+ coord_init = coord_init_first_unit; -+ } -+ ON_DEBUG(coord_check = -+ units_p ? coord_is_existing_unit : coord_is_existing_item); -+ assert("nikita-2560", coord_check(tap->coord)); -+ -+ coord = tap->coord; -+ coord_dup(&dup, coord); -+ if (coord_dir(&dup) != 0) { -+ do { -+ /* move to the left neighboring node */ -+ lock_handle dup; -+ -+ init_lh(&dup); -+ result = -+ get_dir_neighbor(&dup, coord->node, (int)tap->mode, -+ GN_CAN_USE_UPPER_LEVELS); -+ if (result == 0) { -+ result = reiser4_tap_move(tap, &dup); -+ if (result == 0) -+ coord_init(tap->coord, dup.node); -+ done_lh(&dup); -+ } -+ /* skip empty nodes */ -+ } while ((result == 0) && node_is_empty(coord->node)); -+ } else { -+ result = 0; -+ coord_dup(coord, &dup); -+ } -+ assert("nikita-2564", ergo(!result, coord_check(tap->coord))); -+ tap_check(tap); -+ return result; -+} -+ -+/** -+ * move @tap to the next unit, transparently crossing item and node -+ * boundaries -+ */ -+int go_next_unit(tap_t * tap) -+{ -+ return go_dir_el(tap, RIGHT_SIDE, 1); -+} -+ -+/** -+ * move @tap to the previous unit, transparently crossing item and node -+ * boundaries -+ */ -+int go_prev_unit(tap_t * tap) -+{ -+ return go_dir_el(tap, LEFT_SIDE, 1); -+} -+ -+/** -+ * @shift times apply @actor to the @tap. This is used to move @tap by -+ * @shift units (or items, or nodes) in either direction. -+ */ -+static int rewind_to(tap_t * tap, go_actor_t actor, int shift) -+{ -+ int result; -+ -+ assert("nikita-2555", shift >= 0); -+ assert("nikita-2562", tap->coord->node == tap->lh->node); -+ -+ tap_check(tap); -+ result = reiser4_tap_load(tap); -+ if (result != 0) -+ return result; -+ -+ for (; shift > 0; --shift) { -+ result = actor(tap); -+ assert("nikita-2563", tap->coord->node == tap->lh->node); -+ if (result != 0) -+ break; -+ } -+ reiser4_tap_relse(tap); -+ tap_check(tap); -+ return result; -+} -+ -+/** move @tap @shift units rightward */ -+int rewind_right(tap_t * tap, int shift) -+{ -+ return rewind_to(tap, go_next_unit, shift); -+} -+ -+/** move @tap @shift units leftward */ -+int rewind_left(tap_t * tap, int shift) -+{ -+ return rewind_to(tap, go_prev_unit, shift); -+} -+ -+#if REISER4_DEBUG -+/** debugging function: print @tap content in human readable form */ -+static void print_tap(const char *prefix, const tap_t * tap) -+{ -+ if (tap == NULL) { -+ printk("%s: null tap\n", prefix); -+ return; -+ } -+ printk("%s: loaded: %i, in-list: %i, node: %p, mode: %s\n", prefix, -+ tap->loaded, (&tap->linkage == tap->linkage.next && -+ &tap->linkage == tap->linkage.prev), -+ tap->lh->node, -+ lock_mode_name(tap->mode)); -+ print_coord("\tcoord", tap->coord, 0); -+} -+ -+/** check [tap-sane] invariant */ -+static int tap_invariant(const tap_t * tap) -+{ -+ /* [tap-sane] invariant */ -+ -+ if (tap == NULL) -+ return 1; -+ /* tap->mode is one of -+ * -+ * {ZNODE_NO_LOCK, ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}, and -+ */ -+ if (tap->mode != ZNODE_NO_LOCK && -+ tap->mode != ZNODE_READ_LOCK && tap->mode != ZNODE_WRITE_LOCK) -+ return 2; -+ /* tap->coord != NULL, and */ -+ if (tap->coord == NULL) -+ return 3; -+ /* tap->lh != NULL, and */ -+ if (tap->lh == NULL) -+ return 4; -+ /* tap->loaded > 0 => znode_is_loaded(tap->coord->node), and */ -+ if (!ergo(tap->loaded, znode_is_loaded(tap->coord->node))) -+ return 5; -+ /* tap->coord->node == tap->lh->node if tap->lh->node is not 0 */ -+ if (tap->lh->node != NULL && tap->coord->node != tap->lh->node) -+ return 6; -+ return 0; -+} -+ -+/** debugging function: check internal @tap consistency */ -+static void tap_check(const tap_t * tap) -+{ -+ int result; -+ -+ result = tap_invariant(tap); -+ if (result != 0) { -+ print_tap("broken", tap); -+ reiser4_panic("nikita-2831", "tap broken: %i\n", result); -+ } -+} -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/tap.h linux-2.6.20/fs/reiser4/tap.h ---- linux-2.6.20.orig/fs/reiser4/tap.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/tap.h 2007-05-06 14:50:43.879031967 +0400 -@@ -0,0 +1,70 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Tree Access Pointers. See tap.c for more details. */ -+ -+#if !defined( __REISER4_TAP_H__ ) -+#define __REISER4_TAP_H__ -+ -+#include "forward.h" -+#include "readahead.h" -+ -+/** -+ tree_access_pointer aka tap. Data structure combining coord_t and lock -+ handle. -+ Invariants involving this data-type, see doc/lock-ordering for details: -+ -+ [tap-sane] -+ */ -+struct tree_access_pointer { -+ /* coord tap is at */ -+ coord_t *coord; -+ /* lock handle on ->coord->node */ -+ lock_handle *lh; -+ /* mode of lock acquired by this tap */ -+ znode_lock_mode mode; -+ /* incremented by reiser4_tap_load(). -+ Decremented by reiser4_tap_relse(). */ -+ int loaded; -+ /* list of taps */ -+ struct list_head linkage; -+ /* read-ahead hint */ -+ ra_info_t ra_info; -+}; -+ -+typedef int (*go_actor_t) (tap_t * tap); -+ -+extern int reiser4_tap_load(tap_t * tap); -+extern void reiser4_tap_relse(tap_t * tap); -+extern void reiser4_tap_init(tap_t * tap, coord_t * coord, lock_handle * lh, -+ znode_lock_mode mode); -+extern void reiser4_tap_monitor(tap_t * tap); -+extern void reiser4_tap_copy(tap_t * dst, tap_t * src); -+extern void reiser4_tap_done(tap_t * tap); -+extern int reiser4_tap_move(tap_t * tap, lock_handle * target); -+extern int tap_to_coord(tap_t * tap, coord_t * target); -+ -+extern int go_dir_el(tap_t * tap, sideof dir, int units_p); -+extern int go_next_unit(tap_t * tap); -+extern int go_prev_unit(tap_t * tap); -+extern int rewind_right(tap_t * tap, int shift); -+extern int rewind_left(tap_t * tap, int shift); -+ -+extern struct list_head *reiser4_taps_list(void); -+ -+#define for_all_taps(tap) \ -+ for (tap = list_entry(reiser4_taps_list()->next, tap_t, linkage); \ -+ reiser4_taps_list() != &tap->linkage; \ -+ tap = list_entry(tap->linkage.next, tap_t, linkage)) -+ -+/* __REISER4_TAP_H__ */ -+#endif -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/tree.c linux-2.6.20/fs/reiser4/tree.c ---- linux-2.6.20.orig/fs/reiser4/tree.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/tree.c 2007-05-06 14:50:43.883033217 +0400 -@@ -0,0 +1,1876 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* -+ * KEYS IN A TREE. -+ * -+ * The tree consists of nodes located on the disk. Node in the tree is either -+ * formatted or unformatted. Formatted node is one that has structure -+ * understood by the tree balancing and traversal code. Formatted nodes are -+ * further classified into leaf and internal nodes. Latter distinctions is -+ * (almost) of only historical importance: general structure of leaves and -+ * internal nodes is the same in Reiser4. Unformatted nodes contain raw data -+ * that are part of bodies of ordinary files and attributes. -+ * -+ * Each node in the tree spawns some interval in the key space. Key ranges for -+ * all nodes in the tree are disjoint. Actually, this only holds in some weak -+ * sense, because of the non-unique keys: intersection of key ranges for -+ * different nodes is either empty, or consists of exactly one key. -+ * -+ * Formatted node consists of a sequence of items. Each item spawns some -+ * interval in key space. Key ranges for all items in a tree are disjoint, -+ * modulo non-unique keys again. Items within nodes are ordered in the key -+ * order of the smallest key in a item. -+ * -+ * Particular type of item can be further split into units. Unit is piece of -+ * item that can be cut from item and moved into another item of the same -+ * time. Units are used by balancing code to repack data during balancing. -+ * -+ * Unit can be further split into smaller entities (for example, extent unit -+ * represents several pages, and it is natural for extent code to operate on -+ * particular pages and even bytes within one unit), but this is of no -+ * relevance to the generic balancing and lookup code. -+ * -+ * Although item is said to "spawn" range or interval of keys, it is not -+ * necessary that item contains piece of data addressable by each and every -+ * key in this range. For example, compound directory item, consisting of -+ * units corresponding to directory entries and keyed by hashes of file names, -+ * looks more as having "discrete spectrum": only some disjoint keys inside -+ * range occupied by this item really address data. -+ * -+ * No than less, each item always has well-defined least (minimal) key, that -+ * is recorded in item header, stored in the node this item is in. Also, item -+ * plugin can optionally define method ->max_key_inside() returning maximal -+ * key that can _possibly_ be located within this item. This method is used -+ * (mainly) to determine when given piece of data should be merged into -+ * existing item, in stead of creating new one. Because of this, even though -+ * ->max_key_inside() can be larger that any key actually located in the item, -+ * intervals -+ * -+ * [ reiser4_min_key( item ), ->max_key_inside( item ) ] -+ * -+ * are still disjoint for all items within the _same_ node. -+ * -+ * In memory node is represented by znode. It plays several roles: -+ * -+ * . something locks are taken on -+ * -+ * . something tracked by transaction manager (this is going to change) -+ * -+ * . something used to access node data -+ * -+ * . something used to maintain tree structure in memory: sibling and -+ * parental linkage. -+ * -+ * . something used to organize nodes into "slums" -+ * -+ * More on znodes see in znode.[ch] -+ * -+ * DELIMITING KEYS -+ * -+ * To simplify balancing, allow some flexibility in locking and speed up -+ * important coord cache optimization, we keep delimiting keys of nodes in -+ * memory. Depending on disk format (implemented by appropriate node plugin) -+ * node on disk can record both left and right delimiting key, only one of -+ * them, or none. Still, our balancing and tree traversal code keep both -+ * delimiting keys for a node that is in memory stored in the znode. When -+ * node is first brought into memory during tree traversal, its left -+ * delimiting key is taken from its parent, and its right delimiting key is -+ * either next key in its parent, or is right delimiting key of parent if -+ * node is the rightmost child of parent. -+ * -+ * Physical consistency of delimiting key is protected by special dk -+ * read-write lock. That is, delimiting keys can only be inspected or -+ * modified under this lock. But dk lock is only sufficient for fast -+ * "pessimistic" check, because to simplify code and to decrease lock -+ * contention, balancing (carry) only updates delimiting keys right before -+ * unlocking all locked nodes on the given tree level. For example, -+ * coord-by-key cache scans LRU list of recently accessed znodes. For each -+ * node it first does fast check under dk spin lock. If key looked for is -+ * not between delimiting keys for this node, next node is inspected and so -+ * on. If key is inside of the key range, long term lock is taken on node -+ * and key range is rechecked. -+ * -+ * COORDINATES -+ * -+ * To find something in the tree, you supply a key, and the key is resolved -+ * by coord_by_key() into a coord (coordinate) that is valid as long as the -+ * node the coord points to remains locked. As mentioned above trees -+ * consist of nodes that consist of items that consist of units. A unit is -+ * the smallest and indivisible piece of tree as far as balancing and tree -+ * search are concerned. Each node, item, and unit can be addressed by -+ * giving its level in the tree and the key occupied by this entity. A node -+ * knows what the key ranges are of the items within it, and how to find its -+ * items and invoke their item handlers, but it does not know how to access -+ * individual units within its items except through the item handlers. -+ * coord is a structure containing a pointer to the node, the ordinal number -+ * of the item within this node (a sort of item offset), and the ordinal -+ * number of the unit within this item. -+ * -+ * TREE LOOKUP -+ * -+ * There are two types of access to the tree: lookup and modification. -+ * -+ * Lookup is a search for the key in the tree. Search can look for either -+ * exactly the key given to it, or for the largest key that is not greater -+ * than the key given to it. This distinction is determined by "bias" -+ * parameter of search routine (coord_by_key()). coord_by_key() either -+ * returns error (key is not in the tree, or some kind of external error -+ * occurred), or successfully resolves key into coord. -+ * -+ * This resolution is done by traversing tree top-to-bottom from root level -+ * to the desired level. On levels above twig level (level one above the -+ * leaf level) nodes consist exclusively of internal items. Internal item is -+ * nothing more than pointer to the tree node on the child level. On twig -+ * level nodes consist of internal items intermixed with extent -+ * items. Internal items form normal search tree structure used by traversal -+ * to descent through the tree. -+ * -+ * TREE LOOKUP OPTIMIZATIONS -+ * -+ * Tree lookup described above is expensive even if all nodes traversed are -+ * already in the memory: for each node binary search within it has to be -+ * performed and binary searches are CPU consuming and tend to destroy CPU -+ * caches. -+ * -+ * Several optimizations are used to work around this: -+ * -+ * . cbk_cache (look-aside cache for tree traversals, see search.c for -+ * details) -+ * -+ * . seals (see seal.[ch]) -+ * -+ * . vroot (see search.c) -+ * -+ * General search-by-key is layered thusly: -+ * -+ * [check seal, if any] --ok--> done -+ * | -+ * failed -+ * | -+ * V -+ * [vroot defined] --no--> node = tree_root -+ * | | -+ * yes | -+ * | | -+ * V | -+ * node = vroot | -+ * | | -+ * | | -+ * | | -+ * V V -+ * [check cbk_cache for key] --ok--> done -+ * | -+ * failed -+ * | -+ * V -+ * [start tree traversal from node] -+ * -+ */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/item/static_stat.h" -+#include "plugin/item/item.h" -+#include "plugin/node/node.h" -+#include "plugin/plugin.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree_walk.h" -+#include "carry.h" -+#include "carry_ops.h" -+#include "tap.h" -+#include "tree.h" -+#include "vfs_ops.h" -+#include "page_cache.h" -+#include "super.h" -+#include "reiser4.h" -+#include "inode.h" -+ -+#include /* for struct super_block */ -+#include -+ -+/* Disk address (block number) never ever used for any real tree node. This is -+ used as block number of "uber" znode. -+ -+ Invalid block addresses are 0 by tradition. -+ -+*/ -+const reiser4_block_nr UBER_TREE_ADDR = 0ull; -+ -+#define CUT_TREE_MIN_ITERATIONS 64 -+ -+static int find_child_by_addr(znode * parent, znode * child, coord_t * result); -+ -+/* return node plugin of coord->node */ -+node_plugin *node_plugin_by_coord(const coord_t * coord) -+{ -+ assert("vs-1", coord != NULL); -+ assert("vs-2", coord->node != NULL); -+ -+ return coord->node->nplug; -+} -+ -+/* insert item into tree. Fields of @coord are updated so that they can be -+ * used by consequent insert operation. */ -+insert_result insert_by_key(reiser4_tree * tree /* tree to insert new item -+ * into */ , -+ const reiser4_key * key /* key of new item */ , -+ reiser4_item_data * data /* parameters for item -+ * creation */ , -+ coord_t * coord /* resulting insertion coord */ , -+ lock_handle * lh /* resulting lock -+ * handle */ , -+ tree_level stop_level /** level where to insert */ , -+ __u32 flags /* insertion flags */ ) -+{ -+ int result; -+ -+ assert("nikita-358", tree != NULL); -+ assert("nikita-360", coord != NULL); -+ -+ result = coord_by_key(tree, key, coord, lh, ZNODE_WRITE_LOCK, -+ FIND_EXACT, stop_level, stop_level, -+ flags | CBK_FOR_INSERT, NULL /*ra_info */ ); -+ switch (result) { -+ default: -+ break; -+ case CBK_COORD_FOUND: -+ result = IBK_ALREADY_EXISTS; -+ break; -+ case CBK_COORD_NOTFOUND: -+ assert("nikita-2017", coord->node != NULL); -+ result = insert_by_coord(coord, data, key, lh, 0 /*flags */ ); -+ break; -+ } -+ return result; -+} -+ -+/* insert item by calling carry. Helper function called if short-cut -+ insertion failed */ -+static insert_result insert_with_carry_by_coord(coord_t * coord, /* coord where to insert */ -+ lock_handle * lh, /* lock handle of insertion -+ * node */ -+ reiser4_item_data * data, /* parameters of new -+ * item */ -+ const reiser4_key * key, /* key of new item */ -+ carry_opcode cop, /* carry operation to perform */ -+ cop_insert_flag flags -+ /* carry flags */ ) -+{ -+ int result; -+ carry_pool *pool; -+ carry_level *lowest_level; -+ carry_insert_data *cdata; -+ carry_op *op; -+ -+ assert("umka-314", coord != NULL); -+ -+ /* allocate carry_pool and 3 carry_level-s */ -+ pool = -+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) + -+ sizeof(*cdata)); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ lowest_level = (carry_level *) (pool + 1); -+ init_carry_level(lowest_level, pool); -+ -+ op = reiser4_post_carry(lowest_level, cop, coord->node, 0); -+ if (IS_ERR(op) || (op == NULL)) { -+ done_carry_pool(pool); -+ return RETERR(op ? PTR_ERR(op) : -EIO); -+ } -+ cdata = (carry_insert_data *) (lowest_level + 3); -+ cdata->coord = coord; -+ cdata->data = data; -+ cdata->key = key; -+ op->u.insert.d = cdata; -+ if (flags == 0) -+ flags = znode_get_tree(coord->node)->carry.insert_flags; -+ op->u.insert.flags = flags; -+ op->u.insert.type = COPT_ITEM_DATA; -+ op->u.insert.child = NULL; -+ if (lh != NULL) { -+ assert("nikita-3245", lh->node == coord->node); -+ lowest_level->track_type = CARRY_TRACK_CHANGE; -+ lowest_level->tracked = lh; -+ } -+ -+ result = reiser4_carry(lowest_level, NULL); -+ done_carry_pool(pool); -+ -+ return result; -+} -+ -+/* form carry queue to perform paste of @data with @key at @coord, and launch -+ its execution by calling carry(). -+ -+ Instruct carry to update @lh it after balancing insertion coord moves into -+ different block. -+ -+*/ -+static int paste_with_carry(coord_t * coord, /* coord of paste */ -+ lock_handle * lh, /* lock handle of node -+ * where item is -+ * pasted */ -+ reiser4_item_data * data, /* parameters of new -+ * item */ -+ const reiser4_key * key, /* key of new item */ -+ unsigned flags /* paste flags */ ) -+{ -+ int result; -+ carry_pool *pool; -+ carry_level *lowest_level; -+ carry_insert_data *cdata; -+ carry_op *op; -+ -+ assert("umka-315", coord != NULL); -+ assert("umka-316", key != NULL); -+ -+ pool = -+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) + -+ sizeof(*cdata)); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ lowest_level = (carry_level *) (pool + 1); -+ init_carry_level(lowest_level, pool); -+ -+ op = reiser4_post_carry(lowest_level, COP_PASTE, coord->node, 0); -+ if (IS_ERR(op) || (op == NULL)) { -+ done_carry_pool(pool); -+ return RETERR(op ? PTR_ERR(op) : -EIO); -+ } -+ cdata = (carry_insert_data *) (lowest_level + 3); -+ cdata->coord = coord; -+ cdata->data = data; -+ cdata->key = key; -+ op->u.paste.d = cdata; -+ if (flags == 0) -+ flags = znode_get_tree(coord->node)->carry.paste_flags; -+ op->u.paste.flags = flags; -+ op->u.paste.type = COPT_ITEM_DATA; -+ if (lh != NULL) { -+ lowest_level->track_type = CARRY_TRACK_CHANGE; -+ lowest_level->tracked = lh; -+ } -+ -+ result = reiser4_carry(lowest_level, NULL); -+ done_carry_pool(pool); -+ -+ return result; -+} -+ -+/* insert item at the given coord. -+ -+ First try to skip carry by directly calling ->create_item() method of node -+ plugin. If this is impossible (there is not enough free space in the node, -+ or leftmost item in the node is created), call insert_with_carry_by_coord() -+ that will do full carry(). -+ -+*/ -+insert_result insert_by_coord(coord_t * coord /* coord where to -+ * insert. coord->node has -+ * to be write locked by -+ * caller */ , -+ reiser4_item_data * data /* data to be -+ * inserted */ , -+ const reiser4_key * key /* key of new item */ , -+ lock_handle * lh /* lock handle of write -+ * lock on node */ , -+ __u32 flags /* insertion flags */ ) -+{ -+ unsigned item_size; -+ int result; -+ znode *node; -+ -+ assert("vs-247", coord != NULL); -+ assert("vs-248", data != NULL); -+ assert("vs-249", data->length >= 0); -+ assert("nikita-1191", znode_is_write_locked(coord->node)); -+ -+ node = coord->node; -+ coord_clear_iplug(coord); -+ result = zload(node); -+ if (result != 0) -+ return result; -+ -+ item_size = space_needed(node, NULL, data, 1); -+ if (item_size > znode_free_space(node) && -+ (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT) -+ && (flags & COPI_DONT_ALLOCATE)) { -+ /* we are forced to use free space of coord->node and new item -+ does not fit into it. -+ -+ Currently we get here only when we allocate and copy units -+ of extent item from a node to its left neighbor during -+ "squalloc"-ing. If @node (this is left neighbor) does not -+ have enough free space - we do not want to attempt any -+ shifting and allocations because we are in squeezing and -+ everything to the left of @node is tightly packed. -+ */ -+ result = -E_NODE_FULL; -+ } else if ((item_size <= znode_free_space(node)) && -+ !coord_is_before_leftmost(coord) && -+ (node_plugin_by_node(node)->fast_insert != NULL) -+ && node_plugin_by_node(node)->fast_insert(coord)) { -+ /* shortcut insertion without carry() overhead. -+ -+ Only possible if: -+ -+ - there is enough free space -+ -+ - insertion is not into the leftmost position in a node -+ (otherwise it would require updating of delimiting key in a -+ parent) -+ -+ - node plugin agrees with this -+ -+ */ -+ result = -+ node_plugin_by_node(node)->create_item(coord, key, data, -+ NULL); -+ znode_make_dirty(node); -+ } else { -+ /* otherwise do full-fledged carry(). */ -+ result = -+ insert_with_carry_by_coord(coord, lh, data, key, COP_INSERT, -+ flags); -+ } -+ zrelse(node); -+ return result; -+} -+ -+/* @coord is set to leaf level and @data is to be inserted to twig level */ -+insert_result -+insert_extent_by_coord(coord_t * -+ coord -+ /* coord where to insert. coord->node * has to be write * locked by caller */ -+ , -+ reiser4_item_data * data /* data to be inserted */ , -+ const reiser4_key * key /* key of new item */ , -+ lock_handle * -+ lh /* lock handle of write lock on * node */ ) -+{ -+ assert("vs-405", coord != NULL); -+ assert("vs-406", data != NULL); -+ assert("vs-407", data->length > 0); -+ assert("vs-408", znode_is_write_locked(coord->node)); -+ assert("vs-409", znode_get_level(coord->node) == LEAF_LEVEL); -+ -+ return insert_with_carry_by_coord(coord, lh, data, key, COP_EXTENT, -+ 0 /*flags */ ); -+} -+ -+/* Insert into the item at the given coord. -+ -+ First try to skip carry by directly calling ->paste() method of item -+ plugin. If this is impossible (there is not enough free space in the node, -+ or we are pasting into leftmost position in the node), call -+ paste_with_carry() that will do full carry(). -+ -+*/ -+/* paste_into_item */ -+int insert_into_item(coord_t * coord /* coord of pasting */ , -+ lock_handle * lh /* lock handle on node involved */ , -+ const reiser4_key * key /* key of unit being pasted */ , -+ reiser4_item_data * data /* parameters for new unit */ , -+ unsigned flags /* insert/paste flags */ ) -+{ -+ int result; -+ int size_change; -+ node_plugin *nplug; -+ item_plugin *iplug; -+ -+ assert("umka-317", coord != NULL); -+ assert("umka-318", key != NULL); -+ -+ iplug = item_plugin_by_coord(coord); -+ nplug = node_plugin_by_coord(coord); -+ -+ assert("nikita-1480", iplug == data->iplug); -+ -+ size_change = space_needed(coord->node, coord, data, 0); -+ if (size_change > (int)znode_free_space(coord->node) && -+ (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT) -+ && (flags & COPI_DONT_ALLOCATE)) { -+ /* we are forced to use free space of coord->node and new data -+ does not fit into it. */ -+ return -E_NODE_FULL; -+ } -+ -+ /* shortcut paste without carry() overhead. -+ -+ Only possible if: -+ -+ - there is enough free space -+ -+ - paste is not into the leftmost unit in a node (otherwise -+ it would require updating of delimiting key in a parent) -+ -+ - node plugin agrees with this -+ -+ - item plugin agrees with us -+ */ -+ if (size_change <= (int)znode_free_space(coord->node) && -+ (coord->item_pos != 0 || -+ coord->unit_pos != 0 || coord->between == AFTER_UNIT) && -+ coord->unit_pos != 0 && nplug->fast_paste != NULL && -+ nplug->fast_paste(coord) && -+ iplug->b.fast_paste != NULL && iplug->b.fast_paste(coord)) { -+ if (size_change > 0) -+ nplug->change_item_size(coord, size_change); -+ /* NOTE-NIKITA: huh? where @key is used? */ -+ result = iplug->b.paste(coord, data, NULL); -+ if (size_change < 0) -+ nplug->change_item_size(coord, size_change); -+ znode_make_dirty(coord->node); -+ } else -+ /* otherwise do full-fledged carry(). */ -+ result = paste_with_carry(coord, lh, data, key, flags); -+ return result; -+} -+ -+/* this either appends or truncates item @coord */ -+int reiser4_resize_item(coord_t * coord /* coord of item being resized */ , -+ reiser4_item_data * data /* parameters of resize */ , -+ reiser4_key * key /* key of new unit */ , -+ lock_handle * lh /* lock handle of node -+ * being modified */ , -+ cop_insert_flag flags /* carry flags */ ) -+{ -+ int result; -+ znode *node; -+ -+ assert("nikita-362", coord != NULL); -+ assert("nikita-363", data != NULL); -+ assert("vs-245", data->length != 0); -+ -+ node = coord->node; -+ coord_clear_iplug(coord); -+ result = zload(node); -+ if (result != 0) -+ return result; -+ -+ if (data->length < 0) -+ result = node_plugin_by_coord(coord)->shrink_item(coord, -+ -data->length); -+ else -+ result = insert_into_item(coord, lh, key, data, flags); -+ -+ zrelse(node); -+ return result; -+} -+ -+/* insert flow @f */ -+int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f) -+{ -+ int result; -+ carry_pool *pool; -+ carry_level *lowest_level; -+ reiser4_item_data *data; -+ carry_op *op; -+ -+ pool = -+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) + -+ sizeof(*data)); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ lowest_level = (carry_level *) (pool + 1); -+ init_carry_level(lowest_level, pool); -+ -+ op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node, -+ 0 /* operate directly on coord -> node */ ); -+ if (IS_ERR(op) || (op == NULL)) { -+ done_carry_pool(pool); -+ return RETERR(op ? PTR_ERR(op) : -EIO); -+ } -+ -+ /* these are permanent during insert_flow */ -+ data = (reiser4_item_data *) (lowest_level + 3); -+ data->user = 1; -+ data->iplug = item_plugin_by_id(FORMATTING_ID); -+ data->arg = NULL; -+ /* data.length and data.data will be set before calling paste or -+ insert */ -+ data->length = 0; -+ data->data = NULL; -+ -+ op->u.insert_flow.flags = 0; -+ op->u.insert_flow.insert_point = coord; -+ op->u.insert_flow.flow = f; -+ op->u.insert_flow.data = data; -+ op->u.insert_flow.new_nodes = 0; -+ -+ lowest_level->track_type = CARRY_TRACK_CHANGE; -+ lowest_level->tracked = lh; -+ -+ result = reiser4_carry(lowest_level, NULL); -+ done_carry_pool(pool); -+ -+ return result; -+} -+ -+/* Given a coord in parent node, obtain a znode for the corresponding child */ -+znode *child_znode(const coord_t * parent_coord /* coord of pointer to -+ * child */ , -+ znode * parent /* parent of child */ , -+ int incore_p /* if !0 only return child if already in -+ * memory */ , -+ int setup_dkeys_p /* if !0 update delimiting keys of -+ * child */ ) -+{ -+ znode *child; -+ -+ assert("nikita-1374", parent_coord != NULL); -+ assert("nikita-1482", parent != NULL); -+#if REISER4_DEBUG -+ if (setup_dkeys_p) -+ assert_rw_not_locked(&(znode_get_tree(parent)->dk_lock)); -+#endif -+ assert("nikita-2947", znode_is_any_locked(parent)); -+ -+ if (znode_get_level(parent) <= LEAF_LEVEL) { -+ /* trying to get child of leaf node */ -+ warning("nikita-1217", "Child of maize?"); -+ return ERR_PTR(RETERR(-EIO)); -+ } -+ if (item_is_internal(parent_coord)) { -+ reiser4_block_nr addr; -+ item_plugin *iplug; -+ reiser4_tree *tree; -+ -+ iplug = item_plugin_by_coord(parent_coord); -+ assert("vs-512", iplug->s.internal.down_link); -+ iplug->s.internal.down_link(parent_coord, NULL, &addr); -+ -+ tree = znode_get_tree(parent); -+ if (incore_p) -+ child = zlook(tree, &addr); -+ else -+ child = -+ zget(tree, &addr, parent, -+ znode_get_level(parent) - 1, -+ reiser4_ctx_gfp_mask_get()); -+ if ((child != NULL) && !IS_ERR(child) && setup_dkeys_p) -+ set_child_delimiting_keys(parent, parent_coord, child); -+ } else { -+ warning("nikita-1483", "Internal item expected"); -+ child = ERR_PTR(RETERR(-EIO)); -+ } -+ return child; -+} -+ -+/* remove znode from transaction */ -+static void uncapture_znode(znode * node) -+{ -+ struct page *page; -+ -+ assert("zam-1001", ZF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ -+ if (!reiser4_blocknr_is_fake(znode_get_block(node))) { -+ int ret; -+ -+ /* An already allocated block goes right to the atom's delete set. */ -+ ret = -+ reiser4_dealloc_block(znode_get_block(node), 0, -+ BA_DEFER | BA_FORMATTED); -+ if (ret) -+ warning("zam-942", -+ "can\'t add a block (%llu) number to atom's delete set\n", -+ (unsigned long long)(*znode_get_block(node))); -+ -+ spin_lock_znode(node); -+ /* Here we return flush reserved block which was reserved at the -+ * moment when this allocated node was marked dirty and still -+ * not used by flush in node relocation procedure. */ -+ if (ZF_ISSET(node, JNODE_FLUSH_RESERVED)) { -+ txn_atom *atom; -+ -+ atom = jnode_get_atom(ZJNODE(node)); -+ assert("zam-939", atom != NULL); -+ spin_unlock_znode(node); -+ flush_reserved2grabbed(atom, (__u64) 1); -+ spin_unlock_atom(atom); -+ } else -+ spin_unlock_znode(node); -+ } else { -+ /* znode has assigned block which is counted as "fake -+ allocated". Return it back to "free blocks") */ -+ fake_allocated2free((__u64) 1, BA_FORMATTED); -+ } -+ -+ /* -+ * uncapture page from transaction. There is a possibility of a race -+ * with ->releasepage(): reiser4_releasepage() detaches page from this -+ * jnode and we have nothing to uncapture. To avoid this, get -+ * reference of node->pg under jnode spin lock. reiser4_uncapture_page() -+ * will deal with released page itself. -+ */ -+ spin_lock_znode(node); -+ page = znode_page(node); -+ if (likely(page != NULL)) { -+ /* -+ * reiser4_uncapture_page() can only be called when we are sure -+ * that znode is pinned in memory, which we are, because -+ * forget_znode() is only called from longterm_unlock_znode(). -+ */ -+ page_cache_get(page); -+ spin_unlock_znode(node); -+ lock_page(page); -+ reiser4_uncapture_page(page); -+ unlock_page(page); -+ page_cache_release(page); -+ } else { -+ txn_atom *atom; -+ -+ /* handle "flush queued" znodes */ -+ while (1) { -+ atom = jnode_get_atom(ZJNODE(node)); -+ assert("zam-943", atom != NULL); -+ -+ if (!ZF_ISSET(node, JNODE_FLUSH_QUEUED) -+ || !atom->nr_running_queues) -+ break; -+ -+ spin_unlock_znode(node); -+ reiser4_atom_wait_event(atom); -+ spin_lock_znode(node); -+ } -+ -+ reiser4_uncapture_block(ZJNODE(node)); -+ spin_unlock_atom(atom); -+ zput(node); -+ } -+} -+ -+/* This is called from longterm_unlock_znode() when last lock is released from -+ the node that has been removed from the tree. At this point node is removed -+ from sibling list and its lock is invalidated. */ -+void forget_znode(lock_handle * handle) -+{ -+ znode *node; -+ reiser4_tree *tree; -+ -+ assert("umka-319", handle != NULL); -+ -+ node = handle->node; -+ tree = znode_get_tree(node); -+ -+ assert("vs-164", znode_is_write_locked(node)); -+ assert("nikita-1280", ZF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ assert_rw_locked(&(node->lock.guard)); -+ -+ /* We assume that this node was detached from its parent before -+ * unlocking, it gives no way to reach this node from parent through a -+ * down link. The node should have no children and, thereby, can't be -+ * reached from them by their parent pointers. The only way to obtain a -+ * reference to the node is to use sibling pointers from its left and -+ * right neighbors. In the next several lines we remove the node from -+ * the sibling list. */ -+ -+ write_lock_tree(tree); -+ sibling_list_remove(node); -+ znode_remove(node, tree); -+ write_unlock_tree(tree); -+ -+ /* Here we set JNODE_DYING and cancel all pending lock requests. It -+ * forces all lock requestor threads to repeat iterations of getting -+ * lock on a child, neighbor or parent node. But, those threads can't -+ * come to this node again, because this node is no longer a child, -+ * neighbor or parent of any other node. This order of znode -+ * invalidation does not allow other threads to waste cpu time is a busy -+ * loop, trying to lock dying object. The exception is in the flush -+ * code when we take node directly from atom's capture list.*/ -+ reiser4_invalidate_lock(handle); -+ uncapture_znode(node); -+} -+ -+/* Check that internal item at @pointer really contains pointer to @child. */ -+int check_tree_pointer(const coord_t * pointer /* would-be pointer to -+ * @child */ , -+ const znode * child /* child znode */ ) -+{ -+ assert("nikita-1016", pointer != NULL); -+ assert("nikita-1017", child != NULL); -+ assert("nikita-1018", pointer->node != NULL); -+ -+ assert("nikita-1325", znode_is_any_locked(pointer->node)); -+ -+ assert("nikita-2985", -+ znode_get_level(pointer->node) == znode_get_level(child) + 1); -+ -+ coord_clear_iplug((coord_t *) pointer); -+ -+ if (coord_is_existing_unit(pointer)) { -+ item_plugin *iplug; -+ reiser4_block_nr addr; -+ -+ if (item_is_internal(pointer)) { -+ iplug = item_plugin_by_coord(pointer); -+ assert("vs-513", iplug->s.internal.down_link); -+ iplug->s.internal.down_link(pointer, NULL, &addr); -+ /* check that cached value is correct */ -+ if (disk_addr_eq(&addr, znode_get_block(child))) { -+ return NS_FOUND; -+ } -+ } -+ } -+ /* warning ("jmacd-1002", "tree pointer incorrect"); */ -+ return NS_NOT_FOUND; -+} -+ -+/* find coord of pointer to new @child in @parent. -+ -+ Find the &coord_t in the @parent where pointer to a given @child will -+ be in. -+ -+*/ -+int find_new_child_ptr(znode * parent /* parent znode, passed locked */ , -+ znode * -+ child UNUSED_ARG /* child znode, passed locked */ , -+ znode * left /* left brother of new node */ , -+ coord_t * result /* where result is stored in */ ) -+{ -+ int ret; -+ -+ assert("nikita-1486", parent != NULL); -+ assert("nikita-1487", child != NULL); -+ assert("nikita-1488", result != NULL); -+ -+ ret = find_child_ptr(parent, left, result); -+ if (ret != NS_FOUND) { -+ warning("nikita-1489", "Cannot find brother position: %i", ret); -+ return RETERR(-EIO); -+ } else { -+ result->between = AFTER_UNIT; -+ return RETERR(NS_NOT_FOUND); -+ } -+} -+ -+/* find coord of pointer to @child in @parent. -+ -+ Find the &coord_t in the @parent where pointer to a given @child is in. -+ -+*/ -+int find_child_ptr(znode * parent /* parent znode, passed locked */ , -+ znode * child /* child znode, passed locked */ , -+ coord_t * result /* where result is stored in */ ) -+{ -+ int lookup_res; -+ node_plugin *nplug; -+ /* left delimiting key of a child */ -+ reiser4_key ld; -+ reiser4_tree *tree; -+ -+ assert("nikita-934", parent != NULL); -+ assert("nikita-935", child != NULL); -+ assert("nikita-936", result != NULL); -+ assert("zam-356", znode_is_loaded(parent)); -+ -+ coord_init_zero(result); -+ result->node = parent; -+ -+ nplug = parent->nplug; -+ assert("nikita-939", nplug != NULL); -+ -+ tree = znode_get_tree(parent); -+ /* NOTE-NIKITA taking read-lock on tree here assumes that @result is -+ * not aliased to ->in_parent of some znode. Otherwise, -+ * parent_coord_to_coord() below would modify data protected by tree -+ * lock. */ -+ read_lock_tree(tree); -+ /* fast path. Try to use cached value. Lock tree to keep -+ node->pos_in_parent and pos->*_blocknr consistent. */ -+ if (child->in_parent.item_pos + 1 != 0) { -+ parent_coord_to_coord(&child->in_parent, result); -+ if (check_tree_pointer(result, child) == NS_FOUND) { -+ read_unlock_tree(tree); -+ return NS_FOUND; -+ } -+ -+ child->in_parent.item_pos = (unsigned short)~0; -+ } -+ read_unlock_tree(tree); -+ -+ /* is above failed, find some key from @child. We are looking for the -+ least key in a child. */ -+ read_lock_dk(tree); -+ ld = *znode_get_ld_key(child); -+ read_unlock_dk(tree); -+ /* -+ * now, lookup parent with key just found. Note, that left delimiting -+ * key doesn't identify node uniquely, because (in extremely rare -+ * case) two nodes can have equal left delimiting keys, if one of them -+ * is completely filled with directory entries that all happened to be -+ * hash collision. But, we check block number in check_tree_pointer() -+ * and, so, are safe. -+ */ -+ lookup_res = nplug->lookup(parent, &ld, FIND_EXACT, result); -+ /* update cached pos_in_node */ -+ if (lookup_res == NS_FOUND) { -+ write_lock_tree(tree); -+ coord_to_parent_coord(result, &child->in_parent); -+ write_unlock_tree(tree); -+ lookup_res = check_tree_pointer(result, child); -+ } -+ if (lookup_res == NS_NOT_FOUND) -+ lookup_res = find_child_by_addr(parent, child, result); -+ return lookup_res; -+} -+ -+/* find coord of pointer to @child in @parent by scanning -+ -+ Find the &coord_t in the @parent where pointer to a given @child -+ is in by scanning all internal items in @parent and comparing block -+ numbers in them with that of @child. -+ -+*/ -+static int find_child_by_addr(znode * parent /* parent znode, passed locked */ , -+ znode * child /* child znode, passed locked */ , -+ coord_t * result /* where result is stored in */ ) -+{ -+ int ret; -+ -+ assert("nikita-1320", parent != NULL); -+ assert("nikita-1321", child != NULL); -+ assert("nikita-1322", result != NULL); -+ -+ ret = NS_NOT_FOUND; -+ -+ for_all_units(result, parent) { -+ if (check_tree_pointer(result, child) == NS_FOUND) { -+ write_lock_tree(znode_get_tree(parent)); -+ coord_to_parent_coord(result, &child->in_parent); -+ write_unlock_tree(znode_get_tree(parent)); -+ ret = NS_FOUND; -+ break; -+ } -+ } -+ return ret; -+} -+ -+/* true, if @addr is "unallocated block number", which is just address, with -+ highest bit set. */ -+int is_disk_addr_unallocated(const reiser4_block_nr * addr /* address to -+ * check */ ) -+{ -+ assert("nikita-1766", addr != NULL); -+ cassert(sizeof(reiser4_block_nr) == 8); -+ return (*addr & REISER4_BLOCKNR_STATUS_BIT_MASK) == -+ REISER4_UNALLOCATED_STATUS_VALUE; -+} -+ -+/* returns true if removing bytes of given range of key [from_key, to_key] -+ causes removing of whole item @from */ -+static int -+item_removed_completely(coord_t * from, const reiser4_key * from_key, -+ const reiser4_key * to_key) -+{ -+ item_plugin *iplug; -+ reiser4_key key_in_item; -+ -+ assert("umka-325", from != NULL); -+ assert("", item_is_extent(from)); -+ -+ /* check first key just for case */ -+ item_key_by_coord(from, &key_in_item); -+ if (keygt(from_key, &key_in_item)) -+ return 0; -+ -+ /* check last key */ -+ iplug = item_plugin_by_coord(from); -+ assert("vs-611", iplug && iplug->s.file.append_key); -+ -+ iplug->s.file.append_key(from, &key_in_item); -+ set_key_offset(&key_in_item, get_key_offset(&key_in_item) - 1); -+ -+ if (keylt(to_key, &key_in_item)) -+ /* last byte is not removed */ -+ return 0; -+ return 1; -+} -+ -+/* helper function for prepare_twig_kill(): @left and @right are formatted -+ * neighbors of extent item being completely removed. Load and lock neighbors -+ * and store lock handles into @cdata for later use by kill_hook_extent() */ -+static int -+prepare_children(znode * left, znode * right, carry_kill_data * kdata) -+{ -+ int result; -+ int left_loaded; -+ int right_loaded; -+ -+ result = 0; -+ left_loaded = right_loaded = 0; -+ -+ if (left != NULL) { -+ result = zload(left); -+ if (result == 0) { -+ left_loaded = 1; -+ result = longterm_lock_znode(kdata->left, left, -+ ZNODE_READ_LOCK, -+ ZNODE_LOCK_LOPRI); -+ } -+ } -+ if (result == 0 && right != NULL) { -+ result = zload(right); -+ if (result == 0) { -+ right_loaded = 1; -+ result = longterm_lock_znode(kdata->right, right, -+ ZNODE_READ_LOCK, -+ ZNODE_LOCK_HIPRI | -+ ZNODE_LOCK_NONBLOCK); -+ } -+ } -+ if (result != 0) { -+ done_lh(kdata->left); -+ done_lh(kdata->right); -+ if (left_loaded != 0) -+ zrelse(left); -+ if (right_loaded != 0) -+ zrelse(right); -+ } -+ return result; -+} -+ -+static void done_children(carry_kill_data * kdata) -+{ -+ if (kdata->left != NULL && kdata->left->node != NULL) { -+ zrelse(kdata->left->node); -+ done_lh(kdata->left); -+ } -+ if (kdata->right != NULL && kdata->right->node != NULL) { -+ zrelse(kdata->right->node); -+ done_lh(kdata->right); -+ } -+} -+ -+/* part of cut_node. It is called when cut_node is called to remove or cut part -+ of extent item. When head of that item is removed - we have to update right -+ delimiting of left neighbor of extent. When item is removed completely - we -+ have to set sibling link between left and right neighbor of removed -+ extent. This may return -E_DEADLOCK because of trying to get left neighbor -+ locked. So, caller should repeat an attempt -+*/ -+/* Audited by: umka (2002.06.16) */ -+static int -+prepare_twig_kill(carry_kill_data * kdata, znode * locked_left_neighbor) -+{ -+ int result; -+ reiser4_key key; -+ lock_handle left_lh; -+ lock_handle right_lh; -+ coord_t left_coord; -+ coord_t *from; -+ znode *left_child; -+ znode *right_child; -+ reiser4_tree *tree; -+ int left_zloaded_here, right_zloaded_here; -+ -+ from = kdata->params.from; -+ assert("umka-326", from != NULL); -+ assert("umka-327", kdata->params.to != NULL); -+ -+ /* for one extent item only yet */ -+ assert("vs-591", item_is_extent(from)); -+ assert("vs-592", from->item_pos == kdata->params.to->item_pos); -+ -+ if ((kdata->params.from_key -+ && keygt(kdata->params.from_key, item_key_by_coord(from, &key))) -+ || from->unit_pos != 0) { -+ /* head of item @from is not removed, there is nothing to -+ worry about */ -+ return 0; -+ } -+ -+ result = 0; -+ left_zloaded_here = 0; -+ right_zloaded_here = 0; -+ -+ left_child = right_child = NULL; -+ -+ coord_dup(&left_coord, from); -+ init_lh(&left_lh); -+ init_lh(&right_lh); -+ if (coord_prev_unit(&left_coord)) { -+ /* @from is leftmost item in its node */ -+ if (!locked_left_neighbor) { -+ result = -+ reiser4_get_left_neighbor(&left_lh, from->node, -+ ZNODE_READ_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ switch (result) { -+ case 0: -+ break; -+ case -E_NO_NEIGHBOR: -+ /* there is no formatted node to the left of -+ from->node */ -+ warning("vs-605", -+ "extent item has smallest key in " -+ "the tree and it is about to be removed"); -+ return 0; -+ case -E_DEADLOCK: -+ /* need to restart */ -+ default: -+ return result; -+ } -+ -+ /* we have acquired left neighbor of from->node */ -+ result = zload(left_lh.node); -+ if (result) -+ goto done; -+ -+ locked_left_neighbor = left_lh.node; -+ } else { -+ /* squalloc_right_twig_cut should have supplied locked -+ * left neighbor */ -+ assert("vs-834", -+ znode_is_write_locked(locked_left_neighbor)); -+ result = zload(locked_left_neighbor); -+ if (result) -+ return result; -+ } -+ -+ left_zloaded_here = 1; -+ coord_init_last_unit(&left_coord, locked_left_neighbor); -+ } -+ -+ if (!item_is_internal(&left_coord)) { -+ /* what else but extent can be on twig level */ -+ assert("vs-606", item_is_extent(&left_coord)); -+ -+ /* there is no left formatted child */ -+ if (left_zloaded_here) -+ zrelse(locked_left_neighbor); -+ done_lh(&left_lh); -+ return 0; -+ } -+ -+ tree = znode_get_tree(left_coord.node); -+ left_child = child_znode(&left_coord, left_coord.node, 1, 0); -+ -+ if (IS_ERR(left_child)) { -+ result = PTR_ERR(left_child); -+ goto done; -+ } -+ -+ /* left child is acquired, calculate new right delimiting key for it -+ and get right child if it is necessary */ -+ if (item_removed_completely -+ (from, kdata->params.from_key, kdata->params.to_key)) { -+ /* try to get right child of removed item */ -+ coord_t right_coord; -+ -+ assert("vs-607", -+ kdata->params.to->unit_pos == -+ coord_last_unit_pos(kdata->params.to)); -+ coord_dup(&right_coord, kdata->params.to); -+ if (coord_next_unit(&right_coord)) { -+ /* @to is rightmost unit in the node */ -+ result = -+ reiser4_get_right_neighbor(&right_lh, from->node, -+ ZNODE_READ_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ switch (result) { -+ case 0: -+ result = zload(right_lh.node); -+ if (result) -+ goto done; -+ -+ right_zloaded_here = 1; -+ coord_init_first_unit(&right_coord, -+ right_lh.node); -+ item_key_by_coord(&right_coord, &key); -+ break; -+ -+ case -E_NO_NEIGHBOR: -+ /* there is no formatted node to the right of -+ from->node */ -+ read_lock_dk(tree); -+ key = *znode_get_rd_key(from->node); -+ read_unlock_dk(tree); -+ right_coord.node = NULL; -+ result = 0; -+ break; -+ default: -+ /* real error */ -+ goto done; -+ } -+ } else { -+ /* there is an item to the right of @from - take its key */ -+ item_key_by_coord(&right_coord, &key); -+ } -+ -+ /* try to get right child of @from */ -+ if (right_coord.node && /* there is right neighbor of @from */ -+ item_is_internal(&right_coord)) { /* it is internal item */ -+ right_child = child_znode(&right_coord, -+ right_coord.node, 1, 0); -+ -+ if (IS_ERR(right_child)) { -+ result = PTR_ERR(right_child); -+ goto done; -+ } -+ -+ } -+ /* whole extent is removed between znodes left_child and right_child. Prepare them for linking and -+ update of right delimiting key of left_child */ -+ result = prepare_children(left_child, right_child, kdata); -+ } else { -+ /* head of item @to is removed. left_child has to get right delimting key update. Prepare it for that */ -+ result = prepare_children(left_child, NULL, kdata); -+ } -+ -+ done: -+ if (right_child) -+ zput(right_child); -+ if (right_zloaded_here) -+ zrelse(right_lh.node); -+ done_lh(&right_lh); -+ -+ if (left_child) -+ zput(left_child); -+ if (left_zloaded_here) -+ zrelse(locked_left_neighbor); -+ done_lh(&left_lh); -+ return result; -+} -+ -+/* this is used to remove part of node content between coordinates @from and @to. Units to which @from and @to are set -+ are to be cut completely */ -+/* for try_to_merge_with_left, delete_copied, reiser4_delete_node */ -+int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key, /* first key to be removed */ -+ const reiser4_key * to_key, /* last key to be removed */ -+ reiser4_key * -+ smallest_removed /* smallest key actually removed */ ) -+{ -+ int result; -+ carry_pool *pool; -+ carry_level *lowest_level; -+ carry_cut_data *cut_data; -+ carry_op *op; -+ -+ assert("vs-1715", coord_compare(from, to) != COORD_CMP_ON_RIGHT); -+ -+ pool = -+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) + -+ sizeof(*cut_data)); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ lowest_level = (carry_level *) (pool + 1); -+ init_carry_level(lowest_level, pool); -+ -+ op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0); -+ assert("vs-1509", op != 0); -+ if (IS_ERR(op)) { -+ done_carry_pool(pool); -+ return PTR_ERR(op); -+ } -+ -+ cut_data = (carry_cut_data *) (lowest_level + 3); -+ cut_data->params.from = from; -+ cut_data->params.to = to; -+ cut_data->params.from_key = from_key; -+ cut_data->params.to_key = to_key; -+ cut_data->params.smallest_removed = smallest_removed; -+ -+ op->u.cut_or_kill.is_cut = 1; -+ op->u.cut_or_kill.u.cut = cut_data; -+ -+ result = reiser4_carry(lowest_level, NULL); -+ done_carry_pool(pool); -+ -+ return result; -+} -+ -+/* cut part of the node -+ -+ Cut part or whole content of node. -+ -+ cut data between @from and @to of @from->node and call carry() to make -+ corresponding changes in the tree. @from->node may become empty. If so - -+ pointer to it will be removed. Neighboring nodes are not changed. Smallest -+ removed key is stored in @smallest_removed -+ -+*/ -+int kill_node_content(coord_t * from, /* coord of the first unit/item that will be eliminated */ -+ coord_t * to, /* coord of the last unit/item that will be eliminated */ -+ const reiser4_key * from_key, /* first key to be removed */ -+ const reiser4_key * to_key, /* last key to be removed */ -+ reiser4_key * smallest_removed, /* smallest key actually removed */ -+ znode * locked_left_neighbor, /* this is set when kill_node_content is called with left neighbor -+ * locked (in squalloc_right_twig_cut, namely) */ -+ struct inode *inode, /* inode of file whose item (or its part) is to be killed. This is necessary to -+ invalidate pages together with item pointing to them */ -+ int truncate) -+{ /* this call is made for file truncate) */ -+ int result; -+ carry_pool *pool; -+ carry_level *lowest_level; -+ carry_kill_data *kdata; -+ lock_handle *left_child; -+ lock_handle *right_child; -+ carry_op *op; -+ -+ assert("umka-328", from != NULL); -+ assert("vs-316", !node_is_empty(from->node)); -+ assert("nikita-1812", coord_is_existing_unit(from) -+ && coord_is_existing_unit(to)); -+ -+ /* allocate carry_pool, 3 carry_level-s, carry_kill_data and structures for kill_hook_extent */ -+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) + -+ sizeof(carry_kill_data) + -+ 2 * sizeof(lock_handle) + -+ 5 * sizeof(reiser4_key) + 2 * sizeof(coord_t)); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ -+ lowest_level = (carry_level *) (pool + 1); -+ init_carry_level(lowest_level, pool); -+ -+ kdata = (carry_kill_data *) (lowest_level + 3); -+ left_child = (lock_handle *) (kdata + 1); -+ right_child = left_child + 1; -+ -+ init_lh(left_child); -+ init_lh(right_child); -+ -+ kdata->params.from = from; -+ kdata->params.to = to; -+ kdata->params.from_key = from_key; -+ kdata->params.to_key = to_key; -+ kdata->params.smallest_removed = smallest_removed; -+ kdata->params.truncate = truncate; -+ kdata->flags = 0; -+ kdata->inode = inode; -+ kdata->left = left_child; -+ kdata->right = right_child; -+ /* memory for 5 reiser4_key and 2 coord_t will be used in kill_hook_extent */ -+ kdata->buf = (char *)(right_child + 1); -+ -+ if (znode_get_level(from->node) == TWIG_LEVEL && item_is_extent(from)) { -+ /* left child of extent item may have to get updated right -+ delimiting key and to get linked with right child of extent -+ @from if it will be removed completely */ -+ result = prepare_twig_kill(kdata, locked_left_neighbor); -+ if (result) { -+ done_children(kdata); -+ done_carry_pool(pool); -+ return result; -+ } -+ } -+ -+ op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0); -+ if (IS_ERR(op) || (op == NULL)) { -+ done_children(kdata); -+ done_carry_pool(pool); -+ return RETERR(op ? PTR_ERR(op) : -EIO); -+ } -+ -+ op->u.cut_or_kill.is_cut = 0; -+ op->u.cut_or_kill.u.kill = kdata; -+ -+ result = reiser4_carry(lowest_level, NULL); -+ -+ done_children(kdata); -+ done_carry_pool(pool); -+ return result; -+} -+ -+void -+fake_kill_hook_tail(struct inode *inode, loff_t start, loff_t end, int truncate) -+{ -+ if (reiser4_inode_get_flag(inode, REISER4_HAS_MMAP)) { -+ pgoff_t start_pg, end_pg; -+ -+ start_pg = start >> PAGE_CACHE_SHIFT; -+ end_pg = (end - 1) >> PAGE_CACHE_SHIFT; -+ -+ if ((start & (PAGE_CACHE_SIZE - 1)) == 0) { -+ /* -+ * kill up to the page boundary. -+ */ -+ assert("vs-123456", start_pg == end_pg); -+ reiser4_invalidate_pages(inode->i_mapping, start_pg, 1, -+ truncate); -+ } else if (start_pg != end_pg) { -+ /* -+ * page boundary is within killed portion of node. -+ */ -+ assert("vs-654321", end_pg - start_pg == 1); -+ reiser4_invalidate_pages(inode->i_mapping, end_pg, -+ end_pg - start_pg, 1); -+ } -+ } -+ inode_sub_bytes(inode, end - start); -+} -+ -+/** -+ * Delete whole @node from the reiser4 tree without loading it. -+ * -+ * @left: locked left neighbor, -+ * @node: node to be deleted, -+ * @smallest_removed: leftmost key of deleted node, -+ * @object: inode pointer, if we truncate a file body. -+ * @truncate: true if called for file truncate. -+ * -+ * @return: 0 if success, error code otherwise. -+ * -+ * NOTE: if @object!=NULL we assume that @smallest_removed != NULL and it -+ * contains the right value of the smallest removed key from the previous -+ * cut_worker() iteration. This is needed for proper accounting of -+ * "i_blocks" and "i_bytes" fields of the @object. -+ */ -+int reiser4_delete_node(znode * node, reiser4_key * smallest_removed, -+ struct inode *object, int truncate) -+{ -+ lock_handle parent_lock; -+ coord_t cut_from; -+ coord_t cut_to; -+ reiser4_tree *tree; -+ int ret; -+ -+ assert("zam-937", node != NULL); -+ assert("zam-933", znode_is_write_locked(node)); -+ assert("zam-999", smallest_removed != NULL); -+ -+ init_lh(&parent_lock); -+ -+ ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK); -+ if (ret) -+ return ret; -+ -+ assert("zam-934", !znode_above_root(parent_lock.node)); -+ -+ ret = zload(parent_lock.node); -+ if (ret) -+ goto failed_nozrelse; -+ -+ ret = find_child_ptr(parent_lock.node, node, &cut_from); -+ if (ret) -+ goto failed; -+ -+ /* decrement child counter and set parent pointer to NULL before -+ deleting the list from parent node because of checks in -+ internal_kill_item_hook (we can delete the last item from the parent -+ node, the parent node is going to be deleted and its c_count should -+ be zero). */ -+ -+ tree = znode_get_tree(node); -+ write_lock_tree(tree); -+ init_parent_coord(&node->in_parent, NULL); -+ --parent_lock.node->c_count; -+ write_unlock_tree(tree); -+ -+ assert("zam-989", item_is_internal(&cut_from)); -+ -+ /* @node should be deleted after unlocking. */ -+ ZF_SET(node, JNODE_HEARD_BANSHEE); -+ -+ /* remove a pointer from the parent node to the node being deleted. */ -+ coord_dup(&cut_to, &cut_from); -+ /* FIXME: shouldn't this be kill_node_content */ -+ ret = cut_node_content(&cut_from, &cut_to, NULL, NULL, NULL); -+ if (ret) -+ /* FIXME(Zam): Should we re-connect the node to its parent if -+ * cut_node fails? */ -+ goto failed; -+ -+ { -+ reiser4_tree *tree = current_tree; -+ __u64 start_offset = 0, end_offset = 0; -+ -+ read_lock_tree(tree); -+ write_lock_dk(tree); -+ if (object) { -+ /* We use @smallest_removed and the left delimiting of -+ * the current node for @object->i_blocks, i_bytes -+ * calculation. We assume that the items after the -+ * *@smallest_removed key have been deleted from the -+ * file body. */ -+ start_offset = get_key_offset(znode_get_ld_key(node)); -+ end_offset = get_key_offset(smallest_removed); -+ } -+ -+ assert("zam-1021", znode_is_connected(node)); -+ if (node->left) -+ znode_set_rd_key(node->left, znode_get_rd_key(node)); -+ -+ *smallest_removed = *znode_get_ld_key(node); -+ -+ write_unlock_dk(tree); -+ read_unlock_tree(tree); -+ -+ if (object) { -+ /* we used to perform actions which are to be performed on items on their removal from tree in -+ special item method - kill_hook. Here for optimization reasons we avoid reading node -+ containing item we remove and can not call item's kill hook. Instead we call function which -+ does exactly the same things as tail kill hook in assumption that node we avoid reading -+ contains only one item and that item is a tail one. */ -+ fake_kill_hook_tail(object, start_offset, end_offset, -+ truncate); -+ } -+ } -+ failed: -+ zrelse(parent_lock.node); -+ failed_nozrelse: -+ done_lh(&parent_lock); -+ -+ return ret; -+} -+ -+static int can_delete(const reiser4_key *key, znode *node) -+{ -+ int result; -+ -+ read_lock_dk(current_tree); -+ result = keyle(key, znode_get_ld_key(node)); -+ read_unlock_dk(current_tree); -+ return result; -+} -+ -+/** -+ * This subroutine is not optimal but implementation seems to -+ * be easier). -+ * -+ * @tap: the point deletion process begins from, -+ * @from_key: the beginning of the deleted key range, -+ * @to_key: the end of the deleted key range, -+ * @smallest_removed: the smallest removed key, -+ * @truncate: true if called for file truncate. -+ * @progress: return true if a progress in file items deletions was made, -+ * @smallest_removed value is actual in that case. -+ * -+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long -+ * reiser4_cut_tree operation was interrupted for allowing atom commit. -+ */ -+int -+cut_tree_worker_common(tap_t * tap, const reiser4_key * from_key, -+ const reiser4_key * to_key, -+ reiser4_key * smallest_removed, struct inode *object, -+ int truncate, int *progress) -+{ -+ lock_handle next_node_lock; -+ coord_t left_coord; -+ int result; -+ -+ assert("zam-931", tap->coord->node != NULL); -+ assert("zam-932", znode_is_write_locked(tap->coord->node)); -+ -+ *progress = 0; -+ init_lh(&next_node_lock); -+ -+ while (1) { -+ znode *node; /* node from which items are cut */ -+ node_plugin *nplug; /* node plugin for @node */ -+ -+ node = tap->coord->node; -+ -+ /* Move next_node_lock to the next node on the left. */ -+ result = -+ reiser4_get_left_neighbor(&next_node_lock, node, -+ ZNODE_WRITE_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ if (result != 0 && result != -E_NO_NEIGHBOR) -+ break; -+ /* Check can we delete the node as a whole. */ -+ if (*progress && znode_get_level(node) == LEAF_LEVEL && -+ can_delete(from_key, node)) { -+ result = reiser4_delete_node(node, smallest_removed, -+ object, truncate); -+ } else { -+ result = reiser4_tap_load(tap); -+ if (result) -+ return result; -+ -+ /* Prepare the second (right) point for cut_node() */ -+ if (*progress) -+ coord_init_last_unit(tap->coord, node); -+ -+ else if (item_plugin_by_coord(tap->coord)->b.lookup == -+ NULL) -+ /* set rightmost unit for the items without lookup method */ -+ tap->coord->unit_pos = -+ coord_last_unit_pos(tap->coord); -+ -+ nplug = node->nplug; -+ -+ assert("vs-686", nplug); -+ assert("vs-687", nplug->lookup); -+ -+ /* left_coord is leftmost unit cut from @node */ -+ result = nplug->lookup(node, from_key, -+ FIND_MAX_NOT_MORE_THAN, -+ &left_coord); -+ -+ if (IS_CBKERR(result)) -+ break; -+ -+ /* adjust coordinates so that they are set to existing units */ -+ if (coord_set_to_right(&left_coord) -+ || coord_set_to_left(tap->coord)) { -+ result = 0; -+ break; -+ } -+ -+ if (coord_compare(&left_coord, tap->coord) == -+ COORD_CMP_ON_RIGHT) { -+ /* keys from @from_key to @to_key are not in the tree */ -+ result = 0; -+ break; -+ } -+ -+ if (left_coord.item_pos != tap->coord->item_pos) { -+ /* do not allow to cut more than one item. It is added to solve problem of truncating -+ partially converted files. If file is partially converted there may exist a twig node -+ containing both internal item or items pointing to leaf nodes with formatting items -+ and extent item. We do not want to kill internal items being at twig node here -+ because cut_tree_worker assumes killing them from level level */ -+ coord_dup(&left_coord, tap->coord); -+ assert("vs-1652", -+ coord_is_existing_unit(&left_coord)); -+ left_coord.unit_pos = 0; -+ } -+ -+ /* cut data from one node */ -+ // *smallest_removed = *reiser4_min_key(); -+ result = -+ kill_node_content(&left_coord, tap->coord, from_key, -+ to_key, smallest_removed, -+ next_node_lock.node, object, -+ truncate); -+ reiser4_tap_relse(tap); -+ } -+ if (result) -+ break; -+ -+ ++(*progress); -+ -+ /* Check whether all items with keys >= from_key were removed -+ * from the tree. */ -+ if (keyle(smallest_removed, from_key)) -+ /* result = 0; */ -+ break; -+ -+ if (next_node_lock.node == NULL) -+ break; -+ -+ result = reiser4_tap_move(tap, &next_node_lock); -+ done_lh(&next_node_lock); -+ if (result) -+ break; -+ -+ /* Break long reiser4_cut_tree operation (deletion of a large -+ file) if atom requires commit. */ -+ if (*progress > CUT_TREE_MIN_ITERATIONS -+ && current_atom_should_commit()) { -+ result = -E_REPEAT; -+ break; -+ } -+ } -+ done_lh(&next_node_lock); -+ // assert("vs-301", !keyeq(&smallest_removed, reiser4_min_key())); -+ return result; -+} -+ -+/* there is a fundamental problem with optimizing deletes: VFS does it -+ one file at a time. Another problem is that if an item can be -+ anything, then deleting items must be done one at a time. It just -+ seems clean to writes this to specify a from and a to key, and cut -+ everything between them though. */ -+ -+/* use this function with care if deleting more than what is part of a single file. */ -+/* do not use this when cutting a single item, it is suboptimal for that */ -+ -+/* You are encouraged to write plugin specific versions of this. It -+ cannot be optimal for all plugins because it works item at a time, -+ and some plugins could sometimes work node at a time. Regular files -+ however are not optimizable to work node at a time because of -+ extents needing to free the blocks they point to. -+ -+ Optimizations compared to v3 code: -+ -+ It does not balance (that task is left to memory pressure code). -+ -+ Nodes are deleted only if empty. -+ -+ Uses extents. -+ -+ Performs read-ahead of formatted nodes whose contents are part of -+ the deletion. -+*/ -+ -+/** -+ * Delete everything from the reiser4 tree between two keys: @from_key and -+ * @to_key. -+ * -+ * @from_key: the beginning of the deleted key range, -+ * @to_key: the end of the deleted key range, -+ * @smallest_removed: the smallest removed key, -+ * @object: owner of cutting items. -+ * @truncate: true if called for file truncate. -+ * @progress: return true if a progress in file items deletions was made, -+ * @smallest_removed value is actual in that case. -+ * -+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree -+ * operation was interrupted for allowing atom commit . -+ */ -+ -+int reiser4_cut_tree_object(reiser4_tree * tree, const reiser4_key * from_key, -+ const reiser4_key * to_key, -+ reiser4_key * smallest_removed_p, -+ struct inode *object, int truncate, int *progress) -+{ -+ lock_handle lock; -+ int result; -+ tap_t tap; -+ coord_t right_coord; -+ reiser4_key smallest_removed; -+ int (*cut_tree_worker) (tap_t *, const reiser4_key *, -+ const reiser4_key *, reiser4_key *, -+ struct inode *, int, int *); -+ STORE_COUNTERS; -+ -+ assert("umka-329", tree != NULL); -+ assert("umka-330", from_key != NULL); -+ assert("umka-331", to_key != NULL); -+ assert("zam-936", keyle(from_key, to_key)); -+ -+ if (smallest_removed_p == NULL) -+ smallest_removed_p = &smallest_removed; -+ -+ init_lh(&lock); -+ -+ do { -+ /* Find rightmost item to cut away from the tree. */ -+ result = reiser4_object_lookup(object, to_key, &right_coord, -+ &lock, ZNODE_WRITE_LOCK, -+ FIND_MAX_NOT_MORE_THAN, -+ TWIG_LEVEL, LEAF_LEVEL, -+ CBK_UNIQUE, NULL /*ra_info */); -+ if (result != CBK_COORD_FOUND) -+ break; -+ if (object == NULL -+ || inode_file_plugin(object)->cut_tree_worker == NULL) -+ cut_tree_worker = cut_tree_worker_common; -+ else -+ cut_tree_worker = -+ inode_file_plugin(object)->cut_tree_worker; -+ reiser4_tap_init(&tap, &right_coord, &lock, ZNODE_WRITE_LOCK); -+ result = -+ cut_tree_worker(&tap, from_key, to_key, smallest_removed_p, -+ object, truncate, progress); -+ reiser4_tap_done(&tap); -+ -+ reiser4_preempt_point(); -+ -+ } while (0); -+ -+ done_lh(&lock); -+ -+ if (result) { -+ switch (result) { -+ case -E_NO_NEIGHBOR: -+ result = 0; -+ break; -+ case -E_DEADLOCK: -+ result = -E_REPEAT; -+ case -E_REPEAT: -+ case -ENOMEM: -+ case -ENOENT: -+ break; -+ default: -+ warning("nikita-2861", "failure: %i", result); -+ } -+ } -+ -+ CHECK_COUNTERS; -+ return result; -+} -+ -+/* repeat reiser4_cut_tree_object until everything is deleted. -+ * unlike cut_file_items, it does not end current transaction if -E_REPEAT -+ * is returned by cut_tree_object. */ -+int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from, -+ const reiser4_key * to, struct inode *inode, int truncate) -+{ -+ int result; -+ int progress; -+ -+ do { -+ result = reiser4_cut_tree_object(tree, from, to, NULL, -+ inode, truncate, &progress); -+ } while (result == -E_REPEAT); -+ -+ return result; -+} -+ -+/* finishing reiser4 initialization */ -+int reiser4_init_tree(reiser4_tree * tree /* pointer to structure being -+ * initialized */ , -+ const reiser4_block_nr * root_block /* address of a root block -+ * on a disk */ , -+ tree_level height /* height of a tree */ , -+ node_plugin * nplug /* default node plugin */ ) -+{ -+ int result; -+ -+ assert("nikita-306", tree != NULL); -+ assert("nikita-307", root_block != NULL); -+ assert("nikita-308", height > 0); -+ assert("nikita-309", nplug != NULL); -+ assert("zam-587", tree->super != NULL); -+ -+ tree->root_block = *root_block; -+ tree->height = height; -+ tree->estimate_one_insert = calc_estimate_one_insert(height); -+ tree->nplug = nplug; -+ -+ tree->znode_epoch = 1ull; -+ -+ cbk_cache_init(&tree->cbk_cache); -+ -+ result = znodes_tree_init(tree); -+ if (result == 0) -+ result = jnodes_tree_init(tree); -+ if (result == 0) { -+ tree->uber = zget(tree, &UBER_TREE_ADDR, NULL, 0, -+ reiser4_ctx_gfp_mask_get()); -+ if (IS_ERR(tree->uber)) { -+ result = PTR_ERR(tree->uber); -+ tree->uber = NULL; -+ } -+ } -+ return result; -+} -+ -+/* release resources associated with @tree */ -+void reiser4_done_tree(reiser4_tree * tree /* tree to release */ ) -+{ -+ if (tree == NULL) -+ return; -+ -+ if (tree->uber != NULL) { -+ zput(tree->uber); -+ tree->uber = NULL; -+ } -+ znodes_tree_done(tree); -+ jnodes_tree_done(tree); -+ cbk_cache_done(&tree->cbk_cache); -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/tree.h linux-2.6.20/fs/reiser4/tree.h ---- linux-2.6.20.orig/fs/reiser4/tree.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/tree.h 2007-05-06 14:50:43.883033217 +0400 -@@ -0,0 +1,577 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Tree operations. See fs/reiser4/tree.c for comments */ -+ -+#if !defined( __REISER4_TREE_H__ ) -+#define __REISER4_TREE_H__ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "plugin/node/node.h" -+#include "plugin/plugin.h" -+#include "znode.h" -+#include "tap.h" -+ -+#include /* for __u?? */ -+#include /* for struct super_block */ -+#include -+#include /* for struct task_struct */ -+ -+/* fictive block number never actually used */ -+extern const reiser4_block_nr UBER_TREE_ADDR; -+ -+/* &cbk_cache_slot - entry in a coord cache. -+ -+ This is entry in a coord_by_key (cbk) cache, represented by -+ &cbk_cache. -+ -+*/ -+typedef struct cbk_cache_slot { -+ /* cached node */ -+ znode *node; -+ /* linkage to the next cbk cache slot in a LRU order */ -+ struct list_head lru; -+} cbk_cache_slot; -+ -+/* &cbk_cache - coord cache. This is part of reiser4_tree. -+ -+ cbk_cache is supposed to speed up tree lookups by caching results of recent -+ successful lookups (we don't cache negative results as dentry cache -+ does). Cache consists of relatively small number of entries kept in a LRU -+ order. Each entry (&cbk_cache_slot) contains a pointer to znode, from -+ which we can obtain a range of keys that covered by this znode. Before -+ embarking into real tree traversal we scan cbk_cache slot by slot and for -+ each slot check whether key we are looking for is between minimal and -+ maximal keys for node pointed to by this slot. If no match is found, real -+ tree traversal is performed and if result is successful, appropriate entry -+ is inserted into cache, possibly pulling least recently used entry out of -+ it. -+ -+ Tree spin lock is used to protect coord cache. If contention for this -+ lock proves to be too high, more finer grained locking can be added. -+ -+ Invariants involving parts of this data-type: -+ -+ [cbk-cache-invariant] -+*/ -+typedef struct cbk_cache { -+ /* serializator */ -+ rwlock_t guard; -+ int nr_slots; -+ /* head of LRU list of cache slots */ -+ struct list_head lru; -+ /* actual array of slots */ -+ cbk_cache_slot *slot; -+} cbk_cache; -+ -+/* level_lookup_result - possible outcome of looking up key at some level. -+ This is used by coord_by_key when traversing tree downward. */ -+typedef enum { -+ /* continue to the next level */ -+ LOOKUP_CONT, -+ /* done. Either required item was found, or we can prove it -+ doesn't exist, or some error occurred. */ -+ LOOKUP_DONE, -+ /* restart traversal from the root. Infamous "repetition". */ -+ LOOKUP_REST -+} level_lookup_result; -+ -+/* This is representation of internal reiser4 tree where all file-system -+ data and meta-data are stored. This structure is passed to all tree -+ manipulation functions. It's different from the super block because: -+ we don't want to limit ourselves to strictly one to one mapping -+ between super blocks and trees, and, because they are logically -+ different: there are things in a super block that have no relation to -+ the tree (bitmaps, journalling area, mount options, etc.) and there -+ are things in a tree that bear no relation to the super block, like -+ tree of znodes. -+ -+ At this time, there is only one tree -+ per filesystem, and this struct is part of the super block. We only -+ call the super block the super block for historical reasons (most -+ other filesystems call the per filesystem metadata the super block). -+*/ -+ -+struct reiser4_tree { -+ /* block_nr == 0 is fake znode. Write lock it, while changing -+ tree height. */ -+ /* disk address of root node of a tree */ -+ reiser4_block_nr root_block; -+ -+ /* level of the root node. If this is 1, tree consists of root -+ node only */ -+ tree_level height; -+ -+ /* -+ * this is cached here avoid calling plugins through function -+ * dereference all the time. -+ */ -+ __u64 estimate_one_insert; -+ -+ /* cache of recent tree lookup results */ -+ cbk_cache cbk_cache; -+ -+ /* hash table to look up znodes by block number. */ -+ z_hash_table zhash_table; -+ z_hash_table zfake_table; -+ /* hash table to look up jnodes by inode and offset. */ -+ j_hash_table jhash_table; -+ -+ /* lock protecting: -+ - parent pointers, -+ - sibling pointers, -+ - znode hash table -+ - coord cache -+ */ -+ /* NOTE: The "giant" tree lock can be replaced by more spin locks, -+ hoping they will be less contented. We can use one spin lock per one -+ znode hash bucket. With adding of some code complexity, sibling -+ pointers can be protected by both znode spin locks. However it looks -+ more SMP scalable we should test this locking change on n-ways (n > -+ 4) SMP machines. Current 4-ways machine test does not show that tree -+ lock is contented and it is a bottleneck (2003.07.25). */ -+ -+ rwlock_t tree_lock; -+ -+ /* lock protecting delimiting keys */ -+ rwlock_t dk_lock; -+ -+ /* spin lock protecting znode_epoch */ -+ spinlock_t epoch_lock; -+ /* version stamp used to mark znode updates. See seal.[ch] for more -+ * information. */ -+ __u64 znode_epoch; -+ -+ znode *uber; -+ node_plugin *nplug; -+ struct super_block *super; -+ struct { -+ /* carry flags used for insertion of new nodes */ -+ __u32 new_node_flags; -+ /* carry flags used for insertion of new extents */ -+ __u32 new_extent_flags; -+ /* carry flags used for paste operations */ -+ __u32 paste_flags; -+ /* carry flags used for insert operations */ -+ __u32 insert_flags; -+ } carry; -+}; -+ -+extern int reiser4_init_tree(reiser4_tree * tree, -+ const reiser4_block_nr * root_block, -+ tree_level height, node_plugin * default_plugin); -+extern void reiser4_done_tree(reiser4_tree * tree); -+ -+/* cbk flags: options for coord_by_key() */ -+typedef enum { -+ /* coord_by_key() is called for insertion. This is necessary because -+ of extents being located at the twig level. For explanation, see -+ comment just above is_next_item_internal(). -+ */ -+ CBK_FOR_INSERT = (1 << 0), -+ /* coord_by_key() is called with key that is known to be unique */ -+ CBK_UNIQUE = (1 << 1), -+ /* coord_by_key() can trust delimiting keys. This options is not user -+ accessible. coord_by_key() will set it automatically. It will be -+ only cleared by special-case in extents-on-the-twig-level handling -+ where it is necessary to insert item with a key smaller than -+ leftmost key in a node. This is necessary because of extents being -+ located at the twig level. For explanation, see comment just above -+ is_next_item_internal(). -+ */ -+ CBK_TRUST_DK = (1 << 2), -+ CBK_READA = (1 << 3), /* original: readahead leaves which contain items of certain file */ -+ CBK_READDIR_RA = (1 << 4), /* readdir: readahead whole directory and all its stat datas */ -+ CBK_DKSET = (1 << 5), -+ CBK_EXTENDED_COORD = (1 << 6), /* coord_t is actually */ -+ CBK_IN_CACHE = (1 << 7), /* node is already in cache */ -+ CBK_USE_CRABLOCK = (1 << 8) /* use crab_lock in stead of long term -+ * lock */ -+} cbk_flags; -+ -+/* insertion outcome. IBK = insert by key */ -+typedef enum { -+ IBK_INSERT_OK = 0, -+ IBK_ALREADY_EXISTS = -EEXIST, -+ IBK_IO_ERROR = -EIO, -+ IBK_NO_SPACE = -E_NODE_FULL, -+ IBK_OOM = -ENOMEM -+} insert_result; -+ -+#define IS_CBKERR(err) ((err) != CBK_COORD_FOUND && (err) != CBK_COORD_NOTFOUND) -+ -+typedef int (*tree_iterate_actor_t) (reiser4_tree * tree, coord_t * coord, -+ lock_handle * lh, void *arg); -+extern int reiser4_iterate_tree(reiser4_tree * tree, coord_t * coord, -+ lock_handle * lh, -+ tree_iterate_actor_t actor, void *arg, -+ znode_lock_mode mode, int through_units_p); -+extern int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode, -+ znode_lock_request pri, lock_handle * lh); -+ -+/* return node plugin of @node */ -+static inline node_plugin *node_plugin_by_node(const znode * -+ node /* node to query */ ) -+{ -+ assert("vs-213", node != NULL); -+ assert("vs-214", znode_is_loaded(node)); -+ -+ return node->nplug; -+} -+ -+/* number of items in @node */ -+static inline pos_in_node_t node_num_items(const znode * node) -+{ -+ assert("nikita-2754", znode_is_loaded(node)); -+ assert("nikita-2468", -+ node_plugin_by_node(node)->num_of_items(node) == node->nr_items); -+ -+ return node->nr_items; -+} -+ -+/* Return the number of items at the present node. Asserts coord->node != -+ NULL. */ -+static inline unsigned coord_num_items(const coord_t * coord) -+{ -+ assert("jmacd-9805", coord->node != NULL); -+ -+ return node_num_items(coord->node); -+} -+ -+/* true if @node is empty */ -+static inline int node_is_empty(const znode * node) -+{ -+ return node_num_items(node) == 0; -+} -+ -+typedef enum { -+ SHIFTED_SOMETHING = 0, -+ SHIFT_NO_SPACE = -E_NODE_FULL, -+ SHIFT_IO_ERROR = -EIO, -+ SHIFT_OOM = -ENOMEM, -+} shift_result; -+ -+extern node_plugin *node_plugin_by_coord(const coord_t * coord); -+extern int is_coord_in_node(const coord_t * coord); -+extern int key_in_node(const reiser4_key *, const coord_t *); -+extern void coord_item_move_to(coord_t * coord, int items); -+extern void coord_unit_move_to(coord_t * coord, int units); -+ -+/* there are two types of repetitive accesses (ra): intra-syscall -+ (local) and inter-syscall (global). Local ra is used when -+ during single syscall we add/delete several items and units in the -+ same place in a tree. Note that plan-A fragments local ra by -+ separating stat-data and file body in key-space. Global ra is -+ used when user does repetitive modifications in the same place in a -+ tree. -+ -+ Our ra implementation serves following purposes: -+ 1 it affects balancing decisions so that next operation in a row -+ can be performed faster; -+ 2 it affects lower-level read-ahead in page-cache; -+ 3 it allows to avoid unnecessary lookups by maintaining some state -+ across several operations (this is only for local ra); -+ 4 it leaves room for lazy-micro-balancing: when we start a sequence of -+ operations they are performed without actually doing any intra-node -+ shifts, until we finish sequence or scope of sequence leaves -+ current node, only then we really pack node (local ra only). -+*/ -+ -+/* another thing that can be useful is to keep per-tree and/or -+ per-process cache of recent lookups. This cache can be organised as a -+ list of block numbers of formatted nodes sorted by starting key in -+ this node. Balancings should invalidate appropriate parts of this -+ cache. -+*/ -+ -+lookup_result coord_by_key(reiser4_tree * tree, const reiser4_key * key, -+ coord_t * coord, lock_handle * handle, -+ znode_lock_mode lock, lookup_bias bias, -+ tree_level lock_level, tree_level stop_level, -+ __u32 flags, ra_info_t *); -+ -+lookup_result reiser4_object_lookup(struct inode *object, -+ const reiser4_key * key, -+ coord_t * coord, -+ lock_handle * lh, -+ znode_lock_mode lock_mode, -+ lookup_bias bias, -+ tree_level lock_level, -+ tree_level stop_level, -+ __u32 flags, ra_info_t * info); -+ -+insert_result insert_by_key(reiser4_tree * tree, const reiser4_key * key, -+ reiser4_item_data * data, coord_t * coord, -+ lock_handle * lh, -+ tree_level stop_level, __u32 flags); -+insert_result insert_by_coord(coord_t * coord, -+ reiser4_item_data * data, const reiser4_key * key, -+ lock_handle * lh, __u32); -+insert_result insert_extent_by_coord(coord_t * coord, -+ reiser4_item_data * data, -+ const reiser4_key * key, lock_handle * lh); -+int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key, -+ const reiser4_key * to_key, -+ reiser4_key * smallest_removed); -+int kill_node_content(coord_t * from, coord_t * to, -+ const reiser4_key * from_key, const reiser4_key * to_key, -+ reiser4_key * smallest_removed, -+ znode * locked_left_neighbor, struct inode *inode, -+ int truncate); -+ -+int reiser4_resize_item(coord_t * coord, reiser4_item_data * data, -+ reiser4_key * key, lock_handle * lh, cop_insert_flag); -+int insert_into_item(coord_t * coord, lock_handle * lh, const reiser4_key * key, -+ reiser4_item_data * data, unsigned); -+int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f); -+int find_new_child_ptr(znode * parent, znode * child, znode * left, -+ coord_t * result); -+ -+int shift_right_of_but_excluding_insert_coord(coord_t * insert_coord); -+int shift_left_of_and_including_insert_coord(coord_t * insert_coord); -+ -+void fake_kill_hook_tail(struct inode *, loff_t start, loff_t end, int); -+ -+extern int cut_tree_worker_common(tap_t *, const reiser4_key *, -+ const reiser4_key *, reiser4_key *, -+ struct inode *, int, int *); -+extern int reiser4_cut_tree_object(reiser4_tree *, const reiser4_key *, -+ const reiser4_key *, reiser4_key *, -+ struct inode *, int, int *); -+extern int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from, -+ const reiser4_key * to, struct inode *, int); -+ -+extern int reiser4_delete_node(znode *, reiser4_key *, struct inode *, int); -+extern int check_tree_pointer(const coord_t * pointer, const znode * child); -+extern int find_new_child_ptr(znode * parent, znode * child UNUSED_ARG, -+ znode * left, coord_t * result); -+extern int find_child_ptr(znode * parent, znode * child, coord_t * result); -+extern int set_child_delimiting_keys(znode * parent, const coord_t * in_parent, -+ znode * child); -+extern znode *child_znode(const coord_t * in_parent, znode * parent, -+ int incore_p, int setup_dkeys_p); -+ -+extern int cbk_cache_init(cbk_cache * cache); -+extern void cbk_cache_done(cbk_cache * cache); -+extern void cbk_cache_invalidate(const znode * node, reiser4_tree * tree); -+ -+extern char *sprint_address(const reiser4_block_nr * block); -+ -+#if REISER4_DEBUG -+extern void print_coord_content(const char *prefix, coord_t * p); -+extern void reiser4_print_address(const char *prefix, -+ const reiser4_block_nr * block); -+extern void print_tree_rec(const char *prefix, reiser4_tree * tree, -+ __u32 flags); -+extern void check_dkeys(znode *node); -+#else -+#define print_coord_content(p, c) noop -+#define reiser4_print_address(p, b) noop -+#endif -+ -+extern void forget_znode(lock_handle * handle); -+extern int deallocate_znode(znode * node); -+ -+extern int is_disk_addr_unallocated(const reiser4_block_nr * addr); -+ -+/* struct used internally to pack all numerous arguments of tree lookup. -+ Used to avoid passing a lot of arguments to helper functions. */ -+typedef struct cbk_handle { -+ /* tree we are in */ -+ reiser4_tree *tree; -+ /* key we are going after */ -+ const reiser4_key *key; -+ /* coord we will store result in */ -+ coord_t *coord; -+ /* type of lock to take on target node */ -+ znode_lock_mode lock_mode; -+ /* lookup bias. See comments at the declaration of lookup_bias */ -+ lookup_bias bias; -+ /* lock level: level starting from which tree traversal starts taking -+ * write locks. */ -+ tree_level lock_level; -+ /* level where search will stop. Either item will be found between -+ lock_level and stop_level, or CBK_COORD_NOTFOUND will be -+ returned. -+ */ -+ tree_level stop_level; -+ /* level we are currently at */ -+ tree_level level; -+ /* block number of @active node. Tree traversal operates on two -+ nodes: active and parent. */ -+ reiser4_block_nr block; -+ /* put here error message to be printed by caller */ -+ const char *error; -+ /* result passed back to caller */ -+ lookup_result result; -+ /* lock handles for active and parent */ -+ lock_handle *parent_lh; -+ lock_handle *active_lh; -+ reiser4_key ld_key; -+ reiser4_key rd_key; -+ /* flags, passed to the cbk routine. Bits of this bitmask are defined -+ in tree.h:cbk_flags enum. */ -+ __u32 flags; -+ ra_info_t *ra_info; -+ struct inode *object; -+} cbk_handle; -+ -+extern znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h); -+ -+/* eottl.c */ -+extern int handle_eottl(cbk_handle *h, int *outcome); -+ -+int lookup_multikey(cbk_handle * handle, int nr_keys); -+int lookup_couple(reiser4_tree * tree, -+ const reiser4_key * key1, const reiser4_key * key2, -+ coord_t * coord1, coord_t * coord2, -+ lock_handle * lh1, lock_handle * lh2, -+ znode_lock_mode lock_mode, lookup_bias bias, -+ tree_level lock_level, tree_level stop_level, __u32 flags, -+ int *result1, int *result2); -+ -+static inline void read_lock_tree(reiser4_tree *tree) -+{ -+ /* check that tree is not locked */ -+ assert("", (LOCK_CNT_NIL(rw_locked_tree) && -+ LOCK_CNT_NIL(read_locked_tree) && -+ LOCK_CNT_NIL(write_locked_tree))); -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) && -+ LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_NIL(spin_locked_stack))); -+ -+ read_lock(&(tree->tree_lock)); -+ -+ LOCK_CNT_INC(read_locked_tree); -+ LOCK_CNT_INC(rw_locked_tree); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline void read_unlock_tree(reiser4_tree *tree) -+{ -+ assert("nikita-1375", LOCK_CNT_GTZ(read_locked_tree)); -+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(read_locked_tree); -+ LOCK_CNT_DEC(rw_locked_tree); -+ LOCK_CNT_DEC(spin_locked); -+ -+ read_unlock(&(tree->tree_lock)); -+} -+ -+static inline void write_lock_tree(reiser4_tree *tree) -+{ -+ /* check that tree is not locked */ -+ assert("", (LOCK_CNT_NIL(rw_locked_tree) && -+ LOCK_CNT_NIL(read_locked_tree) && -+ LOCK_CNT_NIL(write_locked_tree))); -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) && -+ LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_NIL(spin_locked_stack))); -+ -+ write_lock(&(tree->tree_lock)); -+ -+ LOCK_CNT_INC(write_locked_tree); -+ LOCK_CNT_INC(rw_locked_tree); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline void write_unlock_tree(reiser4_tree *tree) -+{ -+ assert("nikita-1375", LOCK_CNT_GTZ(write_locked_tree)); -+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(write_locked_tree); -+ LOCK_CNT_DEC(rw_locked_tree); -+ LOCK_CNT_DEC(spin_locked); -+ -+ write_unlock(&(tree->tree_lock)); -+} -+ -+static inline void read_lock_dk(reiser4_tree *tree) -+{ -+ /* check that dk is not locked */ -+ assert("", (LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_NIL(read_locked_dk) && -+ LOCK_CNT_NIL(write_locked_dk))); -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", LOCK_CNT_NIL(spin_locked_stack)); -+ -+ read_lock(&((tree)->dk_lock)); -+ -+ LOCK_CNT_INC(read_locked_dk); -+ LOCK_CNT_INC(rw_locked_dk); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline void read_unlock_dk(reiser4_tree *tree) -+{ -+ assert("nikita-1375", LOCK_CNT_GTZ(read_locked_dk)); -+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(read_locked_dk); -+ LOCK_CNT_DEC(rw_locked_dk); -+ LOCK_CNT_DEC(spin_locked); -+ -+ read_unlock(&(tree->dk_lock)); -+} -+ -+static inline void write_lock_dk(reiser4_tree *tree) -+{ -+ /* check that dk is not locked */ -+ assert("", (LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_NIL(read_locked_dk) && -+ LOCK_CNT_NIL(write_locked_dk))); -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", LOCK_CNT_NIL(spin_locked_stack)); -+ -+ write_lock(&((tree)->dk_lock)); -+ -+ LOCK_CNT_INC(write_locked_dk); -+ LOCK_CNT_INC(rw_locked_dk); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline void write_unlock_dk(reiser4_tree *tree) -+{ -+ assert("nikita-1375", LOCK_CNT_GTZ(write_locked_dk)); -+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(write_locked_dk); -+ LOCK_CNT_DEC(rw_locked_dk); -+ LOCK_CNT_DEC(spin_locked); -+ -+ write_unlock(&(tree->dk_lock)); -+} -+ -+/* estimate api. Implementation is in estimate.c */ -+reiser4_block_nr estimate_one_insert_item(reiser4_tree *); -+reiser4_block_nr estimate_one_insert_into_item(reiser4_tree *); -+reiser4_block_nr estimate_insert_flow(tree_level); -+reiser4_block_nr estimate_one_item_removal(reiser4_tree *); -+reiser4_block_nr calc_estimate_one_insert(tree_level); -+reiser4_block_nr estimate_dirty_cluster(struct inode *); -+reiser4_block_nr estimate_insert_cluster(struct inode *); -+reiser4_block_nr estimate_update_cluster(struct inode *); -+ -+/* __REISER4_TREE_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/tree_mod.c linux-2.6.20/fs/reiser4/tree_mod.c ---- linux-2.6.20.orig/fs/reiser4/tree_mod.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/tree_mod.c 2007-05-06 14:50:43.887034467 +0400 -@@ -0,0 +1,386 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* -+ * Functions to add/delete new nodes to/from the tree. -+ * -+ * Functions from this file are used by carry (see carry*) to handle: -+ * -+ * . insertion of new formatted node into tree -+ * -+ * . addition of new tree root, increasing tree height -+ * -+ * . removing tree root, decreasing tree height -+ * -+ */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/plugin.h" -+#include "jnode.h" -+#include "znode.h" -+#include "tree_mod.h" -+#include "block_alloc.h" -+#include "tree_walk.h" -+#include "tree.h" -+#include "super.h" -+ -+#include -+ -+static int add_child_ptr(znode * parent, znode * child); -+/* warning only issued if error is not -E_REPEAT */ -+#define ewarning( error, ... ) \ -+ if( ( error ) != -E_REPEAT ) \ -+ warning( __VA_ARGS__ ) -+ -+/* allocate new node on the @level and immediately on the right of @brother. */ -+znode * reiser4_new_node(znode * brother /* existing left neighbor -+ * of new node */, -+ tree_level level /* tree level at which new node is to -+ * be allocated */) -+{ -+ znode *result; -+ int retcode; -+ reiser4_block_nr blocknr; -+ -+ assert("nikita-930", brother != NULL); -+ assert("umka-264", level < REAL_MAX_ZTREE_HEIGHT); -+ -+ retcode = assign_fake_blocknr_formatted(&blocknr); -+ if (retcode == 0) { -+ result = -+ zget(znode_get_tree(brother), &blocknr, NULL, level, -+ reiser4_ctx_gfp_mask_get()); -+ if (IS_ERR(result)) { -+ ewarning(PTR_ERR(result), "nikita-929", -+ "Cannot allocate znode for carry: %li", -+ PTR_ERR(result)); -+ return result; -+ } -+ /* cheap test, can be executed even when debugging is off */ -+ if (!znode_just_created(result)) { -+ warning("nikita-2213", -+ "Allocated already existing block: %llu", -+ (unsigned long long)blocknr); -+ zput(result); -+ return ERR_PTR(RETERR(-EIO)); -+ } -+ -+ assert("nikita-931", result != NULL); -+ result->nplug = znode_get_tree(brother)->nplug; -+ assert("nikita-933", result->nplug != NULL); -+ -+ retcode = zinit_new(result, reiser4_ctx_gfp_mask_get()); -+ if (retcode == 0) { -+ ZF_SET(result, JNODE_CREATED); -+ zrelse(result); -+ } else { -+ zput(result); -+ result = ERR_PTR(retcode); -+ } -+ } else { -+ /* failure to allocate new node during balancing. -+ This should never happen. Ever. Returning -E_REPEAT -+ is not viable solution, because "out of disk space" -+ is not transient error that will go away by itself. -+ */ -+ ewarning(retcode, "nikita-928", -+ "Cannot allocate block for carry: %i", retcode); -+ result = ERR_PTR(retcode); -+ } -+ assert("nikita-1071", result != NULL); -+ return result; -+} -+ -+/* allocate new root and add it to the tree -+ -+ This helper function is called by add_new_root(). -+ -+*/ -+znode *reiser4_add_tree_root(znode * old_root /* existing tree root */ , -+ znode * fake /* "fake" znode */ ) -+{ -+ reiser4_tree *tree = znode_get_tree(old_root); -+ znode *new_root = NULL; /* to shut gcc up */ -+ int result; -+ -+ assert("nikita-1069", old_root != NULL); -+ assert("umka-262", fake != NULL); -+ assert("umka-263", tree != NULL); -+ -+ /* "fake" znode---one always hanging just above current root. This -+ node is locked when new root is created or existing root is -+ deleted. Downward tree traversal takes lock on it before taking -+ lock on a root node. This avoids race conditions with root -+ manipulations. -+ -+ */ -+ assert("nikita-1348", znode_above_root(fake)); -+ assert("nikita-1211", znode_is_root(old_root)); -+ -+ result = 0; -+ if (tree->height >= REAL_MAX_ZTREE_HEIGHT) { -+ warning("nikita-1344", "Tree is too tall: %i", tree->height); -+ /* ext2 returns -ENOSPC when it runs out of free inodes with a -+ following comment (fs/ext2/ialloc.c:441): Is it really -+ ENOSPC? -+ -+ -EXFULL? -EINVAL? -+ */ -+ result = RETERR(-ENOSPC); -+ } else { -+ /* Allocate block for new root. It's not that -+ important where it will be allocated, as root is -+ almost always in memory. Moreover, allocate on -+ flush can be going here. -+ */ -+ assert("nikita-1448", znode_is_root(old_root)); -+ new_root = reiser4_new_node(fake, tree->height + 1); -+ if (!IS_ERR(new_root) && (result = zload(new_root)) == 0) { -+ lock_handle rlh; -+ -+ init_lh(&rlh); -+ result = -+ longterm_lock_znode(&rlh, new_root, -+ ZNODE_WRITE_LOCK, -+ ZNODE_LOCK_LOPRI); -+ if (result == 0) { -+ parent_coord_t *in_parent; -+ -+ znode_make_dirty(fake); -+ -+ /* new root is a child of "fake" node */ -+ write_lock_tree(tree); -+ -+ ++tree->height; -+ -+ /* recalculate max balance overhead */ -+ tree->estimate_one_insert = -+ estimate_one_insert_item(tree); -+ -+ tree->root_block = *znode_get_block(new_root); -+ in_parent = &new_root->in_parent; -+ init_parent_coord(in_parent, fake); -+ /* manually insert new root into sibling -+ * list. With this all nodes involved into -+ * balancing are connected after balancing is -+ * done---useful invariant to check. */ -+ sibling_list_insert_nolock(new_root, NULL); -+ write_unlock_tree(tree); -+ -+ /* insert into new root pointer to the -+ @old_root. */ -+ assert("nikita-1110", -+ WITH_DATA(new_root, -+ node_is_empty(new_root))); -+ write_lock_dk(tree); -+ znode_set_ld_key(new_root, reiser4_min_key()); -+ znode_set_rd_key(new_root, reiser4_max_key()); -+ write_unlock_dk(tree); -+ if (REISER4_DEBUG) { -+ ZF_CLR(old_root, JNODE_LEFT_CONNECTED); -+ ZF_CLR(old_root, JNODE_RIGHT_CONNECTED); -+ ZF_SET(old_root, JNODE_ORPHAN); -+ } -+ result = add_child_ptr(new_root, old_root); -+ done_lh(&rlh); -+ } -+ zrelse(new_root); -+ } -+ } -+ if (result != 0) -+ new_root = ERR_PTR(result); -+ return new_root; -+} -+ -+/* build &reiser4_item_data for inserting child pointer -+ -+ Build &reiser4_item_data that can be later used to insert pointer to @child -+ in its parent. -+ -+*/ -+void build_child_ptr_data(znode * child /* node pointer to which will be -+ * inserted */ , -+ reiser4_item_data * data /* where to store result */ ) -+{ -+ assert("nikita-1116", child != NULL); -+ assert("nikita-1117", data != NULL); -+ -+ /* -+ * NOTE: use address of child's blocknr as address of data to be -+ * inserted. As result of this data gets into on-disk structure in cpu -+ * byte order. internal's create_hook converts it to little endian byte -+ * order. -+ */ -+ data->data = (char *)znode_get_block(child); -+ /* data -> data is kernel space */ -+ data->user = 0; -+ data->length = sizeof(reiser4_block_nr); -+ /* FIXME-VS: hardcoded internal item? */ -+ -+ /* AUDIT: Is it possible that "item_plugin_by_id" may find nothing? */ -+ data->iplug = item_plugin_by_id(NODE_POINTER_ID); -+} -+ -+/* add pointer to @child into empty @parent. -+ -+ This is used when pointer to old root is inserted into new root which is -+ empty. -+*/ -+static int add_child_ptr(znode * parent, znode * child) -+{ -+ coord_t coord; -+ reiser4_item_data data; -+ int result; -+ reiser4_key key; -+ -+ assert("nikita-1111", parent != NULL); -+ assert("nikita-1112", child != NULL); -+ assert("nikita-1115", -+ znode_get_level(parent) == znode_get_level(child) + 1); -+ -+ result = zload(parent); -+ if (result != 0) -+ return result; -+ assert("nikita-1113", node_is_empty(parent)); -+ coord_init_first_unit(&coord, parent); -+ -+ build_child_ptr_data(child, &data); -+ data.arg = NULL; -+ -+ read_lock_dk(znode_get_tree(parent)); -+ key = *znode_get_ld_key(child); -+ read_unlock_dk(znode_get_tree(parent)); -+ -+ result = node_plugin_by_node(parent)->create_item(&coord, &key, &data, -+ NULL); -+ znode_make_dirty(parent); -+ zrelse(parent); -+ return result; -+} -+ -+/* actually remove tree root */ -+static int reiser4_kill_root(reiser4_tree * tree /* tree from which root is -+ * being removed */, -+ znode * old_root /* root node that is being -+ * removed */ , -+ znode * new_root /* new root---sole child of -+ * @old_root */, -+ const reiser4_block_nr * new_root_blk /* disk address of -+ * @new_root */) -+{ -+ znode *uber; -+ int result; -+ lock_handle handle_for_uber; -+ -+ assert("umka-265", tree != NULL); -+ assert("nikita-1198", new_root != NULL); -+ assert("nikita-1199", -+ znode_get_level(new_root) + 1 == znode_get_level(old_root)); -+ -+ assert("nikita-1201", znode_is_write_locked(old_root)); -+ -+ assert("nikita-1203", -+ disk_addr_eq(new_root_blk, znode_get_block(new_root))); -+ -+ init_lh(&handle_for_uber); -+ /* obtain and lock "fake" znode protecting changes in tree height. */ -+ result = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI, -+ &handle_for_uber); -+ if (result == 0) { -+ uber = handle_for_uber.node; -+ -+ znode_make_dirty(uber); -+ -+ /* don't take long term lock a @new_root. Take spinlock. */ -+ -+ write_lock_tree(tree); -+ -+ tree->root_block = *new_root_blk; -+ --tree->height; -+ -+ /* recalculate max balance overhead */ -+ tree->estimate_one_insert = estimate_one_insert_item(tree); -+ -+ assert("nikita-1202", -+ tree->height == znode_get_level(new_root)); -+ -+ /* new root is child on "fake" node */ -+ init_parent_coord(&new_root->in_parent, uber); -+ ++uber->c_count; -+ -+ /* sibling_list_insert_nolock(new_root, NULL); */ -+ write_unlock_tree(tree); -+ -+ /* reinitialise old root. */ -+ result = node_plugin_by_node(old_root)->init(old_root); -+ znode_make_dirty(old_root); -+ if (result == 0) { -+ assert("nikita-1279", node_is_empty(old_root)); -+ ZF_SET(old_root, JNODE_HEARD_BANSHEE); -+ old_root->c_count = 0; -+ } -+ } -+ done_lh(&handle_for_uber); -+ -+ return result; -+} -+ -+/* remove tree root -+ -+ This function removes tree root, decreasing tree height by one. Tree root -+ and its only child (that is going to become new tree root) are write locked -+ at the entry. -+ -+ To remove tree root we need to take lock on special "fake" znode that -+ protects changes of tree height. See comments in reiser4_add_tree_root() for -+ more on this. -+ -+ Also parent pointers have to be updated in -+ old and new root. To simplify code, function is split into two parts: outer -+ reiser4_kill_tree_root() collects all necessary arguments and calls -+ reiser4_kill_root() to do the actual job. -+ -+*/ -+int reiser4_kill_tree_root(znode * old_root /* tree root that we are -+ removing*/) -+{ -+ int result; -+ coord_t down_link; -+ znode *new_root; -+ reiser4_tree *tree; -+ -+ assert("umka-266", current_tree != NULL); -+ assert("nikita-1194", old_root != NULL); -+ assert("nikita-1196", znode_is_root(old_root)); -+ assert("nikita-1200", node_num_items(old_root) == 1); -+ assert("nikita-1401", znode_is_write_locked(old_root)); -+ -+ coord_init_first_unit(&down_link, old_root); -+ -+ tree = znode_get_tree(old_root); -+ new_root = child_znode(&down_link, old_root, 0, 1); -+ if (!IS_ERR(new_root)) { -+ result = -+ reiser4_kill_root(tree, old_root, new_root, -+ znode_get_block(new_root)); -+ zput(new_root); -+ } else -+ result = PTR_ERR(new_root); -+ -+ return result; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/tree_mod.h linux-2.6.20/fs/reiser4/tree_mod.h ---- linux-2.6.20.orig/fs/reiser4/tree_mod.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/tree_mod.h 2007-05-06 14:50:43.887034467 +0400 -@@ -0,0 +1,29 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Functions to add/delete new nodes to/from the tree. See tree_mod.c for -+ * comments. */ -+ -+#if !defined( __REISER4_TREE_MOD_H__ ) -+#define __REISER4_TREE_MOD_H__ -+ -+#include "forward.h" -+ -+znode *reiser4_new_node(znode * brother, tree_level level); -+znode *reiser4_add_tree_root(znode * old_root, znode * fake); -+int reiser4_kill_tree_root(znode * old_root); -+void build_child_ptr_data(znode * child, reiser4_item_data * data); -+ -+/* __REISER4_TREE_MOD_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/tree_walk.c linux-2.6.20/fs/reiser4/tree_walk.c ---- linux-2.6.20.orig/fs/reiser4/tree_walk.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/tree_walk.c 2007-05-06 14:50:43.887034467 +0400 -@@ -0,0 +1,927 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Routines and macros to: -+ -+ get_left_neighbor() -+ -+ get_right_neighbor() -+ -+ get_parent() -+ -+ get_first_child() -+ -+ get_last_child() -+ -+ various routines to walk the whole tree and do things to it like -+ repack it, or move it to tertiary storage. Please make them as -+ generic as is reasonable. -+ -+*/ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "coord.h" -+#include "plugin/item/item.h" -+#include "jnode.h" -+#include "znode.h" -+#include "tree_walk.h" -+#include "tree.h" -+#include "super.h" -+ -+/* These macros are used internally in tree_walk.c in attempt to make -+ lock_neighbor() code usable to build lock_parent(), lock_right_neighbor, -+ lock_left_neighbor */ -+#define GET_NODE_BY_PTR_OFFSET(node, off) (*(znode**)(((unsigned long)(node)) + (off))) -+#define FIELD_OFFSET(name) offsetof(znode, name) -+#define PARENT_PTR_OFFSET FIELD_OFFSET(in_parent.node) -+#define LEFT_PTR_OFFSET FIELD_OFFSET(left) -+#define RIGHT_PTR_OFFSET FIELD_OFFSET(right) -+ -+/* This is the generic procedure to get and lock `generic' neighbor (left or -+ right neighbor or parent). It implements common algorithm for all cases of -+ getting lock on neighbor node, only znode structure field is different in -+ each case. This is parameterized by ptr_offset argument, which is byte -+ offset for the pointer to the desired neighbor within the current node's -+ znode structure. This function should be called with the tree lock held */ -+static int lock_neighbor( -+ /* resulting lock handle */ -+ lock_handle * result, -+ /* znode to lock */ -+ znode * node, -+ /* pointer to neighbor (or parent) znode field offset, in bytes from -+ the base address of znode structure */ -+ int ptr_offset, -+ /* lock mode for longterm_lock_znode call */ -+ znode_lock_mode mode, -+ /* lock request for longterm_lock_znode call */ -+ znode_lock_request req, -+ /* GN_* flags */ -+ int flags, int rlocked) -+{ -+ reiser4_tree *tree = znode_get_tree(node); -+ znode *neighbor; -+ int ret; -+ -+ assert("umka-236", node != NULL); -+ assert("umka-237", tree != NULL); -+ assert_rw_locked(&(tree->tree_lock)); -+ -+ if (flags & GN_TRY_LOCK) -+ req |= ZNODE_LOCK_NONBLOCK; -+ if (flags & GN_SAME_ATOM) -+ req |= ZNODE_LOCK_DONT_FUSE; -+ -+ /* get neighbor's address by using of sibling link, quit while loop -+ (and return) if link is not available. */ -+ while (1) { -+ neighbor = GET_NODE_BY_PTR_OFFSET(node, ptr_offset); -+ -+ /* return -E_NO_NEIGHBOR if parent or side pointer is NULL or if -+ * node pointed by it is not connected. -+ * -+ * However, GN_ALLOW_NOT_CONNECTED option masks "connected" -+ * check and allows passing reference to not connected znode to -+ * subsequent longterm_lock_znode() call. This kills possible -+ * busy loop if we are trying to get longterm lock on locked but -+ * not yet connected parent node. */ -+ if (neighbor == NULL || !((flags & GN_ALLOW_NOT_CONNECTED) -+ || znode_is_connected(neighbor))) { -+ return RETERR(-E_NO_NEIGHBOR); -+ } -+ -+ /* protect it from deletion. */ -+ zref(neighbor); -+ -+ rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree); -+ -+ ret = longterm_lock_znode(result, neighbor, mode, req); -+ -+ /* The lock handle obtains its own reference, release the one from above. */ -+ zput(neighbor); -+ -+ rlocked ? read_lock_tree(tree) : write_lock_tree(tree); -+ -+ /* restart if node we got reference to is being -+ invalidated. we should not get reference to this node -+ again. */ -+ if (ret == -EINVAL) -+ continue; -+ if (ret) -+ return ret; -+ -+ /* check if neighbor link still points to just locked znode; -+ the link could have been changed while the process slept. */ -+ if (neighbor == GET_NODE_BY_PTR_OFFSET(node, ptr_offset)) -+ return 0; -+ -+ /* znode was locked by mistake; unlock it and restart locking -+ process from beginning. */ -+ rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree); -+ longterm_unlock_znode(result); -+ rlocked ? read_lock_tree(tree) : write_lock_tree(tree); -+ } -+} -+ -+/* get parent node with longterm lock, accepts GN* flags. */ -+int reiser4_get_parent_flags(lock_handle * lh /* resulting lock handle */ , -+ znode * node /* child node */ , -+ znode_lock_mode mode -+ /* type of lock: read or write */ , -+ int flags /* GN_* flags */ ) -+{ -+ int result; -+ -+ read_lock_tree(znode_get_tree(node)); -+ result = lock_neighbor(lh, node, PARENT_PTR_OFFSET, mode, -+ ZNODE_LOCK_HIPRI, flags, 1); -+ read_unlock_tree(znode_get_tree(node)); -+ return result; -+} -+ -+/* wrapper function to lock right or left neighbor depending on GN_GO_LEFT -+ bit in @flags parameter */ -+/* Audited by: umka (2002.06.14) */ -+static inline int -+lock_side_neighbor(lock_handle * result, -+ znode * node, znode_lock_mode mode, int flags, int rlocked) -+{ -+ int ret; -+ int ptr_offset; -+ znode_lock_request req; -+ -+ if (flags & GN_GO_LEFT) { -+ ptr_offset = LEFT_PTR_OFFSET; -+ req = ZNODE_LOCK_LOPRI; -+ } else { -+ ptr_offset = RIGHT_PTR_OFFSET; -+ req = ZNODE_LOCK_HIPRI; -+ } -+ -+ ret = -+ lock_neighbor(result, node, ptr_offset, mode, req, flags, rlocked); -+ -+ if (ret == -E_NO_NEIGHBOR) /* if we walk left or right -E_NO_NEIGHBOR does not -+ * guarantee that neighbor is absent in the -+ * tree; in this case we return -ENOENT -- -+ * means neighbor at least not found in -+ * cache */ -+ return RETERR(-ENOENT); -+ -+ return ret; -+} -+ -+#if REISER4_DEBUG -+ -+int check_sibling_list(znode * node) -+{ -+ znode *scan; -+ znode *next; -+ -+ assert("nikita-3283", LOCK_CNT_GTZ(write_locked_tree)); -+ -+ if (node == NULL) -+ return 1; -+ -+ if (ZF_ISSET(node, JNODE_RIP)) -+ return 1; -+ -+ assert("nikita-3270", node != NULL); -+ assert_rw_write_locked(&(znode_get_tree(node)->tree_lock)); -+ -+ for (scan = node; znode_is_left_connected(scan); scan = next) { -+ next = scan->left; -+ if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) { -+ assert("nikita-3271", znode_is_right_connected(next)); -+ assert("nikita-3272", next->right == scan); -+ } else -+ break; -+ } -+ for (scan = node; znode_is_right_connected(scan); scan = next) { -+ next = scan->right; -+ if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) { -+ assert("nikita-3273", znode_is_left_connected(next)); -+ assert("nikita-3274", next->left == scan); -+ } else -+ break; -+ } -+ return 1; -+} -+ -+#endif -+ -+/* Znode sibling pointers maintenence. */ -+ -+/* Znode sibling pointers are established between any neighbored nodes which are -+ in cache. There are two znode state bits (JNODE_LEFT_CONNECTED, -+ JNODE_RIGHT_CONNECTED), if left or right sibling pointer contains actual -+ value (even NULL), corresponded JNODE_*_CONNECTED bit is set. -+ -+ Reiser4 tree operations which may allocate new znodes (CBK, tree balancing) -+ take care about searching (hash table lookup may be required) of znode -+ neighbors, establishing sibling pointers between them and setting -+ JNODE_*_CONNECTED state bits. */ -+ -+/* adjusting of sibling pointers and `connected' states for two -+ neighbors; works if one neighbor is NULL (was not found). */ -+ -+/* FIXME-VS: this is unstatic-ed to use in tree.c in prepare_twig_cut */ -+void link_left_and_right(znode * left, znode * right) -+{ -+ assert("nikita-3275", check_sibling_list(left)); -+ assert("nikita-3275", check_sibling_list(right)); -+ -+ if (left != NULL) { -+ if (left->right == NULL) { -+ left->right = right; -+ ZF_SET(left, JNODE_RIGHT_CONNECTED); -+ -+ ON_DEBUG(left->right_version = -+ atomic_inc_return(&delim_key_version); -+ ); -+ -+ } else if (ZF_ISSET(left->right, JNODE_HEARD_BANSHEE) -+ && left->right != right) { -+ -+ ON_DEBUG(left->right->left_version = -+ atomic_inc_return(&delim_key_version); -+ left->right_version = -+ atomic_inc_return(&delim_key_version);); -+ -+ left->right->left = NULL; -+ left->right = right; -+ ZF_SET(left, JNODE_RIGHT_CONNECTED); -+ } else -+ /* -+ * there is a race condition in renew_sibling_link() -+ * and assertions below check that it is only one -+ * there. Thread T1 calls renew_sibling_link() without -+ * GN_NO_ALLOC flag. zlook() doesn't find neighbor -+ * node, but before T1 gets to the -+ * link_left_and_right(), another thread T2 creates -+ * neighbor node and connects it. check for -+ * left->right == NULL above protects T1 from -+ * overwriting correct left->right pointer installed -+ * by T2. -+ */ -+ assert("nikita-3302", -+ right == NULL || left->right == right); -+ } -+ if (right != NULL) { -+ if (right->left == NULL) { -+ right->left = left; -+ ZF_SET(right, JNODE_LEFT_CONNECTED); -+ -+ ON_DEBUG(right->left_version = -+ atomic_inc_return(&delim_key_version); -+ ); -+ -+ } else if (ZF_ISSET(right->left, JNODE_HEARD_BANSHEE) -+ && right->left != left) { -+ -+ ON_DEBUG(right->left->right_version = -+ atomic_inc_return(&delim_key_version); -+ right->left_version = -+ atomic_inc_return(&delim_key_version);); -+ -+ right->left->right = NULL; -+ right->left = left; -+ ZF_SET(right, JNODE_LEFT_CONNECTED); -+ -+ } else -+ assert("nikita-3303", -+ left == NULL || right->left == left); -+ } -+ assert("nikita-3275", check_sibling_list(left)); -+ assert("nikita-3275", check_sibling_list(right)); -+} -+ -+/* Audited by: umka (2002.06.14) */ -+static void link_znodes(znode * first, znode * second, int to_left) -+{ -+ if (to_left) -+ link_left_and_right(second, first); -+ else -+ link_left_and_right(first, second); -+} -+ -+/* getting of next (to left or to right, depend on gn_to_left bit in flags) -+ coord's unit position in horizontal direction, even across node -+ boundary. Should be called under tree lock, it protects nonexistence of -+ sibling link on parent level, if lock_side_neighbor() fails with -+ -ENOENT. */ -+static int far_next_coord(coord_t * coord, lock_handle * handle, int flags) -+{ -+ int ret; -+ znode *node; -+ reiser4_tree *tree; -+ -+ assert("umka-243", coord != NULL); -+ assert("umka-244", handle != NULL); -+ assert("zam-1069", handle->node == NULL); -+ -+ ret = -+ (flags & GN_GO_LEFT) ? coord_prev_unit(coord) : -+ coord_next_unit(coord); -+ if (!ret) -+ return 0; -+ -+ ret = -+ lock_side_neighbor(handle, coord->node, ZNODE_READ_LOCK, flags, 0); -+ if (ret) -+ return ret; -+ -+ node = handle->node; -+ tree = znode_get_tree(node); -+ write_unlock_tree(tree); -+ -+ coord_init_zero(coord); -+ -+ /* We avoid synchronous read here if it is specified by flag. */ -+ if ((flags & GN_ASYNC) && znode_page(handle->node) == NULL) { -+ ret = jstartio(ZJNODE(handle->node)); -+ if (!ret) -+ ret = -E_REPEAT; -+ goto error_locked; -+ } -+ -+ /* corresponded zrelse() should be called by the clients of -+ far_next_coord(), in place when this node gets unlocked. */ -+ ret = zload(handle->node); -+ if (ret) -+ goto error_locked; -+ -+ if (flags & GN_GO_LEFT) -+ coord_init_last_unit(coord, node); -+ else -+ coord_init_first_unit(coord, node); -+ -+ if (0) { -+ error_locked: -+ longterm_unlock_znode(handle); -+ } -+ write_lock_tree(tree); -+ return ret; -+} -+ -+/* Very significant function which performs a step in horizontal direction -+ when sibling pointer is not available. Actually, it is only function which -+ does it. -+ Note: this function does not restore locking status at exit, -+ caller should does care about proper unlocking and zrelsing */ -+static int -+renew_sibling_link(coord_t * coord, lock_handle * handle, znode * child, -+ tree_level level, int flags, int *nr_locked) -+{ -+ int ret; -+ int to_left = flags & GN_GO_LEFT; -+ reiser4_block_nr da; -+ /* parent of the neighbor node; we set it to parent until not sharing -+ of one parent between child and neighbor node is detected */ -+ znode *side_parent = coord->node; -+ reiser4_tree *tree = znode_get_tree(child); -+ znode *neighbor = NULL; -+ -+ assert("umka-245", coord != NULL); -+ assert("umka-246", handle != NULL); -+ assert("umka-247", child != NULL); -+ assert("umka-303", tree != NULL); -+ -+ init_lh(handle); -+ write_lock_tree(tree); -+ ret = far_next_coord(coord, handle, flags); -+ -+ if (ret) { -+ if (ret != -ENOENT) { -+ write_unlock_tree(tree); -+ return ret; -+ } -+ } else { -+ item_plugin *iplug; -+ -+ if (handle->node != NULL) { -+ (*nr_locked)++; -+ side_parent = handle->node; -+ } -+ -+ /* does coord object points to internal item? We do not -+ support sibling pointers between znode for formatted and -+ unformatted nodes and return -E_NO_NEIGHBOR in that case. */ -+ iplug = item_plugin_by_coord(coord); -+ if (!item_is_internal(coord)) { -+ link_znodes(child, NULL, to_left); -+ write_unlock_tree(tree); -+ /* we know there can't be formatted neighbor */ -+ return RETERR(-E_NO_NEIGHBOR); -+ } -+ write_unlock_tree(tree); -+ -+ iplug->s.internal.down_link(coord, NULL, &da); -+ -+ if (flags & GN_NO_ALLOC) { -+ neighbor = zlook(tree, &da); -+ } else { -+ neighbor = -+ zget(tree, &da, side_parent, level, -+ reiser4_ctx_gfp_mask_get()); -+ } -+ -+ if (IS_ERR(neighbor)) { -+ ret = PTR_ERR(neighbor); -+ return ret; -+ } -+ -+ if (neighbor) -+ /* update delimiting keys */ -+ set_child_delimiting_keys(coord->node, coord, neighbor); -+ -+ write_lock_tree(tree); -+ } -+ -+ if (likely(neighbor == NULL || -+ (znode_get_level(child) == znode_get_level(neighbor) -+ && child != neighbor))) -+ link_znodes(child, neighbor, to_left); -+ else { -+ warning("nikita-3532", -+ "Sibling nodes on the different levels: %i != %i\n", -+ znode_get_level(child), znode_get_level(neighbor)); -+ ret = RETERR(-EIO); -+ } -+ -+ write_unlock_tree(tree); -+ -+ /* if GN_NO_ALLOC isn't set we keep reference to neighbor znode */ -+ if (neighbor != NULL && (flags & GN_NO_ALLOC)) -+ /* atomic_dec(&ZJNODE(neighbor)->x_count); */ -+ zput(neighbor); -+ -+ return ret; -+} -+ -+/* This function is for establishing of one side relation. */ -+/* Audited by: umka (2002.06.14) */ -+static int connect_one_side(coord_t * coord, znode * node, int flags) -+{ -+ coord_t local; -+ lock_handle handle; -+ int nr_locked; -+ int ret; -+ -+ assert("umka-248", coord != NULL); -+ assert("umka-249", node != NULL); -+ -+ coord_dup_nocheck(&local, coord); -+ -+ init_lh(&handle); -+ -+ ret = -+ renew_sibling_link(&local, &handle, node, znode_get_level(node), -+ flags | GN_NO_ALLOC, &nr_locked); -+ -+ if (handle.node != NULL) { -+ /* complementary operations for zload() and lock() in far_next_coord() */ -+ zrelse(handle.node); -+ longterm_unlock_znode(&handle); -+ } -+ -+ /* we catch error codes which are not interesting for us because we -+ run renew_sibling_link() only for znode connection. */ -+ if (ret == -ENOENT || ret == -E_NO_NEIGHBOR) -+ return 0; -+ -+ return ret; -+} -+ -+/* if @child is not in `connected' state, performs hash searches for left and -+ right neighbor nodes and establishes horizontal sibling links */ -+/* Audited by: umka (2002.06.14), umka (2002.06.15) */ -+int connect_znode(coord_t * parent_coord, znode * child) -+{ -+ reiser4_tree *tree = znode_get_tree(child); -+ int ret = 0; -+ -+ assert("zam-330", parent_coord != NULL); -+ assert("zam-331", child != NULL); -+ assert("zam-332", parent_coord->node != NULL); -+ assert("umka-305", tree != NULL); -+ -+ /* it is trivial to `connect' root znode because it can't have -+ neighbors */ -+ if (znode_above_root(parent_coord->node)) { -+ child->left = NULL; -+ child->right = NULL; -+ ZF_SET(child, JNODE_LEFT_CONNECTED); -+ ZF_SET(child, JNODE_RIGHT_CONNECTED); -+ -+ ON_DEBUG(child->left_version = -+ atomic_inc_return(&delim_key_version); -+ child->right_version = -+ atomic_inc_return(&delim_key_version);); -+ -+ return 0; -+ } -+ -+ /* load parent node */ -+ coord_clear_iplug(parent_coord); -+ ret = zload(parent_coord->node); -+ -+ if (ret != 0) -+ return ret; -+ -+ /* protect `connected' state check by tree_lock */ -+ read_lock_tree(tree); -+ -+ if (!znode_is_right_connected(child)) { -+ read_unlock_tree(tree); -+ /* connect right (default is right) */ -+ ret = connect_one_side(parent_coord, child, GN_NO_ALLOC); -+ if (ret) -+ goto zrelse_and_ret; -+ -+ read_lock_tree(tree); -+ } -+ -+ ret = znode_is_left_connected(child); -+ -+ read_unlock_tree(tree); -+ -+ if (!ret) { -+ ret = -+ connect_one_side(parent_coord, child, -+ GN_NO_ALLOC | GN_GO_LEFT); -+ } else -+ ret = 0; -+ -+ zrelse_and_ret: -+ zrelse(parent_coord->node); -+ -+ return ret; -+} -+ -+/* this function is like renew_sibling_link() but allocates neighbor node if -+ it doesn't exist and `connects' it. It may require making two steps in -+ horizontal direction, first one for neighbor node finding/allocation, -+ second one is for finding neighbor of neighbor to connect freshly allocated -+ znode. */ -+/* Audited by: umka (2002.06.14), umka (2002.06.15) */ -+static int -+renew_neighbor(coord_t * coord, znode * node, tree_level level, int flags) -+{ -+ coord_t local; -+ lock_handle empty[2]; -+ reiser4_tree *tree = znode_get_tree(node); -+ znode *neighbor = NULL; -+ int nr_locked = 0; -+ int ret; -+ -+ assert("umka-250", coord != NULL); -+ assert("umka-251", node != NULL); -+ assert("umka-307", tree != NULL); -+ assert("umka-308", level <= tree->height); -+ -+ /* umka (2002.06.14) -+ Here probably should be a check for given "level" validness. -+ Something like assert("xxx-yyy", level < REAL_MAX_ZTREE_HEIGHT); -+ */ -+ -+ coord_dup(&local, coord); -+ -+ ret = -+ renew_sibling_link(&local, &empty[0], node, level, -+ flags & ~GN_NO_ALLOC, &nr_locked); -+ if (ret) -+ goto out; -+ -+ /* tree lock is not needed here because we keep parent node(s) locked -+ and reference to neighbor znode incremented */ -+ neighbor = (flags & GN_GO_LEFT) ? node->left : node->right; -+ -+ read_lock_tree(tree); -+ ret = znode_is_connected(neighbor); -+ read_unlock_tree(tree); -+ if (ret) { -+ ret = 0; -+ goto out; -+ } -+ -+ ret = -+ renew_sibling_link(&local, &empty[nr_locked], neighbor, level, -+ flags | GN_NO_ALLOC, &nr_locked); -+ /* second renew_sibling_link() call is used for znode connection only, -+ so we can live with these errors */ -+ if (-ENOENT == ret || -E_NO_NEIGHBOR == ret) -+ ret = 0; -+ -+ out: -+ -+ for (--nr_locked; nr_locked >= 0; --nr_locked) { -+ zrelse(empty[nr_locked].node); -+ longterm_unlock_znode(&empty[nr_locked]); -+ } -+ -+ if (neighbor != NULL) -+ /* decrement znode reference counter without actually -+ releasing it. */ -+ atomic_dec(&ZJNODE(neighbor)->x_count); -+ -+ return ret; -+} -+ -+/* -+ reiser4_get_neighbor() -- lock node's neighbor. -+ -+ reiser4_get_neighbor() locks node's neighbor (left or right one, depends on -+ given parameter) using sibling link to it. If sibling link is not available -+ (i.e. neighbor znode is not in cache) and flags allow read blocks, we go one -+ level up for information about neighbor's disk address. We lock node's -+ parent, if it is common parent for both 'node' and its neighbor, neighbor's -+ disk address is in next (to left or to right) down link from link that points -+ to original node. If not, we need to lock parent's neighbor, read its content -+ and take first(last) downlink with neighbor's disk address. That locking -+ could be done by using sibling link and lock_neighbor() function, if sibling -+ link exists. In another case we have to go level up again until we find -+ common parent or valid sibling link. Then go down -+ allocating/connecting/locking/reading nodes until neighbor of first one is -+ locked. -+ -+ @neighbor: result lock handle, -+ @node: a node which we lock neighbor of, -+ @lock_mode: lock mode {LM_READ, LM_WRITE}, -+ @flags: logical OR of {GN_*} (see description above) subset. -+ -+ @return: 0 if success, negative value if lock was impossible due to an error -+ or lack of neighbor node. -+*/ -+ -+/* Audited by: umka (2002.06.14), umka (2002.06.15) */ -+int -+reiser4_get_neighbor(lock_handle * neighbor, znode * node, -+ znode_lock_mode lock_mode, int flags) -+{ -+ reiser4_tree *tree = znode_get_tree(node); -+ lock_handle path[REAL_MAX_ZTREE_HEIGHT]; -+ -+ coord_t coord; -+ -+ tree_level base_level; -+ tree_level h = 0; -+ int ret; -+ -+ assert("umka-252", tree != NULL); -+ assert("umka-253", neighbor != NULL); -+ assert("umka-254", node != NULL); -+ -+ base_level = znode_get_level(node); -+ -+ assert("umka-310", base_level <= tree->height); -+ -+ coord_init_zero(&coord); -+ -+ again: -+ /* first, we try to use simple lock_neighbor() which requires sibling -+ link existence */ -+ read_lock_tree(tree); -+ ret = lock_side_neighbor(neighbor, node, lock_mode, flags, 1); -+ read_unlock_tree(tree); -+ if (!ret) { -+ /* load znode content if it was specified */ -+ if (flags & GN_LOAD_NEIGHBOR) { -+ ret = zload(node); -+ if (ret) -+ longterm_unlock_znode(neighbor); -+ } -+ return ret; -+ } -+ -+ /* only -ENOENT means we may look upward and try to connect -+ @node with its neighbor (if @flags allow us to do it) */ -+ if (ret != -ENOENT || !(flags & GN_CAN_USE_UPPER_LEVELS)) -+ return ret; -+ -+ /* before establishing of sibling link we lock parent node; it is -+ required by renew_neighbor() to work. */ -+ init_lh(&path[0]); -+ ret = reiser4_get_parent(&path[0], node, ZNODE_READ_LOCK); -+ if (ret) -+ return ret; -+ if (znode_above_root(path[0].node)) { -+ longterm_unlock_znode(&path[0]); -+ return RETERR(-E_NO_NEIGHBOR); -+ } -+ -+ while (1) { -+ znode *child = (h == 0) ? node : path[h - 1].node; -+ znode *parent = path[h].node; -+ -+ ret = zload(parent); -+ if (ret) -+ break; -+ -+ ret = find_child_ptr(parent, child, &coord); -+ -+ if (ret) { -+ zrelse(parent); -+ break; -+ } -+ -+ /* try to establish missing sibling link */ -+ ret = renew_neighbor(&coord, child, h + base_level, flags); -+ -+ zrelse(parent); -+ -+ switch (ret) { -+ case 0: -+ /* unlocking of parent znode prevents simple -+ deadlock situation */ -+ done_lh(&path[h]); -+ -+ /* depend on tree level we stay on we repeat first -+ locking attempt ... */ -+ if (h == 0) -+ goto again; -+ -+ /* ... or repeat establishing of sibling link at -+ one level below. */ -+ --h; -+ break; -+ -+ case -ENOENT: -+ /* sibling link is not available -- we go -+ upward. */ -+ init_lh(&path[h + 1]); -+ ret = -+ reiser4_get_parent(&path[h + 1], parent, -+ ZNODE_READ_LOCK); -+ if (ret) -+ goto fail; -+ ++h; -+ if (znode_above_root(path[h].node)) { -+ ret = RETERR(-E_NO_NEIGHBOR); -+ goto fail; -+ } -+ break; -+ -+ case -E_DEADLOCK: -+ /* there was lock request from hi-pri locker. if -+ it is possible we unlock last parent node and -+ re-lock it again. */ -+ for (; reiser4_check_deadlock(); h--) { -+ done_lh(&path[h]); -+ if (h == 0) -+ goto fail; -+ } -+ -+ break; -+ -+ default: /* other errors. */ -+ goto fail; -+ } -+ } -+ fail: -+ ON_DEBUG(check_lock_node_data(node)); -+ ON_DEBUG(check_lock_data()); -+ -+ /* unlock path */ -+ do { -+ /* FIXME-Zam: when we get here from case -E_DEADLOCK's goto -+ fail; path[0] is already done_lh-ed, therefore -+ longterm_unlock_znode(&path[h]); is not applicable */ -+ done_lh(&path[h]); -+ --h; -+ } while (h + 1 != 0); -+ -+ return ret; -+} -+ -+/* remove node from sibling list */ -+/* Audited by: umka (2002.06.14) */ -+void sibling_list_remove(znode * node) -+{ -+ reiser4_tree *tree; -+ -+ tree = znode_get_tree(node); -+ assert("umka-255", node != NULL); -+ assert_rw_write_locked(&(tree->tree_lock)); -+ assert("nikita-3275", check_sibling_list(node)); -+ -+ write_lock_dk(tree); -+ if (znode_is_right_connected(node) && node->right != NULL && -+ znode_is_left_connected(node) && node->left != NULL) { -+ assert("zam-32245", -+ keyeq(znode_get_rd_key(node), -+ znode_get_ld_key(node->right))); -+ znode_set_rd_key(node->left, znode_get_ld_key(node->right)); -+ } -+ write_unlock_dk(tree); -+ -+ if (znode_is_right_connected(node) && node->right != NULL) { -+ assert("zam-322", znode_is_left_connected(node->right)); -+ node->right->left = node->left; -+ ON_DEBUG(node->right->left_version = -+ atomic_inc_return(&delim_key_version); -+ ); -+ } -+ if (znode_is_left_connected(node) && node->left != NULL) { -+ assert("zam-323", znode_is_right_connected(node->left)); -+ node->left->right = node->right; -+ ON_DEBUG(node->left->right_version = -+ atomic_inc_return(&delim_key_version); -+ ); -+ } -+ -+ ZF_CLR(node, JNODE_LEFT_CONNECTED); -+ ZF_CLR(node, JNODE_RIGHT_CONNECTED); -+ ON_DEBUG(node->left = node->right = NULL; -+ node->left_version = atomic_inc_return(&delim_key_version); -+ node->right_version = atomic_inc_return(&delim_key_version);); -+ assert("nikita-3276", check_sibling_list(node)); -+} -+ -+/* disconnect node from sibling list */ -+void sibling_list_drop(znode * node) -+{ -+ znode *right; -+ znode *left; -+ -+ assert("nikita-2464", node != NULL); -+ assert("nikita-3277", check_sibling_list(node)); -+ -+ right = node->right; -+ if (right != NULL) { -+ assert("nikita-2465", znode_is_left_connected(right)); -+ right->left = NULL; -+ ON_DEBUG(right->left_version = -+ atomic_inc_return(&delim_key_version); -+ ); -+ } -+ left = node->left; -+ if (left != NULL) { -+ assert("zam-323", znode_is_right_connected(left)); -+ left->right = NULL; -+ ON_DEBUG(left->right_version = -+ atomic_inc_return(&delim_key_version); -+ ); -+ } -+ ZF_CLR(node, JNODE_LEFT_CONNECTED); -+ ZF_CLR(node, JNODE_RIGHT_CONNECTED); -+ ON_DEBUG(node->left = node->right = NULL; -+ node->left_version = atomic_inc_return(&delim_key_version); -+ node->right_version = atomic_inc_return(&delim_key_version);); -+} -+ -+/* Insert new node into sibling list. Regular balancing inserts new node -+ after (at right side) existing and locked node (@before), except one case -+ of adding new tree root node. @before should be NULL in that case. */ -+void sibling_list_insert_nolock(znode * new, znode * before) -+{ -+ assert("zam-334", new != NULL); -+ assert("nikita-3298", !znode_is_left_connected(new)); -+ assert("nikita-3299", !znode_is_right_connected(new)); -+ assert("nikita-3300", new->left == NULL); -+ assert("nikita-3301", new->right == NULL); -+ assert("nikita-3278", check_sibling_list(new)); -+ assert("nikita-3279", check_sibling_list(before)); -+ -+ if (before != NULL) { -+ assert("zam-333", znode_is_connected(before)); -+ new->right = before->right; -+ new->left = before; -+ ON_DEBUG(new->right_version = -+ atomic_inc_return(&delim_key_version); -+ new->left_version = -+ atomic_inc_return(&delim_key_version);); -+ if (before->right != NULL) { -+ before->right->left = new; -+ ON_DEBUG(before->right->left_version = -+ atomic_inc_return(&delim_key_version); -+ ); -+ } -+ before->right = new; -+ ON_DEBUG(before->right_version = -+ atomic_inc_return(&delim_key_version); -+ ); -+ } else { -+ new->right = NULL; -+ new->left = NULL; -+ ON_DEBUG(new->right_version = -+ atomic_inc_return(&delim_key_version); -+ new->left_version = -+ atomic_inc_return(&delim_key_version);); -+ } -+ ZF_SET(new, JNODE_LEFT_CONNECTED); -+ ZF_SET(new, JNODE_RIGHT_CONNECTED); -+ assert("nikita-3280", check_sibling_list(new)); -+ assert("nikita-3281", check_sibling_list(before)); -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/tree_walk.h linux-2.6.20/fs/reiser4/tree_walk.h ---- linux-2.6.20.orig/fs/reiser4/tree_walk.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/tree_walk.h 2007-05-06 14:50:43.887034467 +0400 -@@ -0,0 +1,125 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* definitions of reiser4 tree walk functions */ -+ -+#ifndef __FS_REISER4_TREE_WALK_H__ -+#define __FS_REISER4_TREE_WALK_H__ -+ -+#include "debug.h" -+#include "forward.h" -+ -+/* establishes horizontal links between cached znodes */ -+int connect_znode(coord_t * coord, znode * node); -+ -+/* tree traversal functions (reiser4_get_parent(), reiser4_get_neighbor()) -+ have the following common arguments: -+ -+ return codes: -+ -+ @return : 0 - OK, -+ -+ZAM-FIXME-HANS: wrong return code name. Change them all. -+ -ENOENT - neighbor is not in cache, what is detected by sibling -+ link absence. -+ -+ -E_NO_NEIGHBOR - we are sure that neighbor (or parent) node cannot be -+ found (because we are left-/right- most node of the -+ tree, for example). Also, this return code is for -+ reiser4_get_parent() when we see no parent link -- it -+ means that our node is root node. -+ -+ -E_DEADLOCK - deadlock detected (request from high-priority process -+ received), other error codes are conformed to -+ /usr/include/asm/errno.h . -+*/ -+ -+int -+reiser4_get_parent_flags(lock_handle * result, znode * node, -+ znode_lock_mode mode, int flags); -+ -+/* bits definition for reiser4_get_neighbor function `flags' arg. */ -+typedef enum { -+ /* If sibling pointer is NULL, this flag allows get_neighbor() to try to -+ * find not allocated not connected neigbor by going though upper -+ * levels */ -+ GN_CAN_USE_UPPER_LEVELS = 0x1, -+ /* locking left neighbor instead of right one */ -+ GN_GO_LEFT = 0x2, -+ /* automatically load neighbor node content */ -+ GN_LOAD_NEIGHBOR = 0x4, -+ /* return -E_REPEAT if can't lock */ -+ GN_TRY_LOCK = 0x8, -+ /* used internally in tree_walk.c, causes renew_sibling to not -+ allocate neighbor znode, but only search for it in znode cache */ -+ GN_NO_ALLOC = 0x10, -+ /* do not go across atom boundaries */ -+ GN_SAME_ATOM = 0x20, -+ /* allow to lock not connected nodes */ -+ GN_ALLOW_NOT_CONNECTED = 0x40, -+ /* Avoid synchronous jload, instead, call jstartio() and return -E_REPEAT. */ -+ GN_ASYNC = 0x80 -+} znode_get_neigbor_flags; -+ -+/* A commonly used wrapper for reiser4_get_parent_flags(). */ -+static inline int reiser4_get_parent(lock_handle * result, znode * node, -+ znode_lock_mode mode) -+{ -+ return reiser4_get_parent_flags(result, node, mode, -+ GN_ALLOW_NOT_CONNECTED); -+} -+ -+int reiser4_get_neighbor(lock_handle * neighbor, znode * node, -+ znode_lock_mode lock_mode, int flags); -+ -+/* there are wrappers for most common usages of reiser4_get_neighbor() */ -+static inline int -+reiser4_get_left_neighbor(lock_handle * result, znode * node, int lock_mode, -+ int flags) -+{ -+ return reiser4_get_neighbor(result, node, lock_mode, -+ flags | GN_GO_LEFT); -+} -+ -+static inline int -+reiser4_get_right_neighbor(lock_handle * result, znode * node, int lock_mode, -+ int flags) -+{ -+ ON_DEBUG(check_lock_node_data(node)); -+ ON_DEBUG(check_lock_data()); -+ return reiser4_get_neighbor(result, node, lock_mode, -+ flags & (~GN_GO_LEFT)); -+} -+ -+extern void sibling_list_remove(znode * node); -+extern void sibling_list_drop(znode * node); -+extern void sibling_list_insert_nolock(znode * new, znode * before); -+extern void link_left_and_right(znode * left, znode * right); -+ -+/* Functions called by tree_walk() when tree_walk() ... */ -+struct tree_walk_actor { -+ /* ... meets a formatted node, */ -+ int (*process_znode) (tap_t *, void *); -+ /* ... meets an extent, */ -+ int (*process_extent) (tap_t *, void *); -+ /* ... begins tree traversal or repeats it after -E_REPEAT was returned by -+ * node or extent processing functions. */ -+ int (*before) (void *); -+}; -+ -+#if REISER4_DEBUG -+int check_sibling_list(znode * node); -+#else -+#define check_sibling_list(n) (1) -+#endif -+ -+#endif /* __FS_REISER4_TREE_WALK_H__ */ -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/txnmgr.c linux-2.6.20/fs/reiser4/txnmgr.c ---- linux-2.6.20.orig/fs/reiser4/txnmgr.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/txnmgr.c 2007-05-06 14:50:43.895036966 +0400 -@@ -0,0 +1,3164 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Joshua MacDonald wrote the first draft of this code. */ -+ -+/* ZAM-LONGTERM-FIXME-HANS: The locking in this file is badly designed, and a -+filesystem scales only as well as its worst locking design. You need to -+substantially restructure this code. Josh was not as experienced a programmer -+as you. Particularly review how the locking style differs from what you did -+for znodes usingt hi-lo priority locking, and present to me an opinion on -+whether the differences are well founded. */ -+ -+/* I cannot help but to disagree with the sentiment above. Locking of -+ * transaction manager is _not_ badly designed, and, at the very least, is not -+ * the scaling bottleneck. Scaling bottleneck is _exactly_ hi-lo priority -+ * locking on znodes, especially on the root node of the tree. --nikita, -+ * 2003.10.13 */ -+ -+/* The txnmgr is a set of interfaces that keep track of atoms and transcrash handles. The -+ txnmgr processes capture_block requests and manages the relationship between jnodes and -+ atoms through the various stages of a transcrash, and it also oversees the fusion and -+ capture-on-copy processes. The main difficulty with this task is maintaining a -+ deadlock-free lock ordering between atoms and jnodes/handles. The reason for the -+ difficulty is that jnodes, handles, and atoms contain pointer circles, and the cycle -+ must be broken. The main requirement is that atom-fusion be deadlock free, so once you -+ hold the atom_lock you may then wait to acquire any jnode or handle lock. This implies -+ that any time you check the atom-pointer of a jnode or handle and then try to lock that -+ atom, you must use trylock() and possibly reverse the order. -+ -+ This code implements the design documented at: -+ -+ http://namesys.com/txn-doc.html -+ -+ZAM-FIXME-HANS: update v4.html to contain all of the information present in the above (but updated), and then remove the -+above document and reference the new. Be sure to provide some credit to Josh. I already have some writings on this -+topic in v4.html, but they are lacking in details present in the above. Cure that. Remember to write for the bright 12 -+year old --- define all technical terms used. -+ -+*/ -+ -+/* Thoughts on the external transaction interface: -+ -+ In the current code, a TRANSCRASH handle is created implicitly by reiser4_init_context() (which -+ creates state that lasts for the duration of a system call and is called at the start -+ of ReiserFS methods implementing VFS operations), and closed by reiser4_exit_context(), -+ occupying the scope of a single system call. We wish to give certain applications an -+ interface to begin and close (commit) transactions. Since our implementation of -+ transactions does not yet support isolation, allowing an application to open a -+ transaction implies trusting it to later close the transaction. Part of the -+ transaction interface will be aimed at enabling that trust, but the interface for -+ actually using transactions is fairly narrow. -+ -+ BEGIN_TRANSCRASH: Returns a transcrash identifier. It should be possible to translate -+ this identifier into a string that a shell-script could use, allowing you to start a -+ transaction by issuing a command. Once open, the transcrash should be set in the task -+ structure, and there should be options (I suppose) to allow it to be carried across -+ fork/exec. A transcrash has several options: -+ -+ - READ_FUSING or WRITE_FUSING: The default policy is for txn-capture to capture only -+ on writes (WRITE_FUSING) and allow "dirty reads". If the application wishes to -+ capture on reads as well, it should set READ_FUSING. -+ -+ - TIMEOUT: Since a non-isolated transcrash cannot be undone, every transcrash must -+ eventually close (or else the machine must crash). If the application dies an -+ unexpected death with an open transcrash, for example, or if it hangs for a long -+ duration, one solution (to avoid crashing the machine) is to simply close it anyway. -+ This is a dangerous option, but it is one way to solve the problem until isolated -+ transcrashes are available for untrusted applications. -+ -+ It seems to be what databases do, though it is unclear how one avoids a DoS attack -+ creating a vulnerability based on resource starvation. Guaranteeing that some -+ minimum amount of computational resources are made available would seem more correct -+ than guaranteeing some amount of time. When we again have someone to code the work, -+ this issue should be considered carefully. -Hans -+ -+ RESERVE_BLOCKS: A running transcrash should indicate to the transaction manager how -+ many dirty blocks it expects. The reserve_blocks interface should be called at a point -+ where it is safe for the application to fail, because the system may not be able to -+ grant the allocation and the application must be able to back-out. For this reason, -+ the number of reserve-blocks can also be passed as an argument to BEGIN_TRANSCRASH, but -+ the application may also wish to extend the allocation after beginning its transcrash. -+ -+ CLOSE_TRANSCRASH: The application closes the transcrash when it is finished making -+ modifications that require transaction protection. When isolated transactions are -+ supported the CLOSE operation is replaced by either COMMIT or ABORT. For example, if a -+ RESERVE_BLOCKS call fails for the application, it should "abort" by calling -+ CLOSE_TRANSCRASH, even though it really commits any changes that were made (which is -+ why, for safety, the application should call RESERVE_BLOCKS before making any changes). -+ -+ For actually implementing these out-of-system-call-scopped transcrashes, the -+ reiser4_context has a "txn_handle *trans" pointer that may be set to an open -+ transcrash. Currently there are no dynamically-allocated transcrashes, but there is a -+ "struct kmem_cache *_txnh_slab" created for that purpose in this file. -+*/ -+ -+/* Extending the other system call interfaces for future transaction features: -+ -+ Specialized applications may benefit from passing flags to the ordinary system call -+ interface such as read(), write(), or stat(). For example, the application specifies -+ WRITE_FUSING by default but wishes to add that a certain read() command should be -+ treated as READ_FUSING. But which read? Is it the directory-entry read, the stat-data -+ read, or the file-data read? These issues are straight-forward, but there are a lot of -+ them and adding the necessary flags-passing code will be tedious. -+ -+ When supporting isolated transactions, there is a corresponding READ_MODIFY_WRITE (RMW) -+ flag, which specifies that although it is a read operation being requested, a -+ write-lock should be taken. The reason is that read-locks are shared while write-locks -+ are exclusive, so taking a read-lock when a later-write is known in advance will often -+ leads to deadlock. If a reader knows it will write later, it should issue read -+ requests with the RMW flag set. -+*/ -+ -+/* -+ The znode/atom deadlock avoidance. -+ -+ FIXME(Zam): writing of this comment is in progress. -+ -+ The atom's special stage ASTAGE_CAPTURE_WAIT introduces a kind of atom's -+ long-term locking, which makes reiser4 locking scheme more complex. It had -+ deadlocks until we implement deadlock avoidance algorithms. That deadlocks -+ looked as the following: one stopped thread waits for a long-term lock on -+ znode, the thread who owns that lock waits when fusion with another atom will -+ be allowed. -+ -+ The source of the deadlocks is an optimization of not capturing index nodes -+ for read. Let's prove it. Suppose we have dumb node capturing scheme which -+ unconditionally captures each block before locking it. -+ -+ That scheme has no deadlocks. Let's begin with the thread which stage is -+ ASTAGE_CAPTURE_WAIT and it waits for a znode lock. The thread can't wait for -+ a capture because it's stage allows fusion with any atom except which are -+ being committed currently. A process of atom commit can't deadlock because -+ atom commit procedure does not acquire locks and does not fuse with other -+ atoms. Reiser4 does capturing right before going to sleep inside the -+ longtertm_lock_znode() function, it means the znode which we want to lock is -+ already captured and its atom is in ASTAGE_CAPTURE_WAIT stage. If we -+ continue the analysis we understand that no one process in the sequence may -+ waits atom fusion. Thereby there are no deadlocks of described kind. -+ -+ The capturing optimization makes the deadlocks possible. A thread can wait a -+ lock which owner did not captured that node. The lock owner's current atom -+ is not fused with the first atom and it does not get a ASTAGE_CAPTURE_WAIT -+ state. A deadlock is possible when that atom meets another one which is in -+ ASTAGE_CAPTURE_WAIT already. -+ -+ The deadlock avoidance scheme includes two algorithms: -+ -+ First algorithm is used when a thread captures a node which is locked but not -+ captured by another thread. Those nodes are marked MISSED_IN_CAPTURE at the -+ moment we skip their capturing. If such a node (marked MISSED_IN_CAPTURE) is -+ being captured by a thread with current atom is in ASTAGE_CAPTURE_WAIT, the -+ routine which forces all lock owners to join with current atom is executed. -+ -+ Second algorithm does not allow to skip capturing of already captured nodes. -+ -+ Both algorithms together prevent waiting a longterm lock without atom fusion -+ with atoms of all lock owners, which is a key thing for getting atom/znode -+ locking deadlocks. -+*/ -+ -+/* -+ * Transactions and mmap(2). -+ * -+ * 1. Transactions are not supported for accesses through mmap(2), because -+ * this would effectively amount to user-level transactions whose duration -+ * is beyond control of the kernel. -+ * -+ * 2. That said, we still want to preserve some decency with regard to -+ * mmap(2). During normal write(2) call, following sequence of events -+ * happens: -+ * -+ * 1. page is created; -+ * -+ * 2. jnode is created, dirtied and captured into current atom. -+ * -+ * 3. extent is inserted and modified. -+ * -+ * Steps (2) and (3) take place under long term lock on the twig node. -+ * -+ * When file is accessed through mmap(2) page is always created during -+ * page fault. -+ * After this (in reiser4_readpage()->reiser4_readpage_extent()): -+ * -+ * 1. if access is made to non-hole page new jnode is created, (if -+ * necessary) -+ * -+ * 2. if access is made to the hole page, jnode is not created (XXX -+ * not clear why). -+ * -+ * Also, even if page is created by write page fault it is not marked -+ * dirty immediately by handle_mm_fault(). Probably this is to avoid races -+ * with page write-out. -+ * -+ * Dirty bit installed by hardware is only transferred to the struct page -+ * later, when page is unmapped (in zap_pte_range(), or -+ * try_to_unmap_one()). -+ * -+ * So, with mmap(2) we have to handle following irksome situations: -+ * -+ * 1. there exists modified page (clean or dirty) without jnode -+ * -+ * 2. there exists modified page (clean or dirty) with clean jnode -+ * -+ * 3. clean page which is a part of atom can be transparently modified -+ * at any moment through mapping without becoming dirty. -+ * -+ * (1) and (2) can lead to the out-of-memory situation: ->writepage() -+ * doesn't know what to do with such pages and ->sync_sb()/->writepages() -+ * don't see them, because these methods operate on atoms. -+ * -+ * (3) can lead to the loss of data: suppose we have dirty page with dirty -+ * captured jnode captured by some atom. As part of early flush (for -+ * example) page was written out. Dirty bit was cleared on both page and -+ * jnode. After this page is modified through mapping, but kernel doesn't -+ * notice and just discards page and jnode as part of commit. (XXX -+ * actually it doesn't, because to reclaim page ->releasepage() has to be -+ * called and before this dirty bit will be transferred to the struct -+ * page). -+ * -+ */ -+ -+#include "debug.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree.h" -+#include "wander.h" -+#include "ktxnmgrd.h" -+#include "super.h" -+#include "page_cache.h" -+#include "reiser4.h" -+#include "vfs_ops.h" -+#include "inode.h" -+#include "flush.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include /* for totalram_pages */ -+ -+static void atom_free(txn_atom * atom); -+ -+static int commit_txnh(txn_handle * txnh); -+ -+static void wakeup_atom_waitfor_list(txn_atom * atom); -+static void wakeup_atom_waiting_list(txn_atom * atom); -+ -+static void capture_assign_txnh_nolock(txn_atom * atom, txn_handle * txnh); -+ -+static void capture_assign_block_nolock(txn_atom * atom, jnode * node); -+ -+static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node); -+ -+static int capture_init_fusion(jnode * node, txn_handle * txnh, -+ txn_capture mode); -+ -+static int capture_fuse_wait(txn_handle *, txn_atom *, txn_atom *, txn_capture); -+ -+static void capture_fuse_into(txn_atom * small, txn_atom * large); -+ -+void reiser4_invalidate_list(struct list_head *); -+ -+/* GENERIC STRUCTURES */ -+ -+typedef struct _txn_wait_links txn_wait_links; -+ -+struct _txn_wait_links { -+ lock_stack *_lock_stack; -+ struct list_head _fwaitfor_link; -+ struct list_head _fwaiting_link; -+ int (*waitfor_cb) (txn_atom * atom, struct _txn_wait_links * wlinks); -+ int (*waiting_cb) (txn_atom * atom, struct _txn_wait_links * wlinks); -+}; -+ -+/* FIXME: In theory, we should be using the slab cache init & destructor -+ methods instead of, e.g., jnode_init, etc. */ -+static struct kmem_cache *_atom_slab = NULL; -+/* this is for user-visible, cross system-call transactions. */ -+static struct kmem_cache *_txnh_slab = NULL; -+ -+/** -+ * init_txnmgr_static - create transaction manager slab caches -+ * -+ * Initializes caches of txn-atoms and txn_handle. It is part of reiser4 module -+ * initialization. -+ */ -+int init_txnmgr_static(void) -+{ -+ assert("jmacd-600", _atom_slab == NULL); -+ assert("jmacd-601", _txnh_slab == NULL); -+ -+ ON_DEBUG(atomic_set(&flush_cnt, 0)); -+ -+ _atom_slab = kmem_cache_create("txn_atom", sizeof(txn_atom), 0, -+ SLAB_HWCACHE_ALIGN | -+ SLAB_RECLAIM_ACCOUNT, NULL, NULL); -+ if (_atom_slab == NULL) -+ return RETERR(-ENOMEM); -+ -+ _txnh_slab = kmem_cache_create("txn_handle", sizeof(txn_handle), 0, -+ SLAB_HWCACHE_ALIGN, NULL, NULL); -+ if (_txnh_slab == NULL) { -+ kmem_cache_destroy(_atom_slab); -+ _atom_slab = NULL; -+ return RETERR(-ENOMEM); -+ } -+ -+ return 0; -+} -+ -+/** -+ * done_txnmgr_static - delete txn_atom and txn_handle caches -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+void done_txnmgr_static(void) -+{ -+ destroy_reiser4_cache(&_atom_slab); -+ destroy_reiser4_cache(&_txnh_slab); -+} -+ -+/** -+ * init_txnmgr - initialize a new transaction manager -+ * @mgr: pointer to transaction manager embedded in reiser4 super block -+ * -+ * This is called on mount. Makes necessary initializations. -+ */ -+void reiser4_init_txnmgr(txn_mgr *mgr) -+{ -+ assert("umka-169", mgr != NULL); -+ -+ mgr->atom_count = 0; -+ mgr->id_count = 1; -+ INIT_LIST_HEAD(&mgr->atoms_list); -+ spin_lock_init(&mgr->tmgr_lock); -+ mutex_init(&mgr->commit_mutex); -+} -+ -+/** -+ * reiser4_done_txnmgr - stop transaction manager -+ * @mgr: pointer to transaction manager embedded in reiser4 super block -+ * -+ * This is called on umount. Does sanity checks. -+ */ -+void reiser4_done_txnmgr(txn_mgr *mgr) -+{ -+ assert("umka-170", mgr != NULL); -+ assert("umka-1701", list_empty_careful(&mgr->atoms_list)); -+ assert("umka-1702", mgr->atom_count == 0); -+} -+ -+/* Initialize a transaction handle. */ -+/* Audited by: umka (2002.06.13) */ -+static void txnh_init(txn_handle * txnh, txn_mode mode) -+{ -+ assert("umka-171", txnh != NULL); -+ -+ txnh->mode = mode; -+ txnh->atom = NULL; -+ reiser4_ctx_gfp_mask_set(); -+ txnh->flags = 0; -+ spin_lock_init(&txnh->hlock); -+ INIT_LIST_HEAD(&txnh->txnh_link); -+} -+ -+#if REISER4_DEBUG -+/* Check if a transaction handle is clean. */ -+static int txnh_isclean(txn_handle * txnh) -+{ -+ assert("umka-172", txnh != NULL); -+ return txnh->atom == NULL && -+ LOCK_CNT_NIL(spin_locked_txnh); -+} -+#endif -+ -+/* Initialize an atom. */ -+static void atom_init(txn_atom * atom) -+{ -+ int level; -+ -+ assert("umka-173", atom != NULL); -+ -+ memset(atom, 0, sizeof(txn_atom)); -+ -+ atom->stage = ASTAGE_FREE; -+ atom->start_time = jiffies; -+ -+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) -+ INIT_LIST_HEAD(ATOM_DIRTY_LIST(atom, level)); -+ -+ INIT_LIST_HEAD(ATOM_CLEAN_LIST(atom)); -+ INIT_LIST_HEAD(ATOM_OVRWR_LIST(atom)); -+ INIT_LIST_HEAD(ATOM_WB_LIST(atom)); -+ INIT_LIST_HEAD(&atom->inodes); -+ spin_lock_init(&(atom->alock)); -+ /* list of transaction handles */ -+ INIT_LIST_HEAD(&atom->txnh_list); -+ /* link to transaction manager's list of atoms */ -+ INIT_LIST_HEAD(&atom->atom_link); -+ INIT_LIST_HEAD(&atom->fwaitfor_list); -+ INIT_LIST_HEAD(&atom->fwaiting_list); -+ blocknr_set_init(&atom->delete_set); -+ blocknr_set_init(&atom->wandered_map); -+ -+ init_atom_fq_parts(atom); -+} -+ -+#if REISER4_DEBUG -+/* Check if an atom is clean. */ -+static int atom_isclean(txn_atom * atom) -+{ -+ int level; -+ -+ assert("umka-174", atom != NULL); -+ -+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) { -+ if (!list_empty_careful(ATOM_DIRTY_LIST(atom, level))) { -+ return 0; -+ } -+ } -+ -+ return atom->stage == ASTAGE_FREE && -+ atom->txnh_count == 0 && -+ atom->capture_count == 0 && -+ atomic_read(&atom->refcount) == 0 && -+ (&atom->atom_link == atom->atom_link.next && -+ &atom->atom_link == atom->atom_link.prev) && -+ list_empty_careful(&atom->txnh_list) && -+ list_empty_careful(ATOM_CLEAN_LIST(atom)) && -+ list_empty_careful(ATOM_OVRWR_LIST(atom)) && -+ list_empty_careful(ATOM_WB_LIST(atom)) && -+ list_empty_careful(&atom->fwaitfor_list) && -+ list_empty_careful(&atom->fwaiting_list) && -+ atom_fq_parts_are_clean(atom); -+} -+#endif -+ -+/* Begin a transaction in this context. Currently this uses the reiser4_context's -+ trans_in_ctx, which means that transaction handles are stack-allocated. Eventually -+ this will be extended to allow transaction handles to span several contexts. */ -+/* Audited by: umka (2002.06.13) */ -+void reiser4_txn_begin(reiser4_context * context) -+{ -+ assert("jmacd-544", context->trans == NULL); -+ -+ context->trans = &context->trans_in_ctx; -+ -+ /* FIXME_LATER_JMACD Currently there's no way to begin a TXN_READ_FUSING -+ transcrash. Default should be TXN_WRITE_FUSING. Also, the _trans variable is -+ stack allocated right now, but we would like to allow for dynamically allocated -+ transcrashes that span multiple system calls. -+ */ -+ txnh_init(context->trans, TXN_WRITE_FUSING); -+} -+ -+/* Finish a transaction handle context. */ -+int reiser4_txn_end(reiser4_context * context) -+{ -+ long ret = 0; -+ txn_handle *txnh; -+ -+ assert("umka-283", context != NULL); -+ assert("nikita-3012", reiser4_schedulable()); -+ assert("vs-24", context == get_current_context()); -+ assert("nikita-2967", lock_stack_isclean(get_current_lock_stack())); -+ -+ txnh = context->trans; -+ if (txnh != NULL) { -+ if (txnh->atom != NULL) -+ ret = commit_txnh(txnh); -+ assert("jmacd-633", txnh_isclean(txnh)); -+ context->trans = NULL; -+ } -+ return ret; -+} -+ -+void reiser4_txn_restart(reiser4_context * context) -+{ -+ reiser4_txn_end(context); -+ reiser4_preempt_point(); -+ reiser4_txn_begin(context); -+} -+ -+void reiser4_txn_restart_current(void) -+{ -+ reiser4_txn_restart(get_current_context()); -+} -+ -+/* TXN_ATOM */ -+ -+/* Get the atom belonging to a txnh, which is not locked. Return txnh locked. Locks atom, if atom -+ is not NULL. This performs the necessary spin_trylock to break the lock-ordering cycle. May -+ return NULL. */ -+static txn_atom *txnh_get_atom(txn_handle * txnh) -+{ -+ txn_atom *atom; -+ -+ assert("umka-180", txnh != NULL); -+ assert_spin_not_locked(&(txnh->hlock)); -+ -+ while (1) { -+ spin_lock_txnh(txnh); -+ atom = txnh->atom; -+ -+ if (atom == NULL) -+ break; -+ -+ if (spin_trylock_atom(atom)) -+ break; -+ -+ atomic_inc(&atom->refcount); -+ -+ spin_unlock_txnh(txnh); -+ spin_lock_atom(atom); -+ spin_lock_txnh(txnh); -+ -+ if (txnh->atom == atom) { -+ atomic_dec(&atom->refcount); -+ break; -+ } -+ -+ spin_unlock_txnh(txnh); -+ atom_dec_and_unlock(atom); -+ } -+ -+ return atom; -+} -+ -+/* Get the current atom and spinlock it if current atom present. May return NULL */ -+txn_atom *get_current_atom_locked_nocheck(void) -+{ -+ reiser4_context *cx; -+ txn_atom *atom; -+ txn_handle *txnh; -+ -+ cx = get_current_context(); -+ assert("zam-437", cx != NULL); -+ -+ txnh = cx->trans; -+ assert("zam-435", txnh != NULL); -+ -+ atom = txnh_get_atom(txnh); -+ -+ spin_unlock_txnh(txnh); -+ return atom; -+} -+ -+/* Get the atom belonging to a jnode, which is initially locked. Return with -+ both jnode and atom locked. This performs the necessary spin_trylock to -+ break the lock-ordering cycle. Assumes the jnode is already locked, and -+ returns NULL if atom is not set. */ -+txn_atom *jnode_get_atom(jnode * node) -+{ -+ txn_atom *atom; -+ -+ assert("umka-181", node != NULL); -+ -+ while (1) { -+ assert_spin_locked(&(node->guard)); -+ -+ atom = node->atom; -+ /* node is not in any atom */ -+ if (atom == NULL) -+ break; -+ -+ /* If atom is not locked, grab the lock and return */ -+ if (spin_trylock_atom(atom)) -+ break; -+ -+ /* At least one jnode belongs to this atom it guarantees that -+ * atom->refcount > 0, we can safely increment refcount. */ -+ atomic_inc(&atom->refcount); -+ spin_unlock_jnode(node); -+ -+ /* re-acquire spin locks in the right order */ -+ spin_lock_atom(atom); -+ spin_lock_jnode(node); -+ -+ /* check if node still points to the same atom. */ -+ if (node->atom == atom) { -+ atomic_dec(&atom->refcount); -+ break; -+ } -+ -+ /* releasing of atom lock and reference requires not holding -+ * locks on jnodes. */ -+ spin_unlock_jnode(node); -+ -+ /* We do not sure that this atom has extra references except our -+ * one, so we should call proper function which may free atom if -+ * last reference is released. */ -+ atom_dec_and_unlock(atom); -+ -+ /* lock jnode again for getting valid node->atom pointer -+ * value. */ -+ spin_lock_jnode(node); -+ } -+ -+ return atom; -+} -+ -+/* Returns true if @node is dirty and part of the same atom as one of its neighbors. Used -+ by flush code to indicate whether the next node (in some direction) is suitable for -+ flushing. */ -+int -+same_slum_check(jnode * node, jnode * check, int alloc_check, int alloc_value) -+{ -+ int compat; -+ txn_atom *atom; -+ -+ assert("umka-182", node != NULL); -+ assert("umka-183", check != NULL); -+ -+ /* Not sure what this function is supposed to do if supplied with @check that is -+ neither formatted nor unformatted (bitmap or so). */ -+ assert("nikita-2373", jnode_is_znode(check) -+ || jnode_is_unformatted(check)); -+ -+ /* Need a lock on CHECK to get its atom and to check various state bits. -+ Don't need a lock on NODE once we get the atom lock. */ -+ /* It is not enough to lock two nodes and check (node->atom == -+ check->atom) because atom could be locked and being fused at that -+ moment, jnodes of the atom of that state (being fused) can point to -+ different objects, but the atom is the same. */ -+ spin_lock_jnode(check); -+ -+ atom = jnode_get_atom(check); -+ -+ if (atom == NULL) { -+ compat = 0; -+ } else { -+ compat = (node->atom == atom && JF_ISSET(check, JNODE_DIRTY)); -+ -+ if (compat && jnode_is_znode(check)) { -+ compat &= znode_is_connected(JZNODE(check)); -+ } -+ -+ if (compat && alloc_check) { -+ compat &= (alloc_value == jnode_is_flushprepped(check)); -+ } -+ -+ spin_unlock_atom(atom); -+ } -+ -+ spin_unlock_jnode(check); -+ -+ return compat; -+} -+ -+/* Decrement the atom's reference count and if it falls to zero, free it. */ -+void atom_dec_and_unlock(txn_atom * atom) -+{ -+ txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr; -+ -+ assert("umka-186", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ assert("zam-1039", atomic_read(&atom->refcount) > 0); -+ -+ if (atomic_dec_and_test(&atom->refcount)) { -+ /* take txnmgr lock and atom lock in proper order. */ -+ if (!spin_trylock_txnmgr(mgr)) { -+ /* This atom should exist after we re-acquire its -+ * spinlock, so we increment its reference counter. */ -+ atomic_inc(&atom->refcount); -+ spin_unlock_atom(atom); -+ spin_lock_txnmgr(mgr); -+ spin_lock_atom(atom); -+ -+ if (!atomic_dec_and_test(&atom->refcount)) { -+ spin_unlock_atom(atom); -+ spin_unlock_txnmgr(mgr); -+ return; -+ } -+ } -+ assert_spin_locked(&(mgr->tmgr_lock)); -+ atom_free(atom); -+ spin_unlock_txnmgr(mgr); -+ } else -+ spin_unlock_atom(atom); -+} -+ -+/* Create new atom and connect it to given transaction handle. This adds the -+ atom to the transaction manager's list and sets its reference count to 1, an -+ artificial reference which is kept until it commits. We play strange games -+ to avoid allocation under jnode & txnh spinlocks.*/ -+ -+static int atom_begin_and_assign_to_txnh(txn_atom ** atom_alloc, txn_handle * txnh) -+{ -+ txn_atom *atom; -+ txn_mgr *mgr; -+ -+ if (REISER4_DEBUG && rofs_tree(current_tree)) { -+ warning("nikita-3366", "Creating atom on rofs"); -+ dump_stack(); -+ } -+ -+ if (*atom_alloc == NULL) { -+ (*atom_alloc) = kmem_cache_alloc(_atom_slab, -+ reiser4_ctx_gfp_mask_get()); -+ -+ if (*atom_alloc == NULL) -+ return RETERR(-ENOMEM); -+ } -+ -+ /* and, also, txnmgr spin lock should be taken before jnode and txnh -+ locks. */ -+ mgr = &get_super_private(reiser4_get_current_sb())->tmgr; -+ spin_lock_txnmgr(mgr); -+ spin_lock_txnh(txnh); -+ -+ /* Check whether new atom still needed */ -+ if (txnh->atom != NULL) { -+ /* NOTE-NIKITA probably it is rather better to free -+ * atom_alloc here than thread it up to reiser4_try_capture() */ -+ -+ spin_unlock_txnh(txnh); -+ spin_unlock_txnmgr(mgr); -+ -+ return -E_REPEAT; -+ } -+ -+ atom = *atom_alloc; -+ *atom_alloc = NULL; -+ -+ atom_init(atom); -+ -+ assert("jmacd-17", atom_isclean(atom)); -+ -+ /* -+ * lock ordering is broken here. It is ok, as long as @atom is new -+ * and inaccessible for others. We can't use spin_lock_atom or -+ * spin_lock(&atom->alock) because they care about locking -+ * dependencies. spin_trylock_lock doesn't. -+ */ -+ check_me("", spin_trylock_atom(atom)); -+ -+ /* add atom to the end of transaction manager's list of atoms */ -+ list_add_tail(&atom->atom_link, &mgr->atoms_list); -+ atom->atom_id = mgr->id_count++; -+ mgr->atom_count += 1; -+ -+ /* Release txnmgr lock */ -+ spin_unlock_txnmgr(mgr); -+ -+ /* One reference until it commits. */ -+ atomic_inc(&atom->refcount); -+ atom->stage = ASTAGE_CAPTURE_FUSE; -+ atom->super = reiser4_get_current_sb(); -+ capture_assign_txnh_nolock(atom, txnh); -+ -+ spin_unlock_atom(atom); -+ spin_unlock_txnh(txnh); -+ -+ return -E_REPEAT; -+} -+ -+/* Return true if an atom is currently "open". */ -+static int atom_isopen(const txn_atom * atom) -+{ -+ assert("umka-185", atom != NULL); -+ -+ return atom->stage > 0 && atom->stage < ASTAGE_PRE_COMMIT; -+} -+ -+/* Return the number of pointers to this atom that must be updated during fusion. This -+ approximates the amount of work to be done. Fusion chooses the atom with fewer -+ pointers to fuse into the atom with more pointers. */ -+static int atom_pointer_count(const txn_atom * atom) -+{ -+ assert("umka-187", atom != NULL); -+ -+ /* This is a measure of the amount of work needed to fuse this atom -+ * into another. */ -+ return atom->txnh_count + atom->capture_count; -+} -+ -+/* Called holding the atom lock, this removes the atom from the transaction manager list -+ and frees it. */ -+static void atom_free(txn_atom * atom) -+{ -+ txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr; -+ -+ assert("umka-188", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ -+ /* Remove from the txn_mgr's atom list */ -+ assert_spin_locked(&(mgr->tmgr_lock)); -+ mgr->atom_count -= 1; -+ list_del_init(&atom->atom_link); -+ -+ /* Clean the atom */ -+ assert("jmacd-16", -+ (atom->stage == ASTAGE_INVALID || atom->stage == ASTAGE_DONE)); -+ atom->stage = ASTAGE_FREE; -+ -+ blocknr_set_destroy(&atom->delete_set); -+ blocknr_set_destroy(&atom->wandered_map); -+ -+ assert("jmacd-16", atom_isclean(atom)); -+ -+ spin_unlock_atom(atom); -+ -+ kmem_cache_free(_atom_slab, atom); -+} -+ -+static int atom_is_dotard(const txn_atom * atom) -+{ -+ return time_after(jiffies, atom->start_time + -+ get_current_super_private()->tmgr.atom_max_age); -+} -+ -+static int atom_can_be_committed(txn_atom * atom) -+{ -+ assert_spin_locked(&(atom->alock)); -+ assert("zam-885", atom->txnh_count > atom->nr_waiters); -+ return atom->txnh_count == atom->nr_waiters + 1; -+} -+ -+/* Return true if an atom should commit now. This is determined by aging, atom -+ size or atom flags. */ -+static int atom_should_commit(const txn_atom * atom) -+{ -+ assert("umka-189", atom != NULL); -+ return -+ (atom->flags & ATOM_FORCE_COMMIT) || -+ ((unsigned)atom_pointer_count(atom) > -+ get_current_super_private()->tmgr.atom_max_size) -+ || atom_is_dotard(atom); -+} -+ -+/* return 1 if current atom exists and requires commit. */ -+int current_atom_should_commit(void) -+{ -+ txn_atom *atom; -+ int result = 0; -+ -+ atom = get_current_atom_locked_nocheck(); -+ if (atom) { -+ result = atom_should_commit(atom); -+ spin_unlock_atom(atom); -+ } -+ return result; -+} -+ -+static int atom_should_commit_asap(const txn_atom * atom) -+{ -+ unsigned int captured; -+ unsigned int pinnedpages; -+ -+ assert("nikita-3309", atom != NULL); -+ -+ captured = (unsigned)atom->capture_count; -+ pinnedpages = (captured >> PAGE_CACHE_SHIFT) * sizeof(znode); -+ -+ return (pinnedpages > (totalram_pages >> 3)) || (atom->flushed > 100); -+} -+ -+static jnode *find_first_dirty_in_list(struct list_head *head, int flags) -+{ -+ jnode *first_dirty; -+ -+ list_for_each_entry(first_dirty, head, capture_link) { -+ if (!(flags & JNODE_FLUSH_COMMIT)) { -+ /* -+ * skip jnodes which "heard banshee" or having active -+ * I/O -+ */ -+ if (JF_ISSET(first_dirty, JNODE_HEARD_BANSHEE) || -+ JF_ISSET(first_dirty, JNODE_WRITEBACK)) -+ continue; -+ } -+ return first_dirty; -+ } -+ return NULL; -+} -+ -+/* Get first dirty node from the atom's dirty_nodes[n] lists; return NULL if atom has no dirty -+ nodes on atom's lists */ -+jnode *find_first_dirty_jnode(txn_atom * atom, int flags) -+{ -+ jnode *first_dirty; -+ tree_level level; -+ -+ assert_spin_locked(&(atom->alock)); -+ -+ /* The flush starts from LEAF_LEVEL (=1). */ -+ for (level = 1; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) { -+ if (list_empty_careful(ATOM_DIRTY_LIST(atom, level))) -+ continue; -+ -+ first_dirty = -+ find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, level), -+ flags); -+ if (first_dirty) -+ return first_dirty; -+ } -+ -+ /* znode-above-root is on the list #0. */ -+ return find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, 0), flags); -+} -+ -+static void dispatch_wb_list(txn_atom * atom, flush_queue_t * fq) -+{ -+ jnode *cur; -+ -+ assert("zam-905", atom_is_protected(atom)); -+ -+ cur = list_entry(ATOM_WB_LIST(atom)->next, jnode, capture_link); -+ while (ATOM_WB_LIST(atom) != &cur->capture_link) { -+ jnode *next = list_entry(cur->capture_link.next, jnode, capture_link); -+ -+ spin_lock_jnode(cur); -+ if (!JF_ISSET(cur, JNODE_WRITEBACK)) { -+ if (JF_ISSET(cur, JNODE_DIRTY)) { -+ queue_jnode(fq, cur); -+ } else { -+ /* move jnode to atom's clean list */ -+ list_move_tail(&cur->capture_link, -+ ATOM_CLEAN_LIST(atom)); -+ } -+ } -+ spin_unlock_jnode(cur); -+ -+ cur = next; -+ } -+} -+ -+/* Scan current atom->writeback_nodes list, re-submit dirty and !writeback -+ * jnodes to disk. */ -+static int submit_wb_list(void) -+{ -+ int ret; -+ flush_queue_t *fq; -+ -+ fq = get_fq_for_current_atom(); -+ if (IS_ERR(fq)) -+ return PTR_ERR(fq); -+ -+ dispatch_wb_list(fq->atom, fq); -+ spin_unlock_atom(fq->atom); -+ -+ ret = reiser4_write_fq(fq, NULL, 1); -+ reiser4_fq_put(fq); -+ -+ return ret; -+} -+ -+/* Wait completion of all writes, re-submit atom writeback list if needed. */ -+static int current_atom_complete_writes(void) -+{ -+ int ret; -+ -+ /* Each jnode from that list was modified and dirtied when it had i/o -+ * request running already. After i/o completion we have to resubmit -+ * them to disk again.*/ -+ ret = submit_wb_list(); -+ if (ret < 0) -+ return ret; -+ -+ /* Wait all i/o completion */ -+ ret = current_atom_finish_all_fq(); -+ if (ret) -+ return ret; -+ -+ /* Scan wb list again; all i/o should be completed, we re-submit dirty -+ * nodes to disk */ -+ ret = submit_wb_list(); -+ if (ret < 0) -+ return ret; -+ -+ /* Wait all nodes we just submitted */ -+ return current_atom_finish_all_fq(); -+} -+ -+#if REISER4_DEBUG -+ -+static void reiser4_info_atom(const char *prefix, const txn_atom * atom) -+{ -+ if (atom == NULL) { -+ printk("%s: no atom\n", prefix); -+ return; -+ } -+ -+ printk("%s: refcount: %i id: %i flags: %x txnh_count: %i" -+ " capture_count: %i stage: %x start: %lu, flushed: %i\n", prefix, -+ atomic_read(&atom->refcount), atom->atom_id, atom->flags, -+ atom->txnh_count, atom->capture_count, atom->stage, -+ atom->start_time, atom->flushed); -+} -+ -+#else /* REISER4_DEBUG */ -+ -+static inline void reiser4_info_atom(const char *prefix, const txn_atom * atom) {} -+ -+#endif /* REISER4_DEBUG */ -+ -+#define TOOMANYFLUSHES (1 << 13) -+ -+/* Called with the atom locked and no open "active" transaction handlers except -+ ours, this function calls flush_current_atom() until all dirty nodes are -+ processed. Then it initiates commit processing. -+ -+ Called by the single remaining open "active" txnh, which is closing. Other -+ open txnhs belong to processes which wait atom commit in commit_txnh() -+ routine. They are counted as "waiters" in atom->nr_waiters. Therefore as -+ long as we hold the atom lock none of the jnodes can be captured and/or -+ locked. -+ -+ Return value is an error code if commit fails. -+*/ -+static int commit_current_atom(long *nr_submitted, txn_atom ** atom) -+{ -+ reiser4_super_info_data *sbinfo = get_current_super_private(); -+ long ret = 0; -+ /* how many times jnode_flush() was called as a part of attempt to -+ * commit this atom. */ -+ int flushiters; -+ -+ assert("zam-888", atom != NULL && *atom != NULL); -+ assert_spin_locked(&((*atom)->alock)); -+ assert("zam-887", get_current_context()->trans->atom == *atom); -+ assert("jmacd-151", atom_isopen(*atom)); -+ -+ assert("nikita-3184", -+ get_current_super_private()->delete_mutex_owner != current); -+ -+ for (flushiters = 0;; ++flushiters) { -+ ret = -+ flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS | -+ JNODE_FLUSH_COMMIT, -+ LONG_MAX /* nr_to_write */ , -+ nr_submitted, atom, NULL); -+ if (ret != -E_REPEAT) -+ break; -+ -+ /* if atom's dirty list contains one znode which is -+ HEARD_BANSHEE and is locked we have to allow lock owner to -+ continue and uncapture that znode */ -+ reiser4_preempt_point(); -+ -+ *atom = get_current_atom_locked(); -+ if (flushiters > TOOMANYFLUSHES && IS_POW(flushiters)) { -+ warning("nikita-3176", -+ "Flushing like mad: %i", flushiters); -+ reiser4_info_atom("atom", *atom); -+ DEBUGON(flushiters > (1 << 20)); -+ } -+ } -+ -+ if (ret) -+ return ret; -+ -+ assert_spin_locked(&((*atom)->alock)); -+ -+ if (!atom_can_be_committed(*atom)) { -+ spin_unlock_atom(*atom); -+ return RETERR(-E_REPEAT); -+ } -+ -+ if ((*atom)->capture_count == 0) -+ goto done; -+ -+ /* Up to this point we have been flushing and after flush is called we -+ return -E_REPEAT. Now we can commit. We cannot return -E_REPEAT -+ at this point, commit should be successful. */ -+ reiser4_atom_set_stage(*atom, ASTAGE_PRE_COMMIT); -+ ON_DEBUG(((*atom)->committer = current)); -+ spin_unlock_atom(*atom); -+ -+ ret = current_atom_complete_writes(); -+ if (ret) -+ return ret; -+ -+ assert("zam-906", list_empty(ATOM_WB_LIST(*atom))); -+ -+ /* isolate critical code path which should be executed by only one -+ * thread using tmgr mutex */ -+ mutex_lock(&sbinfo->tmgr.commit_mutex); -+ -+ ret = reiser4_write_logs(nr_submitted); -+ if (ret < 0) -+ reiser4_panic("zam-597", "write log failed (%ld)\n", ret); -+ -+ /* The atom->ovrwr_nodes list is processed under commit mutex held -+ because of bitmap nodes which are captured by special way in -+ reiser4_pre_commit_hook_bitmap(), that way does not include -+ capture_fuse_wait() as a capturing of other nodes does -- the commit -+ mutex is used for transaction isolation instead. */ -+ reiser4_invalidate_list(ATOM_OVRWR_LIST(*atom)); -+ mutex_unlock(&sbinfo->tmgr.commit_mutex); -+ -+ reiser4_invalidate_list(ATOM_CLEAN_LIST(*atom)); -+ reiser4_invalidate_list(ATOM_WB_LIST(*atom)); -+ assert("zam-927", list_empty(&(*atom)->inodes)); -+ -+ spin_lock_atom(*atom); -+ done: -+ reiser4_atom_set_stage(*atom, ASTAGE_DONE); -+ ON_DEBUG((*atom)->committer = NULL); -+ -+ /* Atom's state changes, so wake up everybody waiting for this -+ event. */ -+ wakeup_atom_waiting_list(*atom); -+ -+ /* Decrement the "until commit" reference, at least one txnh (the caller) is -+ still open. */ -+ atomic_dec(&(*atom)->refcount); -+ -+ assert("jmacd-1070", atomic_read(&(*atom)->refcount) > 0); -+ assert("jmacd-1062", (*atom)->capture_count == 0); -+ BUG_ON((*atom)->capture_count != 0); -+ assert_spin_locked(&((*atom)->alock)); -+ -+ return ret; -+} -+ -+/* TXN_TXNH */ -+ -+/** -+ * force_commit_atom - commit current atom and wait commit completion -+ * @txnh: -+ * -+ * Commits current atom and wait commit completion; current atom and @txnh have -+ * to be spinlocked before call, this function unlocks them on exit. -+ */ -+int force_commit_atom(txn_handle *txnh) -+{ -+ txn_atom *atom; -+ -+ assert("zam-837", txnh != NULL); -+ assert_spin_locked(&(txnh->hlock)); -+ assert("nikita-2966", lock_stack_isclean(get_current_lock_stack())); -+ -+ atom = txnh->atom; -+ -+ assert("zam-834", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ -+ /* -+ * Set flags for atom and txnh: forcing atom commit and waiting for -+ * commit completion -+ */ -+ txnh->flags |= TXNH_WAIT_COMMIT; -+ atom->flags |= ATOM_FORCE_COMMIT; -+ -+ spin_unlock_txnh(txnh); -+ spin_unlock_atom(atom); -+ -+ /* commit is here */ -+ reiser4_txn_restart_current(); -+ return 0; -+} -+ -+/* Called to force commit of any outstanding atoms. @commit_all_atoms controls -+ * should we commit all atoms including new ones which are created after this -+ * functions is called. */ -+int txnmgr_force_commit_all(struct super_block *super, int commit_all_atoms) -+{ -+ int ret; -+ txn_atom *atom; -+ txn_mgr *mgr; -+ txn_handle *txnh; -+ unsigned long start_time = jiffies; -+ reiser4_context *ctx = get_current_context(); -+ -+ assert("nikita-2965", lock_stack_isclean(get_current_lock_stack())); -+ assert("nikita-3058", reiser4_commit_check_locks()); -+ -+ reiser4_txn_restart_current(); -+ -+ mgr = &get_super_private(super)->tmgr; -+ -+ txnh = ctx->trans; -+ -+ again: -+ -+ spin_lock_txnmgr(mgr); -+ -+ list_for_each_entry(atom, &mgr->atoms_list, atom_link) { -+ spin_lock_atom(atom); -+ -+ /* Commit any atom which can be committed. If @commit_new_atoms -+ * is not set we commit only atoms which were created before -+ * this call is started. */ -+ if (commit_all_atoms -+ || time_before_eq(atom->start_time, start_time)) { -+ if (atom->stage <= ASTAGE_POST_COMMIT) { -+ spin_unlock_txnmgr(mgr); -+ -+ if (atom->stage < ASTAGE_PRE_COMMIT) { -+ spin_lock_txnh(txnh); -+ /* Add force-context txnh */ -+ capture_assign_txnh_nolock(atom, txnh); -+ ret = force_commit_atom(txnh); -+ if (ret) -+ return ret; -+ } else -+ /* wait atom commit */ -+ reiser4_atom_wait_event(atom); -+ -+ goto again; -+ } -+ } -+ -+ spin_unlock_atom(atom); -+ } -+ -+#if REISER4_DEBUG -+ if (commit_all_atoms) { -+ reiser4_super_info_data *sbinfo = get_super_private(super); -+ spin_lock_reiser4_super(sbinfo); -+ assert("zam-813", -+ sbinfo->blocks_fake_allocated_unformatted == 0); -+ assert("zam-812", sbinfo->blocks_fake_allocated == 0); -+ spin_unlock_reiser4_super(sbinfo); -+ } -+#endif -+ -+ spin_unlock_txnmgr(mgr); -+ -+ return 0; -+} -+ -+/* check whether commit_some_atoms() can commit @atom. Locking is up to the -+ * caller */ -+static int atom_is_committable(txn_atom * atom) -+{ -+ return -+ atom->stage < ASTAGE_PRE_COMMIT && -+ atom->txnh_count == atom->nr_waiters && atom_should_commit(atom); -+} -+ -+/* called periodically from ktxnmgrd to commit old atoms. Releases ktxnmgrd spin -+ * lock at exit */ -+int commit_some_atoms(txn_mgr * mgr) -+{ -+ int ret = 0; -+ txn_atom *atom; -+ txn_handle *txnh; -+ reiser4_context *ctx; -+ struct list_head *pos, *tmp; -+ -+ ctx = get_current_context(); -+ assert("nikita-2444", ctx != NULL); -+ -+ txnh = ctx->trans; -+ spin_lock_txnmgr(mgr); -+ -+ /* -+ * this is to avoid gcc complain that atom might be used -+ * uninitialized -+ */ -+ atom = NULL; -+ -+ /* look for atom to commit */ -+ list_for_each_safe(pos, tmp, &mgr->atoms_list) { -+ atom = list_entry(pos, txn_atom, atom_link); -+ /* -+ * first test without taking atom spin lock, whether it is -+ * eligible for committing at all -+ */ -+ if (atom_is_committable(atom)) { -+ /* now, take spin lock and re-check */ -+ spin_lock_atom(atom); -+ if (atom_is_committable(atom)) -+ break; -+ spin_unlock_atom(atom); -+ } -+ } -+ -+ ret = (&mgr->atoms_list == pos); -+ spin_unlock_txnmgr(mgr); -+ -+ if (ret) { -+ /* nothing found */ -+ spin_unlock(&mgr->daemon->guard); -+ return 0; -+ } -+ -+ spin_lock_txnh(txnh); -+ -+ BUG_ON(atom == NULL); -+ /* Set the atom to force committing */ -+ atom->flags |= ATOM_FORCE_COMMIT; -+ -+ /* Add force-context txnh */ -+ capture_assign_txnh_nolock(atom, txnh); -+ -+ spin_unlock_txnh(txnh); -+ spin_unlock_atom(atom); -+ -+ /* we are about to release daemon spin lock, notify daemon it -+ has to rescan atoms */ -+ mgr->daemon->rescan = 1; -+ spin_unlock(&mgr->daemon->guard); -+ reiser4_txn_restart_current(); -+ return 0; -+} -+ -+static int txn_try_to_fuse_small_atom(txn_mgr * tmgr, txn_atom * atom) -+{ -+ int atom_stage; -+ txn_atom *atom_2; -+ int repeat; -+ -+ assert("zam-1051", atom->stage < ASTAGE_PRE_COMMIT); -+ -+ atom_stage = atom->stage; -+ repeat = 0; -+ -+ if (!spin_trylock_txnmgr(tmgr)) { -+ atomic_inc(&atom->refcount); -+ spin_unlock_atom(atom); -+ spin_lock_txnmgr(tmgr); -+ spin_lock_atom(atom); -+ repeat = 1; -+ if (atom->stage != atom_stage) { -+ spin_unlock_txnmgr(tmgr); -+ atom_dec_and_unlock(atom); -+ return -E_REPEAT; -+ } -+ atomic_dec(&atom->refcount); -+ } -+ -+ list_for_each_entry(atom_2, &tmgr->atoms_list, atom_link) { -+ if (atom == atom_2) -+ continue; -+ /* -+ * if trylock does not succeed we just do not fuse with that -+ * atom. -+ */ -+ if (spin_trylock_atom(atom_2)) { -+ if (atom_2->stage < ASTAGE_PRE_COMMIT) { -+ spin_unlock_txnmgr(tmgr); -+ capture_fuse_into(atom_2, atom); -+ /* all locks are lost we can only repeat here */ -+ return -E_REPEAT; -+ } -+ spin_unlock_atom(atom_2); -+ } -+ } -+ atom->flags |= ATOM_CANCEL_FUSION; -+ spin_unlock_txnmgr(tmgr); -+ if (repeat) { -+ spin_unlock_atom(atom); -+ return -E_REPEAT; -+ } -+ return 0; -+} -+ -+/* Calls jnode_flush for current atom if it exists; if not, just take another -+ atom and call jnode_flush() for him. If current transaction handle has -+ already assigned atom (current atom) we have to close current transaction -+ prior to switch to another atom or do something with current atom. This -+ code tries to flush current atom. -+ -+ flush_some_atom() is called as part of memory clearing process. It is -+ invoked from balance_dirty_pages(), pdflushd, and entd. -+ -+ If we can flush no nodes, atom is committed, because this frees memory. -+ -+ If atom is too large or too old it is committed also. -+*/ -+int -+flush_some_atom(jnode * start, long *nr_submitted, const struct writeback_control *wbc, -+ int flags) -+{ -+ reiser4_context *ctx = get_current_context(); -+ txn_mgr *tmgr = &get_super_private(ctx->super)->tmgr; -+ txn_handle *txnh = ctx->trans; -+ txn_atom *atom; -+ int ret; -+ -+ BUG_ON(wbc->nr_to_write == 0); -+ BUG_ON(*nr_submitted != 0); -+ assert("zam-1042", txnh != NULL); -+ repeat: -+ if (txnh->atom == NULL) { -+ /* current atom is not available, take first from txnmgr */ -+ spin_lock_txnmgr(tmgr); -+ -+ /* traverse the list of all atoms */ -+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) { -+ /* lock atom before checking its state */ -+ spin_lock_atom(atom); -+ -+ /* -+ * we need an atom which is not being committed and -+ * which has no flushers (jnode_flush() add one flusher -+ * at the beginning and subtract one at the end). -+ */ -+ if (atom->stage < ASTAGE_PRE_COMMIT && -+ atom->nr_flushers == 0) { -+ spin_lock_txnh(txnh); -+ capture_assign_txnh_nolock(atom, txnh); -+ spin_unlock_txnh(txnh); -+ -+ goto found; -+ } -+ -+ spin_unlock_atom(atom); -+ } -+ -+ /* -+ * Write throttling is case of no one atom can be -+ * flushed/committed. -+ */ -+ if (!current_is_pdflush() && !wbc->nonblocking) { -+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) { -+ spin_lock_atom(atom); -+ /* Repeat the check from the above. */ -+ if (atom->stage < ASTAGE_PRE_COMMIT -+ && atom->nr_flushers == 0) { -+ spin_lock_txnh(txnh); -+ capture_assign_txnh_nolock(atom, txnh); -+ spin_unlock_txnh(txnh); -+ -+ goto found; -+ } -+ if (atom->stage <= ASTAGE_POST_COMMIT) { -+ spin_unlock_txnmgr(tmgr); -+ /* -+ * we just wait until atom's flusher -+ * makes a progress in flushing or -+ * committing the atom -+ */ -+ reiser4_atom_wait_event(atom); -+ goto repeat; -+ } -+ spin_unlock_atom(atom); -+ } -+ } -+ spin_unlock_txnmgr(tmgr); -+ return 0; -+ found: -+ spin_unlock_txnmgr(tmgr); -+ } else -+ atom = get_current_atom_locked(); -+ -+ BUG_ON(atom->super != ctx->super); -+ assert("vs-35", atom->super == ctx->super); -+ if (start) { -+ spin_lock_jnode(start); -+ ret = (atom == start->atom) ? 1 : 0; -+ spin_unlock_jnode(start); -+ if (ret == 0) -+ start = NULL; -+ } -+ ret = flush_current_atom(flags, wbc->nr_to_write, nr_submitted, &atom, start); -+ if (ret == 0) { -+ /* flush_current_atom returns 0 only if it submitted for write -+ nothing */ -+ BUG_ON(*nr_submitted != 0); -+ if (*nr_submitted == 0 || atom_should_commit_asap(atom)) { -+ if (atom->capture_count < tmgr->atom_min_size && -+ !(atom->flags & ATOM_CANCEL_FUSION)) { -+ ret = txn_try_to_fuse_small_atom(tmgr, atom); -+ if (ret == -E_REPEAT) { -+ reiser4_preempt_point(); -+ goto repeat; -+ } -+ } -+ /* if early flushing could not make more nodes clean, -+ * or atom is too old/large, -+ * we force current atom to commit */ -+ /* wait for commit completion but only if this -+ * wouldn't stall pdflushd and ent thread. */ -+ if (!wbc->nonblocking && !ctx->entd) -+ txnh->flags |= TXNH_WAIT_COMMIT; -+ atom->flags |= ATOM_FORCE_COMMIT; -+ } -+ spin_unlock_atom(atom); -+ } else if (ret == -E_REPEAT) { -+ if (*nr_submitted == 0) { -+ /* let others who hampers flushing (hold longterm locks, -+ for instance) to free the way for flush */ -+ reiser4_preempt_point(); -+ goto repeat; -+ } -+ ret = 0; -+ } -+/* -+ if (*nr_submitted > wbc->nr_to_write) -+ warning("", "asked for %ld, written %ld\n", wbc->nr_to_write, *nr_submitted); -+*/ -+ reiser4_txn_restart(ctx); -+ -+ return ret; -+} -+ -+/* Remove processed nodes from atom's clean list (thereby remove them from transaction). */ -+void reiser4_invalidate_list(struct list_head *head) -+{ -+ while (!list_empty(head)) { -+ jnode *node; -+ -+ node = list_entry(head->next, jnode, capture_link); -+ spin_lock_jnode(node); -+ reiser4_uncapture_block(node); -+ jput(node); -+ } -+} -+ -+static void init_wlinks(txn_wait_links * wlinks) -+{ -+ wlinks->_lock_stack = get_current_lock_stack(); -+ INIT_LIST_HEAD(&wlinks->_fwaitfor_link); -+ INIT_LIST_HEAD(&wlinks->_fwaiting_link); -+ wlinks->waitfor_cb = NULL; -+ wlinks->waiting_cb = NULL; -+} -+ -+/* Add atom to the atom's waitfor list and wait for somebody to wake us up; */ -+void reiser4_atom_wait_event(txn_atom * atom) -+{ -+ txn_wait_links _wlinks; -+ -+ assert_spin_locked(&(atom->alock)); -+ assert("nikita-3156", -+ lock_stack_isclean(get_current_lock_stack()) || -+ atom->nr_running_queues > 0); -+ -+ init_wlinks(&_wlinks); -+ list_add_tail(&_wlinks._fwaitfor_link, &atom->fwaitfor_list); -+ atomic_inc(&atom->refcount); -+ spin_unlock_atom(atom); -+ -+ reiser4_prepare_to_sleep(_wlinks._lock_stack); -+ reiser4_go_to_sleep(_wlinks._lock_stack); -+ -+ spin_lock_atom(atom); -+ list_del(&_wlinks._fwaitfor_link); -+ atom_dec_and_unlock(atom); -+} -+ -+void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage) -+{ -+ assert("nikita-3535", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ assert("nikita-3536", stage <= ASTAGE_INVALID); -+ /* Excelsior! */ -+ assert("nikita-3537", stage >= atom->stage); -+ if (atom->stage != stage) { -+ atom->stage = stage; -+ reiser4_atom_send_event(atom); -+ } -+} -+ -+/* wake all threads which wait for an event */ -+void reiser4_atom_send_event(txn_atom * atom) -+{ -+ assert_spin_locked(&(atom->alock)); -+ wakeup_atom_waitfor_list(atom); -+} -+ -+/* Informs txn manager code that owner of this txn_handle should wait atom commit completion (for -+ example, because it does fsync(2)) */ -+static int should_wait_commit(txn_handle * h) -+{ -+ return h->flags & TXNH_WAIT_COMMIT; -+} -+ -+typedef struct commit_data { -+ txn_atom *atom; -+ txn_handle *txnh; -+ long nr_written; -+ /* as an optimization we start committing atom by first trying to -+ * flush it few times without switching into ASTAGE_CAPTURE_WAIT. This -+ * allows to reduce stalls due to other threads waiting for atom in -+ * ASTAGE_CAPTURE_WAIT stage. ->preflush is counter of these -+ * preliminary flushes. */ -+ int preflush; -+ /* have we waited on atom. */ -+ int wait; -+ int failed; -+ int wake_ktxnmgrd_up; -+} commit_data; -+ -+/* -+ * Called from commit_txnh() repeatedly, until either error happens, or atom -+ * commits successfully. -+ */ -+static int try_commit_txnh(commit_data * cd) -+{ -+ int result; -+ -+ assert("nikita-2968", lock_stack_isclean(get_current_lock_stack())); -+ -+ /* Get the atom and txnh locked. */ -+ cd->atom = txnh_get_atom(cd->txnh); -+ assert("jmacd-309", cd->atom != NULL); -+ spin_unlock_txnh(cd->txnh); -+ -+ if (cd->wait) { -+ cd->atom->nr_waiters--; -+ cd->wait = 0; -+ } -+ -+ if (cd->atom->stage == ASTAGE_DONE) -+ return 0; -+ -+ if (cd->failed) -+ return 0; -+ -+ if (atom_should_commit(cd->atom)) { -+ /* if atom is _very_ large schedule it for commit as soon as -+ * possible. */ -+ if (atom_should_commit_asap(cd->atom)) { -+ /* -+ * When atom is in PRE_COMMIT or later stage following -+ * invariant (encoded in atom_can_be_committed()) -+ * holds: there is exactly one non-waiter transaction -+ * handle opened on this atom. When thread wants to -+ * wait until atom commits (for example sync()) it -+ * waits on atom event after increasing -+ * atom->nr_waiters (see blow in this function). It -+ * cannot be guaranteed that atom is already committed -+ * after receiving event, so loop has to be -+ * re-started. But if atom switched into PRE_COMMIT -+ * stage and became too large, we cannot change its -+ * state back to CAPTURE_WAIT (atom stage can only -+ * increase monotonically), hence this check. -+ */ -+ if (cd->atom->stage < ASTAGE_CAPTURE_WAIT) -+ reiser4_atom_set_stage(cd->atom, -+ ASTAGE_CAPTURE_WAIT); -+ cd->atom->flags |= ATOM_FORCE_COMMIT; -+ } -+ if (cd->txnh->flags & TXNH_DONT_COMMIT) { -+ /* -+ * this thread (transaction handle that is) doesn't -+ * want to commit atom. Notify waiters that handle is -+ * closed. This can happen, for example, when we are -+ * under VFS directory lock and don't want to commit -+ * atom right now to avoid stalling other threads -+ * working in the same directory. -+ */ -+ -+ /* Wake the ktxnmgrd up if the ktxnmgrd is needed to -+ * commit this atom: no atom waiters and only one -+ * (our) open transaction handle. */ -+ cd->wake_ktxnmgrd_up = -+ cd->atom->txnh_count == 1 && -+ cd->atom->nr_waiters == 0; -+ reiser4_atom_send_event(cd->atom); -+ result = 0; -+ } else if (!atom_can_be_committed(cd->atom)) { -+ if (should_wait_commit(cd->txnh)) { -+ /* sync(): wait for commit */ -+ cd->atom->nr_waiters++; -+ cd->wait = 1; -+ reiser4_atom_wait_event(cd->atom); -+ result = RETERR(-E_REPEAT); -+ } else { -+ result = 0; -+ } -+ } else if (cd->preflush > 0 && !is_current_ktxnmgrd()) { -+ /* -+ * optimization: flush atom without switching it into -+ * ASTAGE_CAPTURE_WAIT. -+ * -+ * But don't do this for ktxnmgrd, because ktxnmgrd -+ * should never block on atom fusion. -+ */ -+ result = flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS, -+ LONG_MAX, &cd->nr_written, -+ &cd->atom, NULL); -+ if (result == 0) { -+ spin_unlock_atom(cd->atom); -+ cd->preflush = 0; -+ result = RETERR(-E_REPEAT); -+ } else /* Atoms wasn't flushed -+ * completely. Rinse. Repeat. */ -+ --cd->preflush; -+ } else { -+ /* We change atom state to ASTAGE_CAPTURE_WAIT to -+ prevent atom fusion and count ourself as an active -+ flusher */ -+ reiser4_atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT); -+ cd->atom->flags |= ATOM_FORCE_COMMIT; -+ -+ result = -+ commit_current_atom(&cd->nr_written, &cd->atom); -+ if (result != 0 && result != -E_REPEAT) -+ cd->failed = 1; -+ } -+ } else -+ result = 0; -+ -+#if REISER4_DEBUG -+ if (result == 0) -+ assert_spin_locked(&(cd->atom->alock)); -+#endif -+ -+ /* perfectly valid assertion, except that when atom/txnh is not locked -+ * fusion can take place, and cd->atom points nowhere. */ -+ /* -+ assert("jmacd-1028", ergo(result != 0, spin_atom_is_not_locked(cd->atom))); -+ */ -+ return result; -+} -+ -+/* Called to commit a transaction handle. This decrements the atom's number of open -+ handles and if it is the last handle to commit and the atom should commit, initiates -+ atom commit. if commit does not fail, return number of written blocks */ -+static int commit_txnh(txn_handle * txnh) -+{ -+ commit_data cd; -+ assert("umka-192", txnh != NULL); -+ -+ memset(&cd, 0, sizeof cd); -+ cd.txnh = txnh; -+ cd.preflush = 10; -+ -+ /* calls try_commit_txnh() until either atom commits, or error -+ * happens */ -+ while (try_commit_txnh(&cd) != 0) -+ reiser4_preempt_point(); -+ -+ spin_lock_txnh(txnh); -+ -+ cd.atom->txnh_count -= 1; -+ txnh->atom = NULL; -+ /* remove transaction handle from atom's list of transaction handles */ -+ list_del_init(&txnh->txnh_link); -+ -+ spin_unlock_txnh(txnh); -+ atom_dec_and_unlock(cd.atom); -+ /* if we don't want to do a commit (TXNH_DONT_COMMIT is set, probably -+ * because it takes time) by current thread, we do that work -+ * asynchronously by ktxnmgrd daemon. */ -+ if (cd.wake_ktxnmgrd_up) -+ ktxnmgrd_kick(&get_current_super_private()->tmgr); -+ -+ return 0; -+} -+ -+/* TRY_CAPTURE */ -+ -+/* This routine attempts a single block-capture request. It may return -E_REPEAT if some -+ condition indicates that the request should be retried, and it may block if the -+ txn_capture mode does not include the TXN_CAPTURE_NONBLOCKING request flag. -+ -+ This routine encodes the basic logic of block capturing described by: -+ -+ http://namesys.com/v4/v4.html -+ -+ Our goal here is to ensure that any two blocks that contain dependent modifications -+ should commit at the same time. This function enforces this discipline by initiating -+ fusion whenever a transaction handle belonging to one atom requests to read or write a -+ block belonging to another atom (TXN_CAPTURE_WRITE or TXN_CAPTURE_READ_ATOMIC). -+ -+ In addition, this routine handles the initial assignment of atoms to blocks and -+ transaction handles. These are possible outcomes of this function: -+ -+ 1. The block and handle are already part of the same atom: return immediate success -+ -+ 2. The block is assigned but the handle is not: call capture_assign_txnh to assign -+ the handle to the block's atom. -+ -+ 3. The handle is assigned but the block is not: call capture_assign_block to assign -+ the block to the handle's atom. -+ -+ 4. Both handle and block are assigned, but to different atoms: call capture_init_fusion -+ to fuse atoms. -+ -+ 5. Neither block nor handle are assigned: create a new atom and assign them both. -+ -+ 6. A read request for a non-captured block: return immediate success. -+ -+ This function acquires and releases the handle's spinlock. This function is called -+ under the jnode lock and if the return value is 0, it returns with the jnode lock still -+ held. If the return is -E_REPEAT or some other error condition, the jnode lock is -+ released. The external interface (reiser4_try_capture) manages re-aquiring the jnode -+ lock in the failure case. -+*/ -+static int try_capture_block( -+ txn_handle * txnh, jnode * node, txn_capture mode, -+ txn_atom ** atom_alloc) -+{ -+ txn_atom *block_atom; -+ txn_atom *txnh_atom; -+ -+ /* Should not call capture for READ_NONCOM requests, handled in reiser4_try_capture. */ -+ assert("jmacd-567", CAPTURE_TYPE(mode) != TXN_CAPTURE_READ_NONCOM); -+ -+ /* FIXME-ZAM-HANS: FIXME_LATER_JMACD Should assert that atom->tree == -+ * node->tree somewhere. */ -+ assert("umka-194", txnh != NULL); -+ assert("umka-195", node != NULL); -+ -+ /* The jnode is already locked! Being called from reiser4_try_capture(). */ -+ assert_spin_locked(&(node->guard)); -+ block_atom = node->atom; -+ -+ /* Get txnh spinlock, this allows us to compare txn_atom pointers but it doesn't -+ let us touch the atoms themselves. */ -+ spin_lock_txnh(txnh); -+ txnh_atom = txnh->atom; -+ /* Process of capturing continues into one of four branches depends on -+ which atoms from (block atom (node->atom), current atom (txnh->atom)) -+ exist. */ -+ if (txnh_atom == NULL) { -+ if (block_atom == NULL) { -+ spin_unlock_txnh(txnh); -+ spin_unlock_jnode(node); -+ /* assign empty atom to the txnh and repeat */ -+ return atom_begin_and_assign_to_txnh(atom_alloc, txnh); -+ } else { -+ atomic_inc(&block_atom->refcount); -+ /* node spin-lock isn't needed anymore */ -+ spin_unlock_jnode(node); -+ if (!spin_trylock_atom(block_atom)) { -+ spin_unlock_txnh(txnh); -+ spin_lock_atom(block_atom); -+ spin_lock_txnh(txnh); -+ } -+ /* re-check state after getting txnh and the node -+ * atom spin-locked */ -+ if (node->atom != block_atom || txnh->atom != NULL) { -+ spin_unlock_txnh(txnh); -+ atom_dec_and_unlock(block_atom); -+ return RETERR(-E_REPEAT); -+ } -+ atomic_dec(&block_atom->refcount); -+ if (block_atom->stage > ASTAGE_CAPTURE_WAIT || -+ (block_atom->stage == ASTAGE_CAPTURE_WAIT && -+ block_atom->txnh_count != 0)) -+ return capture_fuse_wait(txnh, block_atom, NULL, mode); -+ capture_assign_txnh_nolock(block_atom, txnh); -+ spin_unlock_txnh(txnh); -+ spin_unlock_atom(block_atom); -+ return RETERR(-E_REPEAT); -+ } -+ } else { -+ /* It is time to perform deadlock prevention check over the -+ node we want to capture. It is possible this node was locked -+ for read without capturing it. The optimization which allows -+ to do it helps us in keeping atoms independent as long as -+ possible but it may cause lock/fuse deadlock problems. -+ -+ A number of similar deadlock situations with locked but not -+ captured nodes were found. In each situation there are two -+ or more threads: one of them does flushing while another one -+ does routine balancing or tree lookup. The flushing thread -+ (F) sleeps in long term locking request for node (N), another -+ thread (A) sleeps in trying to capture some node already -+ belonging the atom F, F has a state which prevents -+ immediately fusion . -+ -+ Deadlocks of this kind cannot happen if node N was properly -+ captured by thread A. The F thread fuse atoms before locking -+ therefore current atom of thread F and current atom of thread -+ A became the same atom and thread A may proceed. This does -+ not work if node N was not captured because the fusion of -+ atom does not happens. -+ -+ The following scheme solves the deadlock: If -+ longterm_lock_znode locks and does not capture a znode, that -+ znode is marked as MISSED_IN_CAPTURE. A node marked this way -+ is processed by the code below which restores the missed -+ capture and fuses current atoms of all the node lock owners -+ by calling the fuse_not_fused_lock_owners() function. */ -+ if (JF_ISSET(node, JNODE_MISSED_IN_CAPTURE)) { -+ JF_CLR(node, JNODE_MISSED_IN_CAPTURE); -+ if (jnode_is_znode(node) && znode_is_locked(JZNODE(node))) { -+ spin_unlock_txnh(txnh); -+ spin_unlock_jnode(node); -+ fuse_not_fused_lock_owners(txnh, JZNODE(node)); -+ return RETERR(-E_REPEAT); -+ } -+ } -+ if (block_atom == NULL) { -+ atomic_inc(&txnh_atom->refcount); -+ spin_unlock_txnh(txnh); -+ if (!spin_trylock_atom(txnh_atom)) { -+ spin_unlock_jnode(node); -+ spin_lock_atom(txnh_atom); -+ spin_lock_jnode(node); -+ } -+ if (txnh->atom != txnh_atom || node->atom != NULL -+ || JF_ISSET(node, JNODE_IS_DYING)) { -+ spin_unlock_jnode(node); -+ atom_dec_and_unlock(txnh_atom); -+ return RETERR(-E_REPEAT); -+ } -+ atomic_dec(&txnh_atom->refcount); -+ capture_assign_block_nolock(txnh_atom, node); -+ spin_unlock_atom(txnh_atom); -+ } else { -+ if (txnh_atom != block_atom) { -+ if (mode & TXN_CAPTURE_DONT_FUSE) { -+ spin_unlock_txnh(txnh); -+ spin_unlock_jnode(node); -+ /* we are in a "no-fusion" mode and @node is -+ * already part of transaction. */ -+ return RETERR(-E_NO_NEIGHBOR); -+ } -+ return capture_init_fusion(node, txnh, mode); -+ } -+ spin_unlock_txnh(txnh); -+ } -+ } -+ return 0; -+} -+ -+static txn_capture -+build_capture_mode(jnode * node, znode_lock_mode lock_mode, txn_capture flags) -+{ -+ txn_capture cap_mode; -+ -+ assert_spin_locked(&(node->guard)); -+ -+ /* FIXME_JMACD No way to set TXN_CAPTURE_READ_MODIFY yet. */ -+ -+ if (lock_mode == ZNODE_WRITE_LOCK) { -+ cap_mode = TXN_CAPTURE_WRITE; -+ } else if (node->atom != NULL) { -+ cap_mode = TXN_CAPTURE_WRITE; -+ } else if (0 && /* txnh->mode == TXN_READ_FUSING && */ -+ jnode_get_level(node) == LEAF_LEVEL) { -+ /* NOTE-NIKITA TXN_READ_FUSING is not currently used */ -+ /* We only need a READ_FUSING capture at the leaf level. This -+ is because the internal levels of the tree (twigs included) -+ are redundant from the point of the user that asked for a -+ read-fusing transcrash. The user only wants to read-fuse -+ atoms due to reading uncommitted data that another user has -+ written. It is the file system that reads/writes the -+ internal tree levels, the user only reads/writes leaves. */ -+ cap_mode = TXN_CAPTURE_READ_ATOMIC; -+ } else { -+ /* In this case (read lock at a non-leaf) there's no reason to -+ * capture. */ -+ /* cap_mode = TXN_CAPTURE_READ_NONCOM; */ -+ return 0; -+ } -+ -+ cap_mode |= (flags & (TXN_CAPTURE_NONBLOCKING | TXN_CAPTURE_DONT_FUSE)); -+ assert("nikita-3186", cap_mode != 0); -+ return cap_mode; -+} -+ -+/* This is an external interface to try_capture_block(), it calls -+ try_capture_block() repeatedly as long as -E_REPEAT is returned. -+ -+ @node: node to capture, -+ @lock_mode: read or write lock is used in capture mode calculation, -+ @flags: see txn_capture flags enumeration, -+ @can_coc : can copy-on-capture -+ -+ @return: 0 - node was successfully captured, -E_REPEAT - capture request -+ cannot be processed immediately as it was requested in flags, -+ < 0 - other errors. -+*/ -+int reiser4_try_capture(jnode *node, znode_lock_mode lock_mode, -+ txn_capture flags) -+{ -+ txn_atom *atom_alloc = NULL; -+ txn_capture cap_mode; -+ txn_handle *txnh = get_current_context()->trans; -+ int ret; -+ -+ assert_spin_locked(&(node->guard)); -+ -+ repeat: -+ if (JF_ISSET(node, JNODE_IS_DYING)) -+ return RETERR(-EINVAL); -+ if (node->atom != NULL && txnh->atom == node->atom) -+ return 0; -+ cap_mode = build_capture_mode(node, lock_mode, flags); -+ if (cap_mode == 0 || -+ (!(cap_mode & TXN_CAPTURE_WTYPES) && node->atom == NULL)) { -+ /* Mark this node as "MISSED". It helps in further deadlock -+ * analysis */ -+ if (jnode_is_znode(node)) -+ JF_SET(node, JNODE_MISSED_IN_CAPTURE); -+ return 0; -+ } -+ /* Repeat try_capture as long as -E_REPEAT is returned. */ -+ ret = try_capture_block(txnh, node, cap_mode, &atom_alloc); -+ /* Regardless of non_blocking: -+ -+ If ret == 0 then jnode is still locked. -+ If ret != 0 then jnode is unlocked. -+ */ -+#if REISER4_DEBUG -+ if (ret == 0) -+ assert_spin_locked(&(node->guard)); -+ else -+ assert_spin_not_locked(&(node->guard)); -+#endif -+ assert_spin_not_locked(&(txnh->guard)); -+ -+ if (ret == -E_REPEAT) { -+ /* E_REPEAT implies all locks were released, therefore we need -+ to take the jnode's lock again. */ -+ spin_lock_jnode(node); -+ -+ /* Although this may appear to be a busy loop, it is not. -+ There are several conditions that cause E_REPEAT to be -+ returned by the call to try_capture_block, all cases -+ indicating some kind of state change that means you should -+ retry the request and will get a different result. In some -+ cases this could be avoided with some extra code, but -+ generally it is done because the necessary locks were -+ released as a result of the operation and repeating is the -+ simplest thing to do (less bug potential). The cases are: -+ atom fusion returns E_REPEAT after it completes (jnode and -+ txnh were unlocked); race conditions in assign_block, -+ assign_txnh, and init_fusion return E_REPEAT (trylock -+ failure); after going to sleep in capture_fuse_wait -+ (request was blocked but may now succeed). I'm not quite -+ sure how capture_copy works yet, but it may also return -+ E_REPEAT. When the request is legitimately blocked, the -+ requestor goes to sleep in fuse_wait, so this is not a busy -+ loop. */ -+ /* NOTE-NIKITA: still don't understand: -+ -+ try_capture_block->capture_assign_txnh->spin_trylock_atom->E_REPEAT -+ -+ looks like busy loop? -+ */ -+ goto repeat; -+ } -+ -+ /* free extra atom object that was possibly allocated by -+ try_capture_block(). -+ -+ Do this before acquiring jnode spin lock to -+ minimize time spent under lock. --nikita */ -+ if (atom_alloc != NULL) { -+ kmem_cache_free(_atom_slab, atom_alloc); -+ } -+ -+ if (ret != 0) { -+ if (ret == -E_BLOCK) { -+ assert("nikita-3360", -+ cap_mode & TXN_CAPTURE_NONBLOCKING); -+ ret = -E_REPEAT; -+ } -+ -+ /* Failure means jnode is not locked. FIXME_LATER_JMACD May -+ want to fix the above code to avoid releasing the lock and -+ re-acquiring it, but there are cases were failure occurs -+ when the lock is not held, and those cases would need to be -+ modified to re-take the lock. */ -+ spin_lock_jnode(node); -+ } -+ -+ /* Jnode is still locked. */ -+ assert_spin_locked(&(node->guard)); -+ return ret; -+} -+ -+static void release_two_atoms(txn_atom *one, txn_atom *two) -+{ -+ spin_unlock_atom(one); -+ atom_dec_and_unlock(two); -+ spin_lock_atom(one); -+ atom_dec_and_unlock(one); -+} -+ -+/* This function sets up a call to try_capture_block and repeats as long as -E_REPEAT is -+ returned by that routine. The txn_capture request mode is computed here depending on -+ the transaction handle's type and the lock request. This is called from the depths of -+ the lock manager with the jnode lock held and it always returns with the jnode lock -+ held. -+*/ -+ -+/* fuse all 'active' atoms of lock owners of given node. */ -+static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node) -+{ -+ lock_handle *lh; -+ int repeat; -+ txn_atom *atomh, *atomf; -+ reiser4_context *me = get_current_context(); -+ reiser4_context *ctx = NULL; -+ -+ assert_spin_not_locked(&(ZJNODE(node)->guard)); -+ assert_spin_not_locked(&(txnh->hlock)); -+ -+ repeat: -+ repeat = 0; -+ atomh = txnh_get_atom(txnh); -+ spin_unlock_txnh(txnh); -+ assert("zam-692", atomh != NULL); -+ -+ spin_lock_zlock(&node->lock); -+ /* inspect list of lock owners */ -+ list_for_each_entry(lh, &node->lock.owners, owners_link) { -+ ctx = get_context_by_lock_stack(lh->owner); -+ if (ctx == me) -+ continue; -+ /* below we use two assumptions to avoid addition spin-locks -+ for checking the condition : -+ -+ 1) if the lock stack has lock, the transaction should be -+ opened, i.e. ctx->trans != NULL; -+ -+ 2) reading of well-aligned ctx->trans->atom is atomic, if it -+ equals to the address of spin-locked atomh, we take that -+ the atoms are the same, nothing has to be captured. */ -+ if (atomh != ctx->trans->atom) { -+ reiser4_wake_up(lh->owner); -+ repeat = 1; -+ break; -+ } -+ } -+ if (repeat) { -+ if (!spin_trylock_txnh(ctx->trans)) { -+ spin_unlock_zlock(&node->lock); -+ spin_unlock_atom(atomh); -+ goto repeat; -+ } -+ atomf = ctx->trans->atom; -+ if (atomf == NULL) { -+ capture_assign_txnh_nolock(atomh, ctx->trans); -+ /* release zlock lock _after_ assigning the atom to the -+ * transaction handle, otherwise the lock owner thread -+ * may unlock all znodes, exit kernel context and here -+ * we would access an invalid transaction handle. */ -+ spin_unlock_zlock(&node->lock); -+ spin_unlock_atom(atomh); -+ spin_unlock_txnh(ctx->trans); -+ goto repeat; -+ } -+ assert("zam-1059", atomf != atomh); -+ spin_unlock_zlock(&node->lock); -+ atomic_inc(&atomh->refcount); -+ atomic_inc(&atomf->refcount); -+ spin_unlock_txnh(ctx->trans); -+ if (atomf > atomh) { -+ spin_lock_atom_nested(atomf); -+ } else { -+ spin_unlock_atom(atomh); -+ spin_lock_atom(atomf); -+ spin_lock_atom_nested(atomh); -+ } -+ if (atomh == atomf || !atom_isopen(atomh) || !atom_isopen(atomf)) { -+ release_two_atoms(atomf, atomh); -+ goto repeat; -+ } -+ atomic_dec(&atomh->refcount); -+ atomic_dec(&atomf->refcount); -+ capture_fuse_into(atomf, atomh); -+ goto repeat; -+ } -+ spin_unlock_zlock(&node->lock); -+ spin_unlock_atom(atomh); -+} -+ -+/* This is the interface to capture unformatted nodes via their struct page -+ reference. Currently it is only used in reiser4_invalidatepage */ -+int try_capture_page_to_invalidate(struct page *pg) -+{ -+ int ret; -+ jnode *node; -+ -+ assert("umka-292", pg != NULL); -+ assert("nikita-2597", PageLocked(pg)); -+ -+ if (IS_ERR(node = jnode_of_page(pg))) { -+ return PTR_ERR(node); -+ } -+ -+ spin_lock_jnode(node); -+ unlock_page(pg); -+ -+ ret = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0); -+ spin_unlock_jnode(node); -+ jput(node); -+ lock_page(pg); -+ return ret; -+} -+ -+/* This informs the transaction manager when a node is deleted. Add the block to the -+ atom's delete set and uncapture the block. -+ -+VS-FIXME-HANS: this E_REPEAT paradigm clutters the code and creates a need for -+explanations. find all the functions that use it, and unless there is some very -+good reason to use it (I have not noticed one so far and I doubt it exists, but maybe somewhere somehow....), -+move the loop to inside the function. -+ -+VS-FIXME-HANS: can this code be at all streamlined? In particular, can you lock and unlock the jnode fewer times? -+ */ -+void reiser4_uncapture_page(struct page *pg) -+{ -+ jnode *node; -+ txn_atom *atom; -+ -+ assert("umka-199", pg != NULL); -+ assert("nikita-3155", PageLocked(pg)); -+ -+ clear_page_dirty_for_io(pg); -+ -+ reiser4_wait_page_writeback(pg); -+ -+ node = jprivate(pg); -+ BUG_ON(node == NULL); -+ -+ spin_lock_jnode(node); -+ -+ atom = jnode_get_atom(node); -+ if (atom == NULL) { -+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY)); -+ spin_unlock_jnode(node); -+ return; -+ } -+ -+ /* We can remove jnode from transaction even if it is on flush queue -+ * prepped list, we only need to be sure that flush queue is not being -+ * written by reiser4_write_fq(). reiser4_write_fq() does not use atom -+ * spin lock for protection of the prepped nodes list, instead -+ * write_fq() increments atom's nr_running_queues counters for the time -+ * when prepped list is not protected by spin lock. Here we check this -+ * counter if we want to remove jnode from flush queue and, if the -+ * counter is not zero, wait all reiser4_write_fq() for this atom to -+ * complete. This is not significant overhead. */ -+ while (JF_ISSET(node, JNODE_FLUSH_QUEUED) && atom->nr_running_queues) { -+ spin_unlock_jnode(node); -+ /* -+ * at this moment we want to wait for "atom event", viz. wait -+ * until @node can be removed from flush queue. But -+ * reiser4_atom_wait_event() cannot be called with page locked, -+ * because it deadlocks with jnode_extent_write(). Unlock page, -+ * after making sure (through page_cache_get()) that it cannot -+ * be released from memory. -+ */ -+ page_cache_get(pg); -+ unlock_page(pg); -+ reiser4_atom_wait_event(atom); -+ lock_page(pg); -+ /* -+ * page may has been detached by ->writepage()->releasepage(). -+ */ -+ reiser4_wait_page_writeback(pg); -+ spin_lock_jnode(node); -+ page_cache_release(pg); -+ atom = jnode_get_atom(node); -+/* VS-FIXME-HANS: improve the commenting in this function */ -+ if (atom == NULL) { -+ spin_unlock_jnode(node); -+ return; -+ } -+ } -+ reiser4_uncapture_block(node); -+ spin_unlock_atom(atom); -+ jput(node); -+} -+ -+/* this is used in extent's kill hook to uncapture and unhash jnodes attached to -+ * inode's tree of jnodes */ -+void reiser4_uncapture_jnode(jnode * node) -+{ -+ txn_atom *atom; -+ -+ assert_spin_locked(&(node->guard)); -+ assert("", node->pg == 0); -+ -+ atom = jnode_get_atom(node); -+ if (atom == NULL) { -+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY)); -+ spin_unlock_jnode(node); -+ return; -+ } -+ -+ reiser4_uncapture_block(node); -+ spin_unlock_atom(atom); -+ jput(node); -+} -+ -+/* No-locking version of assign_txnh. Sets the transaction handle's atom pointer, -+ increases atom refcount and txnh_count, adds to txnh_list. */ -+static void capture_assign_txnh_nolock(txn_atom *atom, txn_handle *txnh) -+{ -+ assert("umka-200", atom != NULL); -+ assert("umka-201", txnh != NULL); -+ -+ assert_spin_locked(&(txnh->hlock)); -+ assert_spin_locked(&(atom->alock)); -+ assert("jmacd-824", txnh->atom == NULL); -+ assert("nikita-3540", atom_isopen(atom)); -+ BUG_ON(txnh->atom != NULL); -+ -+ atomic_inc(&atom->refcount); -+ txnh->atom = atom; -+ reiser4_ctx_gfp_mask_set(); -+ list_add_tail(&txnh->txnh_link, &atom->txnh_list); -+ atom->txnh_count += 1; -+} -+ -+/* No-locking version of assign_block. Sets the block's atom pointer, references the -+ block, adds it to the clean or dirty capture_jnode list, increments capture_count. */ -+static void capture_assign_block_nolock(txn_atom *atom, jnode *node) -+{ -+ assert("umka-202", atom != NULL); -+ assert("umka-203", node != NULL); -+ assert_spin_locked(&(node->guard)); -+ assert_spin_locked(&(atom->alock)); -+ assert("jmacd-323", node->atom == NULL); -+ BUG_ON(!list_empty_careful(&node->capture_link)); -+ assert("nikita-3470", !JF_ISSET(node, JNODE_DIRTY)); -+ -+ /* Pointer from jnode to atom is not counted in atom->refcount. */ -+ node->atom = atom; -+ -+ list_add_tail(&node->capture_link, ATOM_CLEAN_LIST(atom)); -+ atom->capture_count += 1; -+ /* reference to jnode is acquired by atom. */ -+ jref(node); -+ -+ ON_DEBUG(count_jnode(atom, node, NOT_CAPTURED, CLEAN_LIST, 1)); -+ -+ LOCK_CNT_INC(t_refs); -+} -+ -+/* common code for dirtying both unformatted jnodes and formatted znodes. */ -+static void do_jnode_make_dirty(jnode * node, txn_atom * atom) -+{ -+ assert_spin_locked(&(node->guard)); -+ assert_spin_locked(&(atom->alock)); -+ assert("jmacd-3981", !JF_ISSET(node, JNODE_DIRTY)); -+ -+ JF_SET(node, JNODE_DIRTY); -+ -+ get_current_context()->nr_marked_dirty++; -+ -+ /* We grab2flush_reserve one additional block only if node was -+ not CREATED and jnode_flush did not sort it into neither -+ relocate set nor overwrite one. If node is in overwrite or -+ relocate set we assume that atom's flush reserved counter was -+ already adjusted. */ -+ if (!JF_ISSET(node, JNODE_CREATED) && !JF_ISSET(node, JNODE_RELOC) -+ && !JF_ISSET(node, JNODE_OVRWR) && jnode_is_leaf(node) -+ && !jnode_is_cluster_page(node)) { -+ assert("vs-1093", !reiser4_blocknr_is_fake(&node->blocknr)); -+ assert("vs-1506", *jnode_get_block(node) != 0); -+ grabbed2flush_reserved_nolock(atom, (__u64) 1); -+ JF_SET(node, JNODE_FLUSH_RESERVED); -+ } -+ -+ if (!JF_ISSET(node, JNODE_FLUSH_QUEUED)) { -+ /* If the atom is not set yet, it will be added to the appropriate list in -+ capture_assign_block_nolock. */ -+ /* Sometimes a node is set dirty before being captured -- the case for new -+ jnodes. In that case the jnode will be added to the appropriate list -+ in capture_assign_block_nolock. Another reason not to re-link jnode is -+ that jnode is on a flush queue (see flush.c for details) */ -+ -+ int level = jnode_get_level(node); -+ -+ assert("nikita-3152", !JF_ISSET(node, JNODE_OVRWR)); -+ assert("zam-654", atom->stage < ASTAGE_PRE_COMMIT); -+ assert("nikita-2607", 0 <= level); -+ assert("nikita-2606", level <= REAL_MAX_ZTREE_HEIGHT); -+ -+ /* move node to atom's dirty list */ -+ list_move_tail(&node->capture_link, ATOM_DIRTY_LIST(atom, level)); -+ ON_DEBUG(count_jnode -+ (atom, node, NODE_LIST(node), DIRTY_LIST, 1)); -+ } -+} -+ -+/* Set the dirty status for this (spin locked) jnode. */ -+void jnode_make_dirty_locked(jnode * node) -+{ -+ assert("umka-204", node != NULL); -+ assert_spin_locked(&(node->guard)); -+ -+ if (REISER4_DEBUG && rofs_jnode(node)) { -+ warning("nikita-3365", "Dirtying jnode on rofs"); -+ dump_stack(); -+ } -+ -+ /* Fast check for already dirty node */ -+ if (!JF_ISSET(node, JNODE_DIRTY)) { -+ txn_atom *atom; -+ -+ atom = jnode_get_atom(node); -+ assert("vs-1094", atom); -+ /* Check jnode dirty status again because node spin lock might -+ * be released inside jnode_get_atom(). */ -+ if (likely(!JF_ISSET(node, JNODE_DIRTY))) -+ do_jnode_make_dirty(node, atom); -+ spin_unlock_atom(atom); -+ } -+} -+ -+/* Set the dirty status for this znode. */ -+void znode_make_dirty(znode * z) -+{ -+ jnode *node; -+ struct page *page; -+ -+ assert("umka-204", z != NULL); -+ assert("nikita-3290", znode_above_root(z) || znode_is_loaded(z)); -+ assert("nikita-3560", znode_is_write_locked(z)); -+ -+ node = ZJNODE(z); -+ /* znode is longterm locked, we can check dirty bit without spinlock */ -+ if (JF_ISSET(node, JNODE_DIRTY)) { -+ /* znode is dirty already. All we have to do is to change znode version */ -+ z->version = znode_build_version(jnode_get_tree(node)); -+ return; -+ } -+ -+ spin_lock_jnode(node); -+ jnode_make_dirty_locked(node); -+ page = jnode_page(node); -+ if (page != NULL) { -+ /* this is useful assertion (allows one to check that no -+ * modifications are lost due to update of in-flight page), -+ * but it requires locking on page to check PG_writeback -+ * bit. */ -+ /* assert("nikita-3292", -+ !PageWriteback(page) || ZF_ISSET(z, JNODE_WRITEBACK)); */ -+ page_cache_get(page); -+ -+ /* jnode lock is not needed for the rest of -+ * znode_set_dirty(). */ -+ spin_unlock_jnode(node); -+ /* reiser4 file write code calls set_page_dirty for -+ * unformatted nodes, for formatted nodes we do it here. */ -+ reiser4_set_page_dirty_internal(page); -+ page_cache_release(page); -+ /* bump version counter in znode */ -+ z->version = znode_build_version(jnode_get_tree(node)); -+ } else { -+ assert("zam-596", znode_above_root(JZNODE(node))); -+ spin_unlock_jnode(node); -+ } -+ -+ assert("nikita-1900", znode_is_write_locked(z)); -+ assert("jmacd-9777", node->atom != NULL); -+} -+ -+int reiser4_sync_atom(txn_atom * atom) -+{ -+ int result; -+ txn_handle *txnh; -+ -+ txnh = get_current_context()->trans; -+ -+ result = 0; -+ if (atom != NULL) { -+ if (atom->stage < ASTAGE_PRE_COMMIT) { -+ spin_lock_txnh(txnh); -+ capture_assign_txnh_nolock(atom, txnh); -+ result = force_commit_atom(txnh); -+ } else if (atom->stage < ASTAGE_POST_COMMIT) { -+ /* wait atom commit */ -+ reiser4_atom_wait_event(atom); -+ /* try once more */ -+ result = RETERR(-E_REPEAT); -+ } else -+ spin_unlock_atom(atom); -+ } -+ return result; -+} -+ -+#if REISER4_DEBUG -+ -+/* move jnode form one list to another -+ call this after atom->capture_count is updated */ -+void -+count_jnode(txn_atom * atom, jnode * node, atom_list old_list, -+ atom_list new_list, int check_lists) -+{ -+ struct list_head *pos; -+ -+ assert("zam-1018", atom_is_protected(atom)); -+ assert_spin_locked(&(node->guard)); -+ assert("", NODE_LIST(node) == old_list); -+ -+ switch (NODE_LIST(node)) { -+ case NOT_CAPTURED: -+ break; -+ case DIRTY_LIST: -+ assert("", atom->dirty > 0); -+ atom->dirty--; -+ break; -+ case CLEAN_LIST: -+ assert("", atom->clean > 0); -+ atom->clean--; -+ break; -+ case FQ_LIST: -+ assert("", atom->fq > 0); -+ atom->fq--; -+ break; -+ case WB_LIST: -+ assert("", atom->wb > 0); -+ atom->wb--; -+ break; -+ case OVRWR_LIST: -+ assert("", atom->ovrwr > 0); -+ atom->ovrwr--; -+ break; -+ default: -+ impossible("", ""); -+ } -+ -+ switch (new_list) { -+ case NOT_CAPTURED: -+ break; -+ case DIRTY_LIST: -+ atom->dirty++; -+ break; -+ case CLEAN_LIST: -+ atom->clean++; -+ break; -+ case FQ_LIST: -+ atom->fq++; -+ break; -+ case WB_LIST: -+ atom->wb++; -+ break; -+ case OVRWR_LIST: -+ atom->ovrwr++; -+ break; -+ default: -+ impossible("", ""); -+ } -+ ASSIGN_NODE_LIST(node, new_list); -+ if (0 && check_lists) { -+ int count; -+ tree_level level; -+ -+ count = 0; -+ -+ /* flush queue list */ -+ /* reiser4_check_fq(atom); */ -+ -+ /* dirty list */ -+ count = 0; -+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) { -+ list_for_each(pos, ATOM_DIRTY_LIST(atom, level)) -+ count++; -+ } -+ if (count != atom->dirty) -+ warning("", "dirty counter %d, real %d\n", atom->dirty, -+ count); -+ -+ /* clean list */ -+ count = 0; -+ list_for_each(pos, ATOM_CLEAN_LIST(atom)) -+ count++; -+ if (count != atom->clean) -+ warning("", "clean counter %d, real %d\n", atom->clean, -+ count); -+ -+ /* wb list */ -+ count = 0; -+ list_for_each(pos, ATOM_WB_LIST(atom)) -+ count++; -+ if (count != atom->wb) -+ warning("", "wb counter %d, real %d\n", atom->wb, -+ count); -+ -+ /* overwrite list */ -+ count = 0; -+ list_for_each(pos, ATOM_OVRWR_LIST(atom)) -+ count++; -+ -+ if (count != atom->ovrwr) -+ warning("", "ovrwr counter %d, real %d\n", atom->ovrwr, -+ count); -+ } -+ assert("vs-1624", atom->num_queued == atom->fq); -+ if (atom->capture_count != -+ atom->dirty + atom->clean + atom->ovrwr + atom->wb + atom->fq) { -+ printk -+ ("count %d, dirty %d clean %d ovrwr %d wb %d fq %d\n", -+ atom->capture_count, atom->dirty, atom->clean, atom->ovrwr, -+ atom->wb, atom->fq); -+ assert("vs-1622", -+ atom->capture_count == -+ atom->dirty + atom->clean + atom->ovrwr + atom->wb + -+ atom->fq); -+ } -+} -+ -+#endif -+ -+/* Make node OVRWR and put it on atom->overwrite_nodes list, atom lock and jnode -+ * lock should be taken before calling this function. */ -+void jnode_make_wander_nolock(jnode * node) -+{ -+ txn_atom *atom; -+ -+ assert("nikita-2431", node != NULL); -+ assert("nikita-2432", !JF_ISSET(node, JNODE_RELOC)); -+ assert("nikita-3153", JF_ISSET(node, JNODE_DIRTY)); -+ assert("zam-897", !JF_ISSET(node, JNODE_FLUSH_QUEUED)); -+ assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node))); -+ -+ atom = node->atom; -+ -+ assert("zam-895", atom != NULL); -+ assert("zam-894", atom_is_protected(atom)); -+ -+ JF_SET(node, JNODE_OVRWR); -+ /* move node to atom's overwrite list */ -+ list_move_tail(&node->capture_link, ATOM_OVRWR_LIST(atom)); -+ ON_DEBUG(count_jnode(atom, node, DIRTY_LIST, OVRWR_LIST, 1)); -+} -+ -+/* Same as jnode_make_wander_nolock, but all necessary locks are taken inside -+ * this function. */ -+void jnode_make_wander(jnode * node) -+{ -+ txn_atom *atom; -+ -+ spin_lock_jnode(node); -+ atom = jnode_get_atom(node); -+ assert("zam-913", atom != NULL); -+ assert("zam-914", !JF_ISSET(node, JNODE_RELOC)); -+ -+ jnode_make_wander_nolock(node); -+ spin_unlock_atom(atom); -+ spin_unlock_jnode(node); -+} -+ -+/* this just sets RELOC bit */ -+static void jnode_make_reloc_nolock(flush_queue_t * fq, jnode * node) -+{ -+ assert_spin_locked(&(node->guard)); -+ assert("zam-916", JF_ISSET(node, JNODE_DIRTY)); -+ assert("zam-917", !JF_ISSET(node, JNODE_RELOC)); -+ assert("zam-918", !JF_ISSET(node, JNODE_OVRWR)); -+ assert("zam-920", !JF_ISSET(node, JNODE_FLUSH_QUEUED)); -+ assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node))); -+ jnode_set_reloc(node); -+} -+ -+/* Make znode RELOC and put it on flush queue */ -+void znode_make_reloc(znode * z, flush_queue_t * fq) -+{ -+ jnode *node; -+ txn_atom *atom; -+ -+ node = ZJNODE(z); -+ spin_lock_jnode(node); -+ -+ atom = jnode_get_atom(node); -+ assert("zam-919", atom != NULL); -+ -+ jnode_make_reloc_nolock(fq, node); -+ queue_jnode(fq, node); -+ -+ spin_unlock_atom(atom); -+ spin_unlock_jnode(node); -+ -+} -+ -+/* Make unformatted node RELOC and put it on flush queue */ -+void unformatted_make_reloc(jnode *node, flush_queue_t *fq) -+{ -+ assert("vs-1479", jnode_is_unformatted(node)); -+ -+ jnode_make_reloc_nolock(fq, node); -+ queue_jnode(fq, node); -+} -+ -+int reiser4_capture_super_block(struct super_block *s) -+{ -+ int result; -+ znode *uber; -+ lock_handle lh; -+ -+ init_lh(&lh); -+ result = get_uber_znode(reiser4_get_tree(s), -+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI, &lh); -+ if (result) -+ return result; -+ -+ uber = lh.node; -+ /* Grabbing one block for superblock */ -+ result = reiser4_grab_space_force((__u64) 1, BA_RESERVED); -+ if (result != 0) -+ return result; -+ -+ znode_make_dirty(uber); -+ -+ done_lh(&lh); -+ return 0; -+} -+ -+/* Wakeup every handle on the atom's WAITFOR list */ -+static void wakeup_atom_waitfor_list(txn_atom * atom) -+{ -+ txn_wait_links *wlinks; -+ -+ assert("umka-210", atom != NULL); -+ -+ /* atom is locked */ -+ list_for_each_entry(wlinks, &atom->fwaitfor_list, _fwaitfor_link) { -+ if (wlinks->waitfor_cb == NULL || -+ wlinks->waitfor_cb(atom, wlinks)) -+ /* Wake up. */ -+ reiser4_wake_up(wlinks->_lock_stack); -+ } -+} -+ -+/* Wakeup every handle on the atom's WAITING list */ -+static void wakeup_atom_waiting_list(txn_atom * atom) -+{ -+ txn_wait_links *wlinks; -+ -+ assert("umka-211", atom != NULL); -+ -+ /* atom is locked */ -+ list_for_each_entry(wlinks, &atom->fwaiting_list, _fwaiting_link) { -+ if (wlinks->waiting_cb == NULL || -+ wlinks->waiting_cb(atom, wlinks)) -+ /* Wake up. */ -+ reiser4_wake_up(wlinks->_lock_stack); -+ } -+} -+ -+/* helper function used by capture_fuse_wait() to avoid "spurious wake-ups" */ -+static int wait_for_fusion(txn_atom * atom, txn_wait_links * wlinks) -+{ -+ assert("nikita-3330", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ -+ /* atom->txnh_count == 1 is for waking waiters up if we are releasing -+ * last transaction handle. */ -+ return atom->stage != ASTAGE_CAPTURE_WAIT || atom->txnh_count == 1; -+} -+ -+/* The general purpose of this function is to wait on the first of two possible events. -+ The situation is that a handle (and its atom atomh) is blocked trying to capture a -+ block (i.e., node) but the node's atom (atomf) is in the CAPTURE_WAIT state. The -+ handle's atom (atomh) is not in the CAPTURE_WAIT state. However, atomh could fuse with -+ another atom or, due to age, enter the CAPTURE_WAIT state itself, at which point it -+ needs to unblock the handle to avoid deadlock. When the txnh is unblocked it will -+ proceed and fuse the two atoms in the CAPTURE_WAIT state. -+ -+ In other words, if either atomh or atomf change state, the handle will be awakened, -+ thus there are two lists per atom: WAITING and WAITFOR. -+ -+ This is also called by capture_assign_txnh with (atomh == NULL) to wait for atomf to -+ close but it is not assigned to an atom of its own. -+ -+ Lock ordering in this method: all four locks are held: JNODE_LOCK, TXNH_LOCK, -+ BOTH_ATOM_LOCKS. Result: all four locks are released. -+*/ -+static int capture_fuse_wait(txn_handle * txnh, txn_atom * atomf, -+ txn_atom * atomh, txn_capture mode) -+{ -+ int ret; -+ txn_wait_links wlinks; -+ -+ assert("umka-213", txnh != NULL); -+ assert("umka-214", atomf != NULL); -+ -+ if ((mode & TXN_CAPTURE_NONBLOCKING) != 0) { -+ spin_unlock_txnh(txnh); -+ spin_unlock_atom(atomf); -+ -+ if (atomh) { -+ spin_unlock_atom(atomh); -+ } -+ -+ return RETERR(-E_BLOCK); -+ } -+ -+ /* Initialize the waiting list links. */ -+ init_wlinks(&wlinks); -+ -+ /* Add txnh to atomf's waitfor list, unlock atomf. */ -+ list_add_tail(&wlinks._fwaitfor_link, &atomf->fwaitfor_list); -+ wlinks.waitfor_cb = wait_for_fusion; -+ atomic_inc(&atomf->refcount); -+ spin_unlock_atom(atomf); -+ -+ if (atomh) { -+ /* Add txnh to atomh's waiting list, unlock atomh. */ -+ list_add_tail(&wlinks._fwaiting_link, &atomh->fwaiting_list); -+ atomic_inc(&atomh->refcount); -+ spin_unlock_atom(atomh); -+ } -+ -+ /* Go to sleep. */ -+ spin_unlock_txnh(txnh); -+ -+ ret = reiser4_prepare_to_sleep(wlinks._lock_stack); -+ if (ret == 0) { -+ reiser4_go_to_sleep(wlinks._lock_stack); -+ ret = RETERR(-E_REPEAT); -+ } -+ -+ /* Remove from the waitfor list. */ -+ spin_lock_atom(atomf); -+ -+ list_del(&wlinks._fwaitfor_link); -+ atom_dec_and_unlock(atomf); -+ -+ if (atomh) { -+ /* Remove from the waiting list. */ -+ spin_lock_atom(atomh); -+ list_del(&wlinks._fwaiting_link); -+ atom_dec_and_unlock(atomh); -+ } -+ return ret; -+} -+ -+static void lock_two_atoms(txn_atom * one, txn_atom * two) -+{ -+ assert("zam-1067", one != two); -+ -+ /* lock the atom with lesser address first */ -+ if (one < two) { -+ spin_lock_atom(one); -+ spin_lock_atom_nested(two); -+ } else { -+ spin_lock_atom(two); -+ spin_lock_atom_nested(one); -+ } -+} -+ -+/* Perform the necessary work to prepare for fusing two atoms, which involves -+ * acquiring two atom locks in the proper order. If one of the node's atom is -+ * blocking fusion (i.e., it is in the CAPTURE_WAIT stage) and the handle's -+ * atom is not then the handle's request is put to sleep. If the node's atom -+ * is committing, then the node can be copy-on-captured. Otherwise, pick the -+ * atom with fewer pointers to be fused into the atom with more pointer and -+ * call capture_fuse_into. -+ */ -+static int capture_init_fusion(jnode *node, txn_handle *txnh, txn_capture mode) -+{ -+ txn_atom * txnh_atom = txnh->atom; -+ txn_atom * block_atom = node->atom; -+ -+ atomic_inc(&txnh_atom->refcount); -+ atomic_inc(&block_atom->refcount); -+ -+ spin_unlock_txnh(txnh); -+ spin_unlock_jnode(node); -+ -+ lock_two_atoms(txnh_atom, block_atom); -+ -+ if (txnh->atom != txnh_atom || node->atom != block_atom ) { -+ release_two_atoms(txnh_atom, block_atom); -+ return RETERR(-E_REPEAT); -+ } -+ -+ atomic_dec(&txnh_atom->refcount); -+ atomic_dec(&block_atom->refcount); -+ -+ assert ("zam-1066", atom_isopen(txnh_atom)); -+ -+ if (txnh_atom->stage >= block_atom->stage || -+ (block_atom->stage == ASTAGE_CAPTURE_WAIT && block_atom->txnh_count == 0)) { -+ capture_fuse_into(txnh_atom, block_atom); -+ return RETERR(-E_REPEAT); -+ } -+ spin_lock_txnh(txnh); -+ return capture_fuse_wait(txnh, block_atom, txnh_atom, mode); -+} -+ -+/* This function splices together two jnode lists (small and large) and sets all jnodes in -+ the small list to point to the large atom. Returns the length of the list. */ -+static int -+capture_fuse_jnode_lists(txn_atom *large, struct list_head *large_head, -+ struct list_head *small_head) -+{ -+ int count = 0; -+ jnode *node; -+ -+ assert("umka-218", large != NULL); -+ assert("umka-219", large_head != NULL); -+ assert("umka-220", small_head != NULL); -+ /* small atom should be locked also. */ -+ assert_spin_locked(&(large->alock)); -+ -+ /* For every jnode on small's capture list... */ -+ list_for_each_entry(node, small_head, capture_link) { -+ count += 1; -+ -+ /* With the jnode lock held, update atom pointer. */ -+ spin_lock_jnode(node); -+ node->atom = large; -+ spin_unlock_jnode(node); -+ } -+ -+ /* Splice the lists. */ -+ list_splice_init(small_head, large_head->prev); -+ -+ return count; -+} -+ -+/* This function splices together two txnh lists (small and large) and sets all txn handles in -+ the small list to point to the large atom. Returns the length of the list. */ -+static int -+capture_fuse_txnh_lists(txn_atom *large, struct list_head *large_head, -+ struct list_head *small_head) -+{ -+ int count = 0; -+ txn_handle *txnh; -+ -+ assert("umka-221", large != NULL); -+ assert("umka-222", large_head != NULL); -+ assert("umka-223", small_head != NULL); -+ -+ /* Adjust every txnh to the new atom. */ -+ list_for_each_entry(txnh, small_head, txnh_link) { -+ count += 1; -+ -+ /* With the txnh lock held, update atom pointer. */ -+ spin_lock_txnh(txnh); -+ txnh->atom = large; -+ spin_unlock_txnh(txnh); -+ } -+ -+ /* Splice the txn_handle list. */ -+ list_splice_init(small_head, large_head->prev); -+ -+ return count; -+} -+ -+/* This function fuses two atoms. The captured nodes and handles belonging to SMALL are -+ added to LARGE and their ->atom pointers are all updated. The associated counts are -+ updated as well, and any waiting handles belonging to either are awakened. Finally the -+ smaller atom's refcount is decremented. -+*/ -+static void capture_fuse_into(txn_atom * small, txn_atom * large) -+{ -+ int level; -+ unsigned zcount = 0; -+ unsigned tcount = 0; -+ -+ assert("umka-224", small != NULL); -+ assert("umka-225", small != NULL); -+ -+ assert_spin_locked(&(large->alock)); -+ assert_spin_locked(&(small->alock)); -+ -+ assert("jmacd-201", atom_isopen(small)); -+ assert("jmacd-202", atom_isopen(large)); -+ -+ /* Splice and update the per-level dirty jnode lists */ -+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) { -+ zcount += -+ capture_fuse_jnode_lists(large, -+ ATOM_DIRTY_LIST(large, level), -+ ATOM_DIRTY_LIST(small, level)); -+ } -+ -+ /* Splice and update the [clean,dirty] jnode and txnh lists */ -+ zcount += -+ capture_fuse_jnode_lists(large, ATOM_CLEAN_LIST(large), -+ ATOM_CLEAN_LIST(small)); -+ zcount += -+ capture_fuse_jnode_lists(large, ATOM_OVRWR_LIST(large), -+ ATOM_OVRWR_LIST(small)); -+ zcount += -+ capture_fuse_jnode_lists(large, ATOM_WB_LIST(large), -+ ATOM_WB_LIST(small)); -+ zcount += -+ capture_fuse_jnode_lists(large, &large->inodes, &small->inodes); -+ tcount += -+ capture_fuse_txnh_lists(large, &large->txnh_list, -+ &small->txnh_list); -+ -+ /* Check our accounting. */ -+ assert("jmacd-1063", -+ zcount + small->num_queued == small->capture_count); -+ assert("jmacd-1065", tcount == small->txnh_count); -+ -+ /* sum numbers of waiters threads */ -+ large->nr_waiters += small->nr_waiters; -+ small->nr_waiters = 0; -+ -+ /* splice flush queues */ -+ reiser4_fuse_fq(large, small); -+ -+ /* update counter of jnode on every atom' list */ -+ ON_DEBUG(large->dirty += small->dirty; -+ small->dirty = 0; -+ large->clean += small->clean; -+ small->clean = 0; -+ large->ovrwr += small->ovrwr; -+ small->ovrwr = 0; -+ large->wb += small->wb; -+ small->wb = 0; -+ large->fq += small->fq; -+ small->fq = 0;); -+ -+ /* count flushers in result atom */ -+ large->nr_flushers += small->nr_flushers; -+ small->nr_flushers = 0; -+ -+ /* update counts of flushed nodes */ -+ large->flushed += small->flushed; -+ small->flushed = 0; -+ -+ /* Transfer list counts to large. */ -+ large->txnh_count += small->txnh_count; -+ large->capture_count += small->capture_count; -+ -+ /* Add all txnh references to large. */ -+ atomic_add(small->txnh_count, &large->refcount); -+ atomic_sub(small->txnh_count, &small->refcount); -+ -+ /* Reset small counts */ -+ small->txnh_count = 0; -+ small->capture_count = 0; -+ -+ /* Assign the oldest start_time, merge flags. */ -+ large->start_time = min(large->start_time, small->start_time); -+ large->flags |= small->flags; -+ -+ /* Merge blocknr sets. */ -+ blocknr_set_merge(&small->delete_set, &large->delete_set); -+ blocknr_set_merge(&small->wandered_map, &large->wandered_map); -+ -+ /* Merge allocated/deleted file counts */ -+ large->nr_objects_deleted += small->nr_objects_deleted; -+ large->nr_objects_created += small->nr_objects_created; -+ -+ small->nr_objects_deleted = 0; -+ small->nr_objects_created = 0; -+ -+ /* Merge allocated blocks counts */ -+ large->nr_blocks_allocated += small->nr_blocks_allocated; -+ -+ large->nr_running_queues += small->nr_running_queues; -+ small->nr_running_queues = 0; -+ -+ /* Merge blocks reserved for overwrite set. */ -+ large->flush_reserved += small->flush_reserved; -+ small->flush_reserved = 0; -+ -+ if (large->stage < small->stage) { -+ /* Large only needs to notify if it has changed state. */ -+ reiser4_atom_set_stage(large, small->stage); -+ wakeup_atom_waiting_list(large); -+ } -+ -+ reiser4_atom_set_stage(small, ASTAGE_INVALID); -+ -+ /* Notify any waiters--small needs to unload its wait lists. Waiters -+ actually remove themselves from the list before returning from the -+ fuse_wait function. */ -+ wakeup_atom_waiting_list(small); -+ -+ /* Unlock atoms */ -+ spin_unlock_atom(large); -+ atom_dec_and_unlock(small); -+} -+ -+/* TXNMGR STUFF */ -+ -+/* Release a block from the atom, reversing the effects of being captured, -+ do not release atom's reference to jnode due to holding spin-locks. -+ Currently this is only called when the atom commits. -+ -+ NOTE: this function does not release a (journal) reference to jnode -+ due to locking optimizations, you should call jput() somewhere after -+ calling reiser4_uncapture_block(). */ -+void reiser4_uncapture_block(jnode * node) -+{ -+ txn_atom *atom; -+ -+ assert("umka-226", node != NULL); -+ atom = node->atom; -+ assert("umka-228", atom != NULL); -+ -+ assert("jmacd-1021", node->atom == atom); -+ assert_spin_locked(&(node->guard)); -+ assert("jmacd-1023", atom_is_protected(atom)); -+ -+ JF_CLR(node, JNODE_DIRTY); -+ JF_CLR(node, JNODE_RELOC); -+ JF_CLR(node, JNODE_OVRWR); -+ JF_CLR(node, JNODE_CREATED); -+ JF_CLR(node, JNODE_WRITEBACK); -+ JF_CLR(node, JNODE_REPACK); -+ -+ list_del_init(&node->capture_link); -+ if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) { -+ assert("zam-925", atom_isopen(atom)); -+ assert("vs-1623", NODE_LIST(node) == FQ_LIST); -+ ON_DEBUG(atom->num_queued--); -+ JF_CLR(node, JNODE_FLUSH_QUEUED); -+ } -+ atom->capture_count -= 1; -+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), NOT_CAPTURED, 1)); -+ node->atom = NULL; -+ -+ spin_unlock_jnode(node); -+ LOCK_CNT_DEC(t_refs); -+} -+ -+/* Unconditional insert of jnode into atom's overwrite list. Currently used in -+ bitmap-based allocator code for adding modified bitmap blocks the -+ transaction. @atom and @node are spin locked */ -+void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node) -+{ -+ assert("zam-538", atom_is_protected(atom)); -+ assert_spin_locked(&(node->guard)); -+ assert("zam-899", JF_ISSET(node, JNODE_OVRWR)); -+ assert("zam-543", node->atom == NULL); -+ assert("vs-1433", !jnode_is_unformatted(node) && !jnode_is_znode(node)); -+ -+ list_add(&node->capture_link, ATOM_OVRWR_LIST(atom)); -+ jref(node); -+ node->atom = atom; -+ atom->capture_count++; -+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), OVRWR_LIST, 1)); -+} -+ -+static int count_deleted_blocks_actor(txn_atom * atom, -+ const reiser4_block_nr * a, -+ const reiser4_block_nr * b, void *data) -+{ -+ reiser4_block_nr *counter = data; -+ -+ assert("zam-995", data != NULL); -+ assert("zam-996", a != NULL); -+ if (b == NULL) -+ *counter += 1; -+ else -+ *counter += *b; -+ return 0; -+} -+ -+reiser4_block_nr txnmgr_count_deleted_blocks(void) -+{ -+ reiser4_block_nr result; -+ txn_mgr *tmgr = &get_super_private(reiser4_get_current_sb())->tmgr; -+ txn_atom *atom; -+ -+ result = 0; -+ -+ spin_lock_txnmgr(tmgr); -+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) { -+ spin_lock_atom(atom); -+ if (atom_isopen(atom)) -+ blocknr_set_iterator( -+ atom, &atom->delete_set, -+ count_deleted_blocks_actor, &result, 0); -+ spin_unlock_atom(atom); -+ } -+ spin_unlock_txnmgr(tmgr); -+ -+ return result; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.20.orig/fs/reiser4/txnmgr.h linux-2.6.20/fs/reiser4/txnmgr.h ---- linux-2.6.20.orig/fs/reiser4/txnmgr.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/txnmgr.h 2007-05-06 14:50:43.899038216 +0400 -@@ -0,0 +1,708 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* data-types and function declarations for transaction manager. See txnmgr.c -+ * for details. */ -+ -+#ifndef __REISER4_TXNMGR_H__ -+#define __REISER4_TXNMGR_H__ -+ -+#include "forward.h" -+#include "dformat.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* TYPE DECLARATIONS */ -+ -+/* This enumeration describes the possible types of a capture request (reiser4_try_capture). -+ A capture request dynamically assigns a block to the calling thread's transaction -+ handle. */ -+typedef enum { -+ /* A READ_ATOMIC request indicates that a block will be read and that the caller's -+ atom should fuse in order to ensure that the block commits atomically with the -+ caller. */ -+ TXN_CAPTURE_READ_ATOMIC = (1 << 0), -+ -+ /* A READ_NONCOM request indicates that a block will be read and that the caller is -+ willing to read a non-committed block without causing atoms to fuse. */ -+ TXN_CAPTURE_READ_NONCOM = (1 << 1), -+ -+ /* A READ_MODIFY request indicates that a block will be read but that the caller -+ wishes for the block to be captured as it will be written. This capture request -+ mode is not currently used, but eventually it will be useful for preventing -+ deadlock in read-modify-write cycles. */ -+ TXN_CAPTURE_READ_MODIFY = (1 << 2), -+ -+ /* A WRITE capture request indicates that a block will be modified and that atoms -+ should fuse to make the commit atomic. */ -+ TXN_CAPTURE_WRITE = (1 << 3), -+ -+ /* CAPTURE_TYPES is a mask of the four above capture types, used to separate the -+ exclusive type designation from extra bits that may be supplied -- see -+ below. */ -+ TXN_CAPTURE_TYPES = (TXN_CAPTURE_READ_ATOMIC | -+ TXN_CAPTURE_READ_NONCOM | TXN_CAPTURE_READ_MODIFY | -+ TXN_CAPTURE_WRITE), -+ -+ /* A subset of CAPTURE_TYPES, CAPTURE_WTYPES is a mask of request types that -+ indicate modification will occur. */ -+ TXN_CAPTURE_WTYPES = (TXN_CAPTURE_READ_MODIFY | TXN_CAPTURE_WRITE), -+ -+ /* An option to reiser4_try_capture, NONBLOCKING indicates that the caller would -+ prefer not to sleep waiting for an aging atom to commit. */ -+ TXN_CAPTURE_NONBLOCKING = (1 << 4), -+ -+ /* An option to reiser4_try_capture to prevent atom fusion, just simple -+ capturing is allowed */ -+ TXN_CAPTURE_DONT_FUSE = (1 << 5) -+ -+ /* This macro selects only the exclusive capture request types, stripping out any -+ options that were supplied (i.e., NONBLOCKING). */ -+#define CAPTURE_TYPE(x) ((x) & TXN_CAPTURE_TYPES) -+} txn_capture; -+ -+/* There are two kinds of transaction handle: WRITE_FUSING and READ_FUSING, the only -+ difference is in the handling of read requests. A WRITE_FUSING transaction handle -+ defaults read capture requests to TXN_CAPTURE_READ_NONCOM whereas a READ_FUSIONG -+ transaction handle defaults to TXN_CAPTURE_READ_ATOMIC. */ -+typedef enum { -+ TXN_WRITE_FUSING = (1 << 0), -+ TXN_READ_FUSING = (1 << 1) | TXN_WRITE_FUSING, /* READ implies WRITE */ -+} txn_mode; -+ -+/* Every atom has a stage, which is one of these exclusive values: */ -+typedef enum { -+ /* Initially an atom is free. */ -+ ASTAGE_FREE = 0, -+ -+ /* An atom begins by entering the CAPTURE_FUSE stage, where it proceeds to capture -+ blocks and fuse with other atoms. */ -+ ASTAGE_CAPTURE_FUSE = 1, -+ -+ /* We need to have a ASTAGE_CAPTURE_SLOW in which an atom fuses with one node for every X nodes it flushes to disk where X > 1. */ -+ -+ /* When an atom reaches a certain age it must do all it can to commit. An atom in -+ the CAPTURE_WAIT stage refuses new transaction handles and prevents fusion from -+ atoms in the CAPTURE_FUSE stage. */ -+ ASTAGE_CAPTURE_WAIT = 2, -+ -+ /* Waiting for I/O before commit. Copy-on-capture (see -+ http://namesys.com/v4/v4.html). */ -+ ASTAGE_PRE_COMMIT = 3, -+ -+ /* Post-commit overwrite I/O. Steal-on-capture. */ -+ ASTAGE_POST_COMMIT = 4, -+ -+ /* Atom which waits for the removal of the last reference to (it? ) to -+ * be deleted from memory */ -+ ASTAGE_DONE = 5, -+ -+ /* invalid atom. */ -+ ASTAGE_INVALID = 6, -+ -+} txn_stage; -+ -+/* Certain flags may be set in the txn_atom->flags field. */ -+typedef enum { -+ /* Indicates that the atom should commit as soon as possible. */ -+ ATOM_FORCE_COMMIT = (1 << 0), -+ /* to avoid endless loop, mark the atom (which was considered as too -+ * small) after failed attempt to fuse it. */ -+ ATOM_CANCEL_FUSION = (1 << 1) -+} txn_flags; -+ -+/* Flags for controlling commit_txnh */ -+typedef enum { -+ /* Wait commit atom completion in commit_txnh */ -+ TXNH_WAIT_COMMIT = 0x2, -+ /* Don't commit atom when this handle is closed */ -+ TXNH_DONT_COMMIT = 0x4 -+} txn_handle_flags_t; -+ -+/* TYPE DEFINITIONS */ -+ -+/* A note on lock ordering: the handle & jnode spinlock protects reading of their ->atom -+ fields, so typically an operation on the atom through either of these objects must (1) -+ lock the object, (2) read the atom pointer, (3) lock the atom. -+ -+ During atom fusion, the process holds locks on both atoms at once. Then, it iterates -+ through the list of handles and pages held by the smaller of the two atoms. For each -+ handle and page referencing the smaller atom, the fusing process must: (1) lock the -+ object, and (2) update the atom pointer. -+ -+ You can see that there is a conflict of lock ordering here, so the more-complex -+ procedure should have priority, i.e., the fusing process has priority so that it is -+ guaranteed to make progress and to avoid restarts. -+ -+ This decision, however, means additional complexity for aquiring the atom lock in the -+ first place. -+ -+ The general original procedure followed in the code was: -+ -+ TXN_OBJECT *obj = ...; -+ TXN_ATOM *atom; -+ -+ spin_lock (& obj->_lock); -+ -+ atom = obj->_atom; -+ -+ if (! spin_trylock_atom (atom)) -+ { -+ spin_unlock (& obj->_lock); -+ RESTART OPERATION, THERE WAS A RACE; -+ } -+ -+ ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED -+ -+ It has however been found that this wastes CPU a lot in a manner that is -+ hard to profile. So, proper refcounting was added to atoms, and new -+ standard locking sequence is like following: -+ -+ TXN_OBJECT *obj = ...; -+ TXN_ATOM *atom; -+ -+ spin_lock (& obj->_lock); -+ -+ atom = obj->_atom; -+ -+ if (! spin_trylock_atom (atom)) -+ { -+ atomic_inc (& atom->refcount); -+ spin_unlock (& obj->_lock); -+ spin_lock (&atom->_lock); -+ atomic_dec (& atom->refcount); -+ // HERE atom is locked -+ spin_unlock (&atom->_lock); -+ RESTART OPERATION, THERE WAS A RACE; -+ } -+ -+ ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED -+ -+ (core of this is implemented in trylock_throttle() function) -+ -+ See the jnode_get_atom() function for a common case. -+ -+ As an additional (and important) optimization allowing to avoid restarts, -+ it is possible to re-check required pre-conditions at the HERE point in -+ code above and proceed without restarting if they are still satisfied. -+*/ -+ -+/* An atomic transaction: this is the underlying system representation -+ of a transaction, not the one seen by clients. -+ -+ Invariants involving this data-type: -+ -+ [sb-fake-allocated] -+*/ -+struct txn_atom { -+ /* The spinlock protecting the atom, held during fusion and various other state -+ changes. */ -+ spinlock_t alock; -+ -+ /* The atom's reference counter, increasing (in case of a duplication -+ of an existing reference or when we are sure that some other -+ reference exists) may be done without taking spinlock, decrementing -+ of the ref. counter requires a spinlock to be held. -+ -+ Each transaction handle counts in ->refcount. All jnodes count as -+ one reference acquired in atom_begin_andlock(), released in -+ commit_current_atom(). -+ */ -+ atomic_t refcount; -+ -+ /* The atom_id identifies the atom in persistent records such as the log. */ -+ __u32 atom_id; -+ -+ /* Flags holding any of the txn_flags enumerated values (e.g., -+ ATOM_FORCE_COMMIT). */ -+ __u32 flags; -+ -+ /* Number of open handles. */ -+ __u32 txnh_count; -+ -+ /* The number of znodes captured by this atom. Equal to the sum of lengths of the -+ dirty_nodes[level] and clean_nodes lists. */ -+ __u32 capture_count; -+ -+#if REISER4_DEBUG -+ int clean; -+ int dirty; -+ int ovrwr; -+ int wb; -+ int fq; -+#endif -+ -+ __u32 flushed; -+ -+ /* Current transaction stage. */ -+ txn_stage stage; -+ -+ /* Start time. */ -+ unsigned long start_time; -+ -+ /* The atom's delete set. It collects block numbers of the nodes -+ which were deleted during the transaction. */ -+ struct list_head delete_set; -+ -+ /* The atom's wandered_block mapping. */ -+ struct list_head wandered_map; -+ -+ /* The transaction's list of dirty captured nodes--per level. Index -+ by (level). dirty_nodes[0] is for znode-above-root */ -+ struct list_head dirty_nodes[REAL_MAX_ZTREE_HEIGHT + 1]; -+ -+ /* The transaction's list of clean captured nodes. */ -+ struct list_head clean_nodes; -+ -+ /* The atom's overwrite set */ -+ struct list_head ovrwr_nodes; -+ -+ /* nodes which are being written to disk */ -+ struct list_head writeback_nodes; -+ -+ /* list of inodes */ -+ struct list_head inodes; -+ -+ /* List of handles associated with this atom. */ -+ struct list_head txnh_list; -+ -+ /* Transaction list link: list of atoms in the transaction manager. */ -+ struct list_head atom_link; -+ -+ /* List of handles waiting FOR this atom: see 'capture_fuse_wait' comment. */ -+ struct list_head fwaitfor_list; -+ -+ /* List of this atom's handles that are waiting: see 'capture_fuse_wait' comment. */ -+ struct list_head fwaiting_list; -+ -+ /* Numbers of objects which were deleted/created in this transaction -+ thereby numbers of objects IDs which were released/deallocated. */ -+ int nr_objects_deleted; -+ int nr_objects_created; -+ /* number of blocks allocated during the transaction */ -+ __u64 nr_blocks_allocated; -+ /* All atom's flush queue objects are on this list */ -+ struct list_head flush_queues; -+#if REISER4_DEBUG -+ /* number of flush queues for this atom. */ -+ int nr_flush_queues; -+ /* Number of jnodes which were removed from atom's lists and put -+ on flush_queue */ -+ int num_queued; -+#endif -+ /* number of threads who wait for this atom to complete commit */ -+ int nr_waiters; -+ /* number of threads which do jnode_flush() over this atom */ -+ int nr_flushers; -+ /* number of flush queues which are IN_USE and jnodes from fq->prepped -+ are submitted to disk by the reiser4_write_fq() routine. */ -+ int nr_running_queues; -+ /* A counter of grabbed unformatted nodes, see a description of the -+ * reiser4 space reservation scheme at block_alloc.c */ -+ reiser4_block_nr flush_reserved; -+#if REISER4_DEBUG -+ void *committer; -+#endif -+ struct super_block *super; -+}; -+ -+#define ATOM_DIRTY_LIST(atom, level) (&(atom)->dirty_nodes[level]) -+#define ATOM_CLEAN_LIST(atom) (&(atom)->clean_nodes) -+#define ATOM_OVRWR_LIST(atom) (&(atom)->ovrwr_nodes) -+#define ATOM_WB_LIST(atom) (&(atom)->writeback_nodes) -+#define ATOM_FQ_LIST(fq) (&(fq)->prepped) -+ -+#define NODE_LIST(node) (node)->list -+#define ASSIGN_NODE_LIST(node, list) ON_DEBUG(NODE_LIST(node) = list) -+ON_DEBUG(void -+ count_jnode(txn_atom *, jnode *, atom_list old_list, -+ atom_list new_list, int check_lists)); -+ -+typedef struct protected_jnodes { -+ struct list_head inatom; /* link to atom's list these structures */ -+ struct list_head nodes; /* head of list of protected nodes */ -+} protected_jnodes; -+ -+/* A transaction handle: the client obtains and commits this handle which is assigned by -+ the system to a txn_atom. */ -+struct txn_handle { -+ /* Spinlock protecting ->atom pointer */ -+ spinlock_t hlock; -+ -+ /* Flags for controlling commit_txnh() behavior */ -+ /* from txn_handle_flags_t */ -+ txn_handle_flags_t flags; -+ -+ /* Whether it is READ_FUSING or WRITE_FUSING. */ -+ txn_mode mode; -+ -+ /* If assigned, the atom it is part of. */ -+ txn_atom *atom; -+ -+ /* Transaction list link. Head is in txn_atom. */ -+ struct list_head txnh_link; -+}; -+ -+/* The transaction manager: one is contained in the reiser4_super_info_data */ -+struct txn_mgr { -+ /* A spinlock protecting the atom list, id_count, flush_control */ -+ spinlock_t tmgr_lock; -+ -+ /* List of atoms. */ -+ struct list_head atoms_list; -+ -+ /* Number of atoms. */ -+ int atom_count; -+ -+ /* A counter used to assign atom->atom_id values. */ -+ __u32 id_count; -+ -+ /* a mutex object for commit serialization */ -+ struct mutex commit_mutex; -+ -+ /* a list of all txnmrgs served by particular daemon. */ -+ struct list_head linkage; -+ -+ /* description of daemon for this txnmgr */ -+ ktxnmgrd_context *daemon; -+ -+ /* parameters. Adjustable through mount options. */ -+ unsigned int atom_max_size; -+ unsigned int atom_max_age; -+ unsigned int atom_min_size; -+ /* max number of concurrent flushers for one atom, 0 - unlimited. */ -+ unsigned int atom_max_flushers; -+ struct dentry *debugfs_atom_count; -+ struct dentry *debugfs_id_count; -+}; -+ -+/* FUNCTION DECLARATIONS */ -+ -+/* These are the externally (within Reiser4) visible transaction functions, therefore they -+ are prefixed with "txn_". For comments, see txnmgr.c. */ -+ -+extern int init_txnmgr_static(void); -+extern void done_txnmgr_static(void); -+ -+extern void reiser4_init_txnmgr(txn_mgr *); -+extern void reiser4_done_txnmgr(txn_mgr *); -+ -+extern int reiser4_txn_reserve(int reserved); -+ -+extern void reiser4_txn_begin(reiser4_context * context); -+extern int reiser4_txn_end(reiser4_context * context); -+ -+extern void reiser4_txn_restart(reiser4_context * context); -+extern void reiser4_txn_restart_current(void); -+ -+extern int txnmgr_force_commit_all(struct super_block *, int); -+extern int current_atom_should_commit(void); -+ -+extern jnode *find_first_dirty_jnode(txn_atom *, int); -+ -+extern int commit_some_atoms(txn_mgr *); -+extern int force_commit_atom(txn_handle *); -+extern int flush_current_atom(int, long, long *, txn_atom **, jnode *); -+ -+extern int flush_some_atom(jnode *, long *, const struct writeback_control *, int); -+ -+extern void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage); -+ -+extern int same_slum_check(jnode * base, jnode * check, int alloc_check, -+ int alloc_value); -+extern void atom_dec_and_unlock(txn_atom * atom); -+ -+extern int reiser4_try_capture(jnode * node, znode_lock_mode mode, txn_capture flags); -+extern int try_capture_page_to_invalidate(struct page *pg); -+ -+extern void reiser4_uncapture_page(struct page *pg); -+extern void reiser4_uncapture_block(jnode *); -+extern void reiser4_uncapture_jnode(jnode *); -+ -+extern int reiser4_capture_inode(struct inode *); -+extern int reiser4_uncapture_inode(struct inode *); -+ -+extern txn_atom *get_current_atom_locked_nocheck(void); -+ -+#if REISER4_DEBUG -+ -+/** -+ * atom_is_protected - make sure that nobody but us can do anything with atom -+ * @atom: atom to be checked -+ * -+ * This is used to assert that atom either entered commit stages or is spin -+ * locked. -+ */ -+static inline int atom_is_protected(txn_atom *atom) -+{ -+ if (atom->stage >= ASTAGE_PRE_COMMIT) -+ return 1; -+ assert_spin_locked(&(atom->alock)); -+ return 1; -+} -+ -+#endif -+ -+/* Get the current atom and spinlock it if current atom present. May not return NULL */ -+static inline txn_atom *get_current_atom_locked(void) -+{ -+ txn_atom *atom; -+ -+ atom = get_current_atom_locked_nocheck(); -+ assert("zam-761", atom != NULL); -+ -+ return atom; -+} -+ -+extern txn_atom *jnode_get_atom(jnode *); -+ -+extern void reiser4_atom_wait_event(txn_atom *); -+extern void reiser4_atom_send_event(txn_atom *); -+ -+extern void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node); -+extern int reiser4_capture_super_block(struct super_block *s); -+int capture_bulk(jnode **, int count); -+ -+/* See the comment on the function blocknrset.c:blocknr_set_add for the -+ calling convention of these three routines. */ -+extern void blocknr_set_init(struct list_head * bset); -+extern void blocknr_set_destroy(struct list_head * bset); -+extern void blocknr_set_merge(struct list_head * from, struct list_head * into); -+extern int blocknr_set_add_extent(txn_atom * atom, -+ struct list_head * bset, -+ blocknr_set_entry ** new_bsep, -+ const reiser4_block_nr * start, -+ const reiser4_block_nr * len); -+extern int blocknr_set_add_pair(txn_atom * atom, struct list_head * bset, -+ blocknr_set_entry ** new_bsep, -+ const reiser4_block_nr * a, -+ const reiser4_block_nr * b); -+ -+typedef int (*blocknr_set_actor_f) (txn_atom *, const reiser4_block_nr *, -+ const reiser4_block_nr *, void *); -+ -+extern int blocknr_set_iterator(txn_atom * atom, struct list_head * bset, -+ blocknr_set_actor_f actor, void *data, -+ int delete); -+ -+/* flush code takes care about how to fuse flush queues */ -+extern void flush_init_atom(txn_atom * atom); -+extern void flush_fuse_queues(txn_atom * large, txn_atom * small); -+ -+static inline void spin_lock_atom(txn_atom *atom) -+{ -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) && -+ LOCK_CNT_NIL(spin_locked_atom) && -+ LOCK_CNT_NIL(spin_locked_jnode) && -+ LOCK_CNT_NIL(spin_locked_zlock) && -+ LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_NIL(rw_locked_tree))); -+ -+ spin_lock(&(atom->alock)); -+ -+ LOCK_CNT_INC(spin_locked_atom); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline void spin_lock_atom_nested(txn_atom *atom) -+{ -+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) && -+ LOCK_CNT_NIL(spin_locked_jnode) && -+ LOCK_CNT_NIL(spin_locked_zlock) && -+ LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_NIL(rw_locked_tree))); -+ -+ spin_lock_nested(&(atom->alock), SINGLE_DEPTH_NESTING); -+ -+ LOCK_CNT_INC(spin_locked_atom); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline int spin_trylock_atom(txn_atom *atom) -+{ -+ if (spin_trylock(&(atom->alock))) { -+ LOCK_CNT_INC(spin_locked_atom); -+ LOCK_CNT_INC(spin_locked); -+ return 1; -+ } -+ return 0; -+} -+ -+static inline void spin_unlock_atom(txn_atom *atom) -+{ -+ assert_spin_locked(&(atom->alock)); -+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_atom)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(spin_locked_atom); -+ LOCK_CNT_DEC(spin_locked); -+ -+ spin_unlock(&(atom->alock)); -+} -+ -+static inline void spin_lock_txnh(txn_handle *txnh) -+{ -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", (LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_NIL(spin_locked_zlock) && -+ LOCK_CNT_NIL(rw_locked_tree))); -+ -+ spin_lock(&(txnh->hlock)); -+ -+ LOCK_CNT_INC(spin_locked_txnh); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline int spin_trylock_txnh(txn_handle *txnh) -+{ -+ if (spin_trylock(&(txnh->hlock))) { -+ LOCK_CNT_INC(spin_locked_txnh); -+ LOCK_CNT_INC(spin_locked); -+ return 1; -+ } -+ return 0; -+} -+ -+static inline void spin_unlock_txnh(txn_handle *txnh) -+{ -+ assert_spin_locked(&(txnh->hlock)); -+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnh)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(spin_locked_txnh); -+ LOCK_CNT_DEC(spin_locked); -+ -+ spin_unlock(&(txnh->hlock)); -+} -+ -+#define spin_ordering_pred_txnmgr(tmgr) \ -+ ( LOCK_CNT_NIL(spin_locked_atom) && \ -+ LOCK_CNT_NIL(spin_locked_txnh) && \ -+ LOCK_CNT_NIL(spin_locked_jnode) && \ -+ LOCK_CNT_NIL(rw_locked_zlock) && \ -+ LOCK_CNT_NIL(rw_locked_dk) && \ -+ LOCK_CNT_NIL(rw_locked_tree) ) -+ -+static inline void spin_lock_txnmgr(txn_mgr *mgr) -+{ -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", (LOCK_CNT_NIL(spin_locked_atom) && -+ LOCK_CNT_NIL(spin_locked_txnh) && -+ LOCK_CNT_NIL(spin_locked_jnode) && -+ LOCK_CNT_NIL(spin_locked_zlock) && -+ LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_NIL(rw_locked_tree))); -+ -+ spin_lock(&(mgr->tmgr_lock)); -+ -+ LOCK_CNT_INC(spin_locked_txnmgr); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline int spin_trylock_txnmgr(txn_mgr *mgr) -+{ -+ if (spin_trylock(&(mgr->tmgr_lock))) { -+ LOCK_CNT_INC(spin_locked_txnmgr); -+ LOCK_CNT_INC(spin_locked); -+ return 1; -+ } -+ return 0; -+} -+ -+static inline void spin_unlock_txnmgr(txn_mgr *mgr) -+{ -+ assert_spin_locked(&(mgr->tmgr_lock)); -+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnmgr)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(spin_locked_txnmgr); -+ LOCK_CNT_DEC(spin_locked); -+ -+ spin_unlock(&(mgr->tmgr_lock)); -+} -+ -+typedef enum { -+ FQ_IN_USE = 0x1 -+} flush_queue_state_t; -+ -+typedef struct flush_queue flush_queue_t; -+ -+/* This is an accumulator for jnodes prepared for writing to disk. A flush queue -+ is filled by the jnode_flush() routine, and written to disk under memory -+ pressure or at atom commit time. */ -+/* LOCKING: fq state and fq->atom are protected by guard spinlock, fq->nr_queued -+ field and fq->prepped list can be modified if atom is spin-locked and fq -+ object is "in-use" state. For read-only traversal of the fq->prepped list -+ and reading of the fq->nr_queued field it is enough to keep fq "in-use" or -+ only have atom spin-locked. */ -+struct flush_queue { -+ /* linkage element is the first in this structure to make debugging -+ easier. See field in atom struct for description of list. */ -+ struct list_head alink; -+ /* A spinlock to protect changes of fq state and fq->atom pointer */ -+ spinlock_t guard; -+ /* flush_queue state: [in_use | ready] */ -+ flush_queue_state_t state; -+ /* A list which contains queued nodes, queued nodes are removed from any -+ * atom's list and put on this ->prepped one. */ -+ struct list_head prepped; -+ /* number of submitted i/o requests */ -+ atomic_t nr_submitted; -+ /* number of i/o errors */ -+ atomic_t nr_errors; -+ /* An atom this flush queue is attached to */ -+ txn_atom *atom; -+ /* A wait queue head to wait on i/o completion */ -+ wait_queue_head_t wait; -+#if REISER4_DEBUG -+ /* A thread which took this fq in exclusive use, NULL if fq is free, -+ * used for debugging. */ -+ struct task_struct *owner; -+#endif -+}; -+ -+extern int reiser4_fq_by_atom(txn_atom *, flush_queue_t **); -+extern void reiser4_fq_put_nolock(flush_queue_t *); -+extern void reiser4_fq_put(flush_queue_t *); -+extern void reiser4_fuse_fq(txn_atom * to, txn_atom * from); -+extern void queue_jnode(flush_queue_t *, jnode *); -+ -+extern int reiser4_write_fq(flush_queue_t *, long *, int); -+extern int current_atom_finish_all_fq(void); -+extern void init_atom_fq_parts(txn_atom *); -+ -+extern reiser4_block_nr txnmgr_count_deleted_blocks(void); -+ -+extern void znode_make_dirty(znode * node); -+extern void jnode_make_dirty_locked(jnode * node); -+ -+extern int reiser4_sync_atom(txn_atom * atom); -+ -+#if REISER4_DEBUG -+extern int atom_fq_parts_are_clean(txn_atom *); -+#endif -+ -+extern void add_fq_to_bio(flush_queue_t *, struct bio *); -+extern flush_queue_t *get_fq_for_current_atom(void); -+ -+void protected_jnodes_init(protected_jnodes * list); -+void protected_jnodes_done(protected_jnodes * list); -+void reiser4_invalidate_list(struct list_head * head); -+ -+# endif /* __REISER4_TXNMGR_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/type_safe_hash.h linux-2.6.20/fs/reiser4/type_safe_hash.h ---- linux-2.6.20.orig/fs/reiser4/type_safe_hash.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/type_safe_hash.h 2007-05-06 14:50:43.899038216 +0400 -@@ -0,0 +1,320 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* A hash table class that uses hash chains (singly-linked) and is -+ parametrized to provide type safety. */ -+ -+#ifndef __REISER4_TYPE_SAFE_HASH_H__ -+#define __REISER4_TYPE_SAFE_HASH_H__ -+ -+#include "debug.h" -+ -+#include -+/* Step 1: Use TYPE_SAFE_HASH_DECLARE() to define the TABLE and LINK objects -+ based on the object type. You need to declare the item type before -+ this definition, define it after this definition. */ -+#define TYPE_SAFE_HASH_DECLARE(PREFIX,ITEM_TYPE) \ -+ \ -+typedef struct PREFIX##_hash_table_ PREFIX##_hash_table; \ -+typedef struct PREFIX##_hash_link_ PREFIX##_hash_link; \ -+ \ -+struct PREFIX##_hash_table_ \ -+{ \ -+ ITEM_TYPE **_table; \ -+ __u32 _buckets; \ -+}; \ -+ \ -+struct PREFIX##_hash_link_ \ -+{ \ -+ ITEM_TYPE *_next; \ -+} -+ -+/* Step 2: Define the object type of the hash: give it field of type -+ PREFIX_hash_link. */ -+ -+/* Step 3: Use TYPE_SAFE_HASH_DEFINE to define the hash table interface using -+ the type and field name used in step 3. The arguments are: -+ -+ ITEM_TYPE The item type being hashed -+ KEY_TYPE The type of key being hashed -+ KEY_NAME The name of the key field within the item -+ LINK_NAME The name of the link field within the item, which you must make type PREFIX_hash_link) -+ HASH_FUNC The name of the hash function (or macro, takes const pointer to key) -+ EQ_FUNC The name of the equality function (or macro, takes const pointer to two keys) -+ -+ It implements these functions: -+ -+ prefix_hash_init Initialize the table given its size. -+ prefix_hash_insert Insert an item -+ prefix_hash_insert_index Insert an item w/ precomputed hash_index -+ prefix_hash_find Find an item by key -+ prefix_hash_find_index Find an item w/ precomputed hash_index -+ prefix_hash_remove Remove an item, returns 1 if found, 0 if not found -+ prefix_hash_remove_index Remove an item w/ precomputed hash_index -+ -+ If you'd like something to be done differently, feel free to ask me -+ for modifications. Additional features that could be added but -+ have not been: -+ -+ prefix_hash_remove_key Find and remove an item by key -+ prefix_hash_remove_key_index Find and remove an item by key w/ precomputed hash_index -+ -+ The hash_function currently receives only the key as an argument, -+ meaning it must somehow know the number of buckets. If this is a -+ problem let me know. -+ -+ This hash table uses a single-linked hash chain. This means -+ insertion is fast but deletion requires searching the chain. -+ -+ There is also the doubly-linked hash chain approach, under which -+ deletion requires no search but the code is longer and it takes two -+ pointers per item. -+ -+ The circularly-linked approach has the shortest code but requires -+ two pointers per bucket, doubling the size of the bucket array (in -+ addition to two pointers per item). -+*/ -+#define TYPE_SAFE_HASH_DEFINE(PREFIX,ITEM_TYPE,KEY_TYPE,KEY_NAME,LINK_NAME,HASH_FUNC,EQ_FUNC) \ -+ \ -+static __inline__ void \ -+PREFIX##_check_hash (PREFIX##_hash_table *table UNUSED_ARG, \ -+ __u32 hash UNUSED_ARG) \ -+{ \ -+ assert("nikita-2780", hash < table->_buckets); \ -+} \ -+ \ -+static __inline__ int \ -+PREFIX##_hash_init (PREFIX##_hash_table *hash, \ -+ __u32 buckets) \ -+{ \ -+ hash->_table = (ITEM_TYPE**) KMALLOC (sizeof (ITEM_TYPE*) * buckets); \ -+ hash->_buckets = buckets; \ -+ if (hash->_table == NULL) \ -+ { \ -+ return RETERR(-ENOMEM); \ -+ } \ -+ memset (hash->_table, 0, sizeof (ITEM_TYPE*) * buckets); \ -+ ON_DEBUG(printk(#PREFIX "_hash_table: %i buckets\n", buckets)); \ -+ return 0; \ -+} \ -+ \ -+static __inline__ void \ -+PREFIX##_hash_done (PREFIX##_hash_table *hash) \ -+{ \ -+ if (REISER4_DEBUG && hash->_table != NULL) { \ -+ __u32 i; \ -+ for (i = 0 ; i < hash->_buckets ; ++ i) \ -+ assert("nikita-2905", hash->_table[i] == NULL); \ -+ } \ -+ if (hash->_table != NULL) \ -+ KFREE (hash->_table, sizeof (ITEM_TYPE*) * hash->_buckets); \ -+ hash->_table = NULL; \ -+} \ -+ \ -+static __inline__ void \ -+PREFIX##_hash_prefetch_next (ITEM_TYPE *item) \ -+{ \ -+ prefetch(item->LINK_NAME._next); \ -+} \ -+ \ -+static __inline__ void \ -+PREFIX##_hash_prefetch_bucket (PREFIX##_hash_table *hash, \ -+ __u32 index) \ -+{ \ -+ prefetch(hash->_table[index]); \ -+} \ -+ \ -+static __inline__ ITEM_TYPE* \ -+PREFIX##_hash_find_index (PREFIX##_hash_table *hash, \ -+ __u32 hash_index, \ -+ KEY_TYPE const *find_key) \ -+{ \ -+ ITEM_TYPE *item; \ -+ \ -+ PREFIX##_check_hash(hash, hash_index); \ -+ \ -+ for (item = hash->_table[hash_index]; \ -+ item != NULL; \ -+ item = item->LINK_NAME._next) \ -+ { \ -+ prefetch(item->LINK_NAME._next); \ -+ prefetch(item->LINK_NAME._next + offsetof(ITEM_TYPE, KEY_NAME)); \ -+ if (EQ_FUNC (& item->KEY_NAME, find_key)) \ -+ { \ -+ return item; \ -+ } \ -+ } \ -+ \ -+ return NULL; \ -+} \ -+ \ -+static __inline__ ITEM_TYPE* \ -+PREFIX##_hash_find_index_lru (PREFIX##_hash_table *hash, \ -+ __u32 hash_index, \ -+ KEY_TYPE const *find_key) \ -+{ \ -+ ITEM_TYPE ** item = &hash->_table[hash_index]; \ -+ \ -+ PREFIX##_check_hash(hash, hash_index); \ -+ \ -+ while (*item != NULL) { \ -+ prefetch(&(*item)->LINK_NAME._next); \ -+ if (EQ_FUNC (&(*item)->KEY_NAME, find_key)) { \ -+ ITEM_TYPE *found; \ -+ \ -+ found = *item; \ -+ *item = found->LINK_NAME._next; \ -+ found->LINK_NAME._next = hash->_table[hash_index]; \ -+ hash->_table[hash_index] = found; \ -+ return found; \ -+ } \ -+ item = &(*item)->LINK_NAME._next; \ -+ } \ -+ return NULL; \ -+} \ -+ \ -+static __inline__ int \ -+PREFIX##_hash_remove_index (PREFIX##_hash_table *hash, \ -+ __u32 hash_index, \ -+ ITEM_TYPE *del_item) \ -+{ \ -+ ITEM_TYPE ** hash_item_p = &hash->_table[hash_index]; \ -+ \ -+ PREFIX##_check_hash(hash, hash_index); \ -+ \ -+ while (*hash_item_p != NULL) { \ -+ prefetch(&(*hash_item_p)->LINK_NAME._next); \ -+ if (*hash_item_p == del_item) { \ -+ *hash_item_p = (*hash_item_p)->LINK_NAME._next; \ -+ return 1; \ -+ } \ -+ hash_item_p = &(*hash_item_p)->LINK_NAME._next; \ -+ } \ -+ return 0; \ -+} \ -+ \ -+static __inline__ void \ -+PREFIX##_hash_insert_index (PREFIX##_hash_table *hash, \ -+ __u32 hash_index, \ -+ ITEM_TYPE *ins_item) \ -+{ \ -+ PREFIX##_check_hash(hash, hash_index); \ -+ \ -+ ins_item->LINK_NAME._next = hash->_table[hash_index]; \ -+ hash->_table[hash_index] = ins_item; \ -+} \ -+ \ -+static __inline__ void \ -+PREFIX##_hash_insert_index_rcu (PREFIX##_hash_table *hash, \ -+ __u32 hash_index, \ -+ ITEM_TYPE *ins_item) \ -+{ \ -+ PREFIX##_check_hash(hash, hash_index); \ -+ \ -+ ins_item->LINK_NAME._next = hash->_table[hash_index]; \ -+ smp_wmb(); \ -+ hash->_table[hash_index] = ins_item; \ -+} \ -+ \ -+static __inline__ ITEM_TYPE* \ -+PREFIX##_hash_find (PREFIX##_hash_table *hash, \ -+ KEY_TYPE const *find_key) \ -+{ \ -+ return PREFIX##_hash_find_index (hash, HASH_FUNC(hash, find_key), find_key); \ -+} \ -+ \ -+static __inline__ ITEM_TYPE* \ -+PREFIX##_hash_find_lru (PREFIX##_hash_table *hash, \ -+ KEY_TYPE const *find_key) \ -+{ \ -+ return PREFIX##_hash_find_index_lru (hash, HASH_FUNC(hash, find_key), find_key); \ -+} \ -+ \ -+static __inline__ int \ -+PREFIX##_hash_remove (PREFIX##_hash_table *hash, \ -+ ITEM_TYPE *del_item) \ -+{ \ -+ return PREFIX##_hash_remove_index (hash, \ -+ HASH_FUNC(hash, &del_item->KEY_NAME), del_item); \ -+} \ -+ \ -+static __inline__ int \ -+PREFIX##_hash_remove_rcu (PREFIX##_hash_table *hash, \ -+ ITEM_TYPE *del_item) \ -+{ \ -+ return PREFIX##_hash_remove (hash, del_item); \ -+} \ -+ \ -+static __inline__ void \ -+PREFIX##_hash_insert (PREFIX##_hash_table *hash, \ -+ ITEM_TYPE *ins_item) \ -+{ \ -+ return PREFIX##_hash_insert_index (hash, \ -+ HASH_FUNC(hash, &ins_item->KEY_NAME), ins_item); \ -+} \ -+ \ -+static __inline__ void \ -+PREFIX##_hash_insert_rcu (PREFIX##_hash_table *hash, \ -+ ITEM_TYPE *ins_item) \ -+{ \ -+ return PREFIX##_hash_insert_index_rcu (hash, HASH_FUNC(hash, &ins_item->KEY_NAME), \ -+ ins_item); \ -+} \ -+ \ -+static __inline__ ITEM_TYPE * \ -+PREFIX##_hash_first (PREFIX##_hash_table *hash, __u32 ind) \ -+{ \ -+ ITEM_TYPE *first; \ -+ \ -+ for (first = NULL; ind < hash->_buckets; ++ ind) { \ -+ first = hash->_table[ind]; \ -+ if (first != NULL) \ -+ break; \ -+ } \ -+ return first; \ -+} \ -+ \ -+static __inline__ ITEM_TYPE * \ -+PREFIX##_hash_next (PREFIX##_hash_table *hash, \ -+ ITEM_TYPE *item) \ -+{ \ -+ ITEM_TYPE *next; \ -+ \ -+ if (item == NULL) \ -+ return NULL; \ -+ next = item->LINK_NAME._next; \ -+ if (next == NULL) \ -+ next = PREFIX##_hash_first (hash, HASH_FUNC(hash, &item->KEY_NAME) + 1); \ -+ return next; \ -+} \ -+ \ -+typedef struct {} PREFIX##_hash_dummy -+ -+#define for_all_ht_buckets(table, head) \ -+for ((head) = &(table) -> _table[ 0 ] ; \ -+ (head) != &(table) -> _table[ (table) -> _buckets ] ; ++ (head)) -+ -+#define for_all_in_bucket(bucket, item, next, field) \ -+for ((item) = *(bucket), (next) = (item) ? (item) -> field._next : NULL ; \ -+ (item) != NULL ; \ -+ (item) = (next), (next) = (item) ? (item) -> field._next : NULL ) -+ -+#define for_all_in_htable(table, prefix, item, next) \ -+for ((item) = prefix ## _hash_first ((table), 0), \ -+ (next) = prefix ## _hash_next ((table), (item)) ; \ -+ (item) != NULL ; \ -+ (item) = (next), \ -+ (next) = prefix ## _hash_next ((table), (item))) -+ -+/* __REISER4_TYPE_SAFE_HASH_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/vfs_ops.c linux-2.6.20/fs/reiser4/vfs_ops.c ---- linux-2.6.20.orig/fs/reiser4/vfs_ops.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/vfs_ops.c 2007-05-06 14:50:43.899038216 +0400 -@@ -0,0 +1,259 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Interface to VFS. Reiser4 {super|export|dentry}_operations are defined -+ here. */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "coord.h" -+#include "plugin/item/item.h" -+#include "plugin/file/file.h" -+#include "plugin/security/perm.h" -+#include "plugin/disk_format/disk_format.h" -+#include "plugin/plugin.h" -+#include "plugin/plugin_set.h" -+#include "plugin/object.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree.h" -+#include "vfs_ops.h" -+#include "inode.h" -+#include "page_cache.h" -+#include "ktxnmgrd.h" -+#include "super.h" -+#include "reiser4.h" -+#include "entd.h" -+#include "status_flags.h" -+#include "flush.h" -+#include "dscale.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* update inode stat-data by calling plugin */ -+int reiser4_update_sd(struct inode *object) -+{ -+ file_plugin *fplug; -+ -+ assert("nikita-2338", object != NULL); -+ /* check for read-only file system. */ -+ if (IS_RDONLY(object)) -+ return 0; -+ -+ fplug = inode_file_plugin(object); -+ assert("nikita-2339", fplug != NULL); -+ return fplug->write_sd_by_inode(object); -+} -+ -+/* helper function: increase inode nlink count and call plugin method to save -+ updated stat-data. -+ -+ Used by link/create and during creation of dot and dotdot in mkdir -+*/ -+int reiser4_add_nlink(struct inode *object /* object to which link is added */ , -+ struct inode *parent /* parent where new entry will be */ -+ , -+ int write_sd_p /* true if stat-data has to be -+ * updated */ ) -+{ -+ file_plugin *fplug; -+ int result; -+ -+ assert("nikita-1351", object != NULL); -+ -+ fplug = inode_file_plugin(object); -+ assert("nikita-1445", fplug != NULL); -+ -+ /* ask plugin whether it can add yet another link to this -+ object */ -+ if (!fplug->can_add_link(object)) -+ return RETERR(-EMLINK); -+ -+ assert("nikita-2211", fplug->add_link != NULL); -+ /* call plugin to do actual addition of link */ -+ result = fplug->add_link(object, parent); -+ -+ /* optionally update stat data */ -+ if (result == 0 && write_sd_p) -+ result = fplug->write_sd_by_inode(object); -+ return result; -+} -+ -+/* helper function: decrease inode nlink count and call plugin method to save -+ updated stat-data. -+ -+ Used by unlink/create -+*/ -+int reiser4_del_nlink(struct inode *object /* object from which link is -+ * removed */ , -+ struct inode *parent /* parent where entry was */ , -+ int write_sd_p /* true is stat-data has to be -+ * updated */ ) -+{ -+ file_plugin *fplug; -+ int result; -+ -+ assert("nikita-1349", object != NULL); -+ -+ fplug = inode_file_plugin(object); -+ assert("nikita-1350", fplug != NULL); -+ assert("nikita-1446", object->i_nlink > 0); -+ assert("nikita-2210", fplug->rem_link != NULL); -+ -+ /* call plugin to do actual deletion of link */ -+ result = fplug->rem_link(object, parent); -+ -+ /* optionally update stat data */ -+ if (result == 0 && write_sd_p) -+ result = fplug->write_sd_by_inode(object); -+ return result; -+} -+ -+/* Release reiser4 dentry. This is d_op->d_release() method. */ -+static void reiser4_d_release(struct dentry *dentry /* dentry released */ ) -+{ -+ reiser4_free_dentry_fsdata(dentry); -+} -+ -+/* -+ * Called by reiser4_sync_inodes(), during speculative write-back (through -+ * pdflush, or balance_dirty_pages()). -+ */ -+void reiser4_writeout(struct super_block *sb, struct writeback_control *wbc) -+{ -+ long written = 0; -+ int repeats = 0; -+ int result; -+ struct address_space *mapping; -+ -+ /* -+ * Performs early flushing, trying to free some memory. If there is -+ * nothing to flush, commits some atoms. -+ */ -+ -+ /* Commit all atoms if reiser4_writepages() is called from sys_sync() or -+ sys_fsync(). */ -+ if (wbc->sync_mode != WB_SYNC_NONE) { -+ txnmgr_force_commit_all(sb, 0); -+ return; -+ } -+ -+ BUG_ON(reiser4_get_super_fake(sb) == NULL); -+ mapping = reiser4_get_super_fake(sb)->i_mapping; -+ do { -+ long nr_submitted = 0; -+ jnode *node = NULL; -+ -+ /* do not put more requests to overload write queue */ -+ if (wbc->nonblocking && -+ bdi_write_congested(mapping->backing_dev_info)) { -+ blk_run_address_space(mapping); -+ wbc->encountered_congestion = 1; -+ break; -+ } -+ repeats++; -+ BUG_ON(wbc->nr_to_write <= 0); -+ -+ if (get_current_context()->entd) { -+ entd_context *ent = get_entd_context(sb); -+ -+ if (ent->cur_request->node) -+ /* -+ * this is ent thread and it managed to capture -+ * requested page itself - start flush from -+ * that page -+ */ -+ node = jref(ent->cur_request->node); -+ } -+ -+ result = flush_some_atom(node, &nr_submitted, wbc, -+ JNODE_FLUSH_WRITE_BLOCKS); -+ if (result != 0) -+ warning("nikita-31001", "Flush failed: %i", result); -+ if (node) -+ jput(node); -+ if (!nr_submitted) -+ break; -+ -+ wbc->nr_to_write -= nr_submitted; -+ written += nr_submitted; -+ } while (wbc->nr_to_write > 0); -+} -+ -+void reiser4_throttle_write(struct inode *inode) -+{ -+ reiser4_txn_restart_current(); -+ balance_dirty_pages_ratelimited(inode->i_mapping); -+} -+ -+const char *REISER4_SUPER_MAGIC_STRING = "ReIsEr4"; -+const int REISER4_MAGIC_OFFSET = 16 * 4096; /* offset to magic string from the -+ * beginning of device */ -+ -+/* -+ * Reiser4 initialization/shutdown. -+ * -+ * Code below performs global reiser4 initialization that is done either as -+ * part of kernel initialization (when reiser4 is statically built-in), or -+ * during reiser4 module load (when compiled as module). -+ */ -+ -+void reiser4_handle_error(void) -+{ -+ struct super_block *sb = reiser4_get_current_sb(); -+ -+ if (!sb) -+ return; -+ reiser4_status_write(REISER4_STATUS_DAMAGED, 0, -+ "Filesystem error occured"); -+ switch (get_super_private(sb)->onerror) { -+ case 0: -+ reiser4_panic("foobar-42", "Filesystem error occured\n"); -+ case 1: -+ default: -+ if (sb->s_flags & MS_RDONLY) -+ return; -+ sb->s_flags |= MS_RDONLY; -+ break; -+ } -+} -+ -+struct dentry_operations reiser4_dentry_operations = { -+ .d_revalidate = NULL, -+ .d_hash = NULL, -+ .d_compare = NULL, -+ .d_delete = NULL, -+ .d_release = reiser4_d_release, -+ .d_iput = NULL, -+}; -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/vfs_ops.h linux-2.6.20/fs/reiser4/vfs_ops.h ---- linux-2.6.20.orig/fs/reiser4/vfs_ops.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/vfs_ops.h 2007-05-06 14:50:43.899038216 +0400 -@@ -0,0 +1,53 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* vfs_ops.c's exported symbols */ -+ -+#if !defined( __FS_REISER4_VFS_OPS_H__ ) -+#define __FS_REISER4_VFS_OPS_H__ -+ -+#include "forward.h" -+#include "coord.h" -+#include "seal.h" -+#include "plugin/file/file.h" -+#include "super.h" -+#include "readahead.h" -+ -+#include /* for loff_t */ -+#include /* for struct address_space */ -+#include /* for struct dentry */ -+#include -+#include -+ -+/* address space operations */ -+int reiser4_writepage(struct page *, struct writeback_control *); -+int reiser4_set_page_dirty(struct page *); -+void reiser4_invalidatepage(struct page *, unsigned long offset); -+int reiser4_releasepage(struct page *, gfp_t); -+ -+extern int reiser4_update_sd(struct inode *); -+extern int reiser4_add_nlink(struct inode *, struct inode *, int); -+extern int reiser4_del_nlink(struct inode *, struct inode *, int); -+ -+extern int reiser4_start_up_io(struct page *page); -+extern void reiser4_throttle_write(struct inode *); -+extern int jnode_is_releasable(jnode *); -+ -+#define CAPTURE_APAGE_BURST (1024l) -+void reiser4_writeout(struct super_block *, struct writeback_control *); -+ -+extern void reiser4_handle_error(void); -+ -+/* __FS_REISER4_VFS_OPS_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/wander.c linux-2.6.20/fs/reiser4/wander.c ---- linux-2.6.20.orig/fs/reiser4/wander.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/wander.c 2007-05-06 14:50:43.903039466 +0400 -@@ -0,0 +1,1797 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Reiser4 Wandering Log */ -+ -+/* You should read http://www.namesys.com/txn-doc.html -+ -+ That describes how filesystem operations are performed as atomic -+ transactions, and how we try to arrange it so that we can write most of the -+ data only once while performing the operation atomically. -+ -+ For the purposes of this code, it is enough for it to understand that it -+ has been told a given block should be written either once, or twice (if -+ twice then once to the wandered location and once to the real location). -+ -+ This code guarantees that those blocks that are defined to be part of an -+ atom either all take effect or none of them take effect. -+ -+ Relocate set nodes are submitted to write by the jnode_flush() routine, and -+ the overwrite set is submitted by reiser4_write_log(). This is because with -+ the overwrite set we seek to optimize writes, and with the relocate set we -+ seek to cause disk order to correlate with the parent first pre-order. -+ -+ reiser4_write_log() allocates and writes wandered blocks and maintains -+ additional on-disk structures of the atom as wander records (each wander -+ record occupies one block) for storing of the "wandered map" (a table which -+ contains a relation between wandered and real block numbers) and other -+ information which might be needed at transaction recovery time. -+ -+ The wander records are unidirectionally linked into a circle: each wander -+ record contains a block number of the next wander record, the last wander -+ record points to the first one. -+ -+ One wander record (named "tx head" in this file) has a format which is -+ different from the other wander records. The "tx head" has a reference to the -+ "tx head" block of the previously committed atom. Also, "tx head" contains -+ fs information (the free blocks counter, and the oid allocator state) which -+ is logged in a special way . -+ -+ There are two journal control blocks, named journal header and journal -+ footer which have fixed on-disk locations. The journal header has a -+ reference to the "tx head" block of the last committed atom. The journal -+ footer points to the "tx head" of the last flushed atom. The atom is -+ "played" when all blocks from its overwrite set are written to disk the -+ second time (i.e. written to their real locations). -+ -+ NOTE: People who know reiserfs internals and its journal structure might be -+ confused with these terms journal footer and journal header. There is a table -+ with terms of similar semantics in reiserfs (reiser3) and reiser4: -+ -+ REISER3 TERM | REISER4 TERM | DESCRIPTION -+ --------------------+-----------------------+---------------------------- -+ commit record | journal header | atomic write of this record -+ | | ends transaction commit -+ --------------------+-----------------------+---------------------------- -+ journal header | journal footer | atomic write of this record -+ | | ends post-commit writes. -+ | | After successful -+ | | writing of this journal -+ | | blocks (in reiser3) or -+ | | wandered blocks/records are -+ | | free for re-use. -+ --------------------+-----------------------+---------------------------- -+ -+ The atom commit process is the following: -+ -+ 1. The overwrite set is taken from atom's clean list, and its size is -+ counted. -+ -+ 2. The number of necessary wander records (including tx head) is calculated, -+ and the wander record blocks are allocated. -+ -+ 3. Allocate wandered blocks and populate wander records by wandered map. -+ -+ 4. submit write requests for wander records and wandered blocks. -+ -+ 5. wait until submitted write requests complete. -+ -+ 6. update journal header: change the pointer to the block number of just -+ written tx head, submit an i/o for modified journal header block and wait -+ for i/o completion. -+ -+ NOTE: The special logging for bitmap blocks and some reiser4 super block -+ fields makes processes of atom commit, flush and recovering a bit more -+ complex (see comments in the source code for details). -+ -+ The atom playing process is the following: -+ -+ 1. Write atom's overwrite set in-place. -+ -+ 2. Wait on i/o. -+ -+ 3. Update journal footer: change the pointer to block number of tx head -+ block of the atom we currently flushing, submit an i/o, wait on i/o -+ completion. -+ -+ 4. Free disk space which was used for wandered blocks and wander records. -+ -+ After the freeing of wandered blocks and wander records we have that journal -+ footer points to the on-disk structure which might be overwritten soon. -+ Neither the log writer nor the journal recovery procedure use that pointer -+ for accessing the data. When the journal recovery procedure finds the oldest -+ transaction it compares the journal footer pointer value with the "prev_tx" -+ pointer value in tx head, if values are equal the oldest not flushed -+ transaction is found. -+ -+ NOTE on disk space leakage: the information about of what blocks and how many -+ blocks are allocated for wandered blocks, wandered records is not written to -+ the disk because of special logging for bitmaps and some super blocks -+ counters. After a system crash we the reiser4 does not remember those -+ objects allocation, thus we have no such a kind of disk space leakage. -+*/ -+ -+/* Special logging of reiser4 super block fields. */ -+ -+/* There are some reiser4 super block fields (free block count and OID allocator -+ state (number of files and next free OID) which are logged separately from -+ super block to avoid unnecessary atom fusion. -+ -+ So, the reiser4 super block can be not captured by a transaction with -+ allocates/deallocates disk blocks or create/delete file objects. Moreover, -+ the reiser4 on-disk super block is not touched when such a transaction is -+ committed and flushed. Those "counters logged specially" are logged in "tx -+ head" blocks and in the journal footer block. -+ -+ A step-by-step description of special logging: -+ -+ 0. The per-atom information about deleted or created files and allocated or -+ freed blocks is collected during the transaction. The atom's -+ ->nr_objects_created and ->nr_objects_deleted are for object -+ deletion/creation tracking, the numbers of allocated and freed blocks are -+ calculated using atom's delete set and atom's capture list -- all new and -+ relocated nodes should be on atom's clean list and should have JNODE_RELOC -+ bit set. -+ -+ 1. The "logged specially" reiser4 super block fields have their "committed" -+ versions in the reiser4 in-memory super block. They get modified only at -+ atom commit time. The atom's commit thread has an exclusive access to those -+ "committed" fields because the log writer implementation supports only one -+ atom commit a time (there is a per-fs "commit" mutex). At -+ that time "committed" counters are modified using per-atom information -+ collected during the transaction. These counters are stored on disk as a -+ part of tx head block when atom is committed. -+ -+ 2. When the atom is flushed the value of the free block counter and the OID -+ allocator state get written to the journal footer block. A special journal -+ procedure (journal_recover_sb_data()) takes those values from the journal -+ footer and updates the reiser4 in-memory super block. -+ -+ NOTE: That means free block count and OID allocator state are logged -+ separately from the reiser4 super block regardless of the fact that the -+ reiser4 super block has fields to store both the free block counter and the -+ OID allocator. -+ -+ Writing the whole super block at commit time requires knowing true values of -+ all its fields without changes made by not yet committed transactions. It is -+ possible by having their "committed" version of the super block like the -+ reiser4 bitmap blocks have "committed" and "working" versions. However, -+ another scheme was implemented which stores special logged values in the -+ unused free space inside transaction head block. In my opinion it has an -+ advantage of not writing whole super block when only part of it was -+ modified. */ -+ -+#include "debug.h" -+#include "dformat.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "page_cache.h" -+#include "wander.h" -+#include "reiser4.h" -+#include "super.h" -+#include "vfs_ops.h" -+#include "writeout.h" -+#include "inode.h" -+#include "entd.h" -+ -+#include -+#include /* for struct super_block */ -+#include /* for struct page */ -+#include -+#include /* for struct bio */ -+#include -+ -+static int write_jnodes_to_disk_extent( -+ jnode *, int, const reiser4_block_nr *, flush_queue_t *, int); -+ -+/* The commit_handle is a container for objects needed at atom commit time */ -+struct commit_handle { -+ /* A pointer to atom's list of OVRWR nodes */ -+ struct list_head *overwrite_set; -+ /* atom's overwrite set size */ -+ int overwrite_set_size; -+ /* jnodes for wander record blocks */ -+ struct list_head tx_list; -+ /* number of wander records */ -+ __u32 tx_size; -+ /* 'committed' sb counters are saved here until atom is completely -+ flushed */ -+ __u64 free_blocks; -+ __u64 nr_files; -+ __u64 next_oid; -+ /* A pointer to the atom which is being committed */ -+ txn_atom *atom; -+ /* A pointer to current super block */ -+ struct super_block *super; -+ /* The counter of modified bitmaps */ -+ reiser4_block_nr nr_bitmap; -+}; -+ -+static void init_commit_handle(struct commit_handle *ch, txn_atom *atom) -+{ -+ memset(ch, 0, sizeof(struct commit_handle)); -+ INIT_LIST_HEAD(&ch->tx_list); -+ -+ ch->atom = atom; -+ ch->super = reiser4_get_current_sb(); -+} -+ -+static void done_commit_handle(struct commit_handle *ch) -+{ -+ assert("zam-690", list_empty(&ch->tx_list)); -+} -+ -+static inline int reiser4_use_write_barrier(struct super_block * s) -+{ -+ return !reiser4_is_set(s, REISER4_NO_WRITE_BARRIER); -+} -+ -+static void disable_write_barrier(struct super_block * s) -+{ -+ notice("zam-1055", "%s does not support write barriers," -+ " using synchronous write instead.", s->s_id); -+ set_bit((int)REISER4_NO_WRITE_BARRIER, &get_super_private(s)->fs_flags); -+} -+ -+/* fill journal header block data */ -+static void format_journal_header(struct commit_handle *ch) -+{ -+ struct reiser4_super_info_data *sbinfo; -+ struct journal_header *header; -+ jnode *txhead; -+ -+ sbinfo = get_super_private(ch->super); -+ assert("zam-479", sbinfo != NULL); -+ assert("zam-480", sbinfo->journal_header != NULL); -+ -+ txhead = list_entry(ch->tx_list.next, jnode, capture_link); -+ -+ jload(sbinfo->journal_header); -+ -+ header = (struct journal_header *)jdata(sbinfo->journal_header); -+ assert("zam-484", header != NULL); -+ -+ put_unaligned(cpu_to_le64(*jnode_get_block(txhead)), -+ &header->last_committed_tx); -+ -+ jrelse(sbinfo->journal_header); -+} -+ -+/* fill journal footer block data */ -+static void format_journal_footer(struct commit_handle *ch) -+{ -+ struct reiser4_super_info_data *sbinfo; -+ struct journal_footer *footer; -+ jnode *tx_head; -+ -+ sbinfo = get_super_private(ch->super); -+ -+ tx_head = list_entry(ch->tx_list.next, jnode, capture_link); -+ -+ assert("zam-493", sbinfo != NULL); -+ assert("zam-494", sbinfo->journal_header != NULL); -+ -+ check_me("zam-691", jload(sbinfo->journal_footer) == 0); -+ -+ footer = (struct journal_footer *)jdata(sbinfo->journal_footer); -+ assert("zam-495", footer != NULL); -+ -+ put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)), -+ &footer->last_flushed_tx); -+ put_unaligned(cpu_to_le64(ch->free_blocks), &footer->free_blocks); -+ -+ put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files); -+ put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid); -+ -+ jrelse(sbinfo->journal_footer); -+} -+ -+/* wander record capacity depends on current block size */ -+static int wander_record_capacity(const struct super_block *super) -+{ -+ return (super->s_blocksize - -+ sizeof(struct wander_record_header)) / -+ sizeof(struct wander_entry); -+} -+ -+/* Fill first wander record (tx head) in accordance with supplied given data */ -+static void format_tx_head(struct commit_handle *ch) -+{ -+ jnode *tx_head; -+ jnode *next; -+ struct tx_header *header; -+ -+ tx_head = list_entry(ch->tx_list.next, jnode, capture_link); -+ assert("zam-692", &ch->tx_list != &tx_head->capture_link); -+ -+ next = list_entry(tx_head->capture_link.next, jnode, capture_link); -+ if (&ch->tx_list == &next->capture_link) -+ next = tx_head; -+ -+ header = (struct tx_header *)jdata(tx_head); -+ -+ assert("zam-460", header != NULL); -+ assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header)); -+ -+ memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize); -+ memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE); -+ -+ put_unaligned(cpu_to_le32(ch->tx_size), &header->total); -+ put_unaligned(cpu_to_le64(get_super_private(ch->super)->last_committed_tx), -+ &header->prev_tx); -+ put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block); -+ put_unaligned(cpu_to_le64(ch->free_blocks), &header->free_blocks); -+ put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files); -+ put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid); -+} -+ -+/* prepare ordinary wander record block (fill all service fields) */ -+static void -+format_wander_record(struct commit_handle *ch, jnode *node, __u32 serial) -+{ -+ struct wander_record_header *LRH; -+ jnode *next; -+ -+ assert("zam-464", node != NULL); -+ -+ LRH = (struct wander_record_header *)jdata(node); -+ next = list_entry(node->capture_link.next, jnode, capture_link); -+ -+ if (&ch->tx_list == &next->capture_link) -+ next = list_entry(ch->tx_list.next, jnode, capture_link); -+ -+ assert("zam-465", LRH != NULL); -+ assert("zam-463", -+ ch->super->s_blocksize > sizeof(struct wander_record_header)); -+ -+ memset(jdata(node), 0, (size_t) ch->super->s_blocksize); -+ memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE); -+ -+ put_unaligned(cpu_to_le32(ch->tx_size), &LRH->total); -+ put_unaligned(cpu_to_le32(serial), &LRH->serial); -+ put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block); -+} -+ -+/* add one wandered map entry to formatted wander record */ -+static void -+store_entry(jnode * node, int index, const reiser4_block_nr * a, -+ const reiser4_block_nr * b) -+{ -+ char *data; -+ struct wander_entry *pairs; -+ -+ data = jdata(node); -+ assert("zam-451", data != NULL); -+ -+ pairs = -+ (struct wander_entry *)(data + sizeof(struct wander_record_header)); -+ -+ put_unaligned(cpu_to_le64(*a), &pairs[index].original); -+ put_unaligned(cpu_to_le64(*b), &pairs[index].wandered); -+} -+ -+/* currently, wander records contains contain only wandered map, which depend on -+ overwrite set size */ -+static void get_tx_size(struct commit_handle *ch) -+{ -+ assert("zam-440", ch->overwrite_set_size != 0); -+ assert("zam-695", ch->tx_size == 0); -+ -+ /* count all ordinary wander records -+ ( - 1) / + 1 and add one -+ for tx head block */ -+ ch->tx_size = -+ (ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) + -+ 2; -+} -+ -+/* A special structure for using in store_wmap_actor() for saving its state -+ between calls */ -+struct store_wmap_params { -+ jnode *cur; /* jnode of current wander record to fill */ -+ int idx; /* free element index in wander record */ -+ int capacity; /* capacity */ -+ -+#if REISER4_DEBUG -+ struct list_head *tx_list; -+#endif -+}; -+ -+/* an actor for use in blocknr_set_iterator routine which populates the list -+ of pre-formatted wander records by wandered map info */ -+static int -+store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a, -+ const reiser4_block_nr * b, void *data) -+{ -+ struct store_wmap_params *params = data; -+ -+ if (params->idx >= params->capacity) { -+ /* a new wander record should be taken from the tx_list */ -+ params->cur = list_entry(params->cur->capture_link.next, jnode, capture_link); -+ assert("zam-454", -+ params->tx_list != ¶ms->cur->capture_link); -+ -+ params->idx = 0; -+ } -+ -+ store_entry(params->cur, params->idx, a, b); -+ params->idx++; -+ -+ return 0; -+} -+ -+/* This function is called after Relocate set gets written to disk, Overwrite -+ set is written to wandered locations and all wander records are written -+ also. Updated journal header blocks contains a pointer (block number) to -+ first wander record of the just written transaction */ -+static int update_journal_header(struct commit_handle *ch, int use_barrier) -+{ -+ struct reiser4_super_info_data *sbinfo = get_super_private(ch->super); -+ jnode *jh = sbinfo->journal_header; -+ jnode *head = list_entry(ch->tx_list.next, jnode, capture_link); -+ int ret; -+ -+ format_journal_header(ch); -+ -+ ret = write_jnodes_to_disk_extent(jh, 1, jnode_get_block(jh), NULL, -+ use_barrier ? WRITEOUT_BARRIER : 0); -+ if (ret) -+ return ret; -+ -+ // blk_run_address_space(sbinfo->fake->i_mapping); -+ /*blk_run_queues(); */ -+ -+ ret = jwait_io(jh, WRITE); -+ -+ if (ret) -+ return ret; -+ -+ sbinfo->last_committed_tx = *jnode_get_block(head); -+ -+ return 0; -+} -+ -+/* This function is called after write-back is finished. We update journal -+ footer block and free blocks which were occupied by wandered blocks and -+ transaction wander records */ -+static int update_journal_footer(struct commit_handle *ch, int use_barrier) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(ch->super); -+ -+ jnode *jf = sbinfo->journal_footer; -+ -+ int ret; -+ -+ format_journal_footer(ch); -+ -+ ret = write_jnodes_to_disk_extent(jf, 1, jnode_get_block(jf), NULL, -+ use_barrier ? WRITEOUT_BARRIER : 0); -+ if (ret) -+ return ret; -+ -+ // blk_run_address_space(sbinfo->fake->i_mapping); -+ /*blk_run_queue(); */ -+ -+ ret = jwait_io(jf, WRITE); -+ if (ret) -+ return ret; -+ -+ return 0; -+} -+ -+/* free block numbers of wander records of already written in place transaction */ -+static void dealloc_tx_list(struct commit_handle *ch) -+{ -+ while (!list_empty(&ch->tx_list)) { -+ jnode *cur = list_entry(ch->tx_list.next, jnode, capture_link); -+ list_del(&cur->capture_link); -+ ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link)); -+ reiser4_dealloc_block(jnode_get_block(cur), BLOCK_NOT_COUNTED, -+ BA_FORMATTED); -+ -+ unpin_jnode_data(cur); -+ reiser4_drop_io_head(cur); -+ } -+} -+ -+/* An actor for use in block_nr_iterator() routine which frees wandered blocks -+ from atom's overwrite set. */ -+static int -+dealloc_wmap_actor(txn_atom * atom UNUSED_ARG, -+ const reiser4_block_nr * a UNUSED_ARG, -+ const reiser4_block_nr * b, void *data UNUSED_ARG) -+{ -+ -+ assert("zam-499", b != NULL); -+ assert("zam-500", *b != 0); -+ assert("zam-501", !reiser4_blocknr_is_fake(b)); -+ -+ reiser4_dealloc_block(b, BLOCK_NOT_COUNTED, BA_FORMATTED); -+ return 0; -+} -+ -+/* free wandered block locations of already written in place transaction */ -+static void dealloc_wmap(struct commit_handle *ch) -+{ -+ assert("zam-696", ch->atom != NULL); -+ -+ blocknr_set_iterator(ch->atom, &ch->atom->wandered_map, -+ dealloc_wmap_actor, NULL, 1); -+} -+ -+/* helper function for alloc wandered blocks, which refill set of block -+ numbers needed for wandered blocks */ -+static int -+get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len) -+{ -+ reiser4_blocknr_hint hint; -+ int ret; -+ -+ reiser4_block_nr wide_len = count; -+ -+ /* FIXME-ZAM: A special policy needed for allocation of wandered blocks -+ ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed -+ reserved allocation area so as to get the best qualities of fixed -+ journals? */ -+ reiser4_blocknr_hint_init(&hint); -+ hint.block_stage = BLOCK_GRABBED; -+ -+ ret = reiser4_alloc_blocks(&hint, start, &wide_len, -+ BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START); -+ *len = (int)wide_len; -+ -+ return ret; -+} -+ -+/* -+ * roll back changes made before issuing BIO in the case of IO error. -+ */ -+static void undo_bio(struct bio *bio) -+{ -+ int i; -+ -+ for (i = 0; i < bio->bi_vcnt; ++i) { -+ struct page *pg; -+ jnode *node; -+ -+ pg = bio->bi_io_vec[i].bv_page; -+ ClearPageWriteback(pg); -+ node = jprivate(pg); -+ spin_lock_jnode(node); -+ JF_CLR(node, JNODE_WRITEBACK); -+ JF_SET(node, JNODE_DIRTY); -+ spin_unlock_jnode(node); -+ } -+ bio_put(bio); -+} -+ -+/* put overwrite set back to atom's clean list */ -+static void put_overwrite_set(struct commit_handle *ch) -+{ -+ jnode *cur; -+ -+ list_for_each_entry(cur, ch->overwrite_set, capture_link) -+ jrelse_tail(cur); -+} -+ -+/* Count overwrite set size, grab disk space for wandered blocks allocation. -+ Since we have a separate list for atom's overwrite set we just scan the list, -+ count bitmap and other not leaf nodes which wandered blocks allocation we -+ have to grab space for. */ -+static int get_overwrite_set(struct commit_handle *ch) -+{ -+ int ret; -+ jnode *cur; -+ __u64 nr_not_leaves = 0; -+#if REISER4_DEBUG -+ __u64 nr_formatted_leaves = 0; -+ __u64 nr_unformatted_leaves = 0; -+#endif -+ -+ assert("zam-697", ch->overwrite_set_size == 0); -+ -+ ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom); -+ cur = list_entry(ch->overwrite_set->next, jnode, capture_link); -+ -+ while (ch->overwrite_set != &cur->capture_link) { -+ jnode *next = list_entry(cur->capture_link.next, jnode, capture_link); -+ -+ /* Count bitmap locks for getting correct statistics what number -+ * of blocks were cleared by the transaction commit. */ -+ if (jnode_get_type(cur) == JNODE_BITMAP) -+ ch->nr_bitmap++; -+ -+ assert("zam-939", JF_ISSET(cur, JNODE_OVRWR) -+ || jnode_get_type(cur) == JNODE_BITMAP); -+ -+ if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) { -+ /* we replace fake znode by another (real) -+ znode which is suggested by disk_layout -+ plugin */ -+ -+ /* FIXME: it looks like fake znode should be -+ replaced by jnode supplied by -+ disk_layout. */ -+ -+ struct super_block *s = reiser4_get_current_sb(); -+ reiser4_super_info_data *sbinfo = -+ get_current_super_private(); -+ -+ if (sbinfo->df_plug->log_super) { -+ jnode *sj = sbinfo->df_plug->log_super(s); -+ -+ assert("zam-593", sj != NULL); -+ -+ if (IS_ERR(sj)) -+ return PTR_ERR(sj); -+ -+ spin_lock_jnode(sj); -+ JF_SET(sj, JNODE_OVRWR); -+ insert_into_atom_ovrwr_list(ch->atom, sj); -+ spin_unlock_jnode(sj); -+ -+ /* jload it as the rest of overwrite set */ -+ jload_gfp(sj, reiser4_ctx_gfp_mask_get(), 0); -+ -+ ch->overwrite_set_size++; -+ } -+ spin_lock_jnode(cur); -+ reiser4_uncapture_block(cur); -+ jput(cur); -+ -+ } else { -+ int ret; -+ ch->overwrite_set_size++; -+ ret = jload_gfp(cur, reiser4_ctx_gfp_mask_get(), 0); -+ if (ret) -+ reiser4_panic("zam-783", -+ "cannot load e-flushed jnode back (ret = %d)\n", -+ ret); -+ } -+ -+ /* Count not leaves here because we have to grab disk space -+ * for wandered blocks. They were not counted as "flush -+ * reserved". Counting should be done _after_ nodes are pinned -+ * into memory by jload(). */ -+ if (!jnode_is_leaf(cur)) -+ nr_not_leaves++; -+ else { -+#if REISER4_DEBUG -+ /* at this point @cur either has JNODE_FLUSH_RESERVED -+ * or is eflushed. Locking is not strong enough to -+ * write an assertion checking for this. */ -+ if (jnode_is_znode(cur)) -+ nr_formatted_leaves++; -+ else -+ nr_unformatted_leaves++; -+#endif -+ JF_CLR(cur, JNODE_FLUSH_RESERVED); -+ } -+ -+ cur = next; -+ } -+ -+ /* Grab space for writing (wandered blocks) of not leaves found in -+ * overwrite set. */ -+ ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED); -+ if (ret) -+ return ret; -+ -+ /* Disk space for allocation of wandered blocks of leaf nodes already -+ * reserved as "flush reserved", move it to grabbed space counter. */ -+ spin_lock_atom(ch->atom); -+ assert("zam-940", -+ nr_formatted_leaves + nr_unformatted_leaves <= -+ ch->atom->flush_reserved); -+ flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved); -+ spin_unlock_atom(ch->atom); -+ -+ return ch->overwrite_set_size; -+} -+ -+/** -+ * write_jnodes_to_disk_extent - submit write request -+ * @head: -+ * @first: first jnode of the list -+ * @nr: number of jnodes on the list -+ * @block_p: -+ * @fq: -+ * @flags: used to decide whether page is to get PG_reclaim flag -+ * -+ * Submits a write request for @nr jnodes beginning from the @first, other -+ * jnodes are after the @first on the double-linked "capture" list. All jnodes -+ * will be written to the disk region of @nr blocks starting with @block_p block -+ * number. If @fq is not NULL it means that waiting for i/o completion will be -+ * done more efficiently by using flush_queue_t objects. -+ * This function is the one which writes list of jnodes in batch mode. It does -+ * all low-level things as bio construction and page states manipulation. -+ * -+ * ZAM-FIXME-HANS: brief me on why this function exists, and why bios are -+ * aggregated in this function instead of being left to the layers below -+ * -+ * FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that? -+ * Why that layer needed? Why BIOs cannot be constructed here? -+ */ -+static int write_jnodes_to_disk_extent( -+ jnode *first, int nr, const reiser4_block_nr *block_p, -+ flush_queue_t *fq, int flags) -+{ -+ struct super_block *super = reiser4_get_current_sb(); -+ int write_op = ( flags & WRITEOUT_BARRIER ) ? WRITE_BARRIER : WRITE; -+ int max_blocks; -+ jnode *cur = first; -+ reiser4_block_nr block; -+ -+ assert("zam-571", first != NULL); -+ assert("zam-572", block_p != NULL); -+ assert("zam-570", nr > 0); -+ -+ block = *block_p; -+ max_blocks = min(bio_get_nr_vecs(super->s_bdev), BIO_MAX_PAGES); -+ -+ while (nr > 0) { -+ struct bio *bio; -+ int nr_blocks = min(nr, max_blocks); -+ int i; -+ int nr_used; -+ -+ bio = bio_alloc(GFP_NOIO, nr_blocks); -+ if (!bio) -+ return RETERR(-ENOMEM); -+ -+ bio->bi_bdev = super->s_bdev; -+ bio->bi_sector = block * (super->s_blocksize >> 9); -+ for (nr_used = 0, i = 0; i < nr_blocks; i++) { -+ struct page *pg; -+ -+ pg = jnode_page(cur); -+ assert("zam-573", pg != NULL); -+ -+ page_cache_get(pg); -+ -+ lock_and_wait_page_writeback(pg); -+ -+ if (!bio_add_page(bio, pg, super->s_blocksize, 0)) { -+ /* -+ * underlying device is satiated. Stop adding -+ * pages to the bio. -+ */ -+ unlock_page(pg); -+ page_cache_release(pg); -+ break; -+ } -+ -+ spin_lock_jnode(cur); -+ assert("nikita-3166", -+ pg->mapping == jnode_get_mapping(cur)); -+ assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK)); -+#if REISER4_DEBUG -+ spin_lock(&cur->load); -+ assert("nikita-3165", !jnode_is_releasable(cur)); -+ spin_unlock(&cur->load); -+#endif -+ JF_SET(cur, JNODE_WRITEBACK); -+ JF_CLR(cur, JNODE_DIRTY); -+ ON_DEBUG(cur->written++); -+ spin_unlock_jnode(cur); -+ -+ ClearPageError(pg); -+ set_page_writeback(pg); -+ -+ if (get_current_context()->entd) { -+ /* this is ent thread */ -+ entd_context *ent = get_entd_context(super); -+ struct wbq *rq, *next; -+ -+ spin_lock(&ent->guard); -+ -+ if (pg == ent->cur_request->page) { -+ /* -+ * entd is called for this page. This -+ * request is not in th etodo list -+ */ -+ ent->cur_request->written = 1; -+ } else { -+ /* -+ * if we have written a page for which writepage -+ * is called for - move request to another list. -+ */ -+ list_for_each_entry_safe(rq, next, &ent->todo_list, link) { -+ assert("", rq->magic == WBQ_MAGIC); -+ if (pg == rq->page) { -+ /* -+ * remove request from -+ * entd's queue, but do -+ * not wake up a thread -+ * which put this -+ * request -+ */ -+ list_del_init(&rq->link); -+ ent->nr_todo_reqs --; -+ list_add_tail(&rq->link, &ent->done_list); -+ ent->nr_done_reqs ++; -+ rq->written = 1; -+ break; -+ } -+ } -+ } -+ spin_unlock(&ent->guard); -+ } -+ -+ clear_page_dirty_for_io(pg); -+ -+ unlock_page(pg); -+ -+ cur = list_entry(cur->capture_link.next, jnode, capture_link); -+ nr_used++; -+ } -+ if (nr_used > 0) { -+ assert("nikita-3453", -+ bio->bi_size == super->s_blocksize * nr_used); -+ assert("nikita-3454", bio->bi_vcnt == nr_used); -+ -+ /* Check if we are allowed to write at all */ -+ if (super->s_flags & MS_RDONLY) -+ undo_bio(bio); -+ else { -+ int not_supported; -+ -+ add_fq_to_bio(fq, bio); -+ bio_get(bio); -+ reiser4_submit_bio(write_op, bio); -+ not_supported = bio_flagged(bio, BIO_EOPNOTSUPP); -+ bio_put(bio); -+ if (not_supported) -+ return -EOPNOTSUPP; -+ } -+ -+ block += nr_used - 1; -+ update_blocknr_hint_default(super, &block); -+ block += 1; -+ } else { -+ bio_put(bio); -+ } -+ nr -= nr_used; -+ } -+ -+ return 0; -+} -+ -+/* This is a procedure which recovers a contiguous sequences of disk block -+ numbers in the given list of j-nodes and submits write requests on this -+ per-sequence basis */ -+int -+write_jnode_list(struct list_head *head, flush_queue_t *fq, -+ long *nr_submitted, int flags) -+{ -+ int ret; -+ jnode *beg = list_entry(head->next, jnode, capture_link); -+ -+ while (head != &beg->capture_link) { -+ int nr = 1; -+ jnode *cur = list_entry(beg->capture_link.next, jnode, capture_link); -+ -+ while (head != &cur->capture_link) { -+ if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr) -+ break; -+ ++nr; -+ cur = list_entry(cur->capture_link.next, jnode, capture_link); -+ } -+ -+ ret = write_jnodes_to_disk_extent( -+ beg, nr, jnode_get_block(beg), fq, flags); -+ if (ret) -+ return ret; -+ -+ if (nr_submitted) -+ *nr_submitted += nr; -+ -+ beg = cur; -+ } -+ -+ return 0; -+} -+ -+/* add given wandered mapping to atom's wandered map */ -+static int -+add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p) -+{ -+ int ret; -+ blocknr_set_entry *new_bsep = NULL; -+ reiser4_block_nr block; -+ -+ txn_atom *atom; -+ -+ assert("zam-568", block_p != NULL); -+ block = *block_p; -+ assert("zam-569", len > 0); -+ -+ while ((len--) > 0) { -+ do { -+ atom = get_current_atom_locked(); -+ assert("zam-536", -+ !reiser4_blocknr_is_fake(jnode_get_block(cur))); -+ ret = -+ blocknr_set_add_pair(atom, &atom->wandered_map, -+ &new_bsep, -+ jnode_get_block(cur), &block); -+ } while (ret == -E_REPEAT); -+ -+ if (ret) { -+ /* deallocate blocks which were not added to wandered -+ map */ -+ reiser4_block_nr wide_len = len; -+ -+ reiser4_dealloc_blocks(&block, &wide_len, -+ BLOCK_NOT_COUNTED, -+ BA_FORMATTED -+ /* formatted, without defer */ ); -+ -+ return ret; -+ } -+ -+ spin_unlock_atom(atom); -+ -+ cur = list_entry(cur->capture_link.next, jnode, capture_link); -+ ++block; -+ } -+ -+ return 0; -+} -+ -+/* Allocate wandered blocks for current atom's OVERWRITE SET and immediately -+ submit IO for allocated blocks. We assume that current atom is in a stage -+ when any atom fusion is impossible and atom is unlocked and it is safe. */ -+static int alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t *fq) -+{ -+ reiser4_block_nr block; -+ -+ int rest; -+ int len; -+ int ret; -+ -+ jnode *cur; -+ -+ assert("zam-534", ch->overwrite_set_size > 0); -+ -+ rest = ch->overwrite_set_size; -+ -+ cur = list_entry(ch->overwrite_set->next, jnode, capture_link); -+ while (ch->overwrite_set != &cur->capture_link) { -+ assert("zam-567", JF_ISSET(cur, JNODE_OVRWR)); -+ -+ ret = get_more_wandered_blocks(rest, &block, &len); -+ if (ret) -+ return ret; -+ -+ rest -= len; -+ -+ ret = add_region_to_wmap(cur, len, &block); -+ if (ret) -+ return ret; -+ -+ ret = write_jnodes_to_disk_extent(cur, len, &block, fq, 0); -+ if (ret) -+ return ret; -+ -+ while ((len--) > 0) { -+ assert("zam-604", -+ ch->overwrite_set != &cur->capture_link); -+ cur = list_entry(cur->capture_link.next, jnode, capture_link); -+ } -+ } -+ -+ return 0; -+} -+ -+/* allocate given number of nodes over the journal area and link them into a -+ list, return pointer to the first jnode in the list */ -+static int alloc_tx(struct commit_handle *ch, flush_queue_t * fq) -+{ -+ reiser4_blocknr_hint hint; -+ reiser4_block_nr allocated = 0; -+ reiser4_block_nr first, len; -+ jnode *cur; -+ jnode *txhead; -+ int ret; -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ assert("zam-698", ch->tx_size > 0); -+ assert("zam-699", list_empty_careful(&ch->tx_list)); -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ while (allocated < (unsigned)ch->tx_size) { -+ len = (ch->tx_size - allocated); -+ -+ reiser4_blocknr_hint_init(&hint); -+ -+ hint.block_stage = BLOCK_GRABBED; -+ -+ /* FIXME: there should be some block allocation policy for -+ nodes which contain wander records */ -+ -+ /* We assume that disk space for wandered record blocks can be -+ * taken from reserved area. */ -+ ret = reiser4_alloc_blocks(&hint, &first, &len, -+ BA_FORMATTED | BA_RESERVED | -+ BA_USE_DEFAULT_SEARCH_START); -+ reiser4_blocknr_hint_done(&hint); -+ -+ if (ret) -+ return ret; -+ -+ allocated += len; -+ -+ /* create jnodes for all wander records */ -+ while (len--) { -+ cur = reiser4_alloc_io_head(&first); -+ -+ if (cur == NULL) { -+ ret = RETERR(-ENOMEM); -+ goto free_not_assigned; -+ } -+ -+ ret = jinit_new(cur, reiser4_ctx_gfp_mask_get()); -+ -+ if (ret != 0) { -+ jfree(cur); -+ goto free_not_assigned; -+ } -+ -+ pin_jnode_data(cur); -+ -+ list_add_tail(&cur->capture_link, &ch->tx_list); -+ -+ first++; -+ } -+ } -+ -+ { /* format a on-disk linked list of wander records */ -+ int serial = 1; -+ -+ txhead = list_entry(ch->tx_list.next, jnode, capture_link); -+ format_tx_head(ch); -+ -+ cur = list_entry(txhead->capture_link.next, jnode, capture_link); -+ while (&ch->tx_list != &cur->capture_link) { -+ format_wander_record(ch, cur, serial++); -+ cur = list_entry(cur->capture_link.next, jnode, capture_link); -+ } -+ } -+ -+ { /* Fill wander records with Wandered Set */ -+ struct store_wmap_params params; -+ txn_atom *atom; -+ -+ params.cur = list_entry(txhead->capture_link.next, jnode, capture_link); -+ -+ params.idx = 0; -+ params.capacity = -+ wander_record_capacity(reiser4_get_current_sb()); -+ -+ atom = get_current_atom_locked(); -+ blocknr_set_iterator(atom, &atom->wandered_map, -+ &store_wmap_actor, ¶ms, 0); -+ spin_unlock_atom(atom); -+ } -+ -+ { /* relse all jnodes from tx_list */ -+ cur = list_entry(ch->tx_list.next, jnode, capture_link); -+ while (&ch->tx_list != &cur->capture_link) { -+ jrelse(cur); -+ cur = list_entry(cur->capture_link.next, jnode, capture_link); -+ } -+ } -+ -+ ret = write_jnode_list(&ch->tx_list, fq, NULL, 0); -+ -+ return ret; -+ -+ free_not_assigned: -+ /* We deallocate blocks not yet assigned to jnodes on tx_list. The -+ caller takes care about invalidating of tx list */ -+ reiser4_dealloc_blocks(&first, &len, BLOCK_NOT_COUNTED, BA_FORMATTED); -+ -+ return ret; -+} -+ -+static int commit_tx(struct commit_handle *ch) -+{ -+ flush_queue_t *fq; -+ int barrier; -+ int ret; -+ -+ /* Grab more space for wandered records. */ -+ ret = reiser4_grab_space_force((__u64) (ch->tx_size), BA_RESERVED); -+ if (ret) -+ return ret; -+ -+ fq = get_fq_for_current_atom(); -+ if (IS_ERR(fq)) -+ return PTR_ERR(fq); -+ -+ spin_unlock_atom(fq->atom); -+ do { -+ ret = alloc_wandered_blocks(ch, fq); -+ if (ret) -+ break; -+ ret = alloc_tx(ch, fq); -+ if (ret) -+ break; -+ } while (0); -+ -+ reiser4_fq_put(fq); -+ if (ret) -+ return ret; -+ repeat_wo_barrier: -+ barrier = reiser4_use_write_barrier(ch->super); -+ if (!barrier) { -+ ret = current_atom_finish_all_fq(); -+ if (ret) -+ return ret; -+ } -+ ret = update_journal_header(ch, barrier); -+ if (barrier) { -+ if (ret) { -+ if (ret == -EOPNOTSUPP) { -+ disable_write_barrier(ch->super); -+ goto repeat_wo_barrier; -+ } -+ return ret; -+ } -+ ret = current_atom_finish_all_fq(); -+ } -+ return ret; -+} -+ -+static int write_tx_back(struct commit_handle * ch) -+{ -+ flush_queue_t *fq; -+ int ret; -+ int barrier; -+ -+ reiser4_post_commit_hook(); -+ fq = get_fq_for_current_atom(); -+ if (IS_ERR(fq)) -+ return PTR_ERR(fq); -+ spin_unlock_atom(fq->atom); -+ ret = write_jnode_list( -+ ch->overwrite_set, fq, NULL, WRITEOUT_FOR_PAGE_RECLAIM); -+ reiser4_fq_put(fq); -+ if (ret) -+ return ret; -+ repeat_wo_barrier: -+ barrier = reiser4_use_write_barrier(ch->super); -+ if (!barrier) { -+ ret = current_atom_finish_all_fq(); -+ if (ret) -+ return ret; -+ } -+ ret = update_journal_footer(ch, barrier); -+ if (barrier) { -+ if (ret) { -+ if (ret == -EOPNOTSUPP) { -+ disable_write_barrier(ch->super); -+ goto repeat_wo_barrier; -+ } -+ return ret; -+ } -+ ret = current_atom_finish_all_fq(); -+ } -+ if (ret) -+ return ret; -+ reiser4_post_write_back_hook(); -+ return 0; -+} -+ -+/* We assume that at this moment all captured blocks are marked as RELOC or -+ WANDER (belong to Relocate o Overwrite set), all nodes from Relocate set -+ are submitted to write. -+*/ -+ -+int reiser4_write_logs(long *nr_submitted) -+{ -+ txn_atom *atom; -+ struct super_block *super = reiser4_get_current_sb(); -+ reiser4_super_info_data *sbinfo = get_super_private(super); -+ struct commit_handle ch; -+ int ret; -+ -+ writeout_mode_enable(); -+ -+ /* block allocator may add j-nodes to the clean_list */ -+ ret = reiser4_pre_commit_hook(); -+ if (ret) -+ return ret; -+ -+ /* No locks are required if we take atom which stage >= -+ * ASTAGE_PRE_COMMIT */ -+ atom = get_current_context()->trans->atom; -+ assert("zam-965", atom != NULL); -+ -+ /* relocate set is on the atom->clean_nodes list after -+ * current_atom_complete_writes() finishes. It can be safely -+ * uncaptured after commit_mutex is locked, because any atom that -+ * captures these nodes is guaranteed to commit after current one. -+ * -+ * This can only be done after reiser4_pre_commit_hook(), because it is where -+ * early flushed jnodes with CREATED bit are transferred to the -+ * overwrite list. */ -+ reiser4_invalidate_list(ATOM_CLEAN_LIST(atom)); -+ spin_lock_atom(atom); -+ /* There might be waiters for the relocate nodes which we have -+ * released, wake them up. */ -+ reiser4_atom_send_event(atom); -+ spin_unlock_atom(atom); -+ -+ if (REISER4_DEBUG) { -+ int level; -+ -+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++level) -+ assert("nikita-3352", -+ list_empty_careful(ATOM_DIRTY_LIST(atom, level))); -+ } -+ -+ sbinfo->nr_files_committed += (unsigned)atom->nr_objects_created; -+ sbinfo->nr_files_committed -= (unsigned)atom->nr_objects_deleted; -+ -+ init_commit_handle(&ch, atom); -+ -+ ch.free_blocks = sbinfo->blocks_free_committed; -+ ch.nr_files = sbinfo->nr_files_committed; -+ /* ZAM-FIXME-HANS: email me what the contention level is for the super -+ * lock. */ -+ ch.next_oid = oid_next(super); -+ -+ /* count overwrite set and place it in a separate list */ -+ ret = get_overwrite_set(&ch); -+ -+ if (ret <= 0) { -+ /* It is possible that overwrite set is empty here, it means -+ all captured nodes are clean */ -+ goto up_and_ret; -+ } -+ -+ /* Inform the caller about what number of dirty pages will be -+ * submitted to disk. */ -+ *nr_submitted += ch.overwrite_set_size - ch.nr_bitmap; -+ -+ /* count all records needed for storing of the wandered set */ -+ get_tx_size(&ch); -+ -+ ret = commit_tx(&ch); -+ if (ret) -+ goto up_and_ret; -+ -+ spin_lock_atom(atom); -+ reiser4_atom_set_stage(atom, ASTAGE_POST_COMMIT); -+ spin_unlock_atom(atom); -+ -+ ret = write_tx_back(&ch); -+ reiser4_post_write_back_hook(); -+ -+ up_and_ret: -+ if (ret) { -+ /* there could be fq attached to current atom; the only way to -+ remove them is: */ -+ current_atom_finish_all_fq(); -+ } -+ -+ /* free blocks of flushed transaction */ -+ dealloc_tx_list(&ch); -+ dealloc_wmap(&ch); -+ -+ put_overwrite_set(&ch); -+ -+ done_commit_handle(&ch); -+ -+ writeout_mode_disable(); -+ -+ return ret; -+} -+ -+/* consistency checks for journal data/control blocks: header, footer, log -+ records, transactions head blocks. All functions return zero on success. */ -+ -+static int check_journal_header(const jnode * node UNUSED_ARG) -+{ -+ /* FIXME: journal header has no magic field yet. */ -+ return 0; -+} -+ -+/* wait for write completion for all jnodes from given list */ -+static int wait_on_jnode_list(struct list_head *head) -+{ -+ jnode *scan; -+ int ret = 0; -+ -+ list_for_each_entry(scan, head, capture_link) { -+ struct page *pg = jnode_page(scan); -+ -+ if (pg) { -+ if (PageWriteback(pg)) -+ wait_on_page_writeback(pg); -+ -+ if (PageError(pg)) -+ ret++; -+ } -+ } -+ -+ return ret; -+} -+ -+static int check_journal_footer(const jnode * node UNUSED_ARG) -+{ -+ /* FIXME: journal footer has no magic field yet. */ -+ return 0; -+} -+ -+static int check_tx_head(const jnode * node) -+{ -+ struct tx_header *header = (struct tx_header *)jdata(node); -+ -+ if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) { -+ warning("zam-627", "tx head at block %s corrupted\n", -+ sprint_address(jnode_get_block(node))); -+ return RETERR(-EIO); -+ } -+ -+ return 0; -+} -+ -+static int check_wander_record(const jnode * node) -+{ -+ struct wander_record_header *RH = -+ (struct wander_record_header *)jdata(node); -+ -+ if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) != -+ 0) { -+ warning("zam-628", "wander record at block %s corrupted\n", -+ sprint_address(jnode_get_block(node))); -+ return RETERR(-EIO); -+ } -+ -+ return 0; -+} -+ -+/* fill commit_handler structure by everything what is needed for update_journal_footer */ -+static int restore_commit_handle(struct commit_handle *ch, jnode *tx_head) -+{ -+ struct tx_header *TXH; -+ int ret; -+ -+ ret = jload(tx_head); -+ if (ret) -+ return ret; -+ -+ TXH = (struct tx_header *)jdata(tx_head); -+ -+ ch->free_blocks = le64_to_cpu(get_unaligned(&TXH->free_blocks)); -+ ch->nr_files = le64_to_cpu(get_unaligned(&TXH->nr_files)); -+ ch->next_oid = le64_to_cpu(get_unaligned(&TXH->next_oid)); -+ -+ jrelse(tx_head); -+ -+ list_add(&tx_head->capture_link, &ch->tx_list); -+ -+ return 0; -+} -+ -+/* replay one transaction: restore and write overwrite set in place */ -+static int replay_transaction(const struct super_block *s, -+ jnode * tx_head, -+ const reiser4_block_nr * log_rec_block_p, -+ const reiser4_block_nr * end_block, -+ unsigned int nr_wander_records) -+{ -+ reiser4_block_nr log_rec_block = *log_rec_block_p; -+ struct commit_handle ch; -+ LIST_HEAD(overwrite_set); -+ jnode *log; -+ int ret; -+ -+ init_commit_handle(&ch, NULL); -+ ch.overwrite_set = &overwrite_set; -+ -+ restore_commit_handle(&ch, tx_head); -+ -+ while (log_rec_block != *end_block) { -+ struct wander_record_header *header; -+ struct wander_entry *entry; -+ -+ int i; -+ -+ if (nr_wander_records == 0) { -+ warning("zam-631", -+ "number of wander records in the linked list" -+ " greater than number stored in tx head.\n"); -+ ret = RETERR(-EIO); -+ goto free_ow_set; -+ } -+ -+ log = reiser4_alloc_io_head(&log_rec_block); -+ if (log == NULL) -+ return RETERR(-ENOMEM); -+ -+ ret = jload(log); -+ if (ret < 0) { -+ reiser4_drop_io_head(log); -+ return ret; -+ } -+ -+ ret = check_wander_record(log); -+ if (ret) { -+ jrelse(log); -+ reiser4_drop_io_head(log); -+ return ret; -+ } -+ -+ header = (struct wander_record_header *)jdata(log); -+ log_rec_block = le64_to_cpu(get_unaligned(&header->next_block)); -+ -+ entry = (struct wander_entry *)(header + 1); -+ -+ /* restore overwrite set from wander record content */ -+ for (i = 0; i < wander_record_capacity(s); i++) { -+ reiser4_block_nr block; -+ jnode *node; -+ -+ block = le64_to_cpu(get_unaligned(&entry->wandered)); -+ if (block == 0) -+ break; -+ -+ node = reiser4_alloc_io_head(&block); -+ if (node == NULL) { -+ ret = RETERR(-ENOMEM); -+ /* -+ * FIXME-VS:??? -+ */ -+ jrelse(log); -+ reiser4_drop_io_head(log); -+ goto free_ow_set; -+ } -+ -+ ret = jload(node); -+ -+ if (ret < 0) { -+ reiser4_drop_io_head(node); -+ /* -+ * FIXME-VS:??? -+ */ -+ jrelse(log); -+ reiser4_drop_io_head(log); -+ goto free_ow_set; -+ } -+ -+ block = le64_to_cpu(get_unaligned(&entry->original)); -+ -+ assert("zam-603", block != 0); -+ -+ jnode_set_block(node, &block); -+ -+ list_add_tail(&node->capture_link, ch.overwrite_set); -+ -+ ++entry; -+ } -+ -+ jrelse(log); -+ reiser4_drop_io_head(log); -+ -+ --nr_wander_records; -+ } -+ -+ if (nr_wander_records != 0) { -+ warning("zam-632", "number of wander records in the linked list" -+ " less than number stored in tx head.\n"); -+ ret = RETERR(-EIO); -+ goto free_ow_set; -+ } -+ -+ { /* write wandered set in place */ -+ write_jnode_list(ch.overwrite_set, NULL, NULL, 0); -+ ret = wait_on_jnode_list(ch.overwrite_set); -+ -+ if (ret) { -+ ret = RETERR(-EIO); -+ goto free_ow_set; -+ } -+ } -+ -+ ret = update_journal_footer(&ch, 0); -+ -+ free_ow_set: -+ -+ while (!list_empty(ch.overwrite_set)) { -+ jnode *cur = list_entry(ch.overwrite_set->next, jnode, capture_link); -+ list_del_init(&cur->capture_link); -+ jrelse(cur); -+ reiser4_drop_io_head(cur); -+ } -+ -+ list_del_init(&tx_head->capture_link); -+ -+ done_commit_handle(&ch); -+ -+ return ret; -+} -+ -+/* find oldest committed and not played transaction and play it. The transaction -+ * was committed and journal header block was updated but the blocks from the -+ * process of writing the atom's overwrite set in-place and updating of journal -+ * footer block were not completed. This function completes the process by -+ * recovering the atom's overwrite set from their wandered locations and writes -+ * them in-place and updating the journal footer. */ -+static int replay_oldest_transaction(struct super_block *s) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(s); -+ jnode *jf = sbinfo->journal_footer; -+ unsigned int total; -+ struct journal_footer *F; -+ struct tx_header *T; -+ -+ reiser4_block_nr prev_tx; -+ reiser4_block_nr last_flushed_tx; -+ reiser4_block_nr log_rec_block = 0; -+ -+ jnode *tx_head; -+ -+ int ret; -+ -+ if ((ret = jload(jf)) < 0) -+ return ret; -+ -+ F = (struct journal_footer *)jdata(jf); -+ -+ last_flushed_tx = le64_to_cpu(get_unaligned(&F->last_flushed_tx)); -+ -+ jrelse(jf); -+ -+ if (sbinfo->last_committed_tx == last_flushed_tx) { -+ /* all transactions are replayed */ -+ return 0; -+ } -+ -+ prev_tx = sbinfo->last_committed_tx; -+ -+ /* searching for oldest not flushed transaction */ -+ while (1) { -+ tx_head = reiser4_alloc_io_head(&prev_tx); -+ if (!tx_head) -+ return RETERR(-ENOMEM); -+ -+ ret = jload(tx_head); -+ if (ret < 0) { -+ reiser4_drop_io_head(tx_head); -+ return ret; -+ } -+ -+ ret = check_tx_head(tx_head); -+ if (ret) { -+ jrelse(tx_head); -+ reiser4_drop_io_head(tx_head); -+ return ret; -+ } -+ -+ T = (struct tx_header *)jdata(tx_head); -+ -+ prev_tx = le64_to_cpu(get_unaligned(&T->prev_tx)); -+ -+ if (prev_tx == last_flushed_tx) -+ break; -+ -+ jrelse(tx_head); -+ reiser4_drop_io_head(tx_head); -+ } -+ -+ total = le32_to_cpu(get_unaligned(&T->total)); -+ log_rec_block = le64_to_cpu(get_unaligned(&T->next_block)); -+ -+ pin_jnode_data(tx_head); -+ jrelse(tx_head); -+ -+ ret = -+ replay_transaction(s, tx_head, &log_rec_block, -+ jnode_get_block(tx_head), total - 1); -+ -+ unpin_jnode_data(tx_head); -+ reiser4_drop_io_head(tx_head); -+ -+ if (ret) -+ return ret; -+ return -E_REPEAT; -+} -+ -+/* The reiser4 journal current implementation was optimized to not to capture -+ super block if certain super blocks fields are modified. Currently, the set -+ is (, ). These fields are logged by -+ special way which includes storing them in each transaction head block at -+ atom commit time and writing that information to journal footer block at -+ atom flush time. For getting info from journal footer block to the -+ in-memory super block there is a special function -+ reiser4_journal_recover_sb_data() which should be called after disk format -+ plugin re-reads super block after journal replaying. -+*/ -+ -+/* get the information from journal footer in-memory super block */ -+int reiser4_journal_recover_sb_data(struct super_block *s) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(s); -+ struct journal_footer *jf; -+ int ret; -+ -+ assert("zam-673", sbinfo->journal_footer != NULL); -+ -+ ret = jload(sbinfo->journal_footer); -+ if (ret != 0) -+ return ret; -+ -+ ret = check_journal_footer(sbinfo->journal_footer); -+ if (ret != 0) -+ goto out; -+ -+ jf = (struct journal_footer *)jdata(sbinfo->journal_footer); -+ -+ /* was there at least one flushed transaction? */ -+ if (jf->last_flushed_tx) { -+ -+ /* restore free block counter logged in this transaction */ -+ reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&jf->free_blocks))); -+ -+ /* restore oid allocator state */ -+ oid_init_allocator(s, -+ le64_to_cpu(get_unaligned(&jf->nr_files)), -+ le64_to_cpu(get_unaligned(&jf->next_oid))); -+ } -+ out: -+ jrelse(sbinfo->journal_footer); -+ return ret; -+} -+ -+/* reiser4 replay journal procedure */ -+int reiser4_journal_replay(struct super_block *s) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(s); -+ jnode *jh, *jf; -+ struct journal_header *header; -+ int nr_tx_replayed = 0; -+ int ret; -+ -+ assert("zam-582", sbinfo != NULL); -+ -+ jh = sbinfo->journal_header; -+ jf = sbinfo->journal_footer; -+ -+ if (!jh || !jf) { -+ /* it is possible that disk layout does not support journal -+ structures, we just warn about this */ -+ warning("zam-583", -+ "journal control blocks were not loaded by disk layout plugin. " -+ "journal replaying is not possible.\n"); -+ return 0; -+ } -+ -+ /* Take free block count from journal footer block. The free block -+ counter value corresponds the last flushed transaction state */ -+ ret = jload(jf); -+ if (ret < 0) -+ return ret; -+ -+ ret = check_journal_footer(jf); -+ if (ret) { -+ jrelse(jf); -+ return ret; -+ } -+ -+ jrelse(jf); -+ -+ /* store last committed transaction info in reiser4 in-memory super -+ block */ -+ ret = jload(jh); -+ if (ret < 0) -+ return ret; -+ -+ ret = check_journal_header(jh); -+ if (ret) { -+ jrelse(jh); -+ return ret; -+ } -+ -+ header = (struct journal_header *)jdata(jh); -+ sbinfo->last_committed_tx = le64_to_cpu(get_unaligned(&header->last_committed_tx)); -+ -+ jrelse(jh); -+ -+ /* replay committed transactions */ -+ while ((ret = replay_oldest_transaction(s)) == -E_REPEAT) -+ nr_tx_replayed++; -+ -+ return ret; -+} -+ -+/* load journal control block (either journal header or journal footer block) */ -+static int -+load_journal_control_block(jnode ** node, const reiser4_block_nr * block) -+{ -+ int ret; -+ -+ *node = reiser4_alloc_io_head(block); -+ if (!(*node)) -+ return RETERR(-ENOMEM); -+ -+ ret = jload(*node); -+ -+ if (ret) { -+ reiser4_drop_io_head(*node); -+ *node = NULL; -+ return ret; -+ } -+ -+ pin_jnode_data(*node); -+ jrelse(*node); -+ -+ return 0; -+} -+ -+/* unload journal header or footer and free jnode */ -+static void unload_journal_control_block(jnode ** node) -+{ -+ if (*node) { -+ unpin_jnode_data(*node); -+ reiser4_drop_io_head(*node); -+ *node = NULL; -+ } -+} -+ -+/* release journal control blocks */ -+void reiser4_done_journal_info(struct super_block *s) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(s); -+ -+ assert("zam-476", sbinfo != NULL); -+ -+ unload_journal_control_block(&sbinfo->journal_header); -+ unload_journal_control_block(&sbinfo->journal_footer); -+ rcu_barrier(); -+} -+ -+/* load journal control blocks */ -+int reiser4_init_journal_info(struct super_block *s) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(s); -+ journal_location *loc; -+ int ret; -+ -+ loc = &sbinfo->jloc; -+ -+ assert("zam-651", loc != NULL); -+ assert("zam-652", loc->header != 0); -+ assert("zam-653", loc->footer != 0); -+ -+ ret = load_journal_control_block(&sbinfo->journal_header, &loc->header); -+ -+ if (ret) -+ return ret; -+ -+ ret = load_journal_control_block(&sbinfo->journal_footer, &loc->footer); -+ -+ if (ret) { -+ unload_journal_control_block(&sbinfo->journal_header); -+ } -+ -+ return ret; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/wander.h linux-2.6.20/fs/reiser4/wander.h ---- linux-2.6.20.orig/fs/reiser4/wander.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/wander.h 2007-05-06 14:50:43.903039466 +0400 -@@ -0,0 +1,135 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#if !defined (__FS_REISER4_WANDER_H__) -+#define __FS_REISER4_WANDER_H__ -+ -+#include "dformat.h" -+ -+#include /* for struct super_block */ -+ -+/* REISER4 JOURNAL ON-DISK DATA STRUCTURES */ -+ -+#define TX_HEADER_MAGIC "TxMagic4" -+#define WANDER_RECORD_MAGIC "LogMagc4" -+ -+#define TX_HEADER_MAGIC_SIZE (8) -+#define WANDER_RECORD_MAGIC_SIZE (8) -+ -+/* journal header block format */ -+struct journal_header { -+ /* last written transaction head location */ -+ d64 last_committed_tx; -+}; -+ -+typedef struct journal_location { -+ reiser4_block_nr footer; -+ reiser4_block_nr header; -+} journal_location; -+ -+/* The wander.c head comment describes usage and semantic of all these structures */ -+/* journal footer block format */ -+struct journal_footer { -+ /* last flushed transaction location. */ -+ /* This block number is no more valid after the transaction it points -+ to gets flushed, this number is used only at journal replaying time -+ for detection of the end of on-disk list of committed transactions -+ which were not flushed completely */ -+ d64 last_flushed_tx; -+ -+ /* free block counter is written in journal footer at transaction -+ flushing , not in super block because free blocks counter is logged -+ by another way than super block fields (root pointer, for -+ example). */ -+ d64 free_blocks; -+ -+ /* number of used OIDs and maximal used OID are logged separately from -+ super block */ -+ d64 nr_files; -+ d64 next_oid; -+}; -+ -+/* Each wander record (except the first one) has unified format with wander -+ record header followed by an array of log entries */ -+struct wander_record_header { -+ /* when there is no predefined location for wander records, this magic -+ string should help reiser4fsck. */ -+ char magic[WANDER_RECORD_MAGIC_SIZE]; -+ -+ /* transaction id */ -+ d64 id; -+ -+ /* total number of wander records in current transaction */ -+ d32 total; -+ -+ /* this block number in transaction */ -+ d32 serial; -+ -+ /* number of previous block in commit */ -+ d64 next_block; -+}; -+ -+/* The first wander record (transaction head) of written transaction has the -+ special format */ -+struct tx_header { -+ /* magic string makes first block in transaction different from other -+ logged blocks, it should help fsck. */ -+ char magic[TX_HEADER_MAGIC_SIZE]; -+ -+ /* transaction id */ -+ d64 id; -+ -+ /* total number of records (including this first tx head) in the -+ transaction */ -+ d32 total; -+ -+ /* align next field to 8-byte boundary; this field always is zero */ -+ d32 padding; -+ -+ /* block number of previous transaction head */ -+ d64 prev_tx; -+ -+ /* next wander record location */ -+ d64 next_block; -+ -+ /* committed versions of free blocks counter */ -+ d64 free_blocks; -+ -+ /* number of used OIDs (nr_files) and maximal used OID are logged -+ separately from super block */ -+ d64 nr_files; -+ d64 next_oid; -+}; -+ -+/* A transaction gets written to disk as a set of wander records (each wander -+ record size is fs block) */ -+ -+/* As it was told above a wander The rest of wander record is filled by these log entries, unused space filled -+ by zeroes */ -+struct wander_entry { -+ d64 original; /* block original location */ -+ d64 wandered; /* block wandered location */ -+}; -+ -+/* REISER4 JOURNAL WRITER FUNCTIONS */ -+ -+extern int reiser4_write_logs(long *); -+extern int reiser4_journal_replay(struct super_block *); -+extern int reiser4_journal_recover_sb_data(struct super_block *); -+ -+extern int reiser4_init_journal_info(struct super_block *); -+extern void reiser4_done_journal_info(struct super_block *); -+ -+extern int write_jnode_list(struct list_head *, flush_queue_t *, long *, int); -+ -+#endif /* __FS_REISER4_WANDER_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/writeout.h linux-2.6.20/fs/reiser4/writeout.h ---- linux-2.6.20.orig/fs/reiser4/writeout.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/writeout.h 2007-05-06 14:50:43.907040716 +0400 -@@ -0,0 +1,21 @@ -+/* Copyright 2002, 2003, 2004 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#if !defined (__FS_REISER4_WRITEOUT_H__) -+ -+#define WRITEOUT_SINGLE_STREAM (0x1) -+#define WRITEOUT_FOR_PAGE_RECLAIM (0x2) -+#define WRITEOUT_BARRIER (0x4) -+ -+extern int reiser4_get_writeout_flags(void); -+ -+#endif /* __FS_REISER4_WRITEOUT_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/znode.c linux-2.6.20/fs/reiser4/znode.c ---- linux-2.6.20.orig/fs/reiser4/znode.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/znode.c 2007-05-06 14:50:43.907040716 +0400 -@@ -0,0 +1,1029 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+/* Znode manipulation functions. */ -+/* Znode is the in-memory header for a tree node. It is stored -+ separately from the node itself so that it does not get written to -+ disk. In this respect znode is like buffer head or page head. We -+ also use znodes for additional reiser4 specific purposes: -+ -+ . they are organized into tree structure which is a part of whole -+ reiser4 tree. -+ . they are used to implement node grained locking -+ . they are used to keep additional state associated with a -+ node -+ . they contain links to lists used by the transaction manager -+ -+ Znode is attached to some variable "block number" which is instance of -+ fs/reiser4/tree.h:reiser4_block_nr type. Znode can exist without -+ appropriate node being actually loaded in memory. Existence of znode itself -+ is regulated by reference count (->x_count) in it. Each time thread -+ acquires reference to znode through call to zget(), ->x_count is -+ incremented and decremented on call to zput(). Data (content of node) are -+ brought in memory through call to zload(), which also increments ->d_count -+ reference counter. zload can block waiting on IO. Call to zrelse() -+ decreases this counter. Also, ->c_count keeps track of number of child -+ znodes and prevents parent znode from being recycled until all of its -+ children are. ->c_count is decremented whenever child goes out of existence -+ (being actually recycled in zdestroy()) which can be some time after last -+ reference to this child dies if we support some form of LRU cache for -+ znodes. -+ -+*/ -+/* EVERY ZNODE'S STORY -+ -+ 1. His infancy. -+ -+ Once upon a time, the znode was born deep inside of zget() by call to -+ zalloc(). At the return from zget() znode had: -+ -+ . reference counter (x_count) of 1 -+ . assigned block number, marked as used in bitmap -+ . pointer to parent znode. Root znode parent pointer points -+ to its father: "fake" znode. This, in turn, has NULL parent pointer. -+ . hash table linkage -+ . no data loaded from disk -+ . no node plugin -+ . no sibling linkage -+ -+ 2. His childhood -+ -+ Each node is either brought into memory as a result of tree traversal, or -+ created afresh, creation of the root being a special case of the latter. In -+ either case it's inserted into sibling list. This will typically require -+ some ancillary tree traversing, but ultimately both sibling pointers will -+ exist and JNODE_LEFT_CONNECTED and JNODE_RIGHT_CONNECTED will be true in -+ zjnode.state. -+ -+ 3. His youth. -+ -+ If znode is bound to already existing node in a tree, its content is read -+ from the disk by call to zload(). At that moment, JNODE_LOADED bit is set -+ in zjnode.state and zdata() function starts to return non null for this -+ znode. zload() further calls zparse() that determines which node layout -+ this node is rendered in, and sets ->nplug on success. -+ -+ If znode is for new node just created, memory for it is allocated and -+ zinit_new() function is called to initialise data, according to selected -+ node layout. -+ -+ 4. His maturity. -+ -+ After this point, znode lingers in memory for some time. Threads can -+ acquire references to znode either by blocknr through call to zget(), or by -+ following a pointer to unallocated znode from internal item. Each time -+ reference to znode is obtained, x_count is increased. Thread can read/write -+ lock znode. Znode data can be loaded through calls to zload(), d_count will -+ be increased appropriately. If all references to znode are released -+ (x_count drops to 0), znode is not recycled immediately. Rather, it is -+ still cached in the hash table in the hope that it will be accessed -+ shortly. -+ -+ There are two ways in which znode existence can be terminated: -+ -+ . sudden death: node bound to this znode is removed from the tree -+ . overpopulation: znode is purged out of memory due to memory pressure -+ -+ 5. His death. -+ -+ Death is complex process. -+ -+ When we irrevocably commit ourselves to decision to remove node from the -+ tree, JNODE_HEARD_BANSHEE bit is set in zjnode.state of corresponding -+ znode. This is done either in ->kill_hook() of internal item or in -+ reiser4_kill_root() function when tree root is removed. -+ -+ At this moment znode still has: -+ -+ . locks held on it, necessary write ones -+ . references to it -+ . disk block assigned to it -+ . data loaded from the disk -+ . pending requests for lock -+ -+ But once JNODE_HEARD_BANSHEE bit set, last call to unlock_znode() does node -+ deletion. Node deletion includes two phases. First all ways to get -+ references to that znode (sibling and parent links and hash lookup using -+ block number stored in parent node) should be deleted -- it is done through -+ sibling_list_remove(), also we assume that nobody uses down link from -+ parent node due to its nonexistence or proper parent node locking and -+ nobody uses parent pointers from children due to absence of them. Second we -+ invalidate all pending lock requests which still are on znode's lock -+ request queue, this is done by reiser4_invalidate_lock(). Another -+ JNODE_IS_DYING znode status bit is used to invalidate pending lock requests. -+ Once it set all requesters are forced to return -EINVAL from -+ longterm_lock_znode(). Future locking attempts are not possible because all -+ ways to get references to that znode are removed already. Last, node is -+ uncaptured from transaction. -+ -+ When last reference to the dying znode is just about to be released, -+ block number for this lock is released and znode is removed from the -+ hash table. -+ -+ Now znode can be recycled. -+ -+ [it's possible to free bitmap block and remove znode from the hash -+ table when last lock is released. This will result in having -+ referenced but completely orphaned znode] -+ -+ 6. Limbo -+ -+ As have been mentioned above znodes with reference counter 0 are -+ still cached in a hash table. Once memory pressure increases they are -+ purged out of there [this requires something like LRU list for -+ efficient implementation. LRU list would also greatly simplify -+ implementation of coord cache that would in this case morph to just -+ scanning some initial segment of LRU list]. Data loaded into -+ unreferenced znode are flushed back to the durable storage if -+ necessary and memory is freed. Znodes themselves can be recycled at -+ this point too. -+ -+*/ -+ -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/plugin_header.h" -+#include "plugin/node/node.h" -+#include "plugin/plugin.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree.h" -+#include "tree_walk.h" -+#include "super.h" -+#include "reiser4.h" -+ -+#include -+#include -+#include -+#include -+ -+static z_hash_table *get_htable(reiser4_tree *, -+ const reiser4_block_nr * const blocknr); -+static z_hash_table *znode_get_htable(const znode *); -+static void zdrop(znode *); -+ -+/* hash table support */ -+ -+/* compare two block numbers for equality. Used by hash-table macros */ -+static inline int -+blknreq(const reiser4_block_nr * b1, const reiser4_block_nr * b2) -+{ -+ assert("nikita-534", b1 != NULL); -+ assert("nikita-535", b2 != NULL); -+ -+ return *b1 == *b2; -+} -+ -+/* Hash znode by block number. Used by hash-table macros */ -+/* Audited by: umka (2002.06.11) */ -+static inline __u32 -+blknrhashfn(z_hash_table * table, const reiser4_block_nr * b) -+{ -+ assert("nikita-536", b != NULL); -+ -+ return *b & (REISER4_ZNODE_HASH_TABLE_SIZE - 1); -+} -+ -+/* The hash table definition */ -+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get()) -+#define KFREE(ptr, size) kfree(ptr) -+TYPE_SAFE_HASH_DEFINE(z, znode, reiser4_block_nr, zjnode.key.z, zjnode.link.z, -+ blknrhashfn, blknreq); -+#undef KFREE -+#undef KMALLOC -+ -+/* slab for znodes */ -+static struct kmem_cache *znode_cache; -+ -+int znode_shift_order; -+ -+/** -+ * init_znodes - create znode cache -+ * -+ * Initializes slab cache of znodes. It is part of reiser4 module initialization. -+ */ -+int init_znodes(void) -+{ -+ znode_cache = kmem_cache_create("znode", sizeof(znode), 0, -+ SLAB_HWCACHE_ALIGN | -+ SLAB_RECLAIM_ACCOUNT, NULL, NULL); -+ if (znode_cache == NULL) -+ return RETERR(-ENOMEM); -+ -+ for (znode_shift_order = 0; (1 << znode_shift_order) < sizeof(znode); -+ ++znode_shift_order); -+ --znode_shift_order; -+ return 0; -+} -+ -+/** -+ * done_znodes - delete znode cache -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+void done_znodes(void) -+{ -+ destroy_reiser4_cache(&znode_cache); -+} -+ -+/* call this to initialise tree of znodes */ -+int znodes_tree_init(reiser4_tree * tree /* tree to initialise znodes for */ ) -+{ -+ int result; -+ assert("umka-050", tree != NULL); -+ -+ rwlock_init(&tree->dk_lock); -+ -+ result = z_hash_init(&tree->zhash_table, REISER4_ZNODE_HASH_TABLE_SIZE); -+ if (result != 0) -+ return result; -+ result = z_hash_init(&tree->zfake_table, REISER4_ZNODE_HASH_TABLE_SIZE); -+ return result; -+} -+ -+/* free this znode */ -+void zfree(znode * node /* znode to free */ ) -+{ -+ assert("nikita-465", node != NULL); -+ assert("nikita-2120", znode_page(node) == NULL); -+ assert("nikita-2301", list_empty_careful(&node->lock.owners)); -+ assert("nikita-2302", list_empty_careful(&node->lock.requestors)); -+ assert("nikita-2663", (list_empty_careful(&ZJNODE(node)->capture_link) && -+ NODE_LIST(ZJNODE(node)) == NOT_CAPTURED)); -+ assert("nikita-3220", list_empty(&ZJNODE(node)->jnodes)); -+ assert("nikita-3293", !znode_is_right_connected(node)); -+ assert("nikita-3294", !znode_is_left_connected(node)); -+ assert("nikita-3295", node->left == NULL); -+ assert("nikita-3296", node->right == NULL); -+ -+ /* not yet phash_jnode_destroy(ZJNODE(node)); */ -+ -+ kmem_cache_free(znode_cache, node); -+} -+ -+/* call this to free tree of znodes */ -+void znodes_tree_done(reiser4_tree * tree /* tree to finish with znodes of */ ) -+{ -+ znode *node; -+ znode *next; -+ z_hash_table *ztable; -+ -+ /* scan znode hash-tables and kill all znodes, then free hash tables -+ * themselves. */ -+ -+ assert("nikita-795", tree != NULL); -+ -+ ztable = &tree->zhash_table; -+ -+ if (ztable->_table != NULL) { -+ for_all_in_htable(ztable, z, node, next) { -+ node->c_count = 0; -+ node->in_parent.node = NULL; -+ assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0); -+ zdrop(node); -+ } -+ -+ z_hash_done(&tree->zhash_table); -+ } -+ -+ ztable = &tree->zfake_table; -+ -+ if (ztable->_table != NULL) { -+ for_all_in_htable(ztable, z, node, next) { -+ node->c_count = 0; -+ node->in_parent.node = NULL; -+ assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0); -+ zdrop(node); -+ } -+ -+ z_hash_done(&tree->zfake_table); -+ } -+} -+ -+/* ZNODE STRUCTURES */ -+ -+/* allocate fresh znode */ -+znode *zalloc(gfp_t gfp_flag /* allocation flag */ ) -+{ -+ znode *node; -+ -+ node = kmem_cache_alloc(znode_cache, gfp_flag); -+ return node; -+} -+ -+/* Initialize fields of znode -+ @node: znode to initialize; -+ @parent: parent znode; -+ @tree: tree we are in. */ -+void zinit(znode * node, const znode * parent, reiser4_tree * tree) -+{ -+ assert("nikita-466", node != NULL); -+ assert("umka-268", current_tree != NULL); -+ -+ memset(node, 0, sizeof *node); -+ -+ assert("umka-051", tree != NULL); -+ -+ jnode_init(&node->zjnode, tree, JNODE_FORMATTED_BLOCK); -+ reiser4_init_lock(&node->lock); -+ init_parent_coord(&node->in_parent, parent); -+} -+ -+/* -+ * remove znode from indices. This is called jput() when last reference on -+ * znode is released. -+ */ -+void znode_remove(znode * node /* znode to remove */ , reiser4_tree * tree) -+{ -+ assert("nikita-2108", node != NULL); -+ assert("nikita-470", node->c_count == 0); -+ assert_rw_write_locked(&(tree->tree_lock)); -+ -+ /* remove reference to this znode from cbk cache */ -+ cbk_cache_invalidate(node, tree); -+ -+ /* update c_count of parent */ -+ if (znode_parent(node) != NULL) { -+ assert("nikita-472", znode_parent(node)->c_count > 0); -+ /* father, onto your hands I forward my spirit... */ -+ znode_parent(node)->c_count--; -+ node->in_parent.node = NULL; -+ } else { -+ /* orphaned znode?! Root? */ -+ } -+ -+ /* remove znode from hash-table */ -+ z_hash_remove_rcu(znode_get_htable(node), node); -+} -+ -+/* zdrop() -- Remove znode from the tree. -+ -+ This is called when znode is removed from the memory. */ -+static void zdrop(znode * node /* znode to finish with */ ) -+{ -+ jdrop(ZJNODE(node)); -+} -+ -+/* -+ * put znode into right place in the hash table. This is called by relocate -+ * code. -+ */ -+int znode_rehash(znode * node /* node to rehash */ , -+ const reiser4_block_nr * new_block_nr /* new block number */ ) -+{ -+ z_hash_table *oldtable; -+ z_hash_table *newtable; -+ reiser4_tree *tree; -+ -+ assert("nikita-2018", node != NULL); -+ -+ tree = znode_get_tree(node); -+ oldtable = znode_get_htable(node); -+ newtable = get_htable(tree, new_block_nr); -+ -+ write_lock_tree(tree); -+ /* remove znode from hash-table */ -+ z_hash_remove_rcu(oldtable, node); -+ -+ /* assertion no longer valid due to RCU */ -+ /* assert("nikita-2019", z_hash_find(newtable, new_block_nr) == NULL); */ -+ -+ /* update blocknr */ -+ znode_set_block(node, new_block_nr); -+ node->zjnode.key.z = *new_block_nr; -+ -+ /* insert it into hash */ -+ z_hash_insert_rcu(newtable, node); -+ write_unlock_tree(tree); -+ return 0; -+} -+ -+/* ZNODE LOOKUP, GET, PUT */ -+ -+/* zlook() - get znode with given block_nr in a hash table or return NULL -+ -+ If result is non-NULL then the znode's x_count is incremented. Internal version -+ accepts pre-computed hash index. The hash table is accessed under caller's -+ tree->hash_lock. -+*/ -+znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const blocknr) -+{ -+ znode *result; -+ __u32 hash; -+ z_hash_table *htable; -+ -+ assert("jmacd-506", tree != NULL); -+ assert("jmacd-507", blocknr != NULL); -+ -+ htable = get_htable(tree, blocknr); -+ hash = blknrhashfn(htable, blocknr); -+ -+ rcu_read_lock(); -+ result = z_hash_find_index(htable, hash, blocknr); -+ -+ if (result != NULL) { -+ add_x_ref(ZJNODE(result)); -+ result = znode_rip_check(tree, result); -+ } -+ rcu_read_unlock(); -+ -+ return result; -+} -+ -+/* return hash table where znode with block @blocknr is (or should be) -+ * stored */ -+static z_hash_table *get_htable(reiser4_tree * tree, -+ const reiser4_block_nr * const blocknr) -+{ -+ z_hash_table *table; -+ if (is_disk_addr_unallocated(blocknr)) -+ table = &tree->zfake_table; -+ else -+ table = &tree->zhash_table; -+ return table; -+} -+ -+/* return hash table where znode @node is (or should be) stored */ -+static z_hash_table *znode_get_htable(const znode * node) -+{ -+ return get_htable(znode_get_tree(node), znode_get_block(node)); -+} -+ -+/* zget() - get znode from hash table, allocating it if necessary. -+ -+ First a call to zlook, locating a x-referenced znode if one -+ exists. If znode is not found, allocate new one and return. Result -+ is returned with x_count reference increased. -+ -+ LOCKS TAKEN: TREE_LOCK, ZNODE_LOCK -+ LOCK ORDERING: NONE -+*/ -+znode *zget(reiser4_tree * tree, -+ const reiser4_block_nr * const blocknr, -+ znode * parent, tree_level level, gfp_t gfp_flag) -+{ -+ znode *result; -+ __u32 hashi; -+ -+ z_hash_table *zth; -+ -+ assert("jmacd-512", tree != NULL); -+ assert("jmacd-513", blocknr != NULL); -+ assert("jmacd-514", level < REISER4_MAX_ZTREE_HEIGHT); -+ -+ zth = get_htable(tree, blocknr); -+ hashi = blknrhashfn(zth, blocknr); -+ -+ /* NOTE-NIKITA address-as-unallocated-blocknr still is not -+ implemented. */ -+ -+ z_hash_prefetch_bucket(zth, hashi); -+ -+ rcu_read_lock(); -+ /* Find a matching BLOCKNR in the hash table. If the znode is found, -+ we obtain an reference (x_count) but the znode remains unlocked. -+ Have to worry about race conditions later. */ -+ result = z_hash_find_index(zth, hashi, blocknr); -+ /* According to the current design, the hash table lock protects new -+ znode references. */ -+ if (result != NULL) { -+ add_x_ref(ZJNODE(result)); -+ /* NOTE-NIKITA it should be so, but special case during -+ creation of new root makes such assertion highly -+ complicated. */ -+ assert("nikita-2131", 1 || znode_parent(result) == parent || -+ (ZF_ISSET(result, JNODE_ORPHAN) -+ && (znode_parent(result) == NULL))); -+ result = znode_rip_check(tree, result); -+ } -+ -+ rcu_read_unlock(); -+ -+ if (!result) { -+ znode *shadow; -+ -+ result = zalloc(gfp_flag); -+ if (!result) { -+ return ERR_PTR(RETERR(-ENOMEM)); -+ } -+ -+ zinit(result, parent, tree); -+ ZJNODE(result)->blocknr = *blocknr; -+ ZJNODE(result)->key.z = *blocknr; -+ result->level = level; -+ -+ write_lock_tree(tree); -+ -+ shadow = z_hash_find_index(zth, hashi, blocknr); -+ if (unlikely(shadow != NULL && !ZF_ISSET(shadow, JNODE_RIP))) { -+ jnode_list_remove(ZJNODE(result)); -+ zfree(result); -+ result = shadow; -+ } else { -+ result->version = znode_build_version(tree); -+ z_hash_insert_index_rcu(zth, hashi, result); -+ -+ if (parent != NULL) -+ ++parent->c_count; -+ } -+ -+ add_x_ref(ZJNODE(result)); -+ -+ write_unlock_tree(tree); -+ } -+#if REISER4_DEBUG -+ if (!reiser4_blocknr_is_fake(blocknr) && *blocknr != 0) -+ reiser4_check_block(blocknr, 1); -+#endif -+ /* Check for invalid tree level, return -EIO */ -+ if (unlikely(znode_get_level(result) != level)) { -+ warning("jmacd-504", -+ "Wrong level for cached block %llu: %i expecting %i", -+ (unsigned long long)(*blocknr), znode_get_level(result), -+ level); -+ zput(result); -+ return ERR_PTR(RETERR(-EIO)); -+ } -+ -+ assert("nikita-1227", znode_invariant(result)); -+ -+ return result; -+} -+ -+/* ZNODE PLUGINS/DATA */ -+ -+/* "guess" plugin for node loaded from the disk. Plugin id of node plugin is -+ stored at the fixed offset from the beginning of the node. */ -+static node_plugin *znode_guess_plugin(const znode * node /* znode to guess -+ * plugin of */ ) -+{ -+ reiser4_tree *tree; -+ -+ assert("nikita-1053", node != NULL); -+ assert("nikita-1055", zdata(node) != NULL); -+ -+ tree = znode_get_tree(node); -+ assert("umka-053", tree != NULL); -+ -+ if (reiser4_is_set(tree->super, REISER4_ONE_NODE_PLUGIN)) { -+ return tree->nplug; -+ } else { -+ return node_plugin_by_disk_id -+ (tree, &((common_node_header *) zdata(node))->plugin_id); -+#ifdef GUESS_EXISTS -+ reiser4_plugin *plugin; -+ -+ /* NOTE-NIKITA add locking here when dynamic plugins will be -+ * implemented */ -+ for_all_plugins(REISER4_NODE_PLUGIN_TYPE, plugin) { -+ if ((plugin->u.node.guess != NULL) -+ && plugin->u.node.guess(node)) -+ return plugin; -+ } -+ warning("nikita-1057", "Cannot guess node plugin"); -+ print_znode("node", node); -+ return NULL; -+#endif -+ } -+} -+ -+/* parse node header and install ->node_plugin */ -+int zparse(znode * node /* znode to parse */ ) -+{ -+ int result; -+ -+ assert("nikita-1233", node != NULL); -+ assert("nikita-2370", zdata(node) != NULL); -+ -+ if (node->nplug == NULL) { -+ node_plugin *nplug; -+ -+ nplug = znode_guess_plugin(node); -+ if (likely(nplug != NULL)) { -+ result = nplug->parse(node); -+ if (likely(result == 0)) -+ node->nplug = nplug; -+ } else { -+ result = RETERR(-EIO); -+ } -+ } else -+ result = 0; -+ return result; -+} -+ -+/* zload with readahead */ -+int zload_ra(znode * node /* znode to load */ , ra_info_t * info) -+{ -+ int result; -+ -+ assert("nikita-484", node != NULL); -+ assert("nikita-1377", znode_invariant(node)); -+ assert("jmacd-7771", !znode_above_root(node)); -+ assert("nikita-2125", atomic_read(&ZJNODE(node)->x_count) > 0); -+ assert("nikita-3016", reiser4_schedulable()); -+ -+ if (info) -+ formatted_readahead(node, info); -+ -+ result = jload(ZJNODE(node)); -+ assert("nikita-1378", znode_invariant(node)); -+ return result; -+} -+ -+/* load content of node into memory */ -+int zload(znode * node) -+{ -+ return zload_ra(node, NULL); -+} -+ -+/* call node plugin to initialise newly allocated node. */ -+int zinit_new(znode * node /* znode to initialise */ , gfp_t gfp_flags) -+{ -+ return jinit_new(ZJNODE(node), gfp_flags); -+} -+ -+/* drop reference to node data. When last reference is dropped, data are -+ unloaded. */ -+void zrelse(znode * node /* znode to release references to */ ) -+{ -+ assert("nikita-1381", znode_invariant(node)); -+ -+ jrelse(ZJNODE(node)); -+} -+ -+/* returns free space in node */ -+unsigned znode_free_space(znode * node /* znode to query */ ) -+{ -+ assert("nikita-852", node != NULL); -+ return node_plugin_by_node(node)->free_space(node); -+} -+ -+/* left delimiting key of znode */ -+reiser4_key *znode_get_rd_key(znode * node /* znode to query */ ) -+{ -+ assert("nikita-958", node != NULL); -+ assert_rw_locked(&(znode_get_tree(node)->dk_lock)); -+ assert("nikita-3067", LOCK_CNT_GTZ(rw_locked_dk)); -+ assert("nikita-30671", node->rd_key_version != 0); -+ return &node->rd_key; -+} -+ -+/* right delimiting key of znode */ -+reiser4_key *znode_get_ld_key(znode * node /* znode to query */ ) -+{ -+ assert("nikita-974", node != NULL); -+ assert_rw_locked(&(znode_get_tree(node)->dk_lock)); -+ assert("nikita-3068", LOCK_CNT_GTZ(rw_locked_dk)); -+ assert("nikita-30681", node->ld_key_version != 0); -+ return &node->ld_key; -+} -+ -+ON_DEBUG(atomic_t delim_key_version = ATOMIC_INIT(0); -+ ) -+ -+/* update right-delimiting key of @node */ -+reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key) -+{ -+ assert("nikita-2937", node != NULL); -+ assert("nikita-2939", key != NULL); -+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock)); -+ assert("nikita-3069", LOCK_CNT_GTZ(write_locked_dk)); -+ assert("nikita-2944", -+ znode_is_any_locked(node) || -+ znode_get_level(node) != LEAF_LEVEL || -+ keyge(key, &node->rd_key) || -+ keyeq(&node->rd_key, reiser4_min_key()) || -+ ZF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ -+ node->rd_key = *key; -+ ON_DEBUG(node->rd_key_version = atomic_inc_return(&delim_key_version)); -+ return &node->rd_key; -+} -+ -+/* update left-delimiting key of @node */ -+reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key) -+{ -+ assert("nikita-2940", node != NULL); -+ assert("nikita-2941", key != NULL); -+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock)); -+ assert("nikita-3070", LOCK_CNT_GTZ(write_locked_dk)); -+ assert("nikita-2943", -+ znode_is_any_locked(node) || keyeq(&node->ld_key, -+ reiser4_min_key())); -+ -+ node->ld_key = *key; -+ ON_DEBUG(node->ld_key_version = atomic_inc_return(&delim_key_version)); -+ return &node->ld_key; -+} -+ -+/* true if @key is inside key range for @node */ -+int znode_contains_key(znode * node /* znode to look in */ , -+ const reiser4_key * key /* key to look for */ ) -+{ -+ assert("nikita-1237", node != NULL); -+ assert("nikita-1238", key != NULL); -+ -+ /* left_delimiting_key <= key <= right_delimiting_key */ -+ return keyle(znode_get_ld_key(node), key) -+ && keyle(key, znode_get_rd_key(node)); -+} -+ -+/* same as znode_contains_key(), but lock dk lock */ -+int znode_contains_key_lock(znode * node /* znode to look in */ , -+ const reiser4_key * key /* key to look for */ ) -+{ -+ int result; -+ -+ assert("umka-056", node != NULL); -+ assert("umka-057", key != NULL); -+ -+ read_lock_dk(znode_get_tree(node)); -+ result = znode_contains_key(node, key); -+ read_unlock_dk(znode_get_tree(node)); -+ return result; -+} -+ -+/* get parent pointer, assuming tree is not locked */ -+znode *znode_parent_nolock(const znode * node /* child znode */ ) -+{ -+ assert("nikita-1444", node != NULL); -+ return node->in_parent.node; -+} -+ -+/* get parent pointer of znode */ -+znode *znode_parent(const znode * node /* child znode */ ) -+{ -+ assert("nikita-1226", node != NULL); -+ assert("nikita-1406", LOCK_CNT_GTZ(rw_locked_tree)); -+ return znode_parent_nolock(node); -+} -+ -+/* detect uber znode used to protect in-superblock tree root pointer */ -+int znode_above_root(const znode * node /* znode to query */ ) -+{ -+ assert("umka-059", node != NULL); -+ -+ return disk_addr_eq(&ZJNODE(node)->blocknr, &UBER_TREE_ADDR); -+} -+ -+/* check that @node is root---that its block number is recorder in the tree as -+ that of root node */ -+#if REISER4_DEBUG -+static int znode_is_true_root(const znode * node /* znode to query */ ) -+{ -+ assert("umka-060", node != NULL); -+ assert("umka-061", current_tree != NULL); -+ -+ return disk_addr_eq(znode_get_block(node), -+ &znode_get_tree(node)->root_block); -+} -+#endif -+ -+/* check that @node is root */ -+int znode_is_root(const znode * node /* znode to query */ ) -+{ -+ assert("nikita-1206", node != NULL); -+ -+ return znode_get_level(node) == znode_get_tree(node)->height; -+} -+ -+/* Returns true is @node was just created by zget() and wasn't ever loaded -+ into memory. */ -+/* NIKITA-HANS: yes */ -+int znode_just_created(const znode * node) -+{ -+ assert("nikita-2188", node != NULL); -+ return (znode_page(node) == NULL); -+} -+ -+/* obtain updated ->znode_epoch. See seal.c for description. */ -+__u64 znode_build_version(reiser4_tree * tree) -+{ -+ __u64 result; -+ -+ spin_lock(&tree->epoch_lock); -+ result = ++tree->znode_epoch; -+ spin_unlock(&tree->epoch_lock); -+ return result; -+} -+ -+void init_load_count(load_count * dh) -+{ -+ assert("nikita-2105", dh != NULL); -+ memset(dh, 0, sizeof *dh); -+} -+ -+void done_load_count(load_count * dh) -+{ -+ assert("nikita-2106", dh != NULL); -+ if (dh->node != NULL) { -+ for (; dh->d_ref > 0; --dh->d_ref) -+ zrelse(dh->node); -+ dh->node = NULL; -+ } -+} -+ -+static int incr_load_count(load_count * dh) -+{ -+ int result; -+ -+ assert("nikita-2110", dh != NULL); -+ assert("nikita-2111", dh->node != NULL); -+ -+ result = zload(dh->node); -+ if (result == 0) -+ ++dh->d_ref; -+ return result; -+} -+ -+int incr_load_count_znode(load_count * dh, znode * node) -+{ -+ assert("nikita-2107", dh != NULL); -+ assert("nikita-2158", node != NULL); -+ assert("nikita-2109", -+ ergo(dh->node != NULL, (dh->node == node) || (dh->d_ref == 0))); -+ -+ dh->node = node; -+ return incr_load_count(dh); -+} -+ -+int incr_load_count_jnode(load_count * dh, jnode * node) -+{ -+ if (jnode_is_znode(node)) { -+ return incr_load_count_znode(dh, JZNODE(node)); -+ } -+ return 0; -+} -+ -+void copy_load_count(load_count * new, load_count * old) -+{ -+ int ret = 0; -+ done_load_count(new); -+ new->node = old->node; -+ new->d_ref = 0; -+ -+ while ((new->d_ref < old->d_ref) && (ret = incr_load_count(new)) == 0) { -+ } -+ -+ assert("jmacd-87589", ret == 0); -+} -+ -+void move_load_count(load_count * new, load_count * old) -+{ -+ done_load_count(new); -+ new->node = old->node; -+ new->d_ref = old->d_ref; -+ old->node = NULL; -+ old->d_ref = 0; -+} -+ -+/* convert parent pointer into coord */ -+void parent_coord_to_coord(const parent_coord_t * pcoord, coord_t * coord) -+{ -+ assert("nikita-3204", pcoord != NULL); -+ assert("nikita-3205", coord != NULL); -+ -+ coord_init_first_unit_nocheck(coord, pcoord->node); -+ coord_set_item_pos(coord, pcoord->item_pos); -+ coord->between = AT_UNIT; -+} -+ -+/* pack coord into parent_coord_t */ -+void coord_to_parent_coord(const coord_t * coord, parent_coord_t * pcoord) -+{ -+ assert("nikita-3206", pcoord != NULL); -+ assert("nikita-3207", coord != NULL); -+ -+ pcoord->node = coord->node; -+ pcoord->item_pos = coord->item_pos; -+} -+ -+/* Initialize a parent hint pointer. (parent hint pointer is a field in znode, -+ look for comments there) */ -+void init_parent_coord(parent_coord_t * pcoord, const znode * node) -+{ -+ pcoord->node = (znode *) node; -+ pcoord->item_pos = (unsigned short)~0; -+} -+ -+#if REISER4_DEBUG -+ -+/* debugging aid: znode invariant */ -+static int znode_invariant_f(const znode * node /* znode to check */ , -+ char const **msg /* where to store error -+ * message, if any */ ) -+{ -+#define _ergo(ant, con) \ -+ ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con))) -+ -+#define _equi(e1, e2) \ -+ ((*msg) = "{" #e1 "} <=> {" #e2 "}", equi((e1), (e2))) -+ -+#define _check(exp) ((*msg) = #exp, (exp)) -+ -+ return jnode_invariant_f(ZJNODE(node), msg) && -+ /* [znode-fake] invariant */ -+ /* fake znode doesn't have a parent, and */ -+ _ergo(znode_get_level(node) == 0, znode_parent(node) == NULL) && -+ /* there is another way to express this very check, and */ -+ _ergo(znode_above_root(node), znode_parent(node) == NULL) && -+ /* it has special block number, and */ -+ _ergo(znode_get_level(node) == 0, -+ disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) && -+ /* it is the only znode with such block number, and */ -+ _ergo(!znode_above_root(node) && znode_is_loaded(node), -+ !disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) && -+ /* it is parent of the tree root node */ -+ _ergo(znode_is_true_root(node), -+ znode_above_root(znode_parent(node))) && -+ /* [znode-level] invariant */ -+ /* level of parent znode is one larger than that of child, -+ except for the fake znode, and */ -+ _ergo(znode_parent(node) && !znode_above_root(znode_parent(node)), -+ znode_get_level(znode_parent(node)) == -+ znode_get_level(node) + 1) && -+ /* left neighbor is at the same level, and */ -+ _ergo(znode_is_left_connected(node) && node->left != NULL, -+ znode_get_level(node) == znode_get_level(node->left)) && -+ /* right neighbor is at the same level */ -+ _ergo(znode_is_right_connected(node) && node->right != NULL, -+ znode_get_level(node) == znode_get_level(node->right)) && -+ /* [znode-connected] invariant */ -+ _ergo(node->left != NULL, znode_is_left_connected(node)) && -+ _ergo(node->right != NULL, znode_is_right_connected(node)) && -+ _ergo(!znode_is_root(node) && node->left != NULL, -+ znode_is_right_connected(node->left) && -+ node->left->right == node) && -+ _ergo(!znode_is_root(node) && node->right != NULL, -+ znode_is_left_connected(node->right) && -+ node->right->left == node) && -+ /* [znode-c_count] invariant */ -+ /* for any znode, c_count of its parent is greater than 0 */ -+ _ergo(znode_parent(node) != NULL && -+ !znode_above_root(znode_parent(node)), -+ znode_parent(node)->c_count > 0) && -+ /* leaves don't have children */ -+ _ergo(znode_get_level(node) == LEAF_LEVEL, -+ node->c_count == 0) && -+ _check(node->zjnode.jnodes.prev != NULL) && -+ _check(node->zjnode.jnodes.next != NULL) && -+ /* orphan doesn't have a parent */ -+ _ergo(ZF_ISSET(node, JNODE_ORPHAN), znode_parent(node) == 0) && -+ /* [znode-modify] invariant */ -+ /* if znode is not write-locked, its checksum remains -+ * invariant */ -+ /* unfortunately, zlock is unordered w.r.t. jnode_lock, so we -+ * cannot check this. */ -+ /* [znode-refs] invariant */ -+ /* only referenced znode can be long-term locked */ -+ _ergo(znode_is_locked(node), -+ atomic_read(&ZJNODE(node)->x_count) != 0); -+} -+ -+/* debugging aid: check znode invariant and panic if it doesn't hold */ -+int znode_invariant(znode * node /* znode to check */ ) -+{ -+ char const *failed_msg; -+ int result; -+ -+ assert("umka-063", node != NULL); -+ assert("umka-064", current_tree != NULL); -+ -+ spin_lock_znode(node); -+ read_lock_tree(znode_get_tree(node)); -+ result = znode_invariant_f(node, &failed_msg); -+ if (!result) { -+ /* print_znode("corrupted node", node); */ -+ warning("jmacd-555", "Condition %s failed", failed_msg); -+ } -+ read_unlock_tree(znode_get_tree(node)); -+ spin_unlock_znode(node); -+ return result; -+} -+ -+/* return non-0 iff data are loaded into znode */ -+int znode_is_loaded(const znode * node /* znode to query */ ) -+{ -+ assert("nikita-497", node != NULL); -+ return jnode_is_loaded(ZJNODE(node)); -+} -+ -+unsigned long znode_times_locked(const znode * z) -+{ -+ return z->times_locked; -+} -+ -+#endif /* REISER4_DEBUG */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/fs/reiser4/znode.h linux-2.6.20/fs/reiser4/znode.h ---- linux-2.6.20.orig/fs/reiser4/znode.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.20/fs/reiser4/znode.h 2007-05-06 14:50:43.907040716 +0400 -@@ -0,0 +1,434 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Declaration of znode (Zam's node). See znode.c for more details. */ -+ -+#ifndef __ZNODE_H__ -+#define __ZNODE_H__ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/node/node.h" -+#include "jnode.h" -+#include "lock.h" -+#include "readahead.h" -+ -+#include -+#include -+#include /* for PAGE_CACHE_SIZE */ -+#include -+#include -+ -+/* znode tracks its position within parent (internal item in a parent node, -+ * that contains znode's block number). */ -+typedef struct parent_coord { -+ znode *node; -+ pos_in_node_t item_pos; -+} parent_coord_t; -+ -+/* &znode - node in a reiser4 tree. -+ -+ NOTE-NIKITA fields in this struct have to be rearranged (later) to reduce -+ cacheline pressure. -+ -+ Locking: -+ -+ Long term: data in a disk node attached to this znode are protected -+ by long term, deadlock aware lock ->lock; -+ -+ Spin lock: the following fields are protected by the spin lock: -+ -+ ->lock -+ -+ Following fields are protected by the global tree lock: -+ -+ ->left -+ ->right -+ ->in_parent -+ ->c_count -+ -+ Following fields are protected by the global delimiting key lock (dk_lock): -+ -+ ->ld_key (to update ->ld_key long-term lock on the node is also required) -+ ->rd_key -+ -+ Following fields are protected by the long term lock: -+ -+ ->nr_items -+ -+ ->node_plugin is never changed once set. This means that after code made -+ itself sure that field is valid it can be accessed without any additional -+ locking. -+ -+ ->level is immutable. -+ -+ Invariants involving this data-type: -+ -+ [znode-fake] -+ [znode-level] -+ [znode-connected] -+ [znode-c_count] -+ [znode-refs] -+ [jnode-refs] -+ [jnode-queued] -+ [znode-modify] -+ -+ For this to be made into a clustering or NUMA filesystem, we would want to eliminate all of the global locks. -+ Suggestions for how to do that are desired.*/ -+struct znode { -+ /* Embedded jnode. */ -+ jnode zjnode; -+ -+ /* contains three subfields, node, pos_in_node, and pos_in_unit. -+ -+ pos_in_node and pos_in_unit are only hints that are cached to -+ speed up lookups during balancing. They are not required to be up to -+ date. Synched in find_child_ptr(). -+ -+ This value allows us to avoid expensive binary searches. -+ -+ in_parent->node points to the parent of this node, and is NOT a -+ hint. -+ */ -+ parent_coord_t in_parent; -+ -+ /* -+ * sibling list pointers -+ */ -+ -+ /* left-neighbor */ -+ znode *left; -+ /* right-neighbor */ -+ znode *right; -+ -+ /* long term lock on node content. This lock supports deadlock -+ detection. See lock.c -+ */ -+ zlock lock; -+ -+ /* You cannot remove from memory a node that has children in -+ memory. This is because we rely on the fact that parent of given -+ node can always be reached without blocking for io. When reading a -+ node into memory you must increase the c_count of its parent, when -+ removing it from memory you must decrease the c_count. This makes -+ the code simpler, and the cases where it is suboptimal are truly -+ obscure. -+ */ -+ int c_count; -+ -+ /* plugin of node attached to this znode. NULL if znode is not -+ loaded. */ -+ node_plugin *nplug; -+ -+ /* version of znode data. This is increased on each modification. This -+ * is necessary to implement seals (see seal.[ch]) efficiently. */ -+ __u64 version; -+ -+ /* left delimiting key. Necessary to efficiently perform -+ balancing with node-level locking. Kept in memory only. */ -+ reiser4_key ld_key; -+ /* right delimiting key. */ -+ reiser4_key rd_key; -+ -+ /* znode's tree level */ -+ __u16 level; -+ /* number of items in this node. This field is modified by node -+ * plugin. */ -+ __u16 nr_items; -+ -+#if REISER4_DEBUG -+ void *creator; -+ reiser4_key first_key; -+ unsigned long times_locked; -+ int left_version; /* when node->left was updated */ -+ int right_version; /* when node->right was updated */ -+ int ld_key_version; /* when node->ld_key was updated */ -+ int rd_key_version; /* when node->rd_key was updated */ -+#endif -+ -+} __attribute__ ((aligned(16))); -+ -+ON_DEBUG(extern atomic_t delim_key_version; -+ ) -+ -+/* In general I think these macros should not be exposed. */ -+#define znode_is_locked(node) (lock_is_locked(&node->lock)) -+#define znode_is_rlocked(node) (lock_is_rlocked(&node->lock)) -+#define znode_is_wlocked(node) (lock_is_wlocked(&node->lock)) -+#define znode_is_wlocked_once(node) (lock_is_wlocked_once(&node->lock)) -+#define znode_can_be_rlocked(node) (lock_can_be_rlocked(&node->lock)) -+#define is_lock_compatible(node, mode) (lock_mode_compatible(&node->lock, mode)) -+/* Macros for accessing the znode state. */ -+#define ZF_CLR(p,f) JF_CLR (ZJNODE(p), (f)) -+#define ZF_ISSET(p,f) JF_ISSET(ZJNODE(p), (f)) -+#define ZF_SET(p,f) JF_SET (ZJNODE(p), (f)) -+extern znode *zget(reiser4_tree * tree, const reiser4_block_nr * const block, -+ znode * parent, tree_level level, gfp_t gfp_flag); -+extern znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const block); -+extern int zload(znode * node); -+extern int zload_ra(znode * node, ra_info_t * info); -+extern int zinit_new(znode * node, gfp_t gfp_flags); -+extern void zrelse(znode * node); -+extern void znode_change_parent(znode * new_parent, reiser4_block_nr * block); -+ -+/* size of data in znode */ -+static inline unsigned -+znode_size(const znode * node UNUSED_ARG /* znode to query */ ) -+{ -+ assert("nikita-1416", node != NULL); -+ return PAGE_CACHE_SIZE; -+} -+ -+extern void parent_coord_to_coord(const parent_coord_t * pcoord, -+ coord_t * coord); -+extern void coord_to_parent_coord(const coord_t * coord, -+ parent_coord_t * pcoord); -+extern void init_parent_coord(parent_coord_t * pcoord, const znode * node); -+ -+extern unsigned znode_free_space(znode * node); -+ -+extern reiser4_key *znode_get_rd_key(znode * node); -+extern reiser4_key *znode_get_ld_key(znode * node); -+ -+extern reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key); -+extern reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key); -+ -+/* `connected' state checks */ -+static inline int znode_is_right_connected(const znode * node) -+{ -+ return ZF_ISSET(node, JNODE_RIGHT_CONNECTED); -+} -+ -+static inline int znode_is_left_connected(const znode * node) -+{ -+ return ZF_ISSET(node, JNODE_LEFT_CONNECTED); -+} -+ -+static inline int znode_is_connected(const znode * node) -+{ -+ return znode_is_right_connected(node) && znode_is_left_connected(node); -+} -+ -+extern int znode_shift_order; -+extern int znode_rehash(znode * node, const reiser4_block_nr * new_block_nr); -+extern void znode_remove(znode *, reiser4_tree *); -+extern znode *znode_parent(const znode * node); -+extern znode *znode_parent_nolock(const znode * node); -+extern int znode_above_root(const znode * node); -+extern int init_znodes(void); -+extern void done_znodes(void); -+extern int znodes_tree_init(reiser4_tree * ztree); -+extern void znodes_tree_done(reiser4_tree * ztree); -+extern int znode_contains_key(znode * node, const reiser4_key * key); -+extern int znode_contains_key_lock(znode * node, const reiser4_key * key); -+extern unsigned znode_save_free_space(znode * node); -+extern unsigned znode_recover_free_space(znode * node); -+extern znode *zalloc(gfp_t gfp_flag); -+extern void zinit(znode *, const znode * parent, reiser4_tree *); -+extern int zparse(znode * node); -+ -+extern int znode_just_created(const znode * node); -+ -+extern void zfree(znode * node); -+ -+#if REISER4_DEBUG -+extern void print_znode(const char *prefix, const znode * node); -+#else -+#define print_znode( p, n ) noop -+#endif -+ -+/* Make it look like various znode functions exist instead of treating znodes as -+ jnodes in znode-specific code. */ -+#define znode_page(x) jnode_page ( ZJNODE(x) ) -+#define zdata(x) jdata ( ZJNODE(x) ) -+#define znode_get_block(x) jnode_get_block ( ZJNODE(x) ) -+#define znode_created(x) jnode_created ( ZJNODE(x) ) -+#define znode_set_created(x) jnode_set_created ( ZJNODE(x) ) -+#define znode_convertible(x) jnode_convertible (ZJNODE(x)) -+#define znode_set_convertible(x) jnode_set_convertible (ZJNODE(x)) -+ -+#define znode_is_dirty(x) jnode_is_dirty ( ZJNODE(x) ) -+#define znode_check_dirty(x) jnode_check_dirty ( ZJNODE(x) ) -+#define znode_make_clean(x) jnode_make_clean ( ZJNODE(x) ) -+#define znode_set_block(x, b) jnode_set_block ( ZJNODE(x), (b) ) -+ -+#define spin_lock_znode(x) spin_lock_jnode ( ZJNODE(x) ) -+#define spin_unlock_znode(x) spin_unlock_jnode ( ZJNODE(x) ) -+#define spin_trylock_znode(x) spin_trylock_jnode ( ZJNODE(x) ) -+#define spin_znode_is_locked(x) spin_jnode_is_locked ( ZJNODE(x) ) -+#define spin_znode_is_not_locked(x) spin_jnode_is_not_locked ( ZJNODE(x) ) -+ -+#if REISER4_DEBUG -+extern int znode_x_count_is_protected(const znode * node); -+extern int znode_invariant(znode * node); -+#endif -+ -+/* acquire reference to @node */ -+static inline znode *zref(znode * node) -+{ -+ /* change of x_count from 0 to 1 is protected by tree spin-lock */ -+ return JZNODE(jref(ZJNODE(node))); -+} -+ -+/* release reference to @node */ -+static inline void zput(znode * node) -+{ -+ assert("nikita-3564", znode_invariant(node)); -+ jput(ZJNODE(node)); -+} -+ -+/* get the level field for a znode */ -+static inline tree_level znode_get_level(const znode * node) -+{ -+ return node->level; -+} -+ -+/* get the level field for a jnode */ -+static inline tree_level jnode_get_level(const jnode * node) -+{ -+ if (jnode_is_znode(node)) -+ return znode_get_level(JZNODE(node)); -+ else -+ /* unformatted nodes are all at the LEAF_LEVEL and for -+ "semi-formatted" nodes like bitmaps, level doesn't matter. */ -+ return LEAF_LEVEL; -+} -+ -+/* true if jnode is on leaf level */ -+static inline int jnode_is_leaf(const jnode * node) -+{ -+ if (jnode_is_znode(node)) -+ return (znode_get_level(JZNODE(node)) == LEAF_LEVEL); -+ if (jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK) -+ return 1; -+ return 0; -+} -+ -+/* return znode's tree */ -+static inline reiser4_tree *znode_get_tree(const znode * node) -+{ -+ assert("nikita-2692", node != NULL); -+ return jnode_get_tree(ZJNODE(node)); -+} -+ -+/* resolve race with zput */ -+static inline znode *znode_rip_check(reiser4_tree * tree, znode * node) -+{ -+ jnode *j; -+ -+ j = jnode_rip_sync(tree, ZJNODE(node)); -+ if (likely(j != NULL)) -+ node = JZNODE(j); -+ else -+ node = NULL; -+ return node; -+} -+ -+#if defined(REISER4_DEBUG) -+int znode_is_loaded(const znode * node /* znode to query */ ); -+#endif -+ -+extern __u64 znode_build_version(reiser4_tree * tree); -+ -+/* Data-handles. A data handle object manages pairing calls to zload() and zrelse(). We -+ must load the data for a node in many places. We could do this by simply calling -+ zload() everywhere, the difficulty arises when we must release the loaded data by -+ calling zrelse. In a function with many possible error/return paths, it requires extra -+ work to figure out which exit paths must call zrelse and those which do not. The data -+ handle automatically calls zrelse for every zload that it is responsible for. In that -+ sense, it acts much like a lock_handle. -+*/ -+typedef struct load_count { -+ znode *node; -+ int d_ref; -+} load_count; -+ -+extern void init_load_count(load_count * lc); /* Initialize a load_count set the current node to NULL. */ -+extern void done_load_count(load_count * dh); /* Finalize a load_count: call zrelse() if necessary */ -+extern int incr_load_count_znode(load_count * dh, znode * node); /* Set the argument znode to the current node, call zload(). */ -+extern int incr_load_count_jnode(load_count * dh, jnode * node); /* If the argument jnode is formatted, do the same as -+ * incr_load_count_znode, otherwise do nothing (unformatted nodes -+ * don't require zload/zrelse treatment). */ -+extern void move_load_count(load_count * new, load_count * old); /* Move the contents of a load_count. Old handle is released. */ -+extern void copy_load_count(load_count * new, load_count * old); /* Copy the contents of a load_count. Old handle remains held. */ -+ -+/* Variable initializers for load_count. */ -+#define INIT_LOAD_COUNT ( load_count * ){ .node = NULL, .d_ref = 0 } -+#define INIT_LOAD_COUNT_NODE( n ) ( load_count ){ .node = ( n ), .d_ref = 0 } -+/* A convenience macro for use in assertions or debug-only code, where loaded -+ data is only required to perform the debugging check. This macro -+ encapsulates an expression inside a pair of calls to zload()/zrelse(). */ -+#define WITH_DATA( node, exp ) \ -+({ \ -+ long __with_dh_result; \ -+ znode *__with_dh_node; \ -+ \ -+ __with_dh_node = ( node ); \ -+ __with_dh_result = zload( __with_dh_node ); \ -+ if( __with_dh_result == 0 ) { \ -+ __with_dh_result = ( long )( exp ); \ -+ zrelse( __with_dh_node ); \ -+ } \ -+ __with_dh_result; \ -+}) -+ -+/* Same as above, but accepts a return value in case zload fails. */ -+#define WITH_DATA_RET( node, ret, exp ) \ -+({ \ -+ int __with_dh_result; \ -+ znode *__with_dh_node; \ -+ \ -+ __with_dh_node = ( node ); \ -+ __with_dh_result = zload( __with_dh_node ); \ -+ if( __with_dh_result == 0 ) { \ -+ __with_dh_result = ( int )( exp ); \ -+ zrelse( __with_dh_node ); \ -+ } else \ -+ __with_dh_result = ( ret ); \ -+ __with_dh_result; \ -+}) -+ -+#define WITH_COORD(coord, exp) \ -+({ \ -+ coord_t *__coord; \ -+ \ -+ __coord = (coord); \ -+ coord_clear_iplug(__coord); \ -+ WITH_DATA(__coord->node, exp); \ -+}) -+ -+#if REISER4_DEBUG -+#define STORE_COUNTERS \ -+ reiser4_lock_counters_info __entry_counters = \ -+ *reiser4_lock_counters() -+#define CHECK_COUNTERS \ -+ON_DEBUG_CONTEXT( \ -+({ \ -+ __entry_counters.x_refs = reiser4_lock_counters() -> x_refs; \ -+ __entry_counters.t_refs = reiser4_lock_counters() -> t_refs; \ -+ __entry_counters.d_refs = reiser4_lock_counters() -> d_refs; \ -+ assert("nikita-2159", \ -+ !memcmp(&__entry_counters, reiser4_lock_counters(), \ -+ sizeof __entry_counters)); \ -+}) ) -+ -+#else -+#define STORE_COUNTERS -+#define CHECK_COUNTERS noop -+#endif -+ -+/* __ZNODE_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.20.orig/include/linux/fs.h linux-2.6.20/include/linux/fs.h ---- linux-2.6.20.orig/include/linux/fs.h 2007-05-06 15:04:41.352625543 +0400 -+++ linux-2.6.20/include/linux/fs.h 2007-05-06 14:50:43.911041966 +0400 -@@ -1165,6 +1165,8 @@ - void (*clear_inode) (struct inode *); - void (*umount_begin) (struct vfsmount *, int); - -+ void (*sync_inodes) (struct super_block *sb, -+ struct writeback_control *wbc); - int (*show_options)(struct seq_file *, struct vfsmount *); - int (*show_stats)(struct seq_file *, struct vfsmount *); - #ifdef CONFIG_QUOTA -@@ -1583,6 +1585,7 @@ - extern int invalidate_inode_pages2_range(struct address_space *mapping, - pgoff_t start, pgoff_t end); - extern int write_inode_now(struct inode *, int); -+extern void generic_sync_sb_inodes(struct super_block *, struct writeback_control *); - extern int filemap_fdatawrite(struct address_space *); - extern int filemap_flush(struct address_space *); - extern int filemap_fdatawait(struct address_space *); -diff -urN linux-2.6.20.orig/lib/radix-tree.c linux-2.6.20/lib/radix-tree.c ---- linux-2.6.20.orig/lib/radix-tree.c 2007-05-06 15:04:42.096858012 +0400 -+++ linux-2.6.20/lib/radix-tree.c 2007-05-06 14:50:43.915043216 +0400 -@@ -151,6 +151,7 @@ - out: - return ret; - } -+EXPORT_SYMBOL(radix_tree_preload); - - static inline void tag_set(struct radix_tree_node *node, unsigned int tag, - int offset) -diff -urN linux-2.6.20.orig/mm/filemap.c linux-2.6.20/mm/filemap.c ---- linux-2.6.20.orig/mm/filemap.c 2007-05-06 15:04:42.108861762 +0400 -+++ linux-2.6.20/mm/filemap.c 2007-05-06 14:50:43.919044465 +0400 -@@ -121,6 +121,7 @@ - mapping->nrpages--; - __dec_zone_page_state(page, NR_FILE_PAGES); - } -+EXPORT_SYMBOL(__remove_from_page_cache); - - void remove_from_page_cache(struct page *page) - { -@@ -132,6 +133,7 @@ - __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); - } -+EXPORT_SYMBOL(remove_from_page_cache); - - static int sync_page(void *word) - { -@@ -738,6 +740,7 @@ - read_unlock_irq(&mapping->tree_lock); - return ret; - } -+EXPORT_SYMBOL(add_to_page_cache_lru); - - /** - * find_get_pages_contig - gang contiguous pagecache lookup -@@ -798,6 +801,7 @@ - read_unlock_irq(&mapping->tree_lock); - return ret; - } -+EXPORT_SYMBOL(find_get_pages); - - /** - * grab_cache_page_nowait - returns locked page at given index in given cache -@@ -855,6 +859,7 @@ - - ra->ra_pages /= 4; - } -+EXPORT_SYMBOL(find_get_pages_tag); - - /** - * do_generic_mapping_read - generic file read routine -diff -urN linux-2.6.20.orig/mm/readahead.c linux-2.6.20/mm/readahead.c ---- linux-2.6.20.orig/mm/readahead.c 2007-05-06 15:04:42.144873010 +0400 -+++ linux-2.6.20/mm/readahead.c 2007-05-06 14:50:43.919044465 +0400 -@@ -568,6 +568,7 @@ - ra->flags &= ~RA_FLAG_INCACHE; - ra->cache_hit = 0; - } -+EXPORT_SYMBOL_GPL(handle_ra_miss); - - /* - * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a -Files linux-2.6.20.orig/scripts/kconfig/mconf and linux-2.6.20/scripts/kconfig/mconf differ diff --git a/src/patches/reiser4-for-2.6.24.patch b/src/patches/reiser4-for-2.6.24.patch deleted file mode 100644 index 25363c0af1..0000000000 --- a/src/patches/reiser4-for-2.6.24.patch +++ /dev/null @@ -1,78231 +0,0 @@ -diff -urN linux-2.6.24.orig/arch/x86/lib/usercopy_32.c linux-2.6.24/arch/x86/lib/usercopy_32.c ---- linux-2.6.24.orig/arch/x86/lib/usercopy_32.c 2008-01-25 14:24:08.234127530 +0300 -+++ linux-2.6.24/arch/x86/lib/usercopy_32.c 2008-01-25 11:39:06.872191202 +0300 -@@ -817,6 +817,7 @@ - #endif - return n; - } -+EXPORT_SYMBOL(__copy_from_user_ll_nocache); - - unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from, - unsigned long n) -@@ -831,6 +832,7 @@ - #endif - return n; - } -+EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero); - - /** - * copy_to_user: - Copy a block of data into user space. -diff -urN linux-2.6.24.orig/Documentation/Changes linux-2.6.24/Documentation/Changes ---- linux-2.6.24.orig/Documentation/Changes 2007-10-10 00:31:38.000000000 +0400 -+++ linux-2.6.24/Documentation/Changes 2008-01-25 11:39:06.876192233 +0300 -@@ -36,6 +36,7 @@ - o e2fsprogs 1.29 # tune2fs - o jfsutils 1.1.3 # fsck.jfs -V - o reiserfsprogs 3.6.3 # reiserfsck -V 2>&1|grep reiserfsprogs -+o reiser4progs 1.0.0 # fsck.reiser4 -V - o xfsprogs 2.6.0 # xfs_db -V - o pcmciautils 004 # pccardctl -V - o quota-tools 3.09 # quota -V -@@ -145,6 +146,13 @@ - versions of mkreiserfs, resize_reiserfs, debugreiserfs and - reiserfsck. These utils work on both i386 and alpha platforms. - -+Reiser4progs -+------------ -+ -+The reiser4progs package contains utilities for the reiser4 file system. -+Detailed instructions are provided in the README file located at: -+. -+ - Xfsprogs - -------- - -@@ -323,6 +331,10 @@ - ------------- - o - -+Reiser4progs -+------------ -+o -+ - Xfsprogs - -------- - o -diff -urN linux-2.6.24.orig/Documentation/filesystems/reiser4.txt linux-2.6.24/Documentation/filesystems/reiser4.txt ---- linux-2.6.24.orig/Documentation/filesystems/reiser4.txt 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/Documentation/filesystems/reiser4.txt 2008-01-25 11:39:06.876192233 +0300 -@@ -0,0 +1,75 @@ -+Reiser4 filesystem -+================== -+Reiser4 is a file system based on dancing tree algorithms, and is -+described at http://www.namesys.com -+ -+ -+References -+========== -+web page http://namesys.com/v4/v4.html -+source code ftp://ftp.namesys.com/pub/reiser4-for-2.6/ -+userland tools ftp://ftp.namesys.com/pub/reiser4progs/ -+install page http://www.namesys.com/install_v4.html -+ -+Compile options -+=============== -+Enable reiser4 debug mode -+ This checks everything imaginable while reiser4 -+ runs -+ -+Mount options -+============= -+tmgr.atom_max_size=N -+ Atoms containing more than N blocks will be forced to commit. -+ N is decimal. -+ Default is nr_free_pagecache_pages() / 2 at mount time. -+ -+tmgr.atom_max_age=N -+ Atoms older than N seconds will be forced to commit. N is decimal. -+ Default is 600. -+ -+tmgr.atom_max_flushers=N -+ Limit of concurrent flushers for one atom. 0 means no limit. -+ Default is 0. -+ -+tree.cbk_cache.nr_slots=N -+ Number of slots in the cbk cache. -+ -+flush.relocate_threshold=N -+ If flush finds more than N adjacent dirty leaf-level blocks it -+ will force them to be relocated. -+ Default is 64. -+ -+flush.relocate_distance=N -+ If flush finds can find a block allocation closer than at most -+ N from the preceder it will relocate to that position. -+ Default is 64. -+ -+flush.scan_maxnodes=N -+ The maximum number of nodes to scan left on a level during -+ flush. -+ Default is 10000. -+ -+optimal_io_size=N -+ Preferred IO size. This value is used to set st_blksize of -+ struct stat. -+ Default is 65536. -+ -+bsdgroups -+ Turn on BSD-style gid assignment. -+ -+32bittimes -+ By default file in reiser4 have 64 bit timestamps. Files -+ created when filesystem is mounted with 32bittimes mount -+ option will get 32 bit timestamps. -+ -+mtflush -+ Turn off concurrent flushing. -+ -+nopseudo -+ Disable pseudo files support. See -+ http://namesys.com/v4/pseudo.html for more about pseudo files. -+ -+dont_load_bitmap -+ Don't load all bitmap blocks at mount time, it is useful for -+ machines with tiny RAM and large disks. -diff -urN linux-2.6.24.orig/fs/fs-writeback.c linux-2.6.24/fs/fs-writeback.c ---- linux-2.6.24.orig/fs/fs-writeback.c 2008-01-25 14:24:18.344724018 +0300 -+++ linux-2.6.24/fs/fs-writeback.c 2008-01-25 11:39:06.876192233 +0300 -@@ -386,8 +386,6 @@ - * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so - * that it can be located for waiting on in __writeback_single_inode(). - * -- * Called under inode_lock. -- * - * If `bdi' is non-zero then we're being asked to writeback a specific queue. - * This function assumes that the blockdev superblock's inodes are backed by - * a variety of queues, so all inodes are searched. For other superblocks, -@@ -403,11 +401,13 @@ - * on the writer throttling path, and we get decent balancing between many - * throttled threads: we don't want them all piling up on inode_sync_wait. - */ --static void --sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) -+void -+generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) - { - const unsigned long start = jiffies; /* livelock avoidance */ - -+ spin_lock(&inode_lock); -+ - if (!wbc->for_kupdate || list_empty(&sb->s_io)) - queue_io(sb, wbc->older_than_this); - -@@ -482,8 +482,19 @@ - if (wbc->nr_to_write <= 0) - break; - } -+ spin_unlock(&inode_lock); - return; /* Leave any unwritten inodes on s_io */ - } -+EXPORT_SYMBOL(generic_sync_sb_inodes); -+ -+static void -+sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) -+{ -+ if (sb->s_op->sync_inodes) -+ sb->s_op->sync_inodes(sb, wbc); -+ else -+ generic_sync_sb_inodes(sb, wbc); -+} - - /* - * Start writeback of dirty pagecache data against all unlocked inodes. -@@ -524,11 +535,8 @@ - * be unmounted by the time it is released. - */ - if (down_read_trylock(&sb->s_umount)) { -- if (sb->s_root) { -- spin_lock(&inode_lock); -+ if (sb->s_root) - sync_sb_inodes(sb, wbc); -- spin_unlock(&inode_lock); -- } - up_read(&sb->s_umount); - } - spin_lock(&sb_lock); -@@ -566,9 +574,7 @@ - (inodes_stat.nr_inodes - inodes_stat.nr_unused) + - nr_dirty + nr_unstable; - wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */ -- spin_lock(&inode_lock); - sync_sb_inodes(sb, &wbc); -- spin_unlock(&inode_lock); - } - - /* -diff -urN linux-2.6.24.orig/fs/Kconfig linux-2.6.24/fs/Kconfig ---- linux-2.6.24.orig/fs/Kconfig 2008-01-25 14:24:17.976629488 +0300 -+++ linux-2.6.24/fs/Kconfig 2008-01-25 11:39:06.880193263 +0300 -@@ -273,6 +273,8 @@ - default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y - default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m - -+source "fs/reiser4/Kconfig" -+ - config REISERFS_FS - tristate "Reiserfs support" - help -diff -urN linux-2.6.24.orig/fs/Makefile linux-2.6.24/fs/Makefile ---- linux-2.6.24.orig/fs/Makefile 2008-01-25 14:24:17.980630515 +0300 -+++ linux-2.6.24/fs/Makefile 2008-01-25 11:39:06.884194294 +0300 -@@ -66,6 +66,7 @@ - - # Do not add any filesystems before this line - obj-$(CONFIG_REISERFS_FS) += reiserfs/ -+obj-$(CONFIG_REISER4_FS) += reiser4/ - obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 - obj-$(CONFIG_EXT4DEV_FS) += ext4/ # Before ext2 so root fs can be ext4dev - obj-$(CONFIG_JBD) += jbd/ -diff -urN linux-2.6.24.orig/fs/reiser4/as_ops.c linux-2.6.24/fs/reiser4/as_ops.c ---- linux-2.6.24.orig/fs/reiser4/as_ops.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/as_ops.c 2008-01-25 11:39:06.884194294 +0300 -@@ -0,0 +1,377 @@ -+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Interface to VFS. Reiser4 address_space_operations are defined here. */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "coord.h" -+#include "plugin/item/item.h" -+#include "plugin/file/file.h" -+#include "plugin/security/perm.h" -+#include "plugin/disk_format/disk_format.h" -+#include "plugin/plugin.h" -+#include "plugin/plugin_set.h" -+#include "plugin/object.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree.h" -+#include "vfs_ops.h" -+#include "inode.h" -+#include "page_cache.h" -+#include "ktxnmgrd.h" -+#include "super.h" -+#include "reiser4.h" -+#include "entd.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* address space operations */ -+ -+/** -+ * reiser4_set_page_dirty - set dirty bit, tag in page tree, dirty accounting -+ * @page: page to be dirtied -+ * -+ * Operation of struct address_space_operations. This implementation is used by -+ * unix and cryptcompress file plugins. -+ * -+ * This is called when reiser4 page gets dirtied outside of reiser4, for -+ * example, when dirty bit is moved from pte to physical page. -+ * -+ * Tags page in the mapping's page tree with special tag so that it is possible -+ * to do all the reiser4 specific work wrt dirty pages (jnode creation, -+ * capturing by an atom) later because it can not be done in the contexts where -+ * set_page_dirty is called. -+ */ -+int reiser4_set_page_dirty(struct page *page) -+{ -+ /* this page can be unformatted only */ -+ assert("vs-1734", (page->mapping && -+ page->mapping->host && -+ reiser4_get_super_fake(page->mapping->host->i_sb) != -+ page->mapping->host -+ && reiser4_get_cc_fake(page->mapping->host->i_sb) != -+ page->mapping->host -+ && reiser4_get_bitmap_fake(page->mapping->host->i_sb) != -+ page->mapping->host)); -+ -+ if (!TestSetPageDirty(page)) { -+ struct address_space *mapping = page->mapping; -+ -+ if (mapping) { -+ write_lock_irq(&mapping->tree_lock); -+ -+ /* check for race with truncate */ -+ if (page->mapping) { -+ assert("vs-1652", page->mapping == mapping); -+ if (mapping_cap_account_dirty(mapping)) -+ inc_zone_page_state(page, -+ NR_FILE_DIRTY); -+ radix_tree_tag_set(&mapping->page_tree, -+ page->index, -+ PAGECACHE_TAG_REISER4_MOVED); -+ } -+ write_unlock_irq(&mapping->tree_lock); -+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); -+ } -+ } -+ return 0; -+} -+ -+/* ->invalidatepage method for reiser4 */ -+ -+/* -+ * this is called for each truncated page from -+ * truncate_inode_pages()->truncate_{complete,partial}_page(). -+ * -+ * At the moment of call, page is under lock, and outstanding io (if any) has -+ * completed. -+ */ -+ -+/** -+ * reiser4_invalidatepage -+ * @page: page to invalidate -+ * @offset: starting offset for partial invalidation -+ * -+ */ -+void reiser4_invalidatepage(struct page *page, unsigned long offset) -+{ -+ int ret = 0; -+ reiser4_context *ctx; -+ struct inode *inode; -+ jnode *node; -+ -+ /* -+ * This is called to truncate file's page. -+ * -+ * Originally, reiser4 implemented truncate in a standard way -+ * (vmtruncate() calls ->invalidatepage() on all truncated pages -+ * first, then file system ->truncate() call-back is invoked). -+ * -+ * This lead to the problem when ->invalidatepage() was called on a -+ * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT -+ * process. That is, truncate was bypassing transactions. To avoid -+ * this, try_capture_page_to_invalidate() call was added here. -+ * -+ * After many troubles with vmtruncate() based truncate (including -+ * races with flush, tail conversion, etc.) it was re-written in the -+ * top-to-bottom style: items are killed in reiser4_cut_tree_object() -+ * and pages belonging to extent are invalidated in kill_hook_extent(). -+ * So probably now additional call to capture is not needed here. -+ */ -+ -+ assert("nikita-3137", PageLocked(page)); -+ assert("nikita-3138", !PageWriteback(page)); -+ inode = page->mapping->host; -+ -+ /* -+ * ->invalidatepage() should only be called for the unformatted -+ * jnodes. Destruction of all other types of jnodes is performed -+ * separately. But, during some corner cases (like handling errors -+ * during mount) it is simpler to let ->invalidatepage to be called on -+ * them. Check for this, and do nothing. -+ */ -+ if (reiser4_get_super_fake(inode->i_sb) == inode) -+ return; -+ if (reiser4_get_cc_fake(inode->i_sb) == inode) -+ return; -+ if (reiser4_get_bitmap_fake(inode->i_sb) == inode) -+ return; -+ assert("vs-1426", PagePrivate(page)); -+ assert("vs-1427", -+ page->mapping == jnode_get_mapping(jnode_by_page(page))); -+ assert("", jprivate(page) != NULL); -+ assert("", ergo(inode_file_plugin(inode) != -+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID), -+ offset == 0)); -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return; -+ -+ node = jprivate(page); -+ spin_lock_jnode(node); -+ if (!(node->state & ((1 << JNODE_DIRTY) | (1<< JNODE_FLUSH_QUEUED) | -+ (1 << JNODE_WRITEBACK) | (1 << JNODE_OVRWR)))) { -+ /* there is not need to capture */ -+ jref(node); -+ JF_SET(node, JNODE_HEARD_BANSHEE); -+ page_clear_jnode(page, node); -+ reiser4_uncapture_jnode(node); -+ unhash_unformatted_jnode(node); -+ jput(node); -+ reiser4_exit_context(ctx); -+ return; -+ } -+ spin_unlock_jnode(node); -+ -+ /* capture page being truncated. */ -+ ret = try_capture_page_to_invalidate(page); -+ if (ret != 0) -+ warning("nikita-3141", "Cannot capture: %i", ret); -+ -+ if (offset == 0) { -+ /* remove jnode from transaction and detach it from page. */ -+ jref(node); -+ JF_SET(node, JNODE_HEARD_BANSHEE); -+ /* page cannot be detached from jnode concurrently, because it -+ * is locked */ -+ reiser4_uncapture_page(page); -+ -+ /* this detaches page from jnode, so that jdelete will not try -+ * to lock page which is already locked */ -+ spin_lock_jnode(node); -+ page_clear_jnode(page, node); -+ spin_unlock_jnode(node); -+ unhash_unformatted_jnode(node); -+ -+ jput(node); -+ } -+ -+ reiser4_exit_context(ctx); -+} -+ -+/* help function called from reiser4_releasepage(). It returns true if jnode -+ * can be detached from its page and page released. */ -+int jnode_is_releasable(jnode * node /* node to check */ ) -+{ -+ assert("nikita-2781", node != NULL); -+ assert_spin_locked(&(node->guard)); -+ assert_spin_locked(&(node->load)); -+ -+ /* is some thread is currently using jnode page, later cannot be -+ * detached */ -+ if (atomic_read(&node->d_count) != 0) { -+ return 0; -+ } -+ -+ assert("vs-1214", !jnode_is_loaded(node)); -+ -+ /* -+ * can only release page if real block number is assigned to it. Simple -+ * check for ->atom wouldn't do, because it is possible for node to be -+ * clean, not it atom yet, and still having fake block number. For -+ * example, node just created in jinit_new(). -+ */ -+ if (reiser4_blocknr_is_fake(jnode_get_block(node))) -+ return 0; -+ -+ /* -+ * pages prepared for write can not be released anyway, so avoid -+ * detaching jnode from the page -+ */ -+ if (JF_ISSET(node, JNODE_WRITE_PREPARED)) -+ return 0; -+ -+ /* -+ * dirty jnode cannot be released. It can however be submitted to disk -+ * as part of early flushing, but only after getting flush-prepped. -+ */ -+ if (JF_ISSET(node, JNODE_DIRTY)) -+ return 0; -+ -+ /* overwrite set is only written by log writer. */ -+ if (JF_ISSET(node, JNODE_OVRWR)) -+ return 0; -+ -+ /* jnode is already under writeback */ -+ if (JF_ISSET(node, JNODE_WRITEBACK)) -+ return 0; -+ -+ /* don't flush bitmaps or journal records */ -+ if (!jnode_is_znode(node) && !jnode_is_unformatted(node)) -+ return 0; -+ -+ return 1; -+} -+ -+/* -+ * ->releasepage method for reiser4 -+ * -+ * This is called by VM scanner when it comes across clean page. What we have -+ * to do here is to check whether page can really be released (freed that is) -+ * and if so, detach jnode from it and remove page from the page cache. -+ * -+ * Check for releasability is done by releasable() function. -+ */ -+int reiser4_releasepage(struct page *page, gfp_t gfp UNUSED_ARG) -+{ -+ jnode *node; -+ -+ assert("nikita-2257", PagePrivate(page)); -+ assert("nikita-2259", PageLocked(page)); -+ assert("nikita-2892", !PageWriteback(page)); -+ assert("nikita-3019", reiser4_schedulable()); -+ -+ /* NOTE-NIKITA: this can be called in the context of reiser4 call. It -+ is not clear what to do in this case. A lot of deadlocks seems be -+ possible. */ -+ -+ node = jnode_by_page(page); -+ assert("nikita-2258", node != NULL); -+ assert("reiser4-4", page->mapping != NULL); -+ assert("reiser4-5", page->mapping->host != NULL); -+ -+ if (PageDirty(page)) -+ return 0; -+ -+ /* extra page reference is used by reiser4 to protect -+ * jnode<->page link from this ->releasepage(). */ -+ if (page_count(page) > 3) -+ return 0; -+ -+ /* releasable() needs jnode lock, because it looks at the jnode fields -+ * and we need jload_lock here to avoid races with jload(). */ -+ spin_lock_jnode(node); -+ spin_lock(&(node->load)); -+ if (jnode_is_releasable(node)) { -+ struct address_space *mapping; -+ -+ mapping = page->mapping; -+ jref(node); -+ /* there is no need to synchronize against -+ * jnode_extent_write() here, because pages seen by -+ * jnode_extent_write() are !releasable(). */ -+ page_clear_jnode(page, node); -+ spin_unlock(&(node->load)); -+ spin_unlock_jnode(node); -+ -+ /* we are under memory pressure so release jnode also. */ -+ jput(node); -+ -+ return 1; -+ } else { -+ spin_unlock(&(node->load)); -+ spin_unlock_jnode(node); -+ assert("nikita-3020", reiser4_schedulable()); -+ return 0; -+ } -+} -+ -+int reiser4_readpage(struct file *file, struct page *page) -+{ -+ assert("edward-1533", PageLocked(page)); -+ assert("edward-1534", !PageUptodate(page)); -+ assert("edward-1535", page->mapping && page->mapping->host); -+ -+ return inode_file_plugin(page->mapping->host)->readpage(file, page); -+} -+ -+int reiser4_readpages(struct file *file, struct address_space *mapping, -+ struct list_head *pages, unsigned nr_pages) -+{ -+ return inode_file_plugin(mapping->host)->readpages(file, mapping, -+ pages, nr_pages); -+} -+ -+int reiser4_writepages(struct address_space *mapping, -+ struct writeback_control *wbc) -+{ -+ return inode_file_plugin(mapping->host)->writepages(mapping, wbc); -+} -+ -+int reiser4_prepare_write(struct file *file, struct page *page, -+ unsigned from, unsigned to) -+{ -+ return inode_file_plugin(file->f_dentry->d_inode)->prepare_write(file, -+ page, -+ from, -+ to); -+} -+ -+int reiser4_commit_write(struct file *file, struct page *page, -+ unsigned from, unsigned to) -+{ -+ return inode_file_plugin(file->f_dentry->d_inode)->commit_write(file, -+ page, -+ from, -+ to); -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/block_alloc.c linux-2.6.24/fs/reiser4/block_alloc.c ---- linux-2.6.24.orig/fs/reiser4/block_alloc.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/block_alloc.c 2008-01-25 11:39:06.888195324 +0300 -@@ -0,0 +1,1137 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "debug.h" -+#include "dformat.h" -+#include "plugin/plugin.h" -+#include "txnmgr.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree.h" -+#include "super.h" -+ -+#include /* for __u?? */ -+#include /* for struct super_block */ -+#include -+ -+/* THE REISER4 DISK SPACE RESERVATION SCHEME. */ -+ -+/* We need to be able to reserve enough disk space to ensure that an atomic -+ operation will have enough disk space to flush (see flush.c and -+ http://namesys.com/v4/v4.html) and commit it once it is started. -+ -+ In our design a call for reserving disk space may fail but not an actual -+ block allocation. -+ -+ All free blocks, already allocated blocks, and all kinds of reserved blocks -+ are counted in different per-fs block counters. -+ -+ A reiser4 super block's set of block counters currently is: -+ -+ free -- free blocks, -+ used -- already allocated blocks, -+ -+ grabbed -- initially reserved for performing an fs operation, those blocks -+ are taken from free blocks, then grabbed disk space leaks from grabbed -+ blocks counter to other counters like "fake allocated", "flush -+ reserved", "used", the rest of not used grabbed space is returned to -+ free space at the end of fs operation; -+ -+ fake allocated -- counts all nodes without real disk block numbers assigned, -+ we have separate accounting for formatted and unformatted -+ nodes (for easier debugging); -+ -+ flush reserved -- disk space needed for flushing and committing an atom. -+ Each dirty already allocated block could be written as a -+ part of atom's overwrite set or as a part of atom's -+ relocate set. In both case one additional block is needed, -+ it is used as a wandered block if we do overwrite or as a -+ new location for a relocated block. -+ -+ In addition, blocks in some states are counted on per-thread and per-atom -+ basis. A reiser4 context has a counter of blocks grabbed by this transaction -+ and the sb's grabbed blocks counter is a sum of grabbed blocks counter values -+ of each reiser4 context. Each reiser4 atom has a counter of "flush reserved" -+ blocks, which are reserved for flush processing and atom commit. */ -+ -+/* AN EXAMPLE: suppose we insert new item to the reiser4 tree. We estimate -+ number of blocks to grab for most expensive case of balancing when the leaf -+ node we insert new item to gets split and new leaf node is allocated. -+ -+ So, we need to grab blocks for -+ -+ 1) one block for possible dirtying the node we insert an item to. That block -+ would be used for node relocation at flush time or for allocating of a -+ wandered one, it depends what will be a result (what set, relocate or -+ overwrite the node gets assigned to) of the node processing by the flush -+ algorithm. -+ -+ 2) one block for either allocating a new node, or dirtying of right or left -+ clean neighbor, only one case may happen. -+ -+ VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying of left neighbor, right neighbor, current -+ node, and creation of new node. have I forgotten something? email me. -+ -+ These grabbed blocks are counted in both reiser4 context "grabbed blocks" -+ counter and in the fs-wide one (both ctx->grabbed_blocks and -+ sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is -+ decremented by 2. -+ -+ Suppose both two blocks were spent for dirtying of an already allocated clean -+ node (one block went from "grabbed" to "flush reserved") and for new block -+ allocating (one block went from "grabbed" to "fake allocated formatted"). -+ -+ Inserting of a child pointer to the parent node caused parent node to be -+ split, the balancing code takes care about this grabbing necessary space -+ immediately by calling reiser4_grab with BA_RESERVED flag set which means -+ "can use the 5% reserved disk space". -+ -+ At this moment insertion completes and grabbed blocks (if they were not used) -+ should be returned to the free space counter. -+ -+ However the atom life-cycle is not completed. The atom had one "flush -+ reserved" block added by our insertion and the new fake allocated node is -+ counted as a "fake allocated formatted" one. The atom has to be fully -+ processed by flush before commit. Suppose that the flush moved the first, -+ already allocated node to the atom's overwrite list, the new fake allocated -+ node, obviously, went into the atom relocate set. The reiser4 flush -+ allocates the new node using one unit from "fake allocated formatted" -+ counter, the log writer uses one from "flush reserved" for wandered block -+ allocation. -+ -+ And, it is not the end. When the wandered block is deallocated after the -+ atom gets fully played (see wander.c for term description), the disk space -+ occupied for it is returned to free blocks. */ -+ -+/* BLOCK NUMBERS */ -+ -+/* Any reiser4 node has a block number assigned to it. We use these numbers for -+ indexing in hash tables, so if a block has not yet been assigned a location -+ on disk we need to give it a temporary fake block number. -+ -+ Current implementation of reiser4 uses 64-bit integers for block numbers. We -+ use highest bit in 64-bit block number to distinguish fake and real block -+ numbers. So, only 63 bits may be used to addressing of real device -+ blocks. That "fake" block numbers space is divided into subspaces of fake -+ block numbers for data blocks and for shadow (working) bitmap blocks. -+ -+ Fake block numbers for data blocks are generated by a cyclic counter, which -+ gets incremented after each real block allocation. We assume that it is -+ impossible to overload this counter during one transaction life. */ -+ -+/* Initialize a blocknr hint. */ -+void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint) -+{ -+ memset(hint, 0, sizeof(reiser4_blocknr_hint)); -+} -+ -+/* Release any resources of a blocknr hint. */ -+void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG) -+{ -+ /* No resources should be freed in current blocknr_hint implementation. */ -+} -+ -+/* see above for explanation of fake block number. */ -+/* Audited by: green(2002.06.11) */ -+int reiser4_blocknr_is_fake(const reiser4_block_nr * da) -+{ -+ /* The reason for not simply returning result of '&' operation is that -+ while return value is (possibly 32bit) int, the reiser4_block_nr is -+ at least 64 bits long, and high bit (which is the only possible -+ non zero bit after the masking) would be stripped off */ -+ return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0; -+} -+ -+/* Static functions for / block counters -+ arithmetic. Mostly, they are isolated to not to code same assertions in -+ several places. */ -+static void sub_from_ctx_grabbed(reiser4_context * ctx, __u64 count) -+{ -+ BUG_ON(ctx->grabbed_blocks < count); -+ assert("zam-527", ctx->grabbed_blocks >= count); -+ ctx->grabbed_blocks -= count; -+} -+ -+static void add_to_ctx_grabbed(reiser4_context * ctx, __u64 count) -+{ -+ ctx->grabbed_blocks += count; -+} -+ -+static void sub_from_sb_grabbed(reiser4_super_info_data * sbinfo, __u64 count) -+{ -+ assert("zam-525", sbinfo->blocks_grabbed >= count); -+ sbinfo->blocks_grabbed -= count; -+} -+ -+/* Decrease the counter of block reserved for flush in super block. */ -+static void -+sub_from_sb_flush_reserved(reiser4_super_info_data * sbinfo, __u64 count) -+{ -+ assert("vpf-291", sbinfo->blocks_flush_reserved >= count); -+ sbinfo->blocks_flush_reserved -= count; -+} -+ -+static void -+sub_from_sb_fake_allocated(reiser4_super_info_data * sbinfo, __u64 count, -+ reiser4_ba_flags_t flags) -+{ -+ if (flags & BA_FORMATTED) { -+ assert("zam-806", sbinfo->blocks_fake_allocated >= count); -+ sbinfo->blocks_fake_allocated -= count; -+ } else { -+ assert("zam-528", -+ sbinfo->blocks_fake_allocated_unformatted >= count); -+ sbinfo->blocks_fake_allocated_unformatted -= count; -+ } -+} -+ -+static void sub_from_sb_used(reiser4_super_info_data * sbinfo, __u64 count) -+{ -+ assert("zam-530", -+ sbinfo->blocks_used >= count + sbinfo->min_blocks_used); -+ sbinfo->blocks_used -= count; -+} -+ -+static void -+sub_from_cluster_reserved(reiser4_super_info_data * sbinfo, __u64 count) -+{ -+ assert("edward-501", sbinfo->blocks_clustered >= count); -+ sbinfo->blocks_clustered -= count; -+} -+ -+/* Increase the counter of block reserved for flush in atom. */ -+static void add_to_atom_flush_reserved_nolock(txn_atom * atom, __u32 count) -+{ -+ assert("zam-772", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ atom->flush_reserved += count; -+} -+ -+/* Decrease the counter of block reserved for flush in atom. */ -+static void sub_from_atom_flush_reserved_nolock(txn_atom * atom, __u32 count) -+{ -+ assert("zam-774", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ assert("nikita-2790", atom->flush_reserved >= count); -+ atom->flush_reserved -= count; -+} -+ -+/* super block has 6 counters: free, used, grabbed, fake allocated -+ (formatted and unformatted) and flush reserved. Their sum must be -+ number of blocks on a device. This function checks this */ -+int reiser4_check_block_counters(const struct super_block *super) -+{ -+ __u64 sum; -+ -+ sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) + -+ reiser4_data_blocks(super) + reiser4_fake_allocated(super) + -+ reiser4_fake_allocated_unformatted(super) + reiser4_flush_reserved(super) + -+ reiser4_clustered_blocks(super); -+ if (reiser4_block_count(super) != sum) { -+ printk("super block counters: " -+ "used %llu, free %llu, " -+ "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), " -+ "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n", -+ (unsigned long long)reiser4_data_blocks(super), -+ (unsigned long long)reiser4_free_blocks(super), -+ (unsigned long long)reiser4_grabbed_blocks(super), -+ (unsigned long long)reiser4_fake_allocated(super), -+ (unsigned long long) -+ reiser4_fake_allocated_unformatted(super), -+ (unsigned long long)reiser4_flush_reserved(super), -+ (unsigned long long)reiser4_clustered_blocks(super), -+ (unsigned long long)sum, -+ (unsigned long long)reiser4_block_count(super)); -+ return 0; -+ } -+ return 1; -+} -+ -+/* Adjust "working" free blocks counter for number of blocks we are going to -+ allocate. Record number of grabbed blocks in fs-wide and per-thread -+ counters. This function should be called before bitmap scanning or -+ allocating fake block numbers -+ -+ @super -- pointer to reiser4 super block; -+ @count -- number of blocks we reserve; -+ -+ @return -- 0 if success, -ENOSPC, if all -+ free blocks are preserved or already allocated. -+*/ -+ -+static int -+reiser4_grab(reiser4_context * ctx, __u64 count, reiser4_ba_flags_t flags) -+{ -+ __u64 free_blocks; -+ int ret = 0, use_reserved = flags & BA_RESERVED; -+ reiser4_super_info_data *sbinfo; -+ -+ assert("vs-1276", ctx == get_current_context()); -+ -+ /* Do not grab anything on ro-mounted fs. */ -+ if (rofs_super(ctx->super)) { -+ ctx->grab_enabled = 0; -+ return 0; -+ } -+ -+ sbinfo = get_super_private(ctx->super); -+ -+ spin_lock_reiser4_super(sbinfo); -+ -+ free_blocks = sbinfo->blocks_free; -+ -+ if ((use_reserved && free_blocks < count) || -+ (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) { -+ ret = RETERR(-ENOSPC); -+ goto unlock_and_ret; -+ } -+ -+ add_to_ctx_grabbed(ctx, count); -+ -+ sbinfo->blocks_grabbed += count; -+ sbinfo->blocks_free -= count; -+ -+#if REISER4_DEBUG -+ if (ctx->grabbed_initially == 0) -+ ctx->grabbed_initially = count; -+#endif -+ -+ assert("nikita-2986", reiser4_check_block_counters(ctx->super)); -+ -+ /* disable grab space in current context */ -+ ctx->grab_enabled = 0; -+ -+ unlock_and_ret: -+ spin_unlock_reiser4_super(sbinfo); -+ -+ return ret; -+} -+ -+int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags) -+{ -+ int ret; -+ reiser4_context *ctx; -+ -+ assert("nikita-2964", ergo(flags & BA_CAN_COMMIT, -+ lock_stack_isclean(get_current_lock_stack -+ ()))); -+ ctx = get_current_context(); -+ if (!(flags & BA_FORCE) && !is_grab_enabled(ctx)) { -+ return 0; -+ } -+ -+ ret = reiser4_grab(ctx, count, flags); -+ if (ret == -ENOSPC) { -+ -+ /* Trying to commit the all transactions if BA_CAN_COMMIT flag present */ -+ if (flags & BA_CAN_COMMIT) { -+ txnmgr_force_commit_all(ctx->super, 0); -+ ctx->grab_enabled = 1; -+ ret = reiser4_grab(ctx, count, flags); -+ } -+ } -+ /* -+ * allocation from reserved pool cannot fail. This is severe error. -+ */ -+ assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0)); -+ return ret; -+} -+ -+/* -+ * SPACE RESERVED FOR UNLINK/TRUNCATE -+ * -+ * Unlink and truncate require space in transaction (to update stat data, at -+ * least). But we don't want rm(1) to fail with "No space on device" error. -+ * -+ * Solution is to reserve 5% of disk space for truncates and -+ * unlinks. Specifically, normal space grabbing requests don't grab space from -+ * reserved area. Only requests with BA_RESERVED bit in flags are allowed to -+ * drain it. Per super block delete mutex is used to allow only one -+ * thread at a time to grab from reserved area. -+ * -+ * Grabbing from reserved area should always be performed with BA_CAN_COMMIT -+ * flag. -+ * -+ */ -+ -+int reiser4_grab_reserved(struct super_block *super, -+ __u64 count, reiser4_ba_flags_t flags) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(super); -+ -+ assert("nikita-3175", flags & BA_CAN_COMMIT); -+ -+ /* Check the delete mutex already taken by us, we assume that -+ * reading of machine word is atomic. */ -+ if (sbinfo->delete_mutex_owner == current) { -+ if (reiser4_grab_space -+ (count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) { -+ warning("zam-1003", -+ "nested call of grab_reserved fails count=(%llu)", -+ (unsigned long long)count); -+ reiser4_release_reserved(super); -+ return RETERR(-ENOSPC); -+ } -+ return 0; -+ } -+ -+ if (reiser4_grab_space(count, flags)) { -+ mutex_lock(&sbinfo->delete_mutex); -+ assert("nikita-2929", sbinfo->delete_mutex_owner == NULL); -+ sbinfo->delete_mutex_owner = current; -+ -+ if (reiser4_grab_space(count, flags | BA_RESERVED)) { -+ warning("zam-833", -+ "reserved space is not enough (%llu)", -+ (unsigned long long)count); -+ reiser4_release_reserved(super); -+ return RETERR(-ENOSPC); -+ } -+ } -+ return 0; -+} -+ -+void reiser4_release_reserved(struct super_block *super) -+{ -+ reiser4_super_info_data *info; -+ -+ info = get_super_private(super); -+ if (info->delete_mutex_owner == current) { -+ info->delete_mutex_owner = NULL; -+ mutex_unlock(&info->delete_mutex); -+ } -+} -+ -+static reiser4_super_info_data *grabbed2fake_allocated_head(int count) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ ctx = get_current_context(); -+ sub_from_ctx_grabbed(ctx, count); -+ -+ sbinfo = get_super_private(ctx->super); -+ spin_lock_reiser4_super(sbinfo); -+ -+ sub_from_sb_grabbed(sbinfo, count); -+ /* return sbinfo locked */ -+ return sbinfo; -+} -+ -+/* is called after @count fake block numbers are allocated and pointer to -+ those blocks are inserted into tree. */ -+static void grabbed2fake_allocated_formatted(void) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = grabbed2fake_allocated_head(1); -+ sbinfo->blocks_fake_allocated++; -+ -+ assert("vs-922", reiser4_check_block_counters(reiser4_get_current_sb())); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+/** -+ * grabbed2fake_allocated_unformatted -+ * @count: -+ * -+ */ -+static void grabbed2fake_allocated_unformatted(int count) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = grabbed2fake_allocated_head(count); -+ sbinfo->blocks_fake_allocated_unformatted += count; -+ -+ assert("vs-9221", reiser4_check_block_counters(reiser4_get_current_sb())); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+void grabbed2cluster_reserved(int count) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ ctx = get_current_context(); -+ sub_from_ctx_grabbed(ctx, count); -+ -+ sbinfo = get_super_private(ctx->super); -+ spin_lock_reiser4_super(sbinfo); -+ -+ sub_from_sb_grabbed(sbinfo, count); -+ sbinfo->blocks_clustered += count; -+ -+ assert("edward-504", reiser4_check_block_counters(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+void cluster_reserved2grabbed(int count) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ ctx = get_current_context(); -+ -+ sbinfo = get_super_private(ctx->super); -+ spin_lock_reiser4_super(sbinfo); -+ -+ sub_from_cluster_reserved(sbinfo, count); -+ sbinfo->blocks_grabbed += count; -+ -+ assert("edward-505", reiser4_check_block_counters(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+ add_to_ctx_grabbed(ctx, count); -+} -+ -+void cluster_reserved2free(int count) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ cluster_reserved2grabbed(count); -+ grabbed2free(ctx, sbinfo, count); -+} -+ -+static DEFINE_SPINLOCK(fake_lock); -+static reiser4_block_nr fake_gen = 0; -+ -+/** -+ * assign_fake_blocknr -+ * @blocknr: -+ * @count: -+ * -+ * Obtain a fake block number for new node which will be used to refer to -+ * this newly allocated node until real allocation is done. -+ */ -+static void assign_fake_blocknr(reiser4_block_nr *blocknr, int count) -+{ -+ spin_lock(&fake_lock); -+ *blocknr = fake_gen; -+ fake_gen += count; -+ spin_unlock(&fake_lock); -+ -+ BUG_ON(*blocknr & REISER4_BLOCKNR_STATUS_BIT_MASK); -+ /**blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;*/ -+ *blocknr |= REISER4_UNALLOCATED_STATUS_VALUE; -+ assert("zam-394", zlook(current_tree, blocknr) == NULL); -+} -+ -+int assign_fake_blocknr_formatted(reiser4_block_nr * blocknr) -+{ -+ assign_fake_blocknr(blocknr, 1); -+ grabbed2fake_allocated_formatted(); -+ return 0; -+} -+ -+/** -+ * fake_blocknrs_unformatted -+ * @count: number of fake numbers to get -+ * -+ * Allocates @count fake block numbers which will be assigned to jnodes -+ */ -+reiser4_block_nr fake_blocknr_unformatted(int count) -+{ -+ reiser4_block_nr blocknr; -+ -+ assign_fake_blocknr(&blocknr, count); -+ grabbed2fake_allocated_unformatted(count); -+ -+ return blocknr; -+} -+ -+/* adjust sb block counters, if real (on-disk) block allocation immediately -+ follows grabbing of free disk space. */ -+static void grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo, -+ __u64 count) -+{ -+ sub_from_ctx_grabbed(ctx, count); -+ -+ spin_lock_reiser4_super(sbinfo); -+ -+ sub_from_sb_grabbed(sbinfo, count); -+ sbinfo->blocks_used += count; -+ -+ assert("nikita-2679", reiser4_check_block_counters(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+/* adjust sb block counters when @count unallocated blocks get mapped to disk */ -+static void fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count, -+ reiser4_ba_flags_t flags) -+{ -+ spin_lock_reiser4_super(sbinfo); -+ -+ sub_from_sb_fake_allocated(sbinfo, count, flags); -+ sbinfo->blocks_used += count; -+ -+ assert("nikita-2680", -+ reiser4_check_block_counters(reiser4_get_current_sb())); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+static void flush_reserved2used(txn_atom * atom, __u64 count) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ assert("zam-787", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ -+ sub_from_atom_flush_reserved_nolock(atom, (__u32) count); -+ -+ sbinfo = get_current_super_private(); -+ spin_lock_reiser4_super(sbinfo); -+ -+ sub_from_sb_flush_reserved(sbinfo, count); -+ sbinfo->blocks_used += count; -+ -+ assert("zam-789", -+ reiser4_check_block_counters(reiser4_get_current_sb())); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+/* update the per fs blocknr hint default value. */ -+void -+update_blocknr_hint_default(const struct super_block *s, -+ const reiser4_block_nr * block) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(s); -+ -+ assert("nikita-3342", !reiser4_blocknr_is_fake(block)); -+ -+ spin_lock_reiser4_super(sbinfo); -+ if (*block < sbinfo->block_count) { -+ sbinfo->blocknr_hint_default = *block; -+ } else { -+ warning("zam-676", -+ "block number %llu is too large to be used in a blocknr hint\n", -+ (unsigned long long)*block); -+ dump_stack(); -+ DEBUGON(1); -+ } -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+/* get current value of the default blocknr hint. */ -+void get_blocknr_hint_default(reiser4_block_nr * result) -+{ -+ reiser4_super_info_data *sbinfo = get_current_super_private(); -+ -+ spin_lock_reiser4_super(sbinfo); -+ *result = sbinfo->blocknr_hint_default; -+ assert("zam-677", *result < sbinfo->block_count); -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+/* Allocate "real" disk blocks by calling a proper space allocation plugin -+ * method. Blocks are allocated in one contiguous disk region. The plugin -+ * independent part accounts blocks by subtracting allocated amount from grabbed -+ * or fake block counter and add the same amount to the counter of allocated -+ * blocks. -+ * -+ * @hint -- a reiser4 blocknr hint object which contains further block -+ * allocation hints and parameters (search start, a stage of block -+ * which will be mapped to disk, etc.), -+ * @blk -- an out parameter for the beginning of the allocated region, -+ * @len -- in/out parameter, it should contain the maximum number of allocated -+ * blocks, after block allocation completes, it contains the length of -+ * allocated disk region. -+ * @flags -- see reiser4_ba_flags_t description. -+ * -+ * @return -- 0 if success, error code otherwise. -+ */ -+int -+reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk, -+ reiser4_block_nr * len, reiser4_ba_flags_t flags) -+{ -+ __u64 needed = *len; -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ int ret; -+ -+ assert("zam-986", hint != NULL); -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ /* For write-optimized data we use default search start value, which is -+ * close to last write location. */ -+ if (flags & BA_USE_DEFAULT_SEARCH_START) { -+ get_blocknr_hint_default(&hint->blk); -+ } -+ -+ /* VITALY: allocator should grab this for internal/tx-lists/similar only. */ -+/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)? */ -+ if (hint->block_stage == BLOCK_NOT_COUNTED) { -+ ret = reiser4_grab_space_force(*len, flags); -+ if (ret != 0) -+ return ret; -+ } -+ -+ ret = -+ sa_alloc_blocks(reiser4_get_space_allocator(ctx->super), -+ hint, (int)needed, blk, len); -+ -+ if (!ret) { -+ assert("zam-680", *blk < reiser4_block_count(ctx->super)); -+ assert("zam-681", -+ *blk + *len <= reiser4_block_count(ctx->super)); -+ -+ if (flags & BA_PERMANENT) { -+ /* we assume that current atom exists at this moment */ -+ txn_atom *atom = get_current_atom_locked(); -+ atom->nr_blocks_allocated += *len; -+ spin_unlock_atom(atom); -+ } -+ -+ switch (hint->block_stage) { -+ case BLOCK_NOT_COUNTED: -+ case BLOCK_GRABBED: -+ grabbed2used(ctx, sbinfo, *len); -+ break; -+ case BLOCK_UNALLOCATED: -+ fake_allocated2used(sbinfo, *len, flags); -+ break; -+ case BLOCK_FLUSH_RESERVED: -+ { -+ txn_atom *atom = get_current_atom_locked(); -+ flush_reserved2used(atom, *len); -+ spin_unlock_atom(atom); -+ } -+ break; -+ default: -+ impossible("zam-531", "wrong block stage"); -+ } -+ } else { -+ assert("zam-821", -+ ergo(hint->max_dist == 0 -+ && !hint->backward, ret != -ENOSPC)); -+ if (hint->block_stage == BLOCK_NOT_COUNTED) -+ grabbed2free(ctx, sbinfo, needed); -+ } -+ -+ return ret; -+} -+ -+/* used -> fake_allocated -> grabbed -> free */ -+ -+/* adjust sb block counters when @count unallocated blocks get unmapped from -+ disk */ -+static void -+used2fake_allocated(reiser4_super_info_data * sbinfo, __u64 count, -+ int formatted) -+{ -+ spin_lock_reiser4_super(sbinfo); -+ -+ if (formatted) -+ sbinfo->blocks_fake_allocated += count; -+ else -+ sbinfo->blocks_fake_allocated_unformatted += count; -+ -+ sub_from_sb_used(sbinfo, count); -+ -+ assert("nikita-2681", -+ reiser4_check_block_counters(reiser4_get_current_sb())); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+static void -+used2flush_reserved(reiser4_super_info_data * sbinfo, txn_atom * atom, -+ __u64 count, reiser4_ba_flags_t flags UNUSED_ARG) -+{ -+ assert("nikita-2791", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ -+ add_to_atom_flush_reserved_nolock(atom, (__u32) count); -+ -+ spin_lock_reiser4_super(sbinfo); -+ -+ sbinfo->blocks_flush_reserved += count; -+ /*add_to_sb_flush_reserved(sbinfo, count); */ -+ sub_from_sb_used(sbinfo, count); -+ -+ assert("nikita-2681", -+ reiser4_check_block_counters(reiser4_get_current_sb())); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+/* disk space, virtually used by fake block numbers is counted as "grabbed" again. */ -+static void -+fake_allocated2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo, -+ __u64 count, reiser4_ba_flags_t flags) -+{ -+ add_to_ctx_grabbed(ctx, count); -+ -+ spin_lock_reiser4_super(sbinfo); -+ -+ assert("nikita-2682", reiser4_check_block_counters(ctx->super)); -+ -+ sbinfo->blocks_grabbed += count; -+ sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED); -+ -+ assert("nikita-2683", reiser4_check_block_counters(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ fake_allocated2grabbed(ctx, sbinfo, count, flags); -+ grabbed2free(ctx, sbinfo, count); -+} -+ -+void grabbed2free_mark(__u64 mark) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ assert("nikita-3007", (__s64) mark >= 0); -+ assert("nikita-3006", ctx->grabbed_blocks >= mark); -+ grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark); -+} -+ -+/** -+ * grabbed2free - adjust grabbed and free block counters -+ * @ctx: context to update grabbed block counter of -+ * @sbinfo: super block to update grabbed and free block counters of -+ * @count: number of blocks to adjust counters by -+ * -+ * Decreases context's and per filesystem's counters of grabbed -+ * blocks. Increases per filesystem's counter of free blocks. -+ */ -+void grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo, -+ __u64 count) -+{ -+ sub_from_ctx_grabbed(ctx, count); -+ -+ spin_lock_reiser4_super(sbinfo); -+ -+ sub_from_sb_grabbed(sbinfo, count); -+ sbinfo->blocks_free += count; -+ assert("nikita-2684", reiser4_check_block_counters(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ assert("vs-1095", atom); -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ sub_from_ctx_grabbed(ctx, count); -+ -+ add_to_atom_flush_reserved_nolock(atom, count); -+ -+ spin_lock_reiser4_super(sbinfo); -+ -+ sbinfo->blocks_flush_reserved += count; -+ sub_from_sb_grabbed(sbinfo, count); -+ -+ assert("vpf-292", reiser4_check_block_counters(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+void grabbed2flush_reserved(__u64 count) -+{ -+ txn_atom *atom = get_current_atom_locked(); -+ -+ grabbed2flush_reserved_nolock(atom, count); -+ -+ spin_unlock_atom(atom); -+} -+ -+void flush_reserved2grabbed(txn_atom * atom, __u64 count) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ assert("nikita-2788", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ add_to_ctx_grabbed(ctx, count); -+ -+ sub_from_atom_flush_reserved_nolock(atom, (__u32) count); -+ -+ spin_lock_reiser4_super(sbinfo); -+ -+ sbinfo->blocks_grabbed += count; -+ sub_from_sb_flush_reserved(sbinfo, count); -+ -+ assert("vpf-292", reiser4_check_block_counters(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+/** -+ * all_grabbed2free - releases all blocks grabbed in context -+ * -+ * Decreases context's and super block's grabbed block counters by number of -+ * blocks grabbed by current context and increases super block's free block -+ * counter correspondingly. -+ */ -+void all_grabbed2free(void) -+{ -+ reiser4_context *ctx = get_current_context(); -+ -+ grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks); -+} -+ -+/* adjust sb block counters if real (on-disk) blocks do not become unallocated -+ after freeing, @count blocks become "grabbed". */ -+static void -+used2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo, -+ __u64 count) -+{ -+ add_to_ctx_grabbed(ctx, count); -+ -+ spin_lock_reiser4_super(sbinfo); -+ -+ sbinfo->blocks_grabbed += count; -+ sub_from_sb_used(sbinfo, count); -+ -+ assert("nikita-2685", reiser4_check_block_counters(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+/* this used to be done through used2grabbed and grabbed2free*/ -+static void used2free(reiser4_super_info_data * sbinfo, __u64 count) -+{ -+ spin_lock_reiser4_super(sbinfo); -+ -+ sbinfo->blocks_free += count; -+ sub_from_sb_used(sbinfo, count); -+ -+ assert("nikita-2685", -+ reiser4_check_block_counters(reiser4_get_current_sb())); -+ -+ spin_unlock_reiser4_super(sbinfo); -+} -+ -+#if REISER4_DEBUG -+ -+/* check "allocated" state of given block range */ -+static void -+reiser4_check_blocks(const reiser4_block_nr * start, -+ const reiser4_block_nr * len, int desired) -+{ -+ sa_check_blocks(start, len, desired); -+} -+ -+/* check "allocated" state of given block */ -+void reiser4_check_block(const reiser4_block_nr * block, int desired) -+{ -+ const reiser4_block_nr one = 1; -+ -+ reiser4_check_blocks(block, &one, desired); -+} -+ -+#endif -+ -+/* Blocks deallocation function may do an actual deallocation through space -+ plugin allocation or store deleted block numbers in atom's delete_set data -+ structure depend on @defer parameter. */ -+ -+/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks which -+ will be deleted from WORKING bitmap. They might be just unmapped from disk, or -+ freed but disk space is still grabbed by current thread, or these blocks must -+ not be counted in any reiser4 sb block counters, see block_stage_t comment */ -+ -+/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to -+ distinguish blocks allocated for unformatted and formatted nodes */ -+ -+int -+reiser4_dealloc_blocks(const reiser4_block_nr * start, -+ const reiser4_block_nr * len, -+ block_stage_t target_stage, reiser4_ba_flags_t flags) -+{ -+ txn_atom *atom = NULL; -+ int ret; -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ if (REISER4_DEBUG) { -+ assert("zam-431", *len != 0); -+ assert("zam-432", *start != 0); -+ assert("zam-558", !reiser4_blocknr_is_fake(start)); -+ -+ spin_lock_reiser4_super(sbinfo); -+ assert("zam-562", *start < sbinfo->block_count); -+ spin_unlock_reiser4_super(sbinfo); -+ } -+ -+ if (flags & BA_DEFER) { -+ blocknr_set_entry *bsep = NULL; -+ -+ /* storing deleted block numbers in a blocknr set -+ datastructure for further actual deletion */ -+ do { -+ atom = get_current_atom_locked(); -+ assert("zam-430", atom != NULL); -+ -+ ret = -+ blocknr_set_add_extent(atom, &atom->delete_set, -+ &bsep, start, len); -+ -+ if (ret == -ENOMEM) -+ return ret; -+ -+ /* This loop might spin at most two times */ -+ } while (ret == -E_REPEAT); -+ -+ assert("zam-477", ret == 0); -+ assert("zam-433", atom != NULL); -+ -+ spin_unlock_atom(atom); -+ -+ } else { -+ assert("zam-425", get_current_super_private() != NULL); -+ sa_dealloc_blocks(reiser4_get_space_allocator(ctx->super), -+ *start, *len); -+ -+ if (flags & BA_PERMANENT) { -+ /* These blocks were counted as allocated, we have to revert it -+ * back if allocation is discarded. */ -+ txn_atom *atom = get_current_atom_locked(); -+ atom->nr_blocks_allocated -= *len; -+ spin_unlock_atom(atom); -+ } -+ -+ switch (target_stage) { -+ case BLOCK_NOT_COUNTED: -+ assert("vs-960", flags & BA_FORMATTED); -+ /* VITALY: This is what was grabbed for internal/tx-lists/similar only */ -+ used2free(sbinfo, *len); -+ break; -+ -+ case BLOCK_GRABBED: -+ used2grabbed(ctx, sbinfo, *len); -+ break; -+ -+ case BLOCK_UNALLOCATED: -+ used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED); -+ break; -+ -+ case BLOCK_FLUSH_RESERVED:{ -+ txn_atom *atom; -+ -+ atom = get_current_atom_locked(); -+ used2flush_reserved(sbinfo, atom, *len, -+ flags & BA_FORMATTED); -+ spin_unlock_atom(atom); -+ break; -+ } -+ default: -+ impossible("zam-532", "wrong block stage"); -+ } -+ } -+ -+ return 0; -+} -+ -+/* wrappers for block allocator plugin methods */ -+int reiser4_pre_commit_hook(void) -+{ -+ assert("zam-502", get_current_super_private() != NULL); -+ sa_pre_commit_hook(); -+ return 0; -+} -+ -+/* an actor which applies delete set to block allocator data */ -+static int -+apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a, -+ const reiser4_block_nr * b, void *data UNUSED_ARG) -+{ -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ __u64 len = 1; -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT); -+ assert("zam-552", sbinfo != NULL); -+ -+ if (b != NULL) -+ len = *b; -+ -+ if (REISER4_DEBUG) { -+ spin_lock_reiser4_super(sbinfo); -+ -+ assert("zam-554", *a < reiser4_block_count(ctx->super)); -+ assert("zam-555", *a + len <= reiser4_block_count(ctx->super)); -+ -+ spin_unlock_reiser4_super(sbinfo); -+ } -+ -+ sa_dealloc_blocks(&sbinfo->space_allocator, *a, len); -+ /* adjust sb block counters */ -+ used2free(sbinfo, len); -+ return 0; -+} -+ -+void reiser4_post_commit_hook(void) -+{ -+ txn_atom *atom; -+ -+ atom = get_current_atom_locked(); -+ assert("zam-452", atom->stage == ASTAGE_POST_COMMIT); -+ spin_unlock_atom(atom); -+ -+ /* do the block deallocation which was deferred -+ until commit is done */ -+ blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1); -+ -+ assert("zam-504", get_current_super_private() != NULL); -+ sa_post_commit_hook(); -+} -+ -+void reiser4_post_write_back_hook(void) -+{ -+ assert("zam-504", get_current_super_private() != NULL); -+ -+ sa_post_commit_hook(); -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/block_alloc.h linux-2.6.24/fs/reiser4/block_alloc.h ---- linux-2.6.24.orig/fs/reiser4/block_alloc.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/block_alloc.h 2008-01-25 11:39:06.888195324 +0300 -@@ -0,0 +1,175 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#if !defined (__FS_REISER4_BLOCK_ALLOC_H__) -+#define __FS_REISER4_BLOCK_ALLOC_H__ -+ -+#include "dformat.h" -+#include "forward.h" -+ -+#include /* for __u?? */ -+#include -+ -+/* Mask when is applied to given block number shows is that block number is a fake one */ -+#define REISER4_FAKE_BLOCKNR_BIT_MASK 0x8000000000000000ULL -+/* Mask which isolates a type of object this fake block number was assigned to */ -+#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL -+ -+/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared -+ against these two values to understand is the object unallocated or bitmap -+ shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */ -+#define REISER4_UNALLOCATED_STATUS_VALUE 0xC000000000000000ULL -+#define REISER4_BITMAP_BLOCKS_STATUS_VALUE 0x8000000000000000ULL -+ -+/* specification how block allocation was counted in sb block counters */ -+typedef enum { -+ BLOCK_NOT_COUNTED = 0, /* reiser4 has no info about this block yet */ -+ BLOCK_GRABBED = 1, /* free space grabbed for further allocation -+ of this block */ -+ BLOCK_FLUSH_RESERVED = 2, /* block is reserved for flush needs. */ -+ BLOCK_UNALLOCATED = 3, /* block is used for existing in-memory object -+ ( unallocated formatted or unformatted -+ node) */ -+ BLOCK_ALLOCATED = 4 /* block is mapped to disk, real on-disk block -+ number assigned */ -+} block_stage_t; -+ -+/* a hint for block allocator */ -+struct reiser4_blocknr_hint { -+ /* FIXME: I think we want to add a longterm lock on the bitmap block here. This -+ is to prevent jnode_flush() calls from interleaving allocations on the same -+ bitmap, once a hint is established. */ -+ -+ /* search start hint */ -+ reiser4_block_nr blk; -+ /* if not zero, it is a region size we search for free blocks in */ -+ reiser4_block_nr max_dist; -+ /* level for allocation, may be useful have branch-level and higher -+ write-optimized. */ -+ tree_level level; -+ /* block allocator assumes that blocks, which will be mapped to disk, -+ are in this specified block_stage */ -+ block_stage_t block_stage; -+ /* If direction = 1 allocate blocks in backward direction from the end -+ * of disk to the beginning of disk. */ -+ unsigned int backward:1; -+ -+}; -+ -+/* These flags control block allocation/deallocation behavior */ -+enum reiser4_ba_flags { -+ /* do allocatations from reserved (5%) area */ -+ BA_RESERVED = (1 << 0), -+ -+ /* block allocator can do commit trying to recover free space */ -+ BA_CAN_COMMIT = (1 << 1), -+ -+ /* if operation will be applied to formatted block */ -+ BA_FORMATTED = (1 << 2), -+ -+ /* defer actual block freeing until transaction commit */ -+ BA_DEFER = (1 << 3), -+ -+ /* allocate blocks for permanent fs objects (formatted or unformatted), not -+ wandered of log blocks */ -+ BA_PERMANENT = (1 << 4), -+ -+ /* grab space even it was disabled */ -+ BA_FORCE = (1 << 5), -+ -+ /* use default start value for free blocks search. */ -+ BA_USE_DEFAULT_SEARCH_START = (1 << 6) -+}; -+ -+typedef enum reiser4_ba_flags reiser4_ba_flags_t; -+ -+extern void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint); -+extern void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint); -+extern void update_blocknr_hint_default(const struct super_block *, -+ const reiser4_block_nr *); -+extern void get_blocknr_hint_default(reiser4_block_nr *); -+ -+extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block *super); -+ -+int assign_fake_blocknr_formatted(reiser4_block_nr *); -+reiser4_block_nr fake_blocknr_unformatted(int); -+ -+/* free -> grabbed -> fake_allocated -> used */ -+ -+int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags); -+void all_grabbed2free(void); -+void grabbed2free(reiser4_context *, reiser4_super_info_data *, __u64 count); -+void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags); -+void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count); -+void grabbed2flush_reserved(__u64 count); -+int reiser4_alloc_blocks(reiser4_blocknr_hint * hint, -+ reiser4_block_nr * start, -+ reiser4_block_nr * len, reiser4_ba_flags_t flags); -+int reiser4_dealloc_blocks(const reiser4_block_nr *, -+ const reiser4_block_nr *, -+ block_stage_t, reiser4_ba_flags_t flags); -+ -+static inline int reiser4_alloc_block(reiser4_blocknr_hint * hint, -+ reiser4_block_nr * start, -+ reiser4_ba_flags_t flags) -+{ -+ reiser4_block_nr one = 1; -+ return reiser4_alloc_blocks(hint, start, &one, flags); -+} -+ -+static inline int reiser4_dealloc_block(const reiser4_block_nr * block, -+ block_stage_t stage, -+ reiser4_ba_flags_t flags) -+{ -+ const reiser4_block_nr one = 1; -+ return reiser4_dealloc_blocks(block, &one, stage, flags); -+} -+ -+#define reiser4_grab_space_force(count, flags) \ -+ reiser4_grab_space(count, flags | BA_FORCE) -+ -+extern void grabbed2free_mark(__u64 mark); -+extern int reiser4_grab_reserved(struct super_block *, -+ __u64, reiser4_ba_flags_t); -+extern void reiser4_release_reserved(struct super_block *super); -+ -+/* grabbed -> fake_allocated */ -+ -+/* fake_allocated -> used */ -+ -+/* used -> fake_allocated -> grabbed -> free */ -+ -+extern void flush_reserved2grabbed(txn_atom * atom, __u64 count); -+ -+extern int reiser4_blocknr_is_fake(const reiser4_block_nr * da); -+ -+extern void grabbed2cluster_reserved(int count); -+extern void cluster_reserved2grabbed(int count); -+extern void cluster_reserved2free(int count); -+ -+extern int reiser4_check_block_counters(const struct super_block *); -+ -+#if REISER4_DEBUG -+ -+extern void reiser4_check_block(const reiser4_block_nr *, int); -+ -+#else -+ -+# define reiser4_check_block(beg, val) noop -+ -+#endif -+ -+extern int reiser4_pre_commit_hook(void); -+extern void reiser4_post_commit_hook(void); -+extern void reiser4_post_write_back_hook(void); -+ -+#endif /* __FS_REISER4_BLOCK_ALLOC_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/blocknrset.c linux-2.6.24/fs/reiser4/blocknrset.c ---- linux-2.6.24.orig/fs/reiser4/blocknrset.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/blocknrset.c 2008-01-25 11:39:06.892196354 +0300 -@@ -0,0 +1,368 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* This file contains code for various block number sets used by the atom to -+ track the deleted set and wandered block mappings. */ -+ -+#include "debug.h" -+#include "dformat.h" -+#include "txnmgr.h" -+#include "context.h" -+ -+#include -+ -+/* The proposed data structure for storing unordered block number sets is a -+ list of elements, each of which contains an array of block number or/and -+ array of block number pairs. That element called blocknr_set_entry is used -+ to store block numbers from the beginning and for extents from the end of -+ the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields -+ count numbers of blocks and extents. -+ -+ +------------------- blocknr_set_entry->data ------------------+ -+ |block1|block2| ... ... |pair3|pair2|pair1| -+ +------------------------------------------------------------+ -+ -+ When current blocknr_set_entry is full, allocate a new one. */ -+ -+/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete -+ * set (single blocks and block extents), in that case blocknr pair represent an -+ * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs -+ * there represent a (real block) -> (wandered block) mapping. */ -+ -+/* Protection: blocknr sets belong to reiser4 atom, and -+ * their modifications are performed with the atom lock held */ -+ -+/* The total size of a blocknr_set_entry. */ -+#define BLOCKNR_SET_ENTRY_SIZE 128 -+ -+/* The number of blocks that can fit the blocknr data area. */ -+#define BLOCKNR_SET_ENTRIES_NUMBER \ -+ ((BLOCKNR_SET_ENTRY_SIZE - \ -+ 2 * sizeof (unsigned) - \ -+ sizeof(struct list_head)) / \ -+ sizeof(reiser4_block_nr)) -+ -+/* An entry of the blocknr_set */ -+struct blocknr_set_entry { -+ unsigned nr_singles; -+ unsigned nr_pairs; -+ struct list_head link; -+ reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER]; -+}; -+ -+/* A pair of blocks as recorded in the blocknr_set_entry data. */ -+struct blocknr_pair { -+ reiser4_block_nr a; -+ reiser4_block_nr b; -+}; -+ -+/* Return the number of blocknr slots available in a blocknr_set_entry. */ -+/* Audited by: green(2002.06.11) */ -+static unsigned bse_avail(blocknr_set_entry * bse) -+{ -+ unsigned used = bse->nr_singles + 2 * bse->nr_pairs; -+ -+ assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used); -+ cassert(sizeof(blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE); -+ -+ return BLOCKNR_SET_ENTRIES_NUMBER - used; -+} -+ -+/* Initialize a blocknr_set_entry. */ -+static void bse_init(blocknr_set_entry *bse) -+{ -+ bse->nr_singles = 0; -+ bse->nr_pairs = 0; -+ INIT_LIST_HEAD(&bse->link); -+} -+ -+/* Allocate and initialize a blocknr_set_entry. */ -+/* Audited by: green(2002.06.11) */ -+static blocknr_set_entry *bse_alloc(void) -+{ -+ blocknr_set_entry *e; -+ -+ if ((e = (blocknr_set_entry *) kmalloc(sizeof(blocknr_set_entry), -+ reiser4_ctx_gfp_mask_get())) == NULL) -+ return NULL; -+ -+ bse_init(e); -+ -+ return e; -+} -+ -+/* Free a blocknr_set_entry. */ -+/* Audited by: green(2002.06.11) */ -+static void bse_free(blocknr_set_entry * bse) -+{ -+ kfree(bse); -+} -+ -+/* Add a block number to a blocknr_set_entry */ -+/* Audited by: green(2002.06.11) */ -+static void -+bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block) -+{ -+ assert("jmacd-5099", bse_avail(bse) >= 1); -+ -+ bse->entries[bse->nr_singles++] = *block; -+} -+ -+/* Get a pair of block numbers */ -+/* Audited by: green(2002.06.11) */ -+static inline struct blocknr_pair *bse_get_pair(blocknr_set_entry * bse, -+ unsigned pno) -+{ -+ assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1)); -+ -+ return (struct blocknr_pair *) (bse->entries + -+ BLOCKNR_SET_ENTRIES_NUMBER - -+ 2 * (pno + 1)); -+} -+ -+/* Add a pair of block numbers to a blocknr_set_entry */ -+/* Audited by: green(2002.06.11) */ -+static void -+bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a, -+ const reiser4_block_nr * b) -+{ -+ struct blocknr_pair *pair; -+ -+ assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL); -+ -+ pair = bse_get_pair(bse, bse->nr_pairs++); -+ -+ pair->a = *a; -+ pair->b = *b; -+} -+ -+/* Add either a block or pair of blocks to the block number set. The first -+ blocknr (@a) must be non-NULL. If @b is NULL a single blocknr is added, if -+ @b is non-NULL a pair is added. The block number set belongs to atom, and -+ the call is made with the atom lock held. There may not be enough space in -+ the current blocknr_set_entry. If new_bsep points to a non-NULL -+ blocknr_set_entry then it will be added to the blocknr_set and new_bsep -+ will be set to NULL. If new_bsep contains NULL then the atom lock will be -+ released and a new bse will be allocated in new_bsep. E_REPEAT will be -+ returned with the atom unlocked for the operation to be tried again. If -+ the operation succeeds, 0 is returned. If new_bsep is non-NULL and not -+ used during the call, it will be freed automatically. */ -+static int blocknr_set_add(txn_atom *atom, struct list_head *bset, -+ blocknr_set_entry **new_bsep, const reiser4_block_nr *a, -+ const reiser4_block_nr *b) -+{ -+ blocknr_set_entry *bse; -+ unsigned entries_needed; -+ -+ assert("jmacd-5101", a != NULL); -+ -+ entries_needed = (b == NULL) ? 1 : 2; -+ if (list_empty(bset) || -+ bse_avail(list_entry(bset->next, blocknr_set_entry, link)) < entries_needed) { -+ /* See if a bse was previously allocated. */ -+ if (*new_bsep == NULL) { -+ spin_unlock_atom(atom); -+ *new_bsep = bse_alloc(); -+ return (*new_bsep != NULL) ? -E_REPEAT : -+ RETERR(-ENOMEM); -+ } -+ -+ /* Put it on the head of the list. */ -+ list_add(&((*new_bsep)->link), bset); -+ -+ *new_bsep = NULL; -+ } -+ -+ /* Add the single or pair. */ -+ bse = list_entry(bset->next, blocknr_set_entry, link); -+ if (b == NULL) { -+ bse_put_single(bse, a); -+ } else { -+ bse_put_pair(bse, a, b); -+ } -+ -+ /* If new_bsep is non-NULL then there was an allocation race, free this copy. */ -+ if (*new_bsep != NULL) { -+ bse_free(*new_bsep); -+ *new_bsep = NULL; -+ } -+ -+ return 0; -+} -+ -+/* Add an extent to the block set. If the length is 1, it is treated as a -+ single block (e.g., reiser4_set_add_block). */ -+/* Audited by: green(2002.06.11) */ -+/* Auditor note: Entire call chain cannot hold any spinlocks, because -+ kmalloc might schedule. The only exception is atom spinlock, which is -+ properly freed. */ -+int -+blocknr_set_add_extent(txn_atom * atom, -+ struct list_head * bset, -+ blocknr_set_entry ** new_bsep, -+ const reiser4_block_nr * start, -+ const reiser4_block_nr * len) -+{ -+ assert("jmacd-5102", start != NULL && len != NULL && *len > 0); -+ return blocknr_set_add(atom, bset, new_bsep, start, -+ *len == 1 ? NULL : len); -+} -+ -+/* Add a block pair to the block set. It adds exactly a pair, which is checked -+ * by an assertion that both arguments are not null.*/ -+/* Audited by: green(2002.06.11) */ -+/* Auditor note: Entire call chain cannot hold any spinlocks, because -+ kmalloc might schedule. The only exception is atom spinlock, which is -+ properly freed. */ -+int -+blocknr_set_add_pair(txn_atom * atom, -+ struct list_head * bset, -+ blocknr_set_entry ** new_bsep, const reiser4_block_nr * a, -+ const reiser4_block_nr * b) -+{ -+ assert("jmacd-5103", a != NULL && b != NULL); -+ return blocknr_set_add(atom, bset, new_bsep, a, b); -+} -+ -+/* Initialize a blocknr_set. */ -+void blocknr_set_init(struct list_head *bset) -+{ -+ INIT_LIST_HEAD(bset); -+} -+ -+/* Release the entries of a blocknr_set. */ -+void blocknr_set_destroy(struct list_head *bset) -+{ -+ blocknr_set_entry *bse; -+ -+ while (!list_empty(bset)) { -+ bse = list_entry(bset->next, blocknr_set_entry, link); -+ list_del_init(&bse->link); -+ bse_free(bse); -+ } -+} -+ -+/* Merge blocknr_set entries out of @from into @into. */ -+/* Audited by: green(2002.06.11) */ -+/* Auditor comments: This merge does not know if merged sets contain -+ blocks pairs (As for wandered sets) or extents, so it cannot really merge -+ overlapping ranges if there is some. So I believe it may lead to -+ some blocks being presented several times in one blocknr_set. To help -+ debugging such problems it might help to check for duplicate entries on -+ actual processing of this set. Testing this kind of stuff right here is -+ also complicated by the fact that these sets are not sorted and going -+ through whole set on each element addition is going to be CPU-heavy task */ -+void blocknr_set_merge(struct list_head * from, struct list_head * into) -+{ -+ blocknr_set_entry *bse_into = NULL; -+ -+ /* If @from is empty, no work to perform. */ -+ if (list_empty(from)) -+ return; -+ /* If @into is not empty, try merging partial-entries. */ -+ if (!list_empty(into)) { -+ -+ /* Neither set is empty, pop the front to members and try to combine them. */ -+ blocknr_set_entry *bse_from; -+ unsigned into_avail; -+ -+ bse_into = list_entry(into->next, blocknr_set_entry, link); -+ list_del_init(&bse_into->link); -+ bse_from = list_entry(from->next, blocknr_set_entry, link); -+ list_del_init(&bse_from->link); -+ -+ /* Combine singles. */ -+ for (into_avail = bse_avail(bse_into); -+ into_avail != 0 && bse_from->nr_singles != 0; -+ into_avail -= 1) { -+ bse_put_single(bse_into, -+ &bse_from->entries[--bse_from-> -+ nr_singles]); -+ } -+ -+ /* Combine pairs. */ -+ for (; into_avail > 1 && bse_from->nr_pairs != 0; -+ into_avail -= 2) { -+ struct blocknr_pair *pair = -+ bse_get_pair(bse_from, --bse_from->nr_pairs); -+ bse_put_pair(bse_into, &pair->a, &pair->b); -+ } -+ -+ /* If bse_from is empty, delete it now. */ -+ if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) { -+ bse_free(bse_from); -+ } else { -+ /* Otherwise, bse_into is full or nearly full (e.g., -+ it could have one slot avail and bse_from has one -+ pair left). Push it back onto the list. bse_from -+ becomes bse_into, which will be the new partial. */ -+ list_add(&bse_into->link, into); -+ bse_into = bse_from; -+ } -+ } -+ -+ /* Splice lists together. */ -+ list_splice_init(from, into->prev); -+ -+ /* Add the partial entry back to the head of the list. */ -+ if (bse_into != NULL) -+ list_add(&bse_into->link, into); -+} -+ -+/* Iterate over all blocknr set elements. */ -+int blocknr_set_iterator(txn_atom *atom, struct list_head *bset, -+ blocknr_set_actor_f actor, void *data, int delete) -+{ -+ -+ blocknr_set_entry *entry; -+ -+ assert("zam-429", atom != NULL); -+ assert("zam-430", atom_is_protected(atom)); -+ assert("zam-431", bset != 0); -+ assert("zam-432", actor != NULL); -+ -+ entry = list_entry(bset->next, blocknr_set_entry, link); -+ while (bset != &entry->link) { -+ blocknr_set_entry *tmp = list_entry(entry->link.next, blocknr_set_entry, link); -+ unsigned int i; -+ int ret; -+ -+ for (i = 0; i < entry->nr_singles; i++) { -+ ret = actor(atom, &entry->entries[i], NULL, data); -+ -+ /* We can't break a loop if delete flag is set. */ -+ if (ret != 0 && !delete) -+ return ret; -+ } -+ -+ for (i = 0; i < entry->nr_pairs; i++) { -+ struct blocknr_pair *ab; -+ -+ ab = bse_get_pair(entry, i); -+ -+ ret = actor(atom, &ab->a, &ab->b, data); -+ -+ if (ret != 0 && !delete) -+ return ret; -+ } -+ -+ if (delete) { -+ list_del(&entry->link); -+ bse_free(entry); -+ } -+ -+ entry = tmp; -+ } -+ -+ return 0; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/carry.c linux-2.6.24/fs/reiser4/carry.c ---- linux-2.6.24.orig/fs/reiser4/carry.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/carry.c 2008-01-25 11:39:06.896197385 +0300 -@@ -0,0 +1,1391 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+/* Functions to "carry" tree modification(s) upward. */ -+/* Tree is modified one level at a time. As we modify a level we accumulate a -+ set of changes that need to be propagated to the next level. We manage -+ node locking such that any searches that collide with carrying are -+ restarted, from the root if necessary. -+ -+ Insertion of a new item may result in items being moved among nodes and -+ this requires the delimiting key to be updated at the least common parent -+ of the nodes modified to preserve search tree invariants. Also, insertion -+ may require allocation of a new node. A pointer to the new node has to be -+ inserted into some node on the parent level, etc. -+ -+ Tree carrying is meant to be analogous to arithmetic carrying. -+ -+ A carry operation is always associated with some node (&carry_node). -+ -+ Carry process starts with some initial set of operations to be performed -+ and an initial set of already locked nodes. Operations are performed one -+ by one. Performing each single operation has following possible effects: -+ -+ - content of carry node associated with operation is modified -+ - new carry nodes are locked and involved into carry process on this level -+ - new carry operations are posted to the next level -+ -+ After all carry operations on this level are done, process is repeated for -+ the accumulated sequence on carry operations for the next level. This -+ starts by trying to lock (in left to right order) all carry nodes -+ associated with carry operations on the parent level. After this, we decide -+ whether more nodes are required on the left of already locked set. If so, -+ all locks taken on the parent level are released, new carry nodes are -+ added, and locking process repeats. -+ -+ It may happen that balancing process fails owing to unrecoverable error on -+ some of upper levels of a tree (possible causes are io error, failure to -+ allocate new node, etc.). In this case we should unmount the filesystem, -+ rebooting if it is the root, and possibly advise the use of fsck. -+ -+ USAGE: -+ -+ int some_tree_operation( znode *node, ... ) -+ { -+ // Allocate on a stack pool of carry objects: operations and nodes. -+ // Most carry processes will only take objects from here, without -+ // dynamic allocation. -+ -+I feel uneasy about this pool. It adds to code complexity, I understand why it exists, but.... -Hans -+ -+ carry_pool pool; -+ carry_level lowest_level; -+ carry_op *op; -+ -+ init_carry_pool( &pool ); -+ init_carry_level( &lowest_level, &pool ); -+ -+ // operation may be one of: -+ // COP_INSERT --- insert new item into node -+ // COP_CUT --- remove part of or whole node -+ // COP_PASTE --- increase size of item -+ // COP_DELETE --- delete pointer from parent node -+ // COP_UPDATE --- update delimiting key in least -+ // common ancestor of two -+ -+ op = reiser4_post_carry( &lowest_level, operation, node, 0 ); -+ if( IS_ERR( op ) || ( op == NULL ) ) { -+ handle error -+ } else { -+ // fill in remaining fields in @op, according to carry.h:carry_op -+ result = carry( &lowest_level, NULL ); -+ } -+ done_carry_pool( &pool ); -+ } -+ -+ When you are implementing node plugin method that participates in carry -+ (shifting, insertion, deletion, etc.), do the following: -+ -+ int foo_node_method( znode *node, ..., carry_level *todo ) -+ { -+ carry_op *op; -+ -+ .... -+ -+ // note, that last argument to reiser4_post_carry() is non-null -+ // here, because @op is to be applied to the parent of @node, rather -+ // than to the @node itself as in the previous case. -+ -+ op = node_post_carry( todo, operation, node, 1 ); -+ // fill in remaining fields in @op, according to carry.h:carry_op -+ -+ .... -+ -+ } -+ -+ BATCHING: -+ -+ One of the main advantages of level-by-level balancing implemented here is -+ ability to batch updates on a parent level and to peform them more -+ efficiently as a result. -+ -+ Description To Be Done (TBD). -+ -+ DIFFICULTIES AND SUBTLE POINTS: -+ -+ 1. complex plumbing is required, because: -+ -+ a. effective allocation through pools is needed -+ -+ b. target of operation is not exactly known when operation is -+ posted. This is worked around through bitfields in &carry_node and -+ logic in lock_carry_node() -+ -+ c. of interaction with locking code: node should be added into sibling -+ list when pointer to it is inserted into its parent, which is some time -+ after node was created. Between these moments, node is somewhat in -+ suspended state and is only registered in the carry lists -+ -+ 2. whole balancing logic is implemented here, in particular, insertion -+ logic is coded in make_space(). -+ -+ 3. special cases like insertion (reiser4_add_tree_root()) or deletion -+ (reiser4_kill_tree_root()) of tree root and morphing of paste into insert -+ (insert_paste()) have to be handled. -+ -+ 4. there is non-trivial interdependency between allocation of new nodes -+ and almost everything else. This is mainly due to the (1.c) above. I shall -+ write about this later. -+ -+*/ -+ -+#include "forward.h" -+#include "debug.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/item/item.h" -+#include "plugin/item/extent.h" -+#include "plugin/node/node.h" -+#include "jnode.h" -+#include "znode.h" -+#include "tree_mod.h" -+#include "tree_walk.h" -+#include "block_alloc.h" -+#include "pool.h" -+#include "tree.h" -+#include "carry.h" -+#include "carry_ops.h" -+#include "super.h" -+#include "reiser4.h" -+ -+#include -+ -+/* level locking/unlocking */ -+static int lock_carry_level(carry_level * level); -+static void unlock_carry_level(carry_level * level, int failure); -+static void done_carry_level(carry_level * level); -+static void unlock_carry_node(carry_level * level, carry_node * node, int fail); -+ -+int lock_carry_node(carry_level * level, carry_node * node); -+int lock_carry_node_tail(carry_node * node); -+ -+/* carry processing proper */ -+static int carry_on_level(carry_level * doing, carry_level * todo); -+ -+static carry_op *add_op(carry_level * level, pool_ordering order, -+ carry_op * reference); -+ -+/* handlers for carry operations. */ -+ -+static void fatal_carry_error(carry_level * doing, int ecode); -+static int add_new_root(carry_level * level, carry_node * node, znode * fake); -+ -+static void print_level(const char *prefix, carry_level * level); -+ -+#if REISER4_DEBUG -+typedef enum { -+ CARRY_TODO, -+ CARRY_DOING -+} carry_queue_state; -+static int carry_level_invariant(carry_level * level, carry_queue_state state); -+#endif -+ -+/* main entry point for tree balancing. -+ -+ Tree carry performs operations from @doing and while doing so accumulates -+ information about operations to be performed on the next level ("carried" -+ to the parent level). Carried operations are performed, causing possibly -+ more operations to be carried upward etc. carry() takes care about -+ locking and pinning znodes while operating on them. -+ -+ For usage, see comment at the top of fs/reiser4/carry.c -+ -+*/ -+int reiser4_carry(carry_level * doing /* set of carry operations to be -+ * performed */ , -+ carry_level * done /* set of nodes, already performed -+ * at the previous level. -+ * NULL in most cases */) -+{ -+ int result = 0; -+ /* queue of new requests */ -+ carry_level *todo; -+ ON_DEBUG(STORE_COUNTERS); -+ -+ assert("nikita-888", doing != NULL); -+ BUG_ON(done != NULL); -+ -+ todo = doing + 1; -+ init_carry_level(todo, doing->pool); -+ -+ /* queue of requests preformed on the previous level */ -+ done = todo + 1; -+ init_carry_level(done, doing->pool); -+ -+ /* iterate until there is nothing more to do */ -+ while (result == 0 && doing->ops_num > 0) { -+ carry_level *tmp; -+ -+ /* at this point @done is locked. */ -+ /* repeat lock/do/unlock while -+ -+ (1) lock_carry_level() fails due to deadlock avoidance, or -+ -+ (2) carry_on_level() decides that more nodes have to -+ be involved. -+ -+ (3) some unexpected error occurred while balancing on the -+ upper levels. In this case all changes are rolled back. -+ -+ */ -+ while (1) { -+ result = lock_carry_level(doing); -+ if (result == 0) { -+ /* perform operations from @doing and -+ accumulate new requests in @todo */ -+ result = carry_on_level(doing, todo); -+ if (result == 0) -+ break; -+ else if (result != -E_REPEAT || -+ !doing->restartable) { -+ warning("nikita-1043", -+ "Fatal error during carry: %i", -+ result); -+ print_level("done", done); -+ print_level("doing", doing); -+ print_level("todo", todo); -+ /* do some rough stuff like aborting -+ all pending transcrashes and thus -+ pushing tree back to the consistent -+ state. Alternatvely, just panic. -+ */ -+ fatal_carry_error(doing, result); -+ return result; -+ } -+ } else if (result != -E_REPEAT) { -+ fatal_carry_error(doing, result); -+ return result; -+ } -+ unlock_carry_level(doing, 1); -+ } -+ /* at this point @done can be safely unlocked */ -+ done_carry_level(done); -+ -+ /* cyclically shift queues */ -+ tmp = done; -+ done = doing; -+ doing = todo; -+ todo = tmp; -+ init_carry_level(todo, doing->pool); -+ -+ /* give other threads chance to run */ -+ reiser4_preempt_point(); -+ } -+ done_carry_level(done); -+ -+ /* all counters, but x_refs should remain the same. x_refs can change -+ owing to transaction manager */ -+ ON_DEBUG(CHECK_COUNTERS); -+ return result; -+} -+ -+/* perform carry operations on given level. -+ -+ Optimizations proposed by pooh: -+ -+ (1) don't lock all nodes from queue at the same time. Lock nodes lazily as -+ required; -+ -+ (2) unlock node if there are no more operations to be performed upon it and -+ node didn't add any operation to @todo. This can be implemented by -+ attaching to each node two counters: counter of operaions working on this -+ node and counter and operations carried upward from this node. -+ -+*/ -+static int carry_on_level(carry_level * doing /* queue of carry operations to -+ * do on this level */ , -+ carry_level * todo /* queue where new carry -+ * operations to be performed on -+ * the * parent level are -+ * accumulated during @doing -+ * processing. */ ) -+{ -+ int result; -+ int (*f) (carry_op *, carry_level *, carry_level *); -+ carry_op *op; -+ carry_op *tmp_op; -+ -+ assert("nikita-1034", doing != NULL); -+ assert("nikita-1035", todo != NULL); -+ -+ /* @doing->nodes are locked. */ -+ -+ /* This function can be split into two phases: analysis and modification. -+ -+ Analysis calculates precisely what items should be moved between -+ nodes. This information is gathered in some structures attached to -+ each carry_node in a @doing queue. Analysis also determines whether -+ new nodes are to be allocated etc. -+ -+ After analysis is completed, actual modification is performed. Here -+ we can take advantage of "batch modification": if there are several -+ operations acting on the same node, modifications can be performed -+ more efficiently when batched together. -+ -+ Above is an optimization left for the future. -+ */ -+ /* Important, but delayed optimization: it's possible to batch -+ operations together and perform them more efficiently as a -+ result. For example, deletion of several neighboring items from a -+ node can be converted to a single ->cut() operation. -+ -+ Before processing queue, it should be scanned and "mergeable" -+ operations merged. -+ */ -+ result = 0; -+ for_all_ops(doing, op, tmp_op) { -+ carry_opcode opcode; -+ -+ assert("nikita-1041", op != NULL); -+ opcode = op->op; -+ assert("nikita-1042", op->op < COP_LAST_OP); -+ f = op_dispatch_table[op->op].handler; -+ result = f(op, doing, todo); -+ /* locking can fail with -E_REPEAT. Any different error is fatal -+ and will be handled by fatal_carry_error() sledgehammer. -+ */ -+ if (result != 0) -+ break; -+ } -+ if (result == 0) { -+ carry_plugin_info info; -+ carry_node *scan; -+ carry_node *tmp_scan; -+ -+ info.doing = doing; -+ info.todo = todo; -+ -+ assert("nikita-3002", -+ carry_level_invariant(doing, CARRY_DOING)); -+ for_all_nodes(doing, scan, tmp_scan) { -+ znode *node; -+ -+ node = reiser4_carry_real(scan); -+ assert("nikita-2547", node != NULL); -+ if (node_is_empty(node)) { -+ result = -+ node_plugin_by_node(node)-> -+ prepare_removal(node, &info); -+ if (result != 0) -+ break; -+ } -+ } -+ } -+ return result; -+} -+ -+/* post carry operation -+ -+ This is main function used by external carry clients: node layout plugins -+ and tree operations to create new carry operation to be performed on some -+ level. -+ -+ New operation will be included in the @level queue. To actually perform it, -+ call carry( level, ... ). This function takes write lock on @node. Carry -+ manages all its locks by itself, don't worry about this. -+ -+ This function adds operation and node at the end of the queue. It is up to -+ caller to guarantee proper ordering of node queue. -+ -+*/ -+carry_op * reiser4_post_carry(carry_level * level /* queue where new operation -+ * is to be posted at */ , -+ carry_opcode op /* opcode of operation */ , -+ znode * node /* node on which this operation -+ * will operate */ , -+ int apply_to_parent_p /* whether operation will -+ * operate directly on @node -+ * or on it parent. */) -+{ -+ carry_op *result; -+ carry_node *child; -+ -+ assert("nikita-1046", level != NULL); -+ assert("nikita-1788", znode_is_write_locked(node)); -+ -+ result = add_op(level, POOLO_LAST, NULL); -+ if (IS_ERR(result)) -+ return result; -+ child = reiser4_add_carry(level, POOLO_LAST, NULL); -+ if (IS_ERR(child)) { -+ reiser4_pool_free(&level->pool->op_pool, &result->header); -+ return (carry_op *) child; -+ } -+ result->node = child; -+ result->op = op; -+ child->parent = apply_to_parent_p; -+ if (ZF_ISSET(node, JNODE_ORPHAN)) -+ child->left_before = 1; -+ child->node = node; -+ return result; -+} -+ -+/* initialize carry queue */ -+void init_carry_level(carry_level * level /* level to initialize */ , -+ carry_pool * pool /* pool @level will allocate objects -+ * from */ ) -+{ -+ assert("nikita-1045", level != NULL); -+ assert("nikita-967", pool != NULL); -+ -+ memset(level, 0, sizeof *level); -+ level->pool = pool; -+ -+ INIT_LIST_HEAD(&level->nodes); -+ INIT_LIST_HEAD(&level->ops); -+} -+ -+/* allocate carry pool and initialize pools within queue */ -+carry_pool *init_carry_pool(int size) -+{ -+ carry_pool *pool; -+ -+ assert("", size >= sizeof(carry_pool) + 3 * sizeof(carry_level)); -+ pool = kmalloc(size, reiser4_ctx_gfp_mask_get()); -+ if (pool == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ -+ reiser4_init_pool(&pool->op_pool, sizeof(carry_op), CARRIES_POOL_SIZE, -+ (char *)pool->op); -+ reiser4_init_pool(&pool->node_pool, sizeof(carry_node), -+ NODES_LOCKED_POOL_SIZE, (char *)pool->node); -+ return pool; -+} -+ -+/* finish with queue pools */ -+void done_carry_pool(carry_pool * pool /* pool to destroy */ ) -+{ -+ reiser4_done_pool(&pool->op_pool); -+ reiser4_done_pool(&pool->node_pool); -+ kfree(pool); -+} -+ -+/* add new carry node to the @level. -+ -+ Returns pointer to the new carry node allocated from pool. It's up to -+ callers to maintain proper order in the @level. Assumption is that if carry -+ nodes on one level are already sorted and modifications are peroformed from -+ left to right, carry nodes added on the parent level will be ordered -+ automatically. To control ordering use @order and @reference parameters. -+ -+*/ -+carry_node *reiser4_add_carry_skip(carry_level * level /* &carry_level to add -+ * node to */ , -+ pool_ordering order /* where to insert: -+ * at the beginning of -+ * @level, -+ * before @reference, -+ * after @reference, -+ * at the end of @level -+ */ , -+ carry_node * reference/* reference node for -+ * insertion */) -+{ -+ ON_DEBUG(carry_node * orig_ref = reference); -+ -+ if (order == POOLO_BEFORE) { -+ reference = find_left_carry(reference, level); -+ if (reference == NULL) -+ reference = list_entry(level->nodes.next, carry_node, -+ header.level_linkage); -+ else -+ reference = list_entry(reference->header.level_linkage.next, -+ carry_node, header.level_linkage); -+ } else if (order == POOLO_AFTER) { -+ reference = find_right_carry(reference, level); -+ if (reference == NULL) -+ reference = list_entry(level->nodes.prev, carry_node, -+ header.level_linkage); -+ else -+ reference = list_entry(reference->header.level_linkage.prev, -+ carry_node, header.level_linkage); -+ } -+ assert("nikita-2209", -+ ergo(orig_ref != NULL, -+ reiser4_carry_real(reference) == -+ reiser4_carry_real(orig_ref))); -+ return reiser4_add_carry(level, order, reference); -+} -+ -+carry_node *reiser4_add_carry(carry_level * level /* &carry_level to add node -+ * to */ , -+ pool_ordering order /* where to insert: at the -+ * beginning of @level, before -+ * @reference, after @reference, -+ * at the end of @level */ , -+ carry_node * reference /* reference node for -+ * insertion */ ) -+{ -+ carry_node *result; -+ -+ result = -+ (carry_node *) reiser4_add_obj(&level->pool->node_pool, -+ &level->nodes, -+ order, &reference->header); -+ if (!IS_ERR(result) && (result != NULL)) -+ ++level->nodes_num; -+ return result; -+} -+ -+/* add new carry operation to the @level. -+ -+ Returns pointer to the new carry operations allocated from pool. It's up to -+ callers to maintain proper order in the @level. To control ordering use -+ @order and @reference parameters. -+ -+*/ -+static carry_op *add_op(carry_level * level /* &carry_level to add node to */ , -+ pool_ordering order /* where to insert: at the beginning of -+ * @level, before @reference, after -+ * @reference, at the end of @level */ , -+ carry_op * -+ reference /* reference node for insertion */ ) -+{ -+ carry_op *result; -+ -+ result = -+ (carry_op *) reiser4_add_obj(&level->pool->op_pool, &level->ops, -+ order, &reference->header); -+ if (!IS_ERR(result) && (result != NULL)) -+ ++level->ops_num; -+ return result; -+} -+ -+/* Return node on the right of which @node was created. -+ -+ Each node is created on the right of some existing node (or it is new root, -+ which is special case not handled here). -+ -+ @node is new node created on some level, but not yet inserted into its -+ parent, it has corresponding bit (JNODE_ORPHAN) set in zstate. -+ -+*/ -+static carry_node *find_begetting_brother(carry_node * node /* node to start search -+ * from */ , -+ carry_level * kin UNUSED_ARG /* level to -+ * scan */ ) -+{ -+ carry_node *scan; -+ -+ assert("nikita-1614", node != NULL); -+ assert("nikita-1615", kin != NULL); -+ assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree)); -+ assert("nikita-1619", ergo(reiser4_carry_real(node) != NULL, -+ ZF_ISSET(reiser4_carry_real(node), -+ JNODE_ORPHAN))); -+ for (scan = node;; -+ scan = list_entry(scan->header.level_linkage.prev, carry_node, -+ header.level_linkage)) { -+ assert("nikita-1617", &kin->nodes != &scan->header.level_linkage); -+ if ((scan->node != node->node) && -+ !ZF_ISSET(scan->node, JNODE_ORPHAN)) { -+ assert("nikita-1618", reiser4_carry_real(scan) != NULL); -+ break; -+ } -+ } -+ return scan; -+} -+ -+static cmp_t -+carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2) -+{ -+ assert("nikita-2199", n1 != NULL); -+ assert("nikita-2200", n2 != NULL); -+ -+ if (n1 == n2) -+ return EQUAL_TO; -+ while (1) { -+ n1 = carry_node_next(n1); -+ if (carry_node_end(level, n1)) -+ return GREATER_THAN; -+ if (n1 == n2) -+ return LESS_THAN; -+ } -+ impossible("nikita-2201", "End of level reached"); -+} -+ -+carry_node *find_carry_node(carry_level * level, const znode * node) -+{ -+ carry_node *scan; -+ carry_node *tmp_scan; -+ -+ assert("nikita-2202", level != NULL); -+ assert("nikita-2203", node != NULL); -+ -+ for_all_nodes(level, scan, tmp_scan) { -+ if (reiser4_carry_real(scan) == node) -+ return scan; -+ } -+ return NULL; -+} -+ -+znode *reiser4_carry_real(const carry_node * node) -+{ -+ assert("nikita-3061", node != NULL); -+ -+ return node->lock_handle.node; -+} -+ -+carry_node *insert_carry_node(carry_level * doing, carry_level * todo, -+ const znode * node) -+{ -+ carry_node *base; -+ carry_node *scan; -+ carry_node *tmp_scan; -+ carry_node *proj; -+ -+ base = find_carry_node(doing, node); -+ assert("nikita-2204", base != NULL); -+ -+ for_all_nodes(todo, scan, tmp_scan) { -+ proj = find_carry_node(doing, scan->node); -+ assert("nikita-2205", proj != NULL); -+ if (carry_node_cmp(doing, proj, base) != LESS_THAN) -+ break; -+ } -+ return scan; -+} -+ -+static carry_node *add_carry_atplace(carry_level * doing, carry_level * todo, -+ znode * node) -+{ -+ carry_node *reference; -+ -+ assert("nikita-2994", doing != NULL); -+ assert("nikita-2995", todo != NULL); -+ assert("nikita-2996", node != NULL); -+ -+ reference = insert_carry_node(doing, todo, node); -+ assert("nikita-2997", reference != NULL); -+ -+ return reiser4_add_carry(todo, POOLO_BEFORE, reference); -+} -+ -+/* like reiser4_post_carry(), but designed to be called from node plugin methods. -+ This function is different from reiser4_post_carry() in that it finds proper -+ place to insert node in the queue. */ -+carry_op *node_post_carry(carry_plugin_info * info /* carry parameters -+ * passed down to node -+ * plugin */ , -+ carry_opcode op /* opcode of operation */ , -+ znode * node /* node on which this -+ * operation will operate */ , -+ int apply_to_parent_p /* whether operation will -+ * operate directly on @node -+ * or on it parent. */ ) -+{ -+ carry_op *result; -+ carry_node *child; -+ -+ assert("nikita-2207", info != NULL); -+ assert("nikita-2208", info->todo != NULL); -+ -+ if (info->doing == NULL) -+ return reiser4_post_carry(info->todo, op, node, -+ apply_to_parent_p); -+ -+ result = add_op(info->todo, POOLO_LAST, NULL); -+ if (IS_ERR(result)) -+ return result; -+ child = add_carry_atplace(info->doing, info->todo, node); -+ if (IS_ERR(child)) { -+ reiser4_pool_free(&info->todo->pool->op_pool, &result->header); -+ return (carry_op *) child; -+ } -+ result->node = child; -+ result->op = op; -+ child->parent = apply_to_parent_p; -+ if (ZF_ISSET(node, JNODE_ORPHAN)) -+ child->left_before = 1; -+ child->node = node; -+ return result; -+} -+ -+/* lock all carry nodes in @level */ -+static int lock_carry_level(carry_level * level /* level to lock */ ) -+{ -+ int result; -+ carry_node *node; -+ carry_node *tmp_node; -+ -+ assert("nikita-881", level != NULL); -+ assert("nikita-2229", carry_level_invariant(level, CARRY_TODO)); -+ -+ /* lock nodes from left to right */ -+ result = 0; -+ for_all_nodes(level, node, tmp_node) { -+ result = lock_carry_node(level, node); -+ if (result != 0) -+ break; -+ } -+ return result; -+} -+ -+/* Synchronize delimiting keys between @node and its left neighbor. -+ -+ To reduce contention on dk key and simplify carry code, we synchronize -+ delimiting keys only when carry ultimately leaves tree level (carrying -+ changes upward) and unlocks nodes at this level. -+ -+ This function first finds left neighbor of @node and then updates left -+ neighbor's right delimiting key to conincide with least key in @node. -+ -+*/ -+ -+ON_DEBUG(extern atomic_t delim_key_version; -+ ) -+ -+static void sync_dkeys(znode * spot /* node to update */ ) -+{ -+ reiser4_key pivot; -+ reiser4_tree *tree; -+ -+ assert("nikita-1610", spot != NULL); -+ assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk)); -+ -+ tree = znode_get_tree(spot); -+ read_lock_tree(tree); -+ write_lock_dk(tree); -+ -+ assert("nikita-2192", znode_is_loaded(spot)); -+ -+ /* sync left delimiting key of @spot with key in its leftmost item */ -+ if (node_is_empty(spot)) -+ pivot = *znode_get_rd_key(spot); -+ else -+ leftmost_key_in_node(spot, &pivot); -+ -+ znode_set_ld_key(spot, &pivot); -+ -+ /* there can be sequence of empty nodes pending removal on the left of -+ @spot. Scan them and update their left and right delimiting keys to -+ match left delimiting key of @spot. Also, update right delimiting -+ key of first non-empty left neighbor. -+ */ -+ while (1) { -+ if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED)) -+ break; -+ -+ spot = spot->left; -+ if (spot == NULL) -+ break; -+ -+ znode_set_rd_key(spot, &pivot); -+ /* don't sink into the domain of another balancing */ -+ if (!znode_is_write_locked(spot)) -+ break; -+ if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE)) -+ znode_set_ld_key(spot, &pivot); -+ else -+ break; -+ } -+ -+ write_unlock_dk(tree); -+ read_unlock_tree(tree); -+} -+ -+/* unlock all carry nodes in @level */ -+static void unlock_carry_level(carry_level * level /* level to unlock */ , -+ int failure /* true if unlocking owing to -+ * failure */ ) -+{ -+ carry_node *node; -+ carry_node *tmp_node; -+ -+ assert("nikita-889", level != NULL); -+ -+ if (!failure) { -+ znode *spot; -+ -+ spot = NULL; -+ /* update delimiting keys */ -+ for_all_nodes(level, node, tmp_node) { -+ if (reiser4_carry_real(node) != spot) { -+ spot = reiser4_carry_real(node); -+ sync_dkeys(spot); -+ } -+ } -+ } -+ -+ /* nodes can be unlocked in arbitrary order. In preemptible -+ environment it's better to unlock in reverse order of locking, -+ though. -+ */ -+ for_all_nodes_back(level, node, tmp_node) { -+ /* all allocated nodes should be already linked to their -+ parents at this moment. */ -+ assert("nikita-1631", -+ ergo(!failure, !ZF_ISSET(reiser4_carry_real(node), -+ JNODE_ORPHAN))); -+ ON_DEBUG(check_dkeys(reiser4_carry_real(node))); -+ unlock_carry_node(level, node, failure); -+ } -+ level->new_root = NULL; -+} -+ -+/* finish with @level -+ -+ Unlock nodes and release all allocated resources */ -+static void done_carry_level(carry_level * level /* level to finish */ ) -+{ -+ carry_node *node; -+ carry_node *tmp_node; -+ carry_op *op; -+ carry_op *tmp_op; -+ -+ assert("nikita-1076", level != NULL); -+ -+ unlock_carry_level(level, 0); -+ for_all_nodes(level, node, tmp_node) { -+ assert("nikita-2113", list_empty_careful(&node->lock_handle.locks_link)); -+ assert("nikita-2114", list_empty_careful(&node->lock_handle.owners_link)); -+ reiser4_pool_free(&level->pool->node_pool, &node->header); -+ } -+ for_all_ops(level, op, tmp_op) -+ reiser4_pool_free(&level->pool->op_pool, &op->header); -+} -+ -+/* helper function to complete locking of carry node -+ -+ Finish locking of carry node. There are several ways in which new carry -+ node can be added into carry level and locked. Normal is through -+ lock_carry_node(), but also from find_{left|right}_neighbor(). This -+ function factors out common final part of all locking scenarios. It -+ supposes that @node -> lock_handle is lock handle for lock just taken and -+ fills ->real_node from this lock handle. -+ -+*/ -+int lock_carry_node_tail(carry_node * node /* node to complete locking of */ ) -+{ -+ assert("nikita-1052", node != NULL); -+ assert("nikita-1187", reiser4_carry_real(node) != NULL); -+ assert("nikita-1188", !node->unlock); -+ -+ node->unlock = 1; -+ /* Load node content into memory and install node plugin by -+ looking at the node header. -+ -+ Most of the time this call is cheap because the node is -+ already in memory. -+ -+ Corresponding zrelse() is in unlock_carry_node() -+ */ -+ return zload(reiser4_carry_real(node)); -+} -+ -+/* lock carry node -+ -+ "Resolve" node to real znode, lock it and mark as locked. -+ This requires recursive locking of znodes. -+ -+ When operation is posted to the parent level, node it will be applied to is -+ not yet known. For example, when shifting data between two nodes, -+ delimiting has to be updated in parent or parents of nodes involved. But -+ their parents is not yet locked and, moreover said nodes can be reparented -+ by concurrent balancing. -+ -+ To work around this, carry operation is applied to special "carry node" -+ rather than to the znode itself. Carry node consists of some "base" or -+ "reference" znode and flags indicating how to get to the target of carry -+ operation (->real_node field of carry_node) from base. -+ -+*/ -+int lock_carry_node(carry_level * level /* level @node is in */ , -+ carry_node * node /* node to lock */ ) -+{ -+ int result; -+ znode *reference_point; -+ lock_handle lh; -+ lock_handle tmp_lh; -+ reiser4_tree *tree; -+ -+ assert("nikita-887", level != NULL); -+ assert("nikita-882", node != NULL); -+ -+ result = 0; -+ reference_point = node->node; -+ init_lh(&lh); -+ init_lh(&tmp_lh); -+ if (node->left_before) { -+ /* handling of new nodes, allocated on the previous level: -+ -+ some carry ops were propably posted from the new node, but -+ this node neither has parent pointer set, nor is -+ connected. This will be done in ->create_hook() for -+ internal item. -+ -+ No then less, parent of new node has to be locked. To do -+ this, first go to the "left" in the carry order. This -+ depends on the decision to always allocate new node on the -+ right of existing one. -+ -+ Loop handles case when multiple nodes, all orphans, were -+ inserted. -+ -+ Strictly speaking, taking tree lock is not necessary here, -+ because all nodes scanned by loop in -+ find_begetting_brother() are write-locked by this thread, -+ and thus, their sibling linkage cannot change. -+ -+ */ -+ tree = znode_get_tree(reference_point); -+ read_lock_tree(tree); -+ reference_point = find_begetting_brother(node, level)->node; -+ read_unlock_tree(tree); -+ assert("nikita-1186", reference_point != NULL); -+ } -+ if (node->parent && (result == 0)) { -+ result = -+ reiser4_get_parent(&tmp_lh, reference_point, -+ ZNODE_WRITE_LOCK); -+ if (result != 0) { -+ ; /* nothing */ -+ } else if (znode_get_level(tmp_lh.node) == 0) { -+ assert("nikita-1347", znode_above_root(tmp_lh.node)); -+ result = add_new_root(level, node, tmp_lh.node); -+ if (result == 0) { -+ reference_point = level->new_root; -+ move_lh(&lh, &node->lock_handle); -+ } -+ } else if ((level->new_root != NULL) -+ && (level->new_root != -+ znode_parent_nolock(reference_point))) { -+ /* parent of node exists, but this level aready -+ created different new root, so */ -+ warning("nikita-1109", -+ /* it should be "radicis", but tradition is -+ tradition. do banshees read latin? */ -+ "hodie natus est radici frater"); -+ result = -EIO; -+ } else { -+ move_lh(&lh, &tmp_lh); -+ reference_point = lh.node; -+ } -+ } -+ if (node->left && (result == 0)) { -+ assert("nikita-1183", node->parent); -+ assert("nikita-883", reference_point != NULL); -+ result = -+ reiser4_get_left_neighbor(&tmp_lh, reference_point, -+ ZNODE_WRITE_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ if (result == 0) { -+ done_lh(&lh); -+ move_lh(&lh, &tmp_lh); -+ reference_point = lh.node; -+ } -+ } -+ if (!node->parent && !node->left && !node->left_before) { -+ result = -+ longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK, -+ ZNODE_LOCK_HIPRI); -+ } -+ if (result == 0) { -+ move_lh(&node->lock_handle, &lh); -+ result = lock_carry_node_tail(node); -+ } -+ done_lh(&tmp_lh); -+ done_lh(&lh); -+ return result; -+} -+ -+/* release a lock on &carry_node. -+ -+ Release if necessary lock on @node. This opearion is pair of -+ lock_carry_node() and is idempotent: you can call it more than once on the -+ same node. -+ -+*/ -+static void -+unlock_carry_node(carry_level * level, -+ carry_node * node /* node to be released */ , -+ int failure /* 0 if node is unlocked due -+ * to some error */ ) -+{ -+ znode *real_node; -+ -+ assert("nikita-884", node != NULL); -+ -+ real_node = reiser4_carry_real(node); -+ /* pair to zload() in lock_carry_node_tail() */ -+ zrelse(real_node); -+ if (node->unlock && (real_node != NULL)) { -+ assert("nikita-899", real_node == node->lock_handle.node); -+ longterm_unlock_znode(&node->lock_handle); -+ } -+ if (failure) { -+ if (node->deallocate && (real_node != NULL)) { -+ /* free node in bitmap -+ -+ Prepare node for removal. Last zput() will finish -+ with it. -+ */ -+ ZF_SET(real_node, JNODE_HEARD_BANSHEE); -+ } -+ if (node->free) { -+ assert("nikita-2177", -+ list_empty_careful(&node->lock_handle.locks_link)); -+ assert("nikita-2112", -+ list_empty_careful(&node->lock_handle.owners_link)); -+ reiser4_pool_free(&level->pool->node_pool, -+ &node->header); -+ } -+ } -+} -+ -+/* fatal_carry_error() - all-catching error handling function -+ -+ It is possible that carry faces unrecoverable error, like unability to -+ insert pointer at the internal level. Our simple solution is just panic in -+ this situation. More sophisticated things like attempt to remount -+ file-system as read-only can be implemented without much difficlties. -+ -+ It is believed, that: -+ -+ 1. in stead of panicking, all current transactions can be aborted rolling -+ system back to the consistent state. -+ -+Umm, if you simply panic without doing anything more at all, then all current -+transactions are aborted and the system is rolled back to a consistent state, -+by virtue of the design of the transactional mechanism. Well, wait, let's be -+precise. If an internal node is corrupted on disk due to hardware failure, -+then there may be no consistent state that can be rolled back to, so instead -+we should say that it will rollback the transactions, which barring other -+factors means rolling back to a consistent state. -+ -+# Nikita: there is a subtle difference between panic and aborting -+# transactions: machine doesn't reboot. Processes aren't killed. Processes -+# don't using reiser4 (not that we care about such processes), or using other -+# reiser4 mounts (about them we do care) will simply continue to run. With -+# some luck, even application using aborted file system can survive: it will -+# get some error, like EBADF, from each file descriptor on failed file system, -+# but applications that do care about tolerance will cope with this (squid -+# will). -+ -+It would be a nice feature though to support rollback without rebooting -+followed by remount, but this can wait for later versions. -+ -+ 2. once isolated transactions will be implemented it will be possible to -+ roll back offending transaction. -+ -+2. is additional code complexity of inconsistent value (it implies that a broken tree should be kept in operation), so we must think about -+it more before deciding if it should be done. -Hans -+ -+*/ -+static void fatal_carry_error(carry_level * doing UNUSED_ARG /* carry level -+ * where -+ * unrecoverable -+ * error -+ * occurred */ , -+ int ecode /* error code */ ) -+{ -+ assert("nikita-1230", doing != NULL); -+ assert("nikita-1231", ecode < 0); -+ -+ reiser4_panic("nikita-1232", "Carry failed: %i", ecode); -+} -+ -+/* add new root to the tree -+ -+ This function itself only manages changes in carry structures and delegates -+ all hard work (allocation of znode for new root, changes of parent and -+ sibling pointers to the reiser4_add_tree_root(). -+ -+ Locking: old tree root is locked by carry at this point. Fake znode is also -+ locked. -+ -+*/ -+static int add_new_root(carry_level * level /* carry level in context of which -+ * operation is performed */ , -+ carry_node * node /* carry node for existing root */ , -+ znode * fake /* "fake" znode already locked by -+ * us */ ) -+{ -+ int result; -+ -+ assert("nikita-1104", level != NULL); -+ assert("nikita-1105", node != NULL); -+ -+ assert("nikita-1403", znode_is_write_locked(node->node)); -+ assert("nikita-1404", znode_is_write_locked(fake)); -+ -+ /* trying to create new root. */ -+ /* @node is root and it's already locked by us. This -+ means that nobody else can be trying to add/remove -+ tree root right now. -+ */ -+ if (level->new_root == NULL) -+ level->new_root = reiser4_add_tree_root(node->node, fake); -+ if (!IS_ERR(level->new_root)) { -+ assert("nikita-1210", znode_is_root(level->new_root)); -+ node->deallocate = 1; -+ result = -+ longterm_lock_znode(&node->lock_handle, level->new_root, -+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI); -+ if (result == 0) -+ zput(level->new_root); -+ } else { -+ result = PTR_ERR(level->new_root); -+ level->new_root = NULL; -+ } -+ return result; -+} -+ -+/* allocate new znode and add the operation that inserts the -+ pointer to it into the parent node into the todo level -+ -+ Allocate new znode, add it into carry queue and post into @todo queue -+ request to add pointer to new node into its parent. -+ -+ This is carry related routing that calls reiser4_new_node() to allocate new -+ node. -+*/ -+carry_node *add_new_znode(znode * brother /* existing left neighbor of new -+ * node */ , -+ carry_node * ref /* carry node after which new -+ * carry node is to be inserted -+ * into queue. This affects -+ * locking. */ , -+ carry_level * doing /* carry queue where new node is -+ * to be added */ , -+ carry_level * todo /* carry queue where COP_INSERT -+ * operation to add pointer to -+ * new node will ne added */ ) -+{ -+ carry_node *fresh; -+ znode *new_znode; -+ carry_op *add_pointer; -+ carry_plugin_info info; -+ -+ assert("nikita-1048", brother != NULL); -+ assert("nikita-1049", todo != NULL); -+ -+ /* There is a lot of possible variations here: to what parent -+ new node will be attached and where. For simplicity, always -+ do the following: -+ -+ (1) new node and @brother will have the same parent. -+ -+ (2) new node is added on the right of @brother -+ -+ */ -+ -+ fresh = reiser4_add_carry_skip(doing, -+ ref ? POOLO_AFTER : POOLO_LAST, ref); -+ if (IS_ERR(fresh)) -+ return fresh; -+ -+ fresh->deallocate = 1; -+ fresh->free = 1; -+ -+ new_znode = reiser4_new_node(brother, znode_get_level(brother)); -+ if (IS_ERR(new_znode)) -+ /* @fresh will be deallocated automatically by error -+ handling code in the caller. */ -+ return (carry_node *) new_znode; -+ -+ /* new_znode returned znode with x_count 1. Caller has to decrease -+ it. make_space() does. */ -+ -+ ZF_SET(new_znode, JNODE_ORPHAN); -+ fresh->node = new_znode; -+ -+ while (ZF_ISSET(reiser4_carry_real(ref), JNODE_ORPHAN)) { -+ ref = carry_node_prev(ref); -+ assert("nikita-1606", !carry_node_end(doing, ref)); -+ } -+ -+ info.todo = todo; -+ info.doing = doing; -+ add_pointer = node_post_carry(&info, COP_INSERT, -+ reiser4_carry_real(ref), 1); -+ if (IS_ERR(add_pointer)) { -+ /* no need to deallocate @new_znode here: it will be -+ deallocated during carry error handling. */ -+ return (carry_node *) add_pointer; -+ } -+ -+ add_pointer->u.insert.type = COPT_CHILD; -+ add_pointer->u.insert.child = fresh; -+ add_pointer->u.insert.brother = brother; -+ /* initially new node spawns empty key range */ -+ write_lock_dk(znode_get_tree(brother)); -+ znode_set_ld_key(new_znode, -+ znode_set_rd_key(new_znode, -+ znode_get_rd_key(brother))); -+ write_unlock_dk(znode_get_tree(brother)); -+ return fresh; -+} -+ -+/* DEBUGGING FUNCTIONS. -+ -+ Probably we also should leave them on even when -+ debugging is turned off to print dumps at errors. -+*/ -+#if REISER4_DEBUG -+static int carry_level_invariant(carry_level * level, carry_queue_state state) -+{ -+ carry_node *node; -+ carry_node *tmp_node; -+ -+ if (level == NULL) -+ return 0; -+ -+ if (level->track_type != 0 && -+ level->track_type != CARRY_TRACK_NODE && -+ level->track_type != CARRY_TRACK_CHANGE) -+ return 0; -+ -+ /* check that nodes are in ascending order */ -+ for_all_nodes(level, node, tmp_node) { -+ znode *left; -+ znode *right; -+ -+ reiser4_key lkey; -+ reiser4_key rkey; -+ -+ if (node != carry_node_front(level)) { -+ if (state == CARRY_TODO) { -+ right = node->node; -+ left = carry_node_prev(node)->node; -+ } else { -+ right = reiser4_carry_real(node); -+ left = reiser4_carry_real(carry_node_prev(node)); -+ } -+ if (right == NULL || left == NULL) -+ continue; -+ if (node_is_empty(right) || node_is_empty(left)) -+ continue; -+ if (!keyle(leftmost_key_in_node(left, &lkey), -+ leftmost_key_in_node(right, &rkey))) { -+ warning("", "wrong key order"); -+ return 0; -+ } -+ } -+ } -+ return 1; -+} -+#endif -+ -+/* get symbolic name for boolean */ -+static const char *tf(int boolean /* truth value */ ) -+{ -+ return boolean ? "t" : "f"; -+} -+ -+/* symbolic name for carry operation */ -+static const char *carry_op_name(carry_opcode op /* carry opcode */ ) -+{ -+ switch (op) { -+ case COP_INSERT: -+ return "COP_INSERT"; -+ case COP_DELETE: -+ return "COP_DELETE"; -+ case COP_CUT: -+ return "COP_CUT"; -+ case COP_PASTE: -+ return "COP_PASTE"; -+ case COP_UPDATE: -+ return "COP_UPDATE"; -+ case COP_EXTENT: -+ return "COP_EXTENT"; -+ case COP_INSERT_FLOW: -+ return "COP_INSERT_FLOW"; -+ default:{ -+ /* not mt safe, but who cares? */ -+ static char buf[20]; -+ -+ sprintf(buf, "unknown op: %x", op); -+ return buf; -+ } -+ } -+} -+ -+/* dump information about carry node */ -+static void print_carry(const char *prefix /* prefix to print */ , -+ carry_node * node /* node to print */ ) -+{ -+ if (node == NULL) { -+ printk("%s: null\n", prefix); -+ return; -+ } -+ printk -+ ("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n", -+ prefix, node, tf(node->parent), tf(node->left), tf(node->unlock), -+ tf(node->free), tf(node->deallocate)); -+} -+ -+/* dump information about carry operation */ -+static void print_op(const char *prefix /* prefix to print */ , -+ carry_op * op /* operation to print */ ) -+{ -+ if (op == NULL) { -+ printk("%s: null\n", prefix); -+ return; -+ } -+ printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op)); -+ print_carry("\tnode", op->node); -+ switch (op->op) { -+ case COP_INSERT: -+ case COP_PASTE: -+ print_coord("\tcoord", -+ op->u.insert.d ? op->u.insert.d->coord : NULL, 0); -+ reiser4_print_key("\tkey", -+ op->u.insert.d ? op->u.insert.d->key : NULL); -+ print_carry("\tchild", op->u.insert.child); -+ break; -+ case COP_DELETE: -+ print_carry("\tchild", op->u.delete.child); -+ break; -+ case COP_CUT: -+ if (op->u.cut_or_kill.is_cut) { -+ print_coord("\tfrom", -+ op->u.cut_or_kill.u.kill->params.from, 0); -+ print_coord("\tto", op->u.cut_or_kill.u.kill->params.to, -+ 0); -+ } else { -+ print_coord("\tfrom", -+ op->u.cut_or_kill.u.cut->params.from, 0); -+ print_coord("\tto", op->u.cut_or_kill.u.cut->params.to, -+ 0); -+ } -+ break; -+ case COP_UPDATE: -+ print_carry("\tleft", op->u.update.left); -+ break; -+ default: -+ /* do nothing */ -+ break; -+ } -+} -+ -+/* dump information about all nodes and operations in a @level */ -+static void print_level(const char *prefix /* prefix to print */ , -+ carry_level * level /* level to print */ ) -+{ -+ carry_node *node; -+ carry_node *tmp_node; -+ carry_op *op; -+ carry_op *tmp_op; -+ -+ if (level == NULL) { -+ printk("%s: null\n", prefix); -+ return; -+ } -+ printk("%s: %p, restartable: %s\n", -+ prefix, level, tf(level->restartable)); -+ -+ for_all_nodes(level, node, tmp_node) -+ print_carry("\tcarry node", node); -+ for_all_ops(level, op, tmp_op) -+ print_op("\tcarry op", op); -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/carry.h linux-2.6.24/fs/reiser4/carry.h ---- linux-2.6.24.orig/fs/reiser4/carry.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/carry.h 2008-01-25 11:39:06.896197385 +0300 -@@ -0,0 +1,442 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Functions and data types to "carry" tree modification(s) upward. -+ See fs/reiser4/carry.c for details. */ -+ -+#if !defined( __FS_REISER4_CARRY_H__ ) -+#define __FS_REISER4_CARRY_H__ -+ -+#include "forward.h" -+#include "debug.h" -+#include "pool.h" -+#include "znode.h" -+ -+#include -+ -+/* &carry_node - "location" of carry node. -+ -+ "location" of node that is involved or going to be involved into -+ carry process. Node where operation will be carried to on the -+ parent level cannot be recorded explicitly. Operation will be carried -+ usually to the parent of some node (where changes are performed at -+ the current level) or, to the left neighbor of its parent. But while -+ modifications are performed at the current level, parent may -+ change. So, we have to allow some indirection (or, positevly, -+ flexibility) in locating carry nodes. -+ -+*/ -+typedef struct carry_node { -+ /* pool linkage */ -+ struct reiser4_pool_header header; -+ -+ /* base node from which real_node is calculated. See -+ fs/reiser4/carry.c:lock_carry_node(). */ -+ znode *node; -+ -+ /* how to get ->real_node */ -+ /* to get ->real_node obtain parent of ->node */ -+ __u32 parent:1; -+ /* to get ->real_node obtain left neighbor of parent of -+ ->node */ -+ __u32 left:1; -+ __u32 left_before:1; -+ -+ /* locking */ -+ -+ /* this node was locked by carry process and should be -+ unlocked when carry leaves a level */ -+ __u32 unlock:1; -+ -+ /* disk block for this node was allocated by carry process and -+ should be deallocated when carry leaves a level */ -+ __u32 deallocate:1; -+ /* this carry node was allocated by carry process and should be -+ freed when carry leaves a level */ -+ __u32 free:1; -+ -+ /* type of lock we want to take on this node */ -+ lock_handle lock_handle; -+} carry_node; -+ -+/* &carry_opcode - elementary operations that can be carried upward -+ -+ Operations that carry() can handle. This list is supposed to be -+ expanded. -+ -+ Each carry operation (cop) is handled by appropriate function defined -+ in fs/reiser4/carry.c. For example COP_INSERT is handled by -+ fs/reiser4/carry.c:carry_insert() etc. These functions in turn -+ call plugins of nodes affected by operation to modify nodes' content -+ and to gather operations to be performed on the next level. -+ -+*/ -+typedef enum { -+ /* insert new item into node. */ -+ COP_INSERT, -+ /* delete pointer from parent node */ -+ COP_DELETE, -+ /* remove part of or whole node. */ -+ COP_CUT, -+ /* increase size of item. */ -+ COP_PASTE, -+ /* insert extent (that is sequence of unformatted nodes). */ -+ COP_EXTENT, -+ /* update delimiting key in least common ancestor of two -+ nodes. This is performed when items are moved between two -+ nodes. -+ */ -+ COP_UPDATE, -+ /* insert flow */ -+ COP_INSERT_FLOW, -+ COP_LAST_OP, -+} carry_opcode; -+ -+#define CARRY_FLOW_NEW_NODES_LIMIT 20 -+ -+/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target -+ item is determined. */ -+typedef enum { -+ /* target item is one containing pointer to the ->child node */ -+ COPT_CHILD, -+ /* target item is given explicitly by @coord */ -+ COPT_ITEM_DATA, -+ /* target item is given by key */ -+ COPT_KEY, -+ /* see insert_paste_common() for more comments on this. */ -+ COPT_PASTE_RESTARTED, -+} cop_insert_pos_type; -+ -+/* flags to cut and delete */ -+typedef enum { -+ /* don't kill node even if it became completely empty as results of -+ * cut. This is needed for eottl handling. See carry_extent() for -+ * details. */ -+ DELETE_RETAIN_EMPTY = (1 << 0) -+} cop_delete_flag; -+ -+/* -+ * carry() implements "lock handle tracking" feature. -+ * -+ * Callers supply carry with node where to perform initial operation and lock -+ * handle on this node. Trying to optimize node utilization carry may actually -+ * move insertion point to different node. Callers expect that lock handle -+ * will rebe transferred to the new node also. -+ * -+ */ -+typedef enum { -+ /* transfer lock handle along with insertion point */ -+ CARRY_TRACK_CHANGE = 1, -+ /* acquire new lock handle to the node where insertion point is. This -+ * is used when carry() client doesn't initially possess lock handle -+ * on the insertion point node, for example, by extent insertion -+ * code. See carry_extent(). */ -+ CARRY_TRACK_NODE = 2 -+} carry_track_type; -+ -+/* data supplied to COP_{INSERT|PASTE} by callers */ -+typedef struct carry_insert_data { -+ /* position where new item is to be inserted */ -+ coord_t *coord; -+ /* new item description */ -+ reiser4_item_data *data; -+ /* key of new item */ -+ const reiser4_key *key; -+} carry_insert_data; -+ -+/* cut and kill are similar, so carry_cut_data and carry_kill_data share the below structure of parameters */ -+struct cut_kill_params { -+ /* coord where cut starts (inclusive) */ -+ coord_t *from; -+ /* coord where cut stops (inclusive, this item/unit will also be -+ * cut) */ -+ coord_t *to; -+ /* starting key. This is necessary when item and unit pos don't -+ * uniquely identify what portion or tree to remove. For example, this -+ * indicates what portion of extent unit will be affected. */ -+ const reiser4_key *from_key; -+ /* exclusive stop key */ -+ const reiser4_key *to_key; -+ /* if this is not NULL, smallest actually removed key is stored -+ * here. */ -+ reiser4_key *smallest_removed; -+ /* kill_node_content() is called for file truncate */ -+ int truncate; -+}; -+ -+struct carry_cut_data { -+ struct cut_kill_params params; -+}; -+ -+struct carry_kill_data { -+ struct cut_kill_params params; -+ /* parameter to be passed to the ->kill_hook() method of item -+ * plugin */ -+ /*void *iplug_params; *//* FIXME: unused currently */ -+ /* if not NULL---inode whose items are being removed. This is needed -+ * for ->kill_hook() of extent item to update VM structures when -+ * removing pages. */ -+ struct inode *inode; -+ /* sibling list maintenance is complicated by existence of eottl. When -+ * eottl whose left and right neighbors are formatted leaves is -+ * removed, one has to connect said leaves in the sibling list. This -+ * cannot be done when extent removal is just started as locking rules -+ * require sibling list update to happen atomically with removal of -+ * extent item. Therefore: 1. pointers to left and right neighbors -+ * have to be passed down to the ->kill_hook() of extent item, and -+ * 2. said neighbors have to be locked. */ -+ lock_handle *left; -+ lock_handle *right; -+ /* flags modifying behavior of kill. Currently, it may have DELETE_RETAIN_EMPTY set. */ -+ unsigned flags; -+ char *buf; -+}; -+ -+/* &carry_tree_op - operation to "carry" upward. -+ -+ Description of an operation we want to "carry" to the upper level of -+ a tree: e.g, when we insert something and there is not enough space -+ we allocate a new node and "carry" the operation of inserting a -+ pointer to the new node to the upper level, on removal of empty node, -+ we carry up operation of removing appropriate entry from parent. -+ -+ There are two types of carry ops: when adding or deleting node we -+ node at the parent level where appropriate modification has to be -+ performed is known in advance. When shifting items between nodes -+ (split, merge), delimiting key should be changed in the least common -+ parent of the nodes involved that is not known in advance. -+ -+ For the operations of the first type we store in &carry_op pointer to -+ the &carry_node at the parent level. For the operation of the second -+ type we store &carry_node or parents of the left and right nodes -+ modified and keep track of them upward until they coincide. -+ -+*/ -+typedef struct carry_op { -+ /* pool linkage */ -+ struct reiser4_pool_header header; -+ carry_opcode op; -+ /* node on which operation is to be performed: -+ -+ for insert, paste: node where new item is to be inserted -+ -+ for delete: node where pointer is to be deleted -+ -+ for cut: node to cut from -+ -+ for update: node where delimiting key is to be modified -+ -+ for modify: parent of modified node -+ -+ */ -+ carry_node *node; -+ union { -+ struct { -+ /* (sub-)type of insertion/paste. Taken from -+ cop_insert_pos_type. */ -+ __u8 type; -+ /* various operation flags. Taken from -+ cop_insert_flag. */ -+ __u8 flags; -+ carry_insert_data *d; -+ carry_node *child; -+ znode *brother; -+ } insert, paste, extent; -+ -+ struct { -+ int is_cut; -+ union { -+ carry_kill_data *kill; -+ carry_cut_data *cut; -+ } u; -+ } cut_or_kill; -+ -+ struct { -+ carry_node *left; -+ } update; -+ struct { -+ /* changed child */ -+ carry_node *child; -+ /* bitmask of changes. See &cop_modify_flag */ -+ __u32 flag; -+ } modify; -+ struct { -+ /* flags to deletion operation. Are taken from -+ cop_delete_flag */ -+ __u32 flags; -+ /* child to delete from parent. If this is -+ NULL, delete op->node. */ -+ carry_node *child; -+ } delete; -+ struct { -+ /* various operation flags. Taken from -+ cop_insert_flag. */ -+ __u32 flags; -+ flow_t *flow; -+ coord_t *insert_point; -+ reiser4_item_data *data; -+ /* flow insertion is limited by number of new blocks -+ added in that operation which do not get any data -+ but part of flow. This limit is set by macro -+ CARRY_FLOW_NEW_NODES_LIMIT. This field stores number -+ of nodes added already during one carry_flow */ -+ int new_nodes; -+ } insert_flow; -+ } u; -+} carry_op; -+ -+/* &carry_op_pool - preallocated pool of carry operations, and nodes */ -+typedef struct carry_pool { -+ carry_op op[CARRIES_POOL_SIZE]; -+ struct reiser4_pool op_pool; -+ carry_node node[NODES_LOCKED_POOL_SIZE]; -+ struct reiser4_pool node_pool; -+} carry_pool; -+ -+/* &carry_tree_level - carry process on given level -+ -+ Description of balancing process on the given level. -+ -+ No need for locking here, as carry_tree_level is essentially per -+ thread thing (for now). -+ -+*/ -+struct carry_level { -+ /* this level may be restarted */ -+ __u32 restartable:1; -+ /* list of carry nodes on this level, ordered by key order */ -+ struct list_head nodes; -+ struct list_head ops; -+ /* pool where new objects are allocated from */ -+ carry_pool *pool; -+ int ops_num; -+ int nodes_num; -+ /* new root created on this level, if any */ -+ znode *new_root; -+ /* This is set by caller (insert_by_key(), rreiser4_esize_item(), etc.) -+ when they want ->tracked to automagically wander to the node where -+ insertion point moved after insert or paste. -+ */ -+ carry_track_type track_type; -+ /* lock handle supplied by user that we are tracking. See -+ above. */ -+ lock_handle *tracked; -+}; -+ -+/* information carry passes to plugin methods that may add new operations to -+ the @todo queue */ -+struct carry_plugin_info { -+ carry_level *doing; -+ carry_level *todo; -+}; -+ -+int reiser4_carry(carry_level * doing, carry_level * done); -+ -+carry_node *reiser4_add_carry(carry_level * level, pool_ordering order, -+ carry_node * reference); -+carry_node *reiser4_add_carry_skip(carry_level * level, pool_ordering order, -+ carry_node * reference); -+ -+extern carry_node *insert_carry_node(carry_level * doing, -+ carry_level * todo, const znode * node); -+ -+extern carry_pool *init_carry_pool(int); -+extern void done_carry_pool(carry_pool * pool); -+ -+extern void init_carry_level(carry_level * level, carry_pool * pool); -+ -+extern carry_op *reiser4_post_carry(carry_level * level, carry_opcode op, -+ znode * node, int apply_to_parent); -+extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op, -+ znode * node, int apply_to_parent_p); -+ -+carry_node *add_new_znode(znode * brother, carry_node * reference, -+ carry_level * doing, carry_level * todo); -+ -+carry_node *find_carry_node(carry_level * level, const znode * node); -+ -+extern znode *reiser4_carry_real(const carry_node * node); -+ -+/* helper macros to iterate over carry queues */ -+ -+#define carry_node_next( node ) \ -+ list_entry((node)->header.level_linkage.next, carry_node, \ -+ header.level_linkage) -+ -+#define carry_node_prev( node ) \ -+ list_entry((node)->header.level_linkage.prev, carry_node, \ -+ header.level_linkage) -+ -+#define carry_node_front( level ) \ -+ list_entry((level)->nodes.next, carry_node, header.level_linkage) -+ -+#define carry_node_back( level ) \ -+ list_entry((level)->nodes.prev, carry_node, header.level_linkage) -+ -+#define carry_node_end( level, node ) \ -+ (&(level)->nodes == &(node)->header.level_linkage) -+ -+/* macro to iterate over all operations in a @level */ -+#define for_all_ops( level /* carry level (of type carry_level *) */, \ -+ op /* pointer to carry operation, modified by loop (of \ -+ * type carry_op *) */, \ -+ tmp /* pointer to carry operation (of type carry_op *), \ -+ * used to make iterator stable in the face of \ -+ * deletions from the level */ ) \ -+for (op = list_entry(level->ops.next, carry_op, header.level_linkage), \ -+ tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage); \ -+ &op->header.level_linkage != &level->ops; \ -+ op = tmp, \ -+ tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage)) -+ -+#if 0 -+for( op = ( carry_op * ) pool_level_list_front( &level -> ops ), \ -+ tmp = ( carry_op * ) pool_level_list_next( &op -> header ) ; \ -+ ! pool_level_list_end( &level -> ops, &op -> header ) ; \ -+ op = tmp, tmp = ( carry_op * ) pool_level_list_next( &op -> header ) ) -+#endif -+ -+/* macro to iterate over all nodes in a @level */ \ -+#define for_all_nodes( level /* carry level (of type carry_level *) */, \ -+ node /* pointer to carry node, modified by loop (of \ -+ * type carry_node *) */, \ -+ tmp /* pointer to carry node (of type carry_node *), \ -+ * used to make iterator stable in the face of * \ -+ * deletions from the level */ ) \ -+for (node = list_entry(level->nodes.next, carry_node, header.level_linkage), \ -+ tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage); \ -+ &node->header.level_linkage != &level->nodes; \ -+ node = tmp, \ -+ tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage)) -+ -+#if 0 -+for( node = carry_node_front( level ), \ -+ tmp = carry_node_next( node ) ; ! carry_node_end( level, node ) ; \ -+ node = tmp, tmp = carry_node_next( node ) ) -+#endif -+ -+/* macro to iterate over all nodes in a @level in reverse order -+ -+ This is used, because nodes are unlocked in reversed order of locking */ -+#define for_all_nodes_back( level /* carry level (of type carry_level *) */, \ -+ node /* pointer to carry node, modified by loop \ -+ * (of type carry_node *) */, \ -+ tmp /* pointer to carry node (of type carry_node \ -+ * *), used to make iterator stable in the \ -+ * face of deletions from the level */ ) \ -+for( node = carry_node_back( level ), \ -+ tmp = carry_node_prev( node ) ; ! carry_node_end( level, node ) ; \ -+ node = tmp, tmp = carry_node_prev( node ) ) -+ -+/* __FS_REISER4_CARRY_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/carry_ops.c linux-2.6.24/fs/reiser4/carry_ops.c ---- linux-2.6.24.orig/fs/reiser4/carry_ops.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/carry_ops.c 2008-01-25 11:39:06.900198415 +0300 -@@ -0,0 +1,2131 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* implementation of carry operations */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/item/item.h" -+#include "plugin/node/node.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree_walk.h" -+#include "pool.h" -+#include "tree_mod.h" -+#include "carry.h" -+#include "carry_ops.h" -+#include "tree.h" -+#include "super.h" -+#include "reiser4.h" -+ -+#include -+#include -+ -+static int carry_shift_data(sideof side, coord_t * insert_coord, znode * node, -+ carry_level * doing, carry_level * todo, -+ unsigned int including_insert_coord_p); -+ -+extern int lock_carry_node(carry_level * level, carry_node * node); -+extern int lock_carry_node_tail(carry_node * node); -+ -+/* find left neighbor of a carry node -+ -+ Look for left neighbor of @node and add it to the @doing queue. See -+ comments in the body. -+ -+*/ -+static carry_node *find_left_neighbor(carry_op * op /* node to find left -+ * neighbor of */ , -+ carry_level * doing /* level to scan */ ) -+{ -+ int result; -+ carry_node *node; -+ carry_node *left; -+ int flags; -+ reiser4_tree *tree; -+ -+ node = op->node; -+ -+ tree = current_tree; -+ read_lock_tree(tree); -+ /* first, check whether left neighbor is already in a @doing queue */ -+ if (reiser4_carry_real(node)->left != NULL) { -+ /* NOTE: there is locking subtlety here. Look into -+ * find_right_neighbor() for more info */ -+ if (find_carry_node(doing, -+ reiser4_carry_real(node)->left) != NULL) { -+ read_unlock_tree(tree); -+ left = node; -+ do { -+ left = list_entry(left->header.level_linkage.prev, -+ carry_node, header.level_linkage); -+ assert("nikita-3408", !carry_node_end(doing, -+ left)); -+ } while (reiser4_carry_real(left) == -+ reiser4_carry_real(node)); -+ return left; -+ } -+ } -+ read_unlock_tree(tree); -+ -+ left = reiser4_add_carry_skip(doing, POOLO_BEFORE, node); -+ if (IS_ERR(left)) -+ return left; -+ -+ left->node = node->node; -+ left->free = 1; -+ -+ flags = GN_TRY_LOCK; -+ if (!op->u.insert.flags & COPI_LOAD_LEFT) -+ flags |= GN_NO_ALLOC; -+ -+ /* then, feeling lucky, peek left neighbor in the cache. */ -+ result = reiser4_get_left_neighbor(&left->lock_handle, -+ reiser4_carry_real(node), -+ ZNODE_WRITE_LOCK, flags); -+ if (result == 0) { -+ /* ok, node found and locked. */ -+ result = lock_carry_node_tail(left); -+ if (result != 0) -+ left = ERR_PTR(result); -+ } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) { -+ /* node is leftmost node in a tree, or neighbor wasn't in -+ cache, or there is an extent on the left. */ -+ reiser4_pool_free(&doing->pool->node_pool, &left->header); -+ left = NULL; -+ } else if (doing->restartable) { -+ /* if left neighbor is locked, and level is restartable, add -+ new node to @doing and restart. */ -+ assert("nikita-913", node->parent != 0); -+ assert("nikita-914", node->node != NULL); -+ left->left = 1; -+ left->free = 0; -+ left = ERR_PTR(-E_REPEAT); -+ } else { -+ /* left neighbor is locked, level cannot be restarted. Just -+ ignore left neighbor. */ -+ reiser4_pool_free(&doing->pool->node_pool, &left->header); -+ left = NULL; -+ } -+ return left; -+} -+ -+/* find right neighbor of a carry node -+ -+ Look for right neighbor of @node and add it to the @doing queue. See -+ comments in the body. -+ -+*/ -+static carry_node *find_right_neighbor(carry_op * op /* node to find right -+ * neighbor of */ , -+ carry_level * doing /* level to scan */ ) -+{ -+ int result; -+ carry_node *node; -+ carry_node *right; -+ lock_handle lh; -+ int flags; -+ reiser4_tree *tree; -+ -+ init_lh(&lh); -+ -+ node = op->node; -+ -+ tree = current_tree; -+ read_lock_tree(tree); -+ /* first, check whether right neighbor is already in a @doing queue */ -+ if (reiser4_carry_real(node)->right != NULL) { -+ /* -+ * Tree lock is taken here anyway, because, even if _outcome_ -+ * of (find_carry_node() != NULL) doesn't depends on -+ * concurrent updates to ->right, find_carry_node() cannot -+ * work with second argument NULL. Hence, following comment is -+ * of historic importance only. -+ * -+ * Subtle: -+ * -+ * Q: why don't we need tree lock here, looking for the right -+ * neighbor? -+ * -+ * A: even if value of node->real_node->right were changed -+ * during find_carry_node() execution, outcome of execution -+ * wouldn't change, because (in short) other thread cannot add -+ * elements to the @doing, and if node->real_node->right -+ * already was in @doing, value of node->real_node->right -+ * couldn't change, because node cannot be inserted between -+ * locked neighbors. -+ */ -+ if (find_carry_node(doing, -+ reiser4_carry_real(node)->right) != NULL) { -+ read_unlock_tree(tree); -+ /* -+ * What we are doing here (this is also applicable to -+ * the find_left_neighbor()). -+ * -+ * tree_walk.c code requires that insertion of a -+ * pointer to a child, modification of parent pointer -+ * in the child, and insertion of the child into -+ * sibling list are atomic (see -+ * plugin/item/internal.c:create_hook_internal()). -+ * -+ * carry allocates new node long before pointer to it -+ * is inserted into parent and, actually, long before -+ * parent is even known. Such allocated-but-orphaned -+ * nodes are only trackable through carry level lists. -+ * -+ * Situation that is handled here is following: @node -+ * has valid ->right pointer, but there is -+ * allocated-but-orphaned node in the carry queue that -+ * is logically between @node and @node->right. Here -+ * we are searching for it. Critical point is that -+ * this is only possible if @node->right is also in -+ * the carry queue (this is checked above), because -+ * this is the only way new orphaned node could be -+ * inserted between them (before inserting new node, -+ * make_space() first tries to shift to the right, so, -+ * right neighbor will be locked and queued). -+ * -+ */ -+ right = node; -+ do { -+ right = list_entry(right->header.level_linkage.next, -+ carry_node, header.level_linkage); -+ assert("nikita-3408", !carry_node_end(doing, -+ right)); -+ } while (reiser4_carry_real(right) == -+ reiser4_carry_real(node)); -+ return right; -+ } -+ } -+ read_unlock_tree(tree); -+ -+ flags = GN_CAN_USE_UPPER_LEVELS; -+ if (!op->u.insert.flags & COPI_LOAD_RIGHT) -+ flags = GN_NO_ALLOC; -+ -+ /* then, try to lock right neighbor */ -+ init_lh(&lh); -+ result = reiser4_get_right_neighbor(&lh, -+ reiser4_carry_real(node), -+ ZNODE_WRITE_LOCK, flags); -+ if (result == 0) { -+ /* ok, node found and locked. */ -+ right = reiser4_add_carry_skip(doing, POOLO_AFTER, node); -+ if (!IS_ERR(right)) { -+ right->node = lh.node; -+ move_lh(&right->lock_handle, &lh); -+ right->free = 1; -+ result = lock_carry_node_tail(right); -+ if (result != 0) -+ right = ERR_PTR(result); -+ } -+ } else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) { -+ /* node is rightmost node in a tree, or neighbor wasn't in -+ cache, or there is an extent on the right. */ -+ right = NULL; -+ } else -+ right = ERR_PTR(result); -+ done_lh(&lh); -+ return right; -+} -+ -+/* how much free space in a @node is needed for @op -+ -+ How much space in @node is required for completion of @op, where @op is -+ insert or paste operation. -+*/ -+static unsigned int space_needed_for_op(znode * node /* znode data are -+ * inserted or -+ * pasted in */ , -+ carry_op * op /* carry -+ operation */ ) -+{ -+ assert("nikita-919", op != NULL); -+ -+ switch (op->op) { -+ default: -+ impossible("nikita-1701", "Wrong opcode"); -+ case COP_INSERT: -+ return space_needed(node, NULL, op->u.insert.d->data, 1); -+ case COP_PASTE: -+ return space_needed(node, op->u.insert.d->coord, -+ op->u.insert.d->data, 0); -+ } -+} -+ -+/* how much space in @node is required to insert or paste @data at -+ @coord. */ -+unsigned int space_needed(const znode * node /* node data are inserted or -+ * pasted in */ , -+ const coord_t * coord /* coord where data are -+ * inserted or pasted -+ * at */ , -+ const reiser4_item_data * data /* data to insert or -+ * paste */ , -+ int insertion /* non-0 is inserting, 0---paste */ ) -+{ -+ int result; -+ item_plugin *iplug; -+ -+ assert("nikita-917", node != NULL); -+ assert("nikita-918", node_plugin_by_node(node) != NULL); -+ assert("vs-230", !insertion || (coord == NULL)); -+ -+ result = 0; -+ iplug = data->iplug; -+ if (iplug->b.estimate != NULL) { -+ /* ask item plugin how much space is needed to insert this -+ item */ -+ result += iplug->b.estimate(insertion ? NULL : coord, data); -+ } else { -+ /* reasonable default */ -+ result += data->length; -+ } -+ if (insertion) { -+ node_plugin *nplug; -+ -+ nplug = node->nplug; -+ /* and add node overhead */ -+ if (nplug->item_overhead != NULL) { -+ result += nplug->item_overhead(node, NULL); -+ } -+ } -+ return result; -+} -+ -+/* find &coord in parent where pointer to new child is to be stored. */ -+static int find_new_child_coord(carry_op * op /* COP_INSERT carry operation to -+ * insert pointer to new -+ * child */ ) -+{ -+ int result; -+ znode *node; -+ znode *child; -+ -+ assert("nikita-941", op != NULL); -+ assert("nikita-942", op->op == COP_INSERT); -+ -+ node = reiser4_carry_real(op->node); -+ assert("nikita-943", node != NULL); -+ assert("nikita-944", node_plugin_by_node(node) != NULL); -+ -+ child = reiser4_carry_real(op->u.insert.child); -+ result = -+ find_new_child_ptr(node, child, op->u.insert.brother, -+ op->u.insert.d->coord); -+ -+ build_child_ptr_data(child, op->u.insert.d->data); -+ return result; -+} -+ -+/* additional amount of free space in @node required to complete @op */ -+static int free_space_shortage(znode * node /* node to check */ , -+ carry_op * op /* operation being performed */ ) -+{ -+ assert("nikita-1061", node != NULL); -+ assert("nikita-1062", op != NULL); -+ -+ switch (op->op) { -+ default: -+ impossible("nikita-1702", "Wrong opcode"); -+ case COP_INSERT: -+ case COP_PASTE: -+ return space_needed_for_op(node, op) - znode_free_space(node); -+ case COP_EXTENT: -+ /* when inserting extent shift data around until insertion -+ point is utmost in the node. */ -+ if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE) -+ return +1; -+ else -+ return -1; -+ } -+} -+ -+/* helper function: update node pointer in operation after insertion -+ point was probably shifted into @target. */ -+static znode *sync_op(carry_op * op, carry_node * target) -+{ -+ znode *insertion_node; -+ -+ /* reget node from coord: shift might move insertion coord to -+ the neighbor */ -+ insertion_node = op->u.insert.d->coord->node; -+ /* if insertion point was actually moved into new node, -+ update carry node pointer in operation. */ -+ if (insertion_node != reiser4_carry_real(op->node)) { -+ op->node = target; -+ assert("nikita-2540", -+ reiser4_carry_real(target) == insertion_node); -+ } -+ assert("nikita-2541", -+ reiser4_carry_real(op->node) == op->u.insert.d->coord->node); -+ return insertion_node; -+} -+ -+/* -+ * complete make_space() call: update tracked lock handle if necessary. See -+ * comments for fs/reiser4/carry.h:carry_track_type -+ */ -+static int -+make_space_tail(carry_op * op, carry_level * doing, znode * orig_node) -+{ -+ int result; -+ carry_track_type tracking; -+ znode *node; -+ -+ tracking = doing->track_type; -+ node = op->u.insert.d->coord->node; -+ -+ if (tracking == CARRY_TRACK_NODE || -+ (tracking == CARRY_TRACK_CHANGE && node != orig_node)) { -+ /* inserting or pasting into node different from -+ original. Update lock handle supplied by caller. */ -+ assert("nikita-1417", doing->tracked != NULL); -+ done_lh(doing->tracked); -+ init_lh(doing->tracked); -+ result = longterm_lock_znode(doing->tracked, node, -+ ZNODE_WRITE_LOCK, -+ ZNODE_LOCK_HIPRI); -+ } else -+ result = 0; -+ return result; -+} -+ -+/* This is insertion policy function. It shifts data to the left and right -+ neighbors of insertion coord and allocates new nodes until there is enough -+ free space to complete @op. -+ -+ See comments in the body. -+ -+ Assumes that the node format favors insertions at the right end of the node -+ as node40 does. -+ -+ See carry_flow() on detail about flow insertion -+*/ -+static int make_space(carry_op * op /* carry operation, insert or paste */ , -+ carry_level * doing /* current carry queue */ , -+ carry_level * todo /* carry queue on the parent level */ ) -+{ -+ znode *node; -+ int result; -+ int not_enough_space; -+ int blk_alloc; -+ znode *orig_node; -+ __u32 flags; -+ -+ coord_t *coord; -+ -+ assert("nikita-890", op != NULL); -+ assert("nikita-891", todo != NULL); -+ assert("nikita-892", -+ op->op == COP_INSERT || -+ op->op == COP_PASTE || op->op == COP_EXTENT); -+ assert("nikita-1607", -+ reiser4_carry_real(op->node) == op->u.insert.d->coord->node); -+ -+ flags = op->u.insert.flags; -+ -+ /* NOTE check that new node can only be allocated after checking left -+ * and right neighbors. This is necessary for proper work of -+ * find_{left,right}_neighbor(). */ -+ assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE, -+ flags & COPI_DONT_SHIFT_LEFT)); -+ assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE, -+ flags & COPI_DONT_SHIFT_RIGHT)); -+ -+ coord = op->u.insert.d->coord; -+ orig_node = node = coord->node; -+ -+ assert("nikita-908", node != NULL); -+ assert("nikita-909", node_plugin_by_node(node) != NULL); -+ -+ result = 0; -+ /* If there is not enough space in a node, try to shift something to -+ the left neighbor. This is a bit tricky, as locking to the left is -+ low priority. This is handled by restart logic in carry(). -+ */ -+ not_enough_space = free_space_shortage(node, op); -+ if (not_enough_space <= 0) -+ /* it is possible that carry was called when there actually -+ was enough space in the node. For example, when inserting -+ leftmost item so that delimiting keys have to be updated. -+ */ -+ return make_space_tail(op, doing, orig_node); -+ if (!(flags & COPI_DONT_SHIFT_LEFT)) { -+ carry_node *left; -+ /* make note in statistics of an attempt to move -+ something into the left neighbor */ -+ left = find_left_neighbor(op, doing); -+ if (unlikely(IS_ERR(left))) { -+ if (PTR_ERR(left) == -E_REPEAT) -+ return -E_REPEAT; -+ else { -+ /* some error other than restart request -+ occurred. This shouldn't happen. Issue a -+ warning and continue as if left neighbor -+ weren't existing. -+ */ -+ warning("nikita-924", -+ "Error accessing left neighbor: %li", -+ PTR_ERR(left)); -+ } -+ } else if (left != NULL) { -+ -+ /* shift everything possible on the left of and -+ including insertion coord into the left neighbor */ -+ result = carry_shift_data(LEFT_SIDE, coord, -+ reiser4_carry_real(left), -+ doing, todo, -+ flags & COPI_GO_LEFT); -+ -+ /* reget node from coord: shift_left() might move -+ insertion coord to the left neighbor */ -+ node = sync_op(op, left); -+ -+ not_enough_space = free_space_shortage(node, op); -+ /* There is not enough free space in @node, but -+ may be, there is enough free space in -+ @left. Various balancing decisions are valid here. -+ The same for the shifiting to the right. -+ */ -+ } -+ } -+ /* If there still is not enough space, shift to the right */ -+ if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) { -+ carry_node *right; -+ -+ right = find_right_neighbor(op, doing); -+ if (IS_ERR(right)) { -+ warning("nikita-1065", -+ "Error accessing right neighbor: %li", -+ PTR_ERR(right)); -+ } else if (right != NULL) { -+ /* node containing insertion point, and its right -+ neighbor node are write locked by now. -+ -+ shift everything possible on the right of but -+ excluding insertion coord into the right neighbor -+ */ -+ result = carry_shift_data(RIGHT_SIDE, coord, -+ reiser4_carry_real(right), -+ doing, todo, -+ flags & COPI_GO_RIGHT); -+ /* reget node from coord: shift_right() might move -+ insertion coord to the right neighbor */ -+ node = sync_op(op, right); -+ not_enough_space = free_space_shortage(node, op); -+ } -+ } -+ /* If there is still not enough space, allocate new node(s). -+ -+ We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in -+ the carry operation flags (currently this is needed during flush -+ only). -+ */ -+ for (blk_alloc = 0; -+ not_enough_space > 0 && result == 0 && blk_alloc < 2 && -+ !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) { -+ carry_node *fresh; /* new node we are allocating */ -+ coord_t coord_shadow; /* remembered insertion point before -+ * shifting data into new node */ -+ carry_node *node_shadow; /* remembered insertion node before -+ * shifting */ -+ unsigned int gointo; /* whether insertion point should move -+ * into newly allocated node */ -+ -+ /* allocate new node on the right of @node. Znode and disk -+ fake block number for new node are allocated. -+ -+ add_new_znode() posts carry operation COP_INSERT with -+ COPT_CHILD option to the parent level to add -+ pointer to newly created node to its parent. -+ -+ Subtle point: if several new nodes are required to complete -+ insertion operation at this level, they will be inserted -+ into their parents in the order of creation, which means -+ that @node will be valid "cookie" at the time of insertion. -+ -+ */ -+ fresh = add_new_znode(node, op->node, doing, todo); -+ if (IS_ERR(fresh)) -+ return PTR_ERR(fresh); -+ -+ /* Try to shift into new node. */ -+ result = lock_carry_node(doing, fresh); -+ zput(reiser4_carry_real(fresh)); -+ if (result != 0) { -+ warning("nikita-947", -+ "Cannot lock new node: %i", result); -+ return result; -+ } -+ -+ /* both nodes are write locked by now. -+ -+ shift everything possible on the right of and -+ including insertion coord into the right neighbor. -+ */ -+ coord_dup(&coord_shadow, op->u.insert.d->coord); -+ node_shadow = op->node; -+ /* move insertion point into newly created node if: -+ -+ . insertion point is rightmost in the source node, or -+ . this is not the first node we are allocating in a row. -+ */ -+ gointo = -+ (blk_alloc > 0) || -+ coord_is_after_rightmost(op->u.insert.d->coord); -+ -+ if (gointo && -+ op->op == COP_PASTE && -+ coord_is_existing_item(op->u.insert.d->coord) && -+ is_solid_item((item_plugin_by_coord(op->u.insert.d->coord)))) { -+ /* paste into solid (atomic) item, which can contain -+ only one unit, so we need to shift it right, where -+ insertion point supposed to be */ -+ -+ assert("edward-1444", op->u.insert.d->data->iplug == -+ item_plugin_by_id(STATIC_STAT_DATA_ID)); -+ assert("edward-1445", -+ op->u.insert.d->data->length > -+ node_plugin_by_node(coord->node)->free_space -+ (coord->node)); -+ -+ op->u.insert.d->coord->between = BEFORE_UNIT; -+ } -+ -+ result = carry_shift_data(RIGHT_SIDE, coord, -+ reiser4_carry_real(fresh), -+ doing, todo, gointo); -+ /* if insertion point was actually moved into new node, -+ update carry node pointer in operation. */ -+ node = sync_op(op, fresh); -+ not_enough_space = free_space_shortage(node, op); -+ if ((not_enough_space > 0) && (node != coord_shadow.node)) { -+ /* there is not enough free in new node. Shift -+ insertion point back to the @shadow_node so that -+ next new node would be inserted between -+ @shadow_node and @fresh. -+ */ -+ coord_normalize(&coord_shadow); -+ coord_dup(coord, &coord_shadow); -+ node = coord->node; -+ op->node = node_shadow; -+ if (1 || (flags & COPI_STEP_BACK)) { -+ /* still not enough space?! Maybe there is -+ enough space in the source node (i.e., node -+ data are moved from) now. -+ */ -+ not_enough_space = -+ free_space_shortage(node, op); -+ } -+ } -+ } -+ if (not_enough_space > 0) { -+ if (!(flags & COPI_DONT_ALLOCATE)) -+ warning("nikita-948", "Cannot insert new item"); -+ result = -E_NODE_FULL; -+ } -+ assert("nikita-1622", ergo(result == 0, -+ reiser4_carry_real(op->node) == coord->node)); -+ assert("nikita-2616", coord == op->u.insert.d->coord); -+ if (result == 0) -+ result = make_space_tail(op, doing, orig_node); -+ return result; -+} -+ -+/* insert_paste_common() - common part of insert and paste operations -+ -+ This function performs common part of COP_INSERT and COP_PASTE. -+ -+ There are two ways in which insertion/paste can be requested: -+ -+ . by directly supplying reiser4_item_data. In this case, op -> -+ u.insert.type is set to COPT_ITEM_DATA. -+ -+ . by supplying child pointer to which is to inserted into parent. In this -+ case op -> u.insert.type == COPT_CHILD. -+ -+ . by supplying key of new item/unit. This is currently only used during -+ extent insertion -+ -+ This is required, because when new node is allocated we don't know at what -+ position pointer to it is to be stored in the parent. Actually, we don't -+ even know what its parent will be, because parent can be re-balanced -+ concurrently and new node re-parented, and because parent can be full and -+ pointer to the new node will go into some other node. -+ -+ insert_paste_common() resolves pointer to child node into position in the -+ parent by calling find_new_child_coord(), that fills -+ reiser4_item_data. After this, insertion/paste proceeds uniformly. -+ -+ Another complication is with finding free space during pasting. It may -+ happen that while shifting items to the neighbors and newly allocated -+ nodes, insertion coord can no longer be in the item we wanted to paste -+ into. At this point, paste becomes (morphs) into insert. Moreover free -+ space analysis has to be repeated, because amount of space required for -+ insertion is different from that of paste (item header overhead, etc). -+ -+ This function "unifies" different insertion modes (by resolving child -+ pointer or key into insertion coord), and then calls make_space() to free -+ enough space in the node by shifting data to the left and right and by -+ allocating new nodes if necessary. Carry operation knows amount of space -+ required for its completion. After enough free space is obtained, caller of -+ this function (carry_{insert,paste,etc.}) performs actual insertion/paste -+ by calling item plugin method. -+ -+*/ -+static int insert_paste_common(carry_op * op /* carry operation being -+ * performed */ , -+ carry_level * doing /* current carry level */ , -+ carry_level * todo /* next carry level */ , -+ carry_insert_data * cdata /* pointer to -+ * cdata */ , -+ coord_t * coord /* insertion/paste coord */ , -+ reiser4_item_data * data /* data to be -+ * inserted/pasted */ ) -+{ -+ assert("nikita-981", op != NULL); -+ assert("nikita-980", todo != NULL); -+ assert("nikita-979", (op->op == COP_INSERT) || (op->op == COP_PASTE) -+ || (op->op == COP_EXTENT)); -+ -+ if (op->u.insert.type == COPT_PASTE_RESTARTED) { -+ /* nothing to do. Fall through to make_space(). */ -+ ; -+ } else if (op->u.insert.type == COPT_KEY) { -+ node_search_result intra_node; -+ znode *node; -+ /* Problem with doing batching at the lowest level, is that -+ operations here are given by coords where modification is -+ to be performed, and one modification can invalidate coords -+ of all following operations. -+ -+ So, we are implementing yet another type for operation that -+ will use (the only) "locator" stable across shifting of -+ data between nodes, etc.: key (COPT_KEY). -+ -+ This clause resolves key to the coord in the node. -+ -+ But node can change also. Probably some pieces have to be -+ added to the lock_carry_node(), to lock node by its key. -+ -+ */ -+ /* NOTE-NIKITA Lookup bias is fixed to FIND_EXACT. Complain -+ if you need something else. */ -+ op->u.insert.d->coord = coord; -+ node = reiser4_carry_real(op->node); -+ intra_node = node_plugin_by_node(node)->lookup -+ (node, op->u.insert.d->key, FIND_EXACT, -+ op->u.insert.d->coord); -+ if ((intra_node != NS_FOUND) && (intra_node != NS_NOT_FOUND)) { -+ warning("nikita-1715", "Intra node lookup failure: %i", -+ intra_node); -+ return intra_node; -+ } -+ } else if (op->u.insert.type == COPT_CHILD) { -+ /* if we are asked to insert pointer to the child into -+ internal node, first convert pointer to the child into -+ coord within parent node. -+ */ -+ znode *child; -+ int result; -+ -+ op->u.insert.d = cdata; -+ op->u.insert.d->coord = coord; -+ op->u.insert.d->data = data; -+ op->u.insert.d->coord->node = reiser4_carry_real(op->node); -+ result = find_new_child_coord(op); -+ child = reiser4_carry_real(op->u.insert.child); -+ if (result != NS_NOT_FOUND) { -+ warning("nikita-993", -+ "Cannot find a place for child pointer: %i", -+ result); -+ return result; -+ } -+ /* This only happens when we did multiple insertions at -+ the previous level, trying to insert single item and -+ it so happened, that insertion of pointers to all new -+ nodes before this one already caused parent node to -+ split (may be several times). -+ -+ I am going to come up with better solution. -+ -+ You are not expected to understand this. -+ -- v6root/usr/sys/ken/slp.c -+ -+ Basically, what happens here is the following: carry came -+ to the parent level and is about to insert internal item -+ pointing to the child node that it just inserted in the -+ level below. Position where internal item is to be inserted -+ was found by find_new_child_coord() above, but node of the -+ current carry operation (that is, parent node of child -+ inserted on the previous level), was determined earlier in -+ the lock_carry_level/lock_carry_node. It could so happen -+ that other carry operations already performed on the parent -+ level already split parent node, so that insertion point -+ moved into another node. Handle this by creating new carry -+ node for insertion point if necessary. -+ */ -+ if (reiser4_carry_real(op->node) != -+ op->u.insert.d->coord->node) { -+ pool_ordering direction; -+ znode *z1; -+ znode *z2; -+ reiser4_key k1; -+ reiser4_key k2; -+ -+ /* -+ * determine in what direction insertion point -+ * moved. Do this by comparing delimiting keys. -+ */ -+ z1 = op->u.insert.d->coord->node; -+ z2 = reiser4_carry_real(op->node); -+ if (keyle(leftmost_key_in_node(z1, &k1), -+ leftmost_key_in_node(z2, &k2))) -+ /* insertion point moved to the left */ -+ direction = POOLO_BEFORE; -+ else -+ /* insertion point moved to the right */ -+ direction = POOLO_AFTER; -+ -+ op->node = reiser4_add_carry_skip(doing, -+ direction, op->node); -+ if (IS_ERR(op->node)) -+ return PTR_ERR(op->node); -+ op->node->node = op->u.insert.d->coord->node; -+ op->node->free = 1; -+ result = lock_carry_node(doing, op->node); -+ if (result != 0) -+ return result; -+ } -+ -+ /* -+ * set up key of an item being inserted: we are inserting -+ * internal item and its key is (by the very definition of -+ * search tree) is leftmost key in the child node. -+ */ -+ write_lock_dk(znode_get_tree(child)); -+ op->u.insert.d->key = leftmost_key_in_node(child, -+ znode_get_ld_key(child)); -+ write_unlock_dk(znode_get_tree(child)); -+ op->u.insert.d->data->arg = op->u.insert.brother; -+ } else { -+ assert("vs-243", op->u.insert.d->coord != NULL); -+ op->u.insert.d->coord->node = reiser4_carry_real(op->node); -+ } -+ -+ /* find free space. */ -+ return make_space(op, doing, todo); -+} -+ -+/* handle carry COP_INSERT operation. -+ -+ Insert new item into node. New item can be given in one of two ways: -+ -+ - by passing &tree_coord and &reiser4_item_data as part of @op. This is -+ only applicable at the leaf/twig level. -+ -+ - by passing a child node pointer to which is to be inserted by this -+ operation. -+ -+*/ -+static int carry_insert(carry_op * op /* operation to perform */ , -+ carry_level * doing /* queue of operations @op -+ * is part of */ , -+ carry_level * todo /* queue where new operations -+ * are accumulated */ ) -+{ -+ znode *node; -+ carry_insert_data cdata; -+ coord_t coord; -+ reiser4_item_data data; -+ carry_plugin_info info; -+ int result; -+ -+ assert("nikita-1036", op != NULL); -+ assert("nikita-1037", todo != NULL); -+ assert("nikita-1038", op->op == COP_INSERT); -+ -+ coord_init_zero(&coord); -+ -+ /* perform common functionality of insert and paste. */ -+ result = insert_paste_common(op, doing, todo, &cdata, &coord, &data); -+ if (result != 0) -+ return result; -+ -+ node = op->u.insert.d->coord->node; -+ assert("nikita-1039", node != NULL); -+ assert("nikita-1040", node_plugin_by_node(node) != NULL); -+ -+ assert("nikita-949", -+ space_needed_for_op(node, op) <= znode_free_space(node)); -+ -+ /* ask node layout to create new item. */ -+ info.doing = doing; -+ info.todo = todo; -+ result = node_plugin_by_node(node)->create_item -+ (op->u.insert.d->coord, op->u.insert.d->key, op->u.insert.d->data, -+ &info); -+ doing->restartable = 0; -+ znode_make_dirty(node); -+ -+ return result; -+} -+ -+/* -+ * Flow insertion code. COP_INSERT_FLOW is special tree operation that is -+ * supplied with a "flow" (that is, a stream of data) and inserts it into tree -+ * by slicing into multiple items. -+ */ -+ -+#define flow_insert_point(op) ( ( op ) -> u.insert_flow.insert_point ) -+#define flow_insert_flow(op) ( ( op ) -> u.insert_flow.flow ) -+#define flow_insert_data(op) ( ( op ) -> u.insert_flow.data ) -+ -+static size_t item_data_overhead(carry_op * op) -+{ -+ if (flow_insert_data(op)->iplug->b.estimate == NULL) -+ return 0; -+ return (flow_insert_data(op)->iplug->b. -+ estimate(NULL /* estimate insertion */ , flow_insert_data(op)) - -+ flow_insert_data(op)->length); -+} -+ -+/* FIXME-VS: this is called several times during one make_flow_for_insertion -+ and it will always return the same result. Some optimization could be made -+ by calculating this value once at the beginning and passing it around. That -+ would reduce some flexibility in future changes -+*/ -+static int can_paste(coord_t *, const reiser4_key *, const reiser4_item_data *); -+static size_t flow_insertion_overhead(carry_op * op) -+{ -+ znode *node; -+ size_t insertion_overhead; -+ -+ node = flow_insert_point(op)->node; -+ insertion_overhead = 0; -+ if (node->nplug->item_overhead && -+ !can_paste(flow_insert_point(op), &flow_insert_flow(op)->key, -+ flow_insert_data(op))) -+ insertion_overhead = -+ node->nplug->item_overhead(node, NULL) + -+ item_data_overhead(op); -+ return insertion_overhead; -+} -+ -+/* how many bytes of flow does fit to the node */ -+static int what_can_fit_into_node(carry_op * op) -+{ -+ size_t free, overhead; -+ -+ overhead = flow_insertion_overhead(op); -+ free = znode_free_space(flow_insert_point(op)->node); -+ if (free <= overhead) -+ return 0; -+ free -= overhead; -+ /* FIXME: flow->length is loff_t only to not get overflowed in case of expandign truncate */ -+ if (free < op->u.insert_flow.flow->length) -+ return free; -+ return (int)op->u.insert_flow.flow->length; -+} -+ -+/* in make_space_for_flow_insertion we need to check either whether whole flow -+ fits into a node or whether minimal fraction of flow fits into a node */ -+static int enough_space_for_whole_flow(carry_op * op) -+{ -+ return (unsigned)what_can_fit_into_node(op) == -+ op->u.insert_flow.flow->length; -+} -+ -+#define MIN_FLOW_FRACTION 1 -+static int enough_space_for_min_flow_fraction(carry_op * op) -+{ -+ assert("vs-902", coord_is_after_rightmost(flow_insert_point(op))); -+ -+ return what_can_fit_into_node(op) >= MIN_FLOW_FRACTION; -+} -+ -+/* this returns 0 if left neighbor was obtained successfully and everything -+ upto insertion point including it were shifted and left neighbor still has -+ some free space to put minimal fraction of flow into it */ -+static int -+make_space_by_shift_left(carry_op * op, carry_level * doing, carry_level * todo) -+{ -+ carry_node *left; -+ znode *orig; -+ -+ left = find_left_neighbor(op, doing); -+ if (unlikely(IS_ERR(left))) { -+ warning("vs-899", -+ "make_space_by_shift_left: " -+ "error accessing left neighbor: %li", PTR_ERR(left)); -+ return 1; -+ } -+ if (left == NULL) -+ /* left neighbor either does not exist or is unformatted -+ node */ -+ return 1; -+ -+ orig = flow_insert_point(op)->node; -+ /* try to shift content of node @orig from its head upto insert point -+ including insertion point into the left neighbor */ -+ carry_shift_data(LEFT_SIDE, flow_insert_point(op), -+ reiser4_carry_real(left), doing, todo, -+ 1 /* including insert point */); -+ if (reiser4_carry_real(left) != flow_insert_point(op)->node) { -+ /* insertion point did not move */ -+ return 1; -+ } -+ -+ /* insertion point is set after last item in the node */ -+ assert("vs-900", coord_is_after_rightmost(flow_insert_point(op))); -+ -+ if (!enough_space_for_min_flow_fraction(op)) { -+ /* insertion point node does not have enough free space to put -+ even minimal portion of flow into it, therefore, move -+ insertion point back to orig node (before first item) */ -+ coord_init_before_first_item(flow_insert_point(op), orig); -+ return 1; -+ } -+ -+ /* part of flow is to be written to the end of node */ -+ op->node = left; -+ return 0; -+} -+ -+/* this returns 0 if right neighbor was obtained successfully and everything to -+ the right of insertion point was shifted to it and node got enough free -+ space to put minimal fraction of flow into it */ -+static int -+make_space_by_shift_right(carry_op * op, carry_level * doing, -+ carry_level * todo) -+{ -+ carry_node *right; -+ -+ right = find_right_neighbor(op, doing); -+ if (unlikely(IS_ERR(right))) { -+ warning("nikita-1065", "shift_right_excluding_insert_point: " -+ "error accessing right neighbor: %li", PTR_ERR(right)); -+ return 1; -+ } -+ if (right) { -+ /* shift everything possible on the right of but excluding -+ insertion coord into the right neighbor */ -+ carry_shift_data(RIGHT_SIDE, flow_insert_point(op), -+ reiser4_carry_real(right), doing, todo, -+ 0 /* not including insert point */); -+ } else { -+ /* right neighbor either does not exist or is unformatted -+ node */ -+ ; -+ } -+ if (coord_is_after_rightmost(flow_insert_point(op))) { -+ if (enough_space_for_min_flow_fraction(op)) { -+ /* part of flow is to be written to the end of node */ -+ return 0; -+ } -+ } -+ -+ /* new node is to be added if insert point node did not get enough -+ space for whole flow */ -+ return 1; -+} -+ -+/* this returns 0 when insert coord is set at the node end and fraction of flow -+ fits into that node */ -+static int -+make_space_by_new_nodes(carry_op * op, carry_level * doing, carry_level * todo) -+{ -+ int result; -+ znode *node; -+ carry_node *new; -+ -+ node = flow_insert_point(op)->node; -+ -+ if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT) -+ return RETERR(-E_NODE_FULL); -+ /* add new node after insert point node */ -+ new = add_new_znode(node, op->node, doing, todo); -+ if (unlikely(IS_ERR(new))) { -+ return PTR_ERR(new); -+ } -+ result = lock_carry_node(doing, new); -+ zput(reiser4_carry_real(new)); -+ if (unlikely(result)) { -+ return result; -+ } -+ op->u.insert_flow.new_nodes++; -+ if (!coord_is_after_rightmost(flow_insert_point(op))) { -+ carry_shift_data(RIGHT_SIDE, flow_insert_point(op), -+ reiser4_carry_real(new), doing, todo, -+ 0 /* not including insert point */); -+ assert("vs-901", -+ coord_is_after_rightmost(flow_insert_point(op))); -+ -+ if (enough_space_for_min_flow_fraction(op)) { -+ return 0; -+ } -+ if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT) -+ return RETERR(-E_NODE_FULL); -+ -+ /* add one more new node */ -+ new = add_new_znode(node, op->node, doing, todo); -+ if (unlikely(IS_ERR(new))) { -+ return PTR_ERR(new); -+ } -+ result = lock_carry_node(doing, new); -+ zput(reiser4_carry_real(new)); -+ if (unlikely(result)) { -+ return result; -+ } -+ op->u.insert_flow.new_nodes++; -+ } -+ -+ /* move insertion point to new node */ -+ coord_init_before_first_item(flow_insert_point(op), -+ reiser4_carry_real(new)); -+ op->node = new; -+ return 0; -+} -+ -+static int -+make_space_for_flow_insertion(carry_op * op, carry_level * doing, -+ carry_level * todo) -+{ -+ __u32 flags = op->u.insert_flow.flags; -+ -+ if (enough_space_for_whole_flow(op)) { -+ /* whole flow fits into insert point node */ -+ return 0; -+ } -+ -+ if (!(flags & COPI_DONT_SHIFT_LEFT) -+ && (make_space_by_shift_left(op, doing, todo) == 0)) { -+ /* insert point is shifted to left neighbor of original insert -+ point node and is set after last unit in that node. It has -+ enough space to fit at least minimal fraction of flow. */ -+ return 0; -+ } -+ -+ if (enough_space_for_whole_flow(op)) { -+ /* whole flow fits into insert point node */ -+ return 0; -+ } -+ -+ if (!(flags & COPI_DONT_SHIFT_RIGHT) -+ && (make_space_by_shift_right(op, doing, todo) == 0)) { -+ /* insert point is still set to the same node, but there is -+ nothing to the right of insert point. */ -+ return 0; -+ } -+ -+ if (enough_space_for_whole_flow(op)) { -+ /* whole flow fits into insert point node */ -+ return 0; -+ } -+ -+ return make_space_by_new_nodes(op, doing, todo); -+} -+ -+/* implements COP_INSERT_FLOW operation */ -+static int -+carry_insert_flow(carry_op * op, carry_level * doing, carry_level * todo) -+{ -+ int result; -+ flow_t *f; -+ coord_t *insert_point; -+ node_plugin *nplug; -+ carry_plugin_info info; -+ znode *orig_node; -+ lock_handle *orig_lh; -+ -+ f = op->u.insert_flow.flow; -+ result = 0; -+ -+ /* carry system needs this to work */ -+ info.doing = doing; -+ info.todo = todo; -+ -+ orig_node = flow_insert_point(op)->node; -+ orig_lh = doing->tracked; -+ -+ while (f->length) { -+ result = make_space_for_flow_insertion(op, doing, todo); -+ if (result) -+ break; -+ -+ insert_point = flow_insert_point(op); -+ nplug = node_plugin_by_node(insert_point->node); -+ -+ /* compose item data for insertion/pasting */ -+ flow_insert_data(op)->data = f->data; -+ flow_insert_data(op)->length = what_can_fit_into_node(op); -+ -+ if (can_paste(insert_point, &f->key, flow_insert_data(op))) { -+ /* insert point is set to item of file we are writing to and we have to append to it */ -+ assert("vs-903", insert_point->between == AFTER_UNIT); -+ nplug->change_item_size(insert_point, -+ flow_insert_data(op)->length); -+ flow_insert_data(op)->iplug->b.paste(insert_point, -+ flow_insert_data -+ (op), &info); -+ } else { -+ /* new item must be inserted */ -+ pos_in_node_t new_pos; -+ flow_insert_data(op)->length += item_data_overhead(op); -+ -+ /* FIXME-VS: this is because node40_create_item changes -+ insert_point for obscure reasons */ -+ switch (insert_point->between) { -+ case AFTER_ITEM: -+ new_pos = insert_point->item_pos + 1; -+ break; -+ case EMPTY_NODE: -+ new_pos = 0; -+ break; -+ case BEFORE_ITEM: -+ assert("vs-905", insert_point->item_pos == 0); -+ new_pos = 0; -+ break; -+ default: -+ impossible("vs-906", -+ "carry_insert_flow: invalid coord"); -+ new_pos = 0; -+ break; -+ } -+ -+ nplug->create_item(insert_point, &f->key, -+ flow_insert_data(op), &info); -+ coord_set_item_pos(insert_point, new_pos); -+ } -+ coord_init_after_item_end(insert_point); -+ doing->restartable = 0; -+ znode_make_dirty(insert_point->node); -+ -+ move_flow_forward(f, (unsigned)flow_insert_data(op)->length); -+ } -+ -+ if (orig_node != flow_insert_point(op)->node) { -+ /* move lock to new insert point */ -+ done_lh(orig_lh); -+ init_lh(orig_lh); -+ result = -+ longterm_lock_znode(orig_lh, flow_insert_point(op)->node, -+ ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI); -+ } -+ -+ return result; -+} -+ -+/* implements COP_DELETE operation -+ -+ Remove pointer to @op -> u.delete.child from it's parent. -+ -+ This function also handles killing of a tree root is last pointer from it -+ was removed. This is complicated by our handling of "twig" level: root on -+ twig level is never killed. -+ -+*/ -+static int carry_delete(carry_op * op /* operation to be performed */ , -+ carry_level * doing UNUSED_ARG /* current carry -+ * level */ , -+ carry_level * todo /* next carry level */ ) -+{ -+ int result; -+ coord_t coord; -+ coord_t coord2; -+ znode *parent; -+ znode *child; -+ carry_plugin_info info; -+ reiser4_tree *tree; -+ -+ /* -+ * This operation is called to delete internal item pointing to the -+ * child node that was removed by carry from the tree on the previous -+ * tree level. -+ */ -+ -+ assert("nikita-893", op != NULL); -+ assert("nikita-894", todo != NULL); -+ assert("nikita-895", op->op == COP_DELETE); -+ -+ coord_init_zero(&coord); -+ coord_init_zero(&coord2); -+ -+ parent = reiser4_carry_real(op->node); -+ child = op->u.delete.child ? -+ reiser4_carry_real(op->u.delete.child) : op->node->node; -+ tree = znode_get_tree(child); -+ read_lock_tree(tree); -+ -+ /* -+ * @parent was determined when carry entered parent level -+ * (lock_carry_level/lock_carry_node). Since then, actual parent of -+ * @child node could change due to other carry operations performed on -+ * the parent level. Check for this. -+ */ -+ -+ if (znode_parent(child) != parent) { -+ /* NOTE-NIKITA add stat counter for this. */ -+ parent = znode_parent(child); -+ assert("nikita-2581", find_carry_node(doing, parent)); -+ } -+ read_unlock_tree(tree); -+ -+ assert("nikita-1213", znode_get_level(parent) > LEAF_LEVEL); -+ -+ /* Twig level horrors: tree should be of height at least 2. So, last -+ pointer from the root at twig level is preserved even if child is -+ empty. This is ugly, but so it was architectured. -+ */ -+ -+ if (znode_is_root(parent) && -+ znode_get_level(parent) <= REISER4_MIN_TREE_HEIGHT && -+ node_num_items(parent) == 1) { -+ /* Delimiting key manipulations. */ -+ write_lock_dk(tree); -+ znode_set_ld_key(child, znode_set_ld_key(parent, reiser4_min_key())); -+ znode_set_rd_key(child, znode_set_rd_key(parent, reiser4_max_key())); -+ ZF_SET(child, JNODE_DKSET); -+ write_unlock_dk(tree); -+ -+ /* @child escaped imminent death! */ -+ ZF_CLR(child, JNODE_HEARD_BANSHEE); -+ return 0; -+ } -+ -+ /* convert child pointer to the coord_t */ -+ result = find_child_ptr(parent, child, &coord); -+ if (result != NS_FOUND) { -+ warning("nikita-994", "Cannot find child pointer: %i", result); -+ print_coord_content("coord", &coord); -+ return result; -+ } -+ -+ coord_dup(&coord2, &coord); -+ info.doing = doing; -+ info.todo = todo; -+ { -+ /* -+ * Actually kill internal item: prepare structure with -+ * arguments for ->cut_and_kill() method... -+ */ -+ -+ struct carry_kill_data kdata; -+ kdata.params.from = &coord; -+ kdata.params.to = &coord2; -+ kdata.params.from_key = NULL; -+ kdata.params.to_key = NULL; -+ kdata.params.smallest_removed = NULL; -+ kdata.params.truncate = 1; -+ kdata.flags = op->u.delete.flags; -+ kdata.inode = NULL; -+ kdata.left = NULL; -+ kdata.right = NULL; -+ kdata.buf = NULL; -+ /* ... and call it. */ -+ result = node_plugin_by_node(parent)->cut_and_kill(&kdata, -+ &info); -+ } -+ doing->restartable = 0; -+ -+ /* check whether root should be killed violently */ -+ if (znode_is_root(parent) && -+ /* don't kill roots at and lower than twig level */ -+ znode_get_level(parent) > REISER4_MIN_TREE_HEIGHT && -+ node_num_items(parent) == 1) { -+ result = reiser4_kill_tree_root(coord.node); -+ } -+ -+ return result < 0 ? : 0; -+} -+ -+/* implements COP_CUT opration -+ -+ Cuts part or whole content of node. -+ -+*/ -+static int carry_cut(carry_op * op /* operation to be performed */ , -+ carry_level * doing /* current carry level */ , -+ carry_level * todo /* next carry level */ ) -+{ -+ int result; -+ carry_plugin_info info; -+ node_plugin *nplug; -+ -+ assert("nikita-896", op != NULL); -+ assert("nikita-897", todo != NULL); -+ assert("nikita-898", op->op == COP_CUT); -+ -+ info.doing = doing; -+ info.todo = todo; -+ -+ nplug = node_plugin_by_node(reiser4_carry_real(op->node)); -+ if (op->u.cut_or_kill.is_cut) -+ result = nplug->cut(op->u.cut_or_kill.u.cut, &info); -+ else -+ result = nplug->cut_and_kill(op->u.cut_or_kill.u.kill, &info); -+ -+ doing->restartable = 0; -+ return result < 0 ? : 0; -+} -+ -+/* helper function for carry_paste(): returns true if @op can be continued as -+ paste */ -+static int -+can_paste(coord_t * icoord, const reiser4_key * key, -+ const reiser4_item_data * data) -+{ -+ coord_t circa; -+ item_plugin *new_iplug; -+ item_plugin *old_iplug; -+ int result = 0; /* to keep gcc shut */ -+ -+ assert("", icoord->between != AT_UNIT); -+ -+ /* obviously, one cannot paste when node is empty---there is nothing -+ to paste into. */ -+ if (node_is_empty(icoord->node)) -+ return 0; -+ /* if insertion point is at the middle of the item, then paste */ -+ if (!coord_is_between_items(icoord)) -+ return 1; -+ coord_dup(&circa, icoord); -+ circa.between = AT_UNIT; -+ -+ old_iplug = item_plugin_by_coord(&circa); -+ new_iplug = data->iplug; -+ -+ /* check whether we can paste to the item @icoord is "at" when we -+ ignore ->between field */ -+ if (old_iplug == new_iplug && item_can_contain_key(&circa, key, data)) { -+ result = 1; -+ } else if (icoord->between == BEFORE_UNIT -+ || icoord->between == BEFORE_ITEM) { -+ /* otherwise, try to glue to the item at the left, if any */ -+ coord_dup(&circa, icoord); -+ if (coord_set_to_left(&circa)) { -+ result = 0; -+ coord_init_before_item(icoord); -+ } else { -+ old_iplug = item_plugin_by_coord(&circa); -+ result = (old_iplug == new_iplug) -+ && item_can_contain_key(icoord, key, data); -+ if (result) { -+ coord_dup(icoord, &circa); -+ icoord->between = AFTER_UNIT; -+ } -+ } -+ } else if (icoord->between == AFTER_UNIT -+ || icoord->between == AFTER_ITEM) { -+ coord_dup(&circa, icoord); -+ /* otherwise, try to glue to the item at the right, if any */ -+ if (coord_set_to_right(&circa)) { -+ result = 0; -+ coord_init_after_item(icoord); -+ } else { -+ int (*cck) (const coord_t *, const reiser4_key *, -+ const reiser4_item_data *); -+ -+ old_iplug = item_plugin_by_coord(&circa); -+ -+ cck = old_iplug->b.can_contain_key; -+ if (cck == NULL) -+ /* item doesn't define ->can_contain_key -+ method? So it is not expandable. */ -+ result = 0; -+ else { -+ result = (old_iplug == new_iplug) -+ && cck(&circa /*icoord */ , key, data); -+ if (result) { -+ coord_dup(icoord, &circa); -+ icoord->between = BEFORE_UNIT; -+ } -+ } -+ } -+ } else -+ impossible("nikita-2513", "Nothing works"); -+ if (result) { -+ if (icoord->between == BEFORE_ITEM) { -+ assert("vs-912", icoord->unit_pos == 0); -+ icoord->between = BEFORE_UNIT; -+ } else if (icoord->between == AFTER_ITEM) { -+ coord_init_after_item_end(icoord); -+ } -+ } -+ return result; -+} -+ -+/* implements COP_PASTE operation -+ -+ Paste data into existing item. This is complicated by the fact that after -+ we shifted something to the left or right neighbors trying to free some -+ space, item we were supposed to paste into can be in different node than -+ insertion coord. If so, we are no longer doing paste, but insert. See -+ comments in insert_paste_common(). -+ -+*/ -+static int carry_paste(carry_op * op /* operation to be performed */ , -+ carry_level * doing UNUSED_ARG /* current carry -+ * level */ , -+ carry_level * todo /* next carry level */ ) -+{ -+ znode *node; -+ carry_insert_data cdata; -+ coord_t dcoord; -+ reiser4_item_data data; -+ int result; -+ int real_size; -+ item_plugin *iplug; -+ carry_plugin_info info; -+ coord_t *coord; -+ -+ assert("nikita-982", op != NULL); -+ assert("nikita-983", todo != NULL); -+ assert("nikita-984", op->op == COP_PASTE); -+ -+ coord_init_zero(&dcoord); -+ -+ result = insert_paste_common(op, doing, todo, &cdata, &dcoord, &data); -+ if (result != 0) -+ return result; -+ -+ coord = op->u.insert.d->coord; -+ -+ /* handle case when op -> u.insert.coord doesn't point to the item -+ of required type. restart as insert. */ -+ if (!can_paste(coord, op->u.insert.d->key, op->u.insert.d->data)) { -+ op->op = COP_INSERT; -+ op->u.insert.type = COPT_PASTE_RESTARTED; -+ result = op_dispatch_table[COP_INSERT].handler(op, doing, todo); -+ -+ return result; -+ } -+ -+ node = coord->node; -+ iplug = item_plugin_by_coord(coord); -+ assert("nikita-992", iplug != NULL); -+ -+ assert("nikita-985", node != NULL); -+ assert("nikita-986", node_plugin_by_node(node) != NULL); -+ -+ assert("nikita-987", -+ space_needed_for_op(node, op) <= znode_free_space(node)); -+ -+ assert("nikita-1286", coord_is_existing_item(coord)); -+ -+ /* -+ * if item is expanded as a result of this operation, we should first -+ * change item size, than call ->b.paste item method. If item is -+ * shrunk, it should be done other way around: first call ->b.paste -+ * method, then reduce item size. -+ */ -+ -+ real_size = space_needed_for_op(node, op); -+ if (real_size > 0) -+ node->nplug->change_item_size(coord, real_size); -+ -+ doing->restartable = 0; -+ info.doing = doing; -+ info.todo = todo; -+ -+ result = iplug->b.paste(coord, op->u.insert.d->data, &info); -+ -+ if (real_size < 0) -+ node->nplug->change_item_size(coord, real_size); -+ -+ /* if we pasted at the beginning of the item, update item's key. */ -+ if (coord->unit_pos == 0 && coord->between != AFTER_UNIT) -+ node->nplug->update_item_key(coord, op->u.insert.d->key, &info); -+ -+ znode_make_dirty(node); -+ return result; -+} -+ -+/* handle carry COP_EXTENT operation. */ -+static int carry_extent(carry_op * op /* operation to perform */ , -+ carry_level * doing /* queue of operations @op -+ * is part of */ , -+ carry_level * todo /* queue where new operations -+ * are accumulated */ ) -+{ -+ znode *node; -+ carry_insert_data cdata; -+ coord_t coord; -+ reiser4_item_data data; -+ carry_op *delete_dummy; -+ carry_op *insert_extent; -+ int result; -+ carry_plugin_info info; -+ -+ assert("nikita-1751", op != NULL); -+ assert("nikita-1752", todo != NULL); -+ assert("nikita-1753", op->op == COP_EXTENT); -+ -+ /* extent insertion overview: -+ -+ extents live on the TWIG LEVEL, which is level one above the leaf -+ one. This complicates extent insertion logic somewhat: it may -+ happen (and going to happen all the time) that in logical key -+ ordering extent has to be placed between items I1 and I2, located -+ at the leaf level, but I1 and I2 are in the same formatted leaf -+ node N1. To insert extent one has to -+ -+ (1) reach node N1 and shift data between N1, its neighbors and -+ possibly newly allocated nodes until I1 and I2 fall into different -+ nodes. Since I1 and I2 are still neighboring items in logical key -+ order, they will be necessary utmost items in their respective -+ nodes. -+ -+ (2) After this new extent item is inserted into node on the twig -+ level. -+ -+ Fortunately this process can reuse almost all code from standard -+ insertion procedure (viz. make_space() and insert_paste_common()), -+ due to the following observation: make_space() only shifts data up -+ to and excluding or including insertion point. It never -+ "over-moves" through insertion point. Thus, one can use -+ make_space() to perform step (1). All required for this is just to -+ instruct free_space_shortage() to keep make_space() shifting data -+ until insertion point is at the node border. -+ -+ */ -+ -+ /* perform common functionality of insert and paste. */ -+ result = insert_paste_common(op, doing, todo, &cdata, &coord, &data); -+ if (result != 0) -+ return result; -+ -+ node = op->u.extent.d->coord->node; -+ assert("nikita-1754", node != NULL); -+ assert("nikita-1755", node_plugin_by_node(node) != NULL); -+ assert("nikita-1700", coord_wrt(op->u.extent.d->coord) != COORD_INSIDE); -+ -+ /* NOTE-NIKITA add some checks here. Not assertions, -EIO. Check that -+ extent fits between items. */ -+ -+ info.doing = doing; -+ info.todo = todo; -+ -+ /* there is another complication due to placement of extents on the -+ twig level: extents are "rigid" in the sense that key-range -+ occupied by extent cannot grow indefinitely to the right as it is -+ for the formatted leaf nodes. Because of this when search finds two -+ adjacent extents on the twig level, it has to "drill" to the leaf -+ level, creating new node. Here we are removing this node. -+ */ -+ if (node_is_empty(node)) { -+ delete_dummy = node_post_carry(&info, COP_DELETE, node, 1); -+ if (IS_ERR(delete_dummy)) -+ return PTR_ERR(delete_dummy); -+ delete_dummy->u.delete.child = NULL; -+ delete_dummy->u.delete.flags = DELETE_RETAIN_EMPTY; -+ ZF_SET(node, JNODE_HEARD_BANSHEE); -+ } -+ -+ /* proceed with inserting extent item into parent. We are definitely -+ inserting rather than pasting if we get that far. */ -+ insert_extent = node_post_carry(&info, COP_INSERT, node, 1); -+ if (IS_ERR(insert_extent)) -+ /* @delete_dummy will be automatically destroyed on the level -+ exiting */ -+ return PTR_ERR(insert_extent); -+ /* NOTE-NIKITA insertion by key is simplest option here. Another -+ possibility is to insert on the left or right of already existing -+ item. -+ */ -+ insert_extent->u.insert.type = COPT_KEY; -+ insert_extent->u.insert.d = op->u.extent.d; -+ assert("nikita-1719", op->u.extent.d->key != NULL); -+ insert_extent->u.insert.d->data->arg = op->u.extent.d->coord; -+ insert_extent->u.insert.flags = -+ znode_get_tree(node)->carry.new_extent_flags; -+ -+ /* -+ * if carry was asked to track lock handle we should actually track -+ * lock handle on the twig node rather than on the leaf where -+ * operation was started from. Transfer tracked lock handle. -+ */ -+ if (doing->track_type) { -+ assert("nikita-3242", doing->tracked != NULL); -+ assert("nikita-3244", todo->tracked == NULL); -+ todo->tracked = doing->tracked; -+ todo->track_type = CARRY_TRACK_NODE; -+ doing->tracked = NULL; -+ doing->track_type = 0; -+ } -+ -+ return 0; -+} -+ -+/* update key in @parent between pointers to @left and @right. -+ -+ Find coords of @left and @right and update delimiting key between them. -+ This is helper function called by carry_update(). Finds position of -+ internal item involved. Updates item key. Updates delimiting keys of child -+ nodes involved. -+*/ -+static int update_delimiting_key(znode * parent /* node key is updated -+ * in */ , -+ znode * left /* child of @parent */ , -+ znode * right /* child of @parent */ , -+ carry_level * doing /* current carry -+ * level */ , -+ carry_level * todo /* parent carry -+ * level */ , -+ const char **error_msg /* place to -+ * store error -+ * message */ ) -+{ -+ coord_t left_pos; -+ coord_t right_pos; -+ int result; -+ reiser4_key ldkey; -+ carry_plugin_info info; -+ -+ assert("nikita-1177", right != NULL); -+ /* find position of right left child in a parent */ -+ result = find_child_ptr(parent, right, &right_pos); -+ if (result != NS_FOUND) { -+ *error_msg = "Cannot find position of right child"; -+ return result; -+ } -+ -+ if ((left != NULL) && !coord_is_leftmost_unit(&right_pos)) { -+ /* find position of the left child in a parent */ -+ result = find_child_ptr(parent, left, &left_pos); -+ if (result != NS_FOUND) { -+ *error_msg = "Cannot find position of left child"; -+ return result; -+ } -+ assert("nikita-1355", left_pos.node != NULL); -+ } else -+ left_pos.node = NULL; -+ -+ /* check that they are separated by exactly one key and are basically -+ sane */ -+ if (REISER4_DEBUG) { -+ if ((left_pos.node != NULL) -+ && !coord_is_existing_unit(&left_pos)) { -+ *error_msg = "Left child is bastard"; -+ return RETERR(-EIO); -+ } -+ if (!coord_is_existing_unit(&right_pos)) { -+ *error_msg = "Right child is bastard"; -+ return RETERR(-EIO); -+ } -+ if (left_pos.node != NULL && -+ !coord_are_neighbors(&left_pos, &right_pos)) { -+ *error_msg = "Children are not direct siblings"; -+ return RETERR(-EIO); -+ } -+ } -+ *error_msg = NULL; -+ -+ info.doing = doing; -+ info.todo = todo; -+ -+ /* -+ * If child node is not empty, new key of internal item is a key of -+ * leftmost item in the child node. If the child is empty, take its -+ * right delimiting key as a new key of the internal item. Precise key -+ * in the latter case is not important per se, because the child (and -+ * the internal item) are going to be killed shortly anyway, but we -+ * have to preserve correct order of keys in the parent node. -+ */ -+ -+ if (!ZF_ISSET(right, JNODE_HEARD_BANSHEE)) -+ leftmost_key_in_node(right, &ldkey); -+ else { -+ read_lock_dk(znode_get_tree(parent)); -+ ldkey = *znode_get_rd_key(right); -+ read_unlock_dk(znode_get_tree(parent)); -+ } -+ node_plugin_by_node(parent)->update_item_key(&right_pos, &ldkey, &info); -+ doing->restartable = 0; -+ znode_make_dirty(parent); -+ return 0; -+} -+ -+/* implements COP_UPDATE opration -+ -+ Update delimiting keys. -+ -+*/ -+static int carry_update(carry_op * op /* operation to be performed */ , -+ carry_level * doing /* current carry level */ , -+ carry_level * todo /* next carry level */ ) -+{ -+ int result; -+ carry_node *missing UNUSED_ARG; -+ znode *left; -+ znode *right; -+ carry_node *lchild; -+ carry_node *rchild; -+ const char *error_msg; -+ reiser4_tree *tree; -+ -+ /* -+ * This operation is called to update key of internal item. This is -+ * necessary when carry shifted of cut data on the child -+ * level. Arguments of this operation are: -+ * -+ * @right --- child node. Operation should update key of internal -+ * item pointing to @right. -+ * -+ * @left --- left neighbor of @right. This parameter is optional. -+ */ -+ -+ assert("nikita-902", op != NULL); -+ assert("nikita-903", todo != NULL); -+ assert("nikita-904", op->op == COP_UPDATE); -+ -+ lchild = op->u.update.left; -+ rchild = op->node; -+ -+ if (lchild != NULL) { -+ assert("nikita-1001", lchild->parent); -+ assert("nikita-1003", !lchild->left); -+ left = reiser4_carry_real(lchild); -+ } else -+ left = NULL; -+ -+ tree = znode_get_tree(rchild->node); -+ read_lock_tree(tree); -+ right = znode_parent(rchild->node); -+ read_unlock_tree(tree); -+ -+ if (right != NULL) { -+ result = update_delimiting_key(right, -+ lchild ? lchild->node : NULL, -+ rchild->node, -+ doing, todo, &error_msg); -+ } else { -+ error_msg = "Cannot find node to update key in"; -+ result = RETERR(-EIO); -+ } -+ /* operation will be reposted to the next level by the -+ ->update_item_key() method of node plugin, if necessary. */ -+ -+ if (result != 0) { -+ warning("nikita-999", "Error updating delimiting key: %s (%i)", -+ error_msg ? : "", result); -+ } -+ return result; -+} -+ -+/* move items from @node during carry */ -+static int carry_shift_data(sideof side /* in what direction to move data */ , -+ coord_t * insert_coord /* coord where new item -+ * is to be inserted */ , -+ znode * node /* node which data are moved from */ , -+ carry_level * doing /* active carry queue */ , -+ carry_level * todo /* carry queue where new -+ * operations are to be put -+ * in */ , -+ unsigned int including_insert_coord_p /* true if -+ * @insertion_coord -+ * can be moved */ ) -+{ -+ int result; -+ znode *source; -+ carry_plugin_info info; -+ node_plugin *nplug; -+ -+ source = insert_coord->node; -+ -+ info.doing = doing; -+ info.todo = todo; -+ -+ nplug = node_plugin_by_node(node); -+ result = nplug->shift(insert_coord, node, -+ (side == LEFT_SIDE) ? SHIFT_LEFT : SHIFT_RIGHT, 0, -+ (int)including_insert_coord_p, &info); -+ /* the only error ->shift() method of node plugin can return is -+ -ENOMEM due to carry node/operation allocation. */ -+ assert("nikita-915", result >= 0 || result == -ENOMEM); -+ if (result > 0) { -+ /* -+ * if some number of bytes was actually shifted, mark nodes -+ * dirty, and carry level as non-restartable. -+ */ -+ doing->restartable = 0; -+ znode_make_dirty(source); -+ znode_make_dirty(node); -+ } -+ -+ assert("nikita-2077", coord_check(insert_coord)); -+ return 0; -+} -+ -+typedef carry_node *(*carry_iterator) (carry_node * node); -+static carry_node *find_dir_carry(carry_node * node, carry_level * level, -+ carry_iterator iterator); -+ -+static carry_node *pool_level_list_prev(carry_node *node) -+{ -+ return list_entry(node->header.level_linkage.prev, carry_node, header.level_linkage); -+} -+ -+/* look for the left neighbor of given carry node in a carry queue. -+ -+ This is used by find_left_neighbor(), but I am not sure that this -+ really gives any advantage. More statistics required. -+ -+*/ -+carry_node *find_left_carry(carry_node * node /* node to find left neighbor -+ * of */ , -+ carry_level * level /* level to scan */ ) -+{ -+ return find_dir_carry(node, level, -+ (carry_iterator) pool_level_list_prev); -+} -+ -+static carry_node *pool_level_list_next(carry_node *node) -+{ -+ return list_entry(node->header.level_linkage.next, carry_node, header.level_linkage); -+} -+ -+/* look for the right neighbor of given carry node in a -+ carry queue. -+ -+ This is used by find_right_neighbor(), but I am not sure that this -+ really gives any advantage. More statistics required. -+ -+*/ -+carry_node *find_right_carry(carry_node * node /* node to find right neighbor -+ * of */ , -+ carry_level * level /* level to scan */ ) -+{ -+ return find_dir_carry(node, level, -+ (carry_iterator) pool_level_list_next); -+} -+ -+/* look for the left or right neighbor of given carry node in a carry -+ queue. -+ -+ Helper function used by find_{left|right}_carry(). -+*/ -+static carry_node *find_dir_carry(carry_node * node /* node to start scanning -+ * from */ , -+ carry_level * level /* level to scan */ , -+ carry_iterator iterator /* operation to -+ * move to the next -+ * node */ ) -+{ -+ carry_node *neighbor; -+ -+ assert("nikita-1059", node != NULL); -+ assert("nikita-1060", level != NULL); -+ -+ /* scan list of carry nodes on this list dir-ward, skipping all -+ carry nodes referencing the same znode. */ -+ neighbor = node; -+ while (1) { -+ neighbor = iterator(neighbor); -+ if (carry_node_end(level, neighbor)) -+ /* list head is reached */ -+ return NULL; -+ if (reiser4_carry_real(neighbor) != reiser4_carry_real(node)) -+ return neighbor; -+ } -+} -+ -+/* -+ * Memory reservation estimation. -+ * -+ * Carry process proceeds through tree levels upwards. Carry assumes that it -+ * takes tree in consistent state (e.g., that search tree invariants hold), -+ * and leaves tree consistent after it finishes. This means that when some -+ * error occurs carry cannot simply return if there are pending carry -+ * operations. Generic solution for this problem is carry-undo either as -+ * transaction manager feature (requiring checkpoints and isolation), or -+ * through some carry specific mechanism. -+ * -+ * Our current approach is to panic if carry hits an error while tree is -+ * inconsistent. Unfortunately -ENOMEM can easily be triggered. To work around -+ * this "memory reservation" mechanism was added. -+ * -+ * Memory reservation is implemented by perthread-pages.diff patch from -+ * core-patches. Its API is defined in -+ * -+ * int perthread_pages_reserve(int nrpages, gfp_t gfp); -+ * void perthread_pages_release(int nrpages); -+ * int perthread_pages_count(void); -+ * -+ * carry estimates its worst case memory requirements at the entry, reserved -+ * enough memory, and released unused pages before returning. -+ * -+ * Code below estimates worst case memory requirements for a given carry -+ * queue. This is dome by summing worst case memory requirements for each -+ * operation in the queue. -+ * -+ */ -+ -+/* -+ * Memory memory requirements of many operations depends on the tree -+ * height. For example, item insertion requires new node to be inserted at -+ * each tree level in the worst case. What tree height should be used for -+ * estimation? Current tree height is wrong, because tree height can change -+ * between the time when estimation was done and the time when operation is -+ * actually performed. Maximal possible tree height (REISER4_MAX_ZTREE_HEIGHT) -+ * is also not desirable, because it would lead to the huge over-estimation -+ * all the time. Plausible solution is "capped tree height": if current tree -+ * height is less than some TREE_HEIGHT_CAP constant, capped tree height is -+ * TREE_HEIGHT_CAP, otherwise it's current tree height. Idea behind this is -+ * that if tree height is TREE_HEIGHT_CAP or larger, it's extremely unlikely -+ * to be increased even more during short interval of time. -+ */ -+#define TREE_HEIGHT_CAP (5) -+ -+/* return capped tree height for the @tree. See comment above. */ -+static int cap_tree_height(reiser4_tree * tree) -+{ -+ return max_t(int, tree->height, TREE_HEIGHT_CAP); -+} -+ -+/* return capped tree height for the current tree. */ -+static int capped_height(void) -+{ -+ return cap_tree_height(current_tree); -+} -+ -+/* return number of pages required to store given number of bytes */ -+static int bytes_to_pages(int bytes) -+{ -+ return (bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; -+} -+ -+/* how many pages are required to allocate znodes during item insertion. */ -+static int carry_estimate_znodes(void) -+{ -+ /* -+ * Note, that there we have some problem here: there is no way to -+ * reserve pages specifically for the given slab. This means that -+ * these pages can be hijacked for some other end. -+ */ -+ -+ /* in the worst case we need 3 new znode on each tree level */ -+ return bytes_to_pages(capped_height() * sizeof(znode) * 3); -+} -+ -+/* -+ * how many pages are required to load bitmaps. One bitmap per level. -+ */ -+static int carry_estimate_bitmaps(void) -+{ -+ if (reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_LOAD_BITMAP)) { -+ int bytes; -+ -+ bytes = capped_height() * (0 + /* bnode should be added, but its is private to -+ * bitmap.c, skip for now. */ -+ 2 * sizeof(jnode)); /* working and commit jnodes */ -+ return bytes_to_pages(bytes) + 2; /* and their contents */ -+ } else -+ /* bitmaps were pre-loaded during mount */ -+ return 0; -+} -+ -+/* worst case item insertion memory requirements */ -+static int carry_estimate_insert(carry_op * op, carry_level * level) -+{ -+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */ -+ capped_height() + /* new block on each level */ -+ 1 + /* and possibly extra new block at the leaf level */ -+ 3; /* loading of leaves into memory */ -+} -+ -+/* worst case item deletion memory requirements */ -+static int carry_estimate_delete(carry_op * op, carry_level * level) -+{ -+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */ -+ 3; /* loading of leaves into memory */ -+} -+ -+/* worst case tree cut memory requirements */ -+static int carry_estimate_cut(carry_op * op, carry_level * level) -+{ -+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */ -+ 3; /* loading of leaves into memory */ -+} -+ -+/* worst case memory requirements of pasting into item */ -+static int carry_estimate_paste(carry_op * op, carry_level * level) -+{ -+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */ -+ capped_height() + /* new block on each level */ -+ 1 + /* and possibly extra new block at the leaf level */ -+ 3; /* loading of leaves into memory */ -+} -+ -+/* worst case memory requirements of extent insertion */ -+static int carry_estimate_extent(carry_op * op, carry_level * level) -+{ -+ return carry_estimate_insert(op, level) + /* insert extent */ -+ carry_estimate_delete(op, level); /* kill leaf */ -+} -+ -+/* worst case memory requirements of key update */ -+static int carry_estimate_update(carry_op * op, carry_level * level) -+{ -+ return 0; -+} -+ -+/* worst case memory requirements of flow insertion */ -+static int carry_estimate_insert_flow(carry_op * op, carry_level * level) -+{ -+ int newnodes; -+ -+ newnodes = min(bytes_to_pages(op->u.insert_flow.flow->length), -+ CARRY_FLOW_NEW_NODES_LIMIT); -+ /* -+ * roughly estimate insert_flow as a sequence of insertions. -+ */ -+ return newnodes * carry_estimate_insert(op, level); -+} -+ -+/* This is dispatch table for carry operations. It can be trivially -+ abstracted into useful plugin: tunable balancing policy is a good -+ thing. */ -+carry_op_handler op_dispatch_table[COP_LAST_OP] = { -+ [COP_INSERT] = { -+ .handler = carry_insert, -+ .estimate = carry_estimate_insert} -+ , -+ [COP_DELETE] = { -+ .handler = carry_delete, -+ .estimate = carry_estimate_delete} -+ , -+ [COP_CUT] = { -+ .handler = carry_cut, -+ .estimate = carry_estimate_cut} -+ , -+ [COP_PASTE] = { -+ .handler = carry_paste, -+ .estimate = carry_estimate_paste} -+ , -+ [COP_EXTENT] = { -+ .handler = carry_extent, -+ .estimate = carry_estimate_extent} -+ , -+ [COP_UPDATE] = { -+ .handler = carry_update, -+ .estimate = carry_estimate_update} -+ , -+ [COP_INSERT_FLOW] = { -+ .handler = carry_insert_flow, -+ .estimate = carry_estimate_insert_flow} -+}; -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/carry_ops.h linux-2.6.24/fs/reiser4/carry_ops.h ---- linux-2.6.24.orig/fs/reiser4/carry_ops.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/carry_ops.h 2008-01-25 11:39:06.904199446 +0300 -@@ -0,0 +1,42 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* implementation of carry operations. See carry_ops.c for details. */ -+ -+#if !defined( __CARRY_OPS_H__ ) -+#define __CARRY_OPS_H__ -+ -+#include "forward.h" -+#include "znode.h" -+#include "carry.h" -+ -+/* carry operation handlers */ -+typedef struct carry_op_handler { -+ /* perform operation */ -+ int (*handler) (carry_op * op, carry_level * doing, carry_level * todo); -+ /* estimate memory requirements for @op */ -+ int (*estimate) (carry_op * op, carry_level * level); -+} carry_op_handler; -+ -+/* This is dispatch table for carry operations. It can be trivially -+ abstracted into useful plugin: tunable balancing policy is a good -+ thing. */ -+extern carry_op_handler op_dispatch_table[COP_LAST_OP]; -+ -+unsigned int space_needed(const znode * node, const coord_t * coord, -+ const reiser4_item_data * data, int inserting); -+extern carry_node *find_left_carry(carry_node * node, carry_level * level); -+extern carry_node *find_right_carry(carry_node * node, carry_level * level); -+ -+/* __CARRY_OPS_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/context.c linux-2.6.24/fs/reiser4/context.c ---- linux-2.6.24.orig/fs/reiser4/context.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/context.c 2008-01-25 11:39:06.904199446 +0300 -@@ -0,0 +1,288 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Manipulation of reiser4_context */ -+ -+/* -+ * global context used during system call. Variable of this type is allocated -+ * on the stack at the beginning of the reiser4 part of the system call and -+ * pointer to it is stored in the current->fs_context. This allows us to avoid -+ * passing pointer to current transaction and current lockstack (both in -+ * one-to-one mapping with threads) all over the call chain. -+ * -+ * It's kind of like those global variables the prof used to tell you not to -+ * use in CS1, except thread specific.;-) Nikita, this was a good idea. -+ * -+ * In some situations it is desirable to have ability to enter reiser4_context -+ * more than once for the same thread (nested contexts). For example, there -+ * are some functions that can be called either directly from VFS/VM or from -+ * already active reiser4 context (->writepage, for example). -+ * -+ * In such situations "child" context acts like dummy: all activity is -+ * actually performed in the top level context, and get_current_context() -+ * always returns top level context. -+ * Of course, reiser4_init_context()/reiser4_done_context() have to be properly -+ * nested any way. -+ * -+ * Note that there is an important difference between reiser4 uses -+ * ->fs_context and the way other file systems use it. Other file systems -+ * (ext3 and reiserfs) use ->fs_context only for the duration of _transaction_ -+ * (this is why ->fs_context was initially called ->journal_info). This means, -+ * that when ext3 or reiserfs finds that ->fs_context is not NULL on the entry -+ * to the file system, they assume that some transaction is already underway, -+ * and usually bail out, because starting nested transaction would most likely -+ * lead to the deadlock. This gives false positives with reiser4, because we -+ * set ->fs_context before starting transaction. -+ */ -+ -+#include "debug.h" -+#include "super.h" -+#include "context.h" -+ -+#include /* balance_dirty_pages() */ -+#include -+ -+static void _reiser4_init_context(reiser4_context * context, -+ struct super_block *super) -+{ -+ memset(context, 0, sizeof(*context)); -+ -+ context->super = super; -+ context->magic = context_magic; -+ context->outer = current->journal_info; -+ current->journal_info = (void *)context; -+ context->nr_children = 0; -+ context->gfp_mask = GFP_KERNEL; -+ -+ init_lock_stack(&context->stack); -+ -+ reiser4_txn_begin(context); -+ -+ /* initialize head of tap list */ -+ INIT_LIST_HEAD(&context->taps); -+#if REISER4_DEBUG -+ context->task = current; -+#endif -+ grab_space_enable(); -+} -+ -+/* initialize context and bind it to the current thread -+ -+ This function should be called at the beginning of reiser4 part of -+ syscall. -+*/ -+reiser4_context * reiser4_init_context(struct super_block * super) -+{ -+ reiser4_context *context; -+ -+ assert("nikita-2662", !in_interrupt() && !in_irq()); -+ assert("nikita-3357", super != NULL); -+ assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super)); -+ -+ context = get_current_context_check(); -+ if (context && context->super == super) { -+ context = (reiser4_context *) current->journal_info; -+ context->nr_children++; -+ return context; -+ } -+ -+ context = kmalloc(sizeof(*context), GFP_KERNEL); -+ if (context == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ -+ _reiser4_init_context(context, super); -+ return context; -+} -+ -+/* this is used in scan_mgr which is called with spinlock held and in -+ reiser4_fill_super magic */ -+void init_stack_context(reiser4_context *context, struct super_block *super) -+{ -+ assert("nikita-2662", !in_interrupt() && !in_irq()); -+ assert("nikita-3357", super != NULL); -+ assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super)); -+ assert("vs-12", !is_in_reiser4_context()); -+ -+ _reiser4_init_context(context, super); -+ context->on_stack = 1; -+ return; -+} -+ -+/* cast lock stack embedded into reiser4 context up to its container */ -+reiser4_context *get_context_by_lock_stack(lock_stack * owner) -+{ -+ return container_of(owner, reiser4_context, stack); -+} -+ -+/* true if there is already _any_ reiser4 context for the current thread */ -+int is_in_reiser4_context(void) -+{ -+ reiser4_context *ctx; -+ -+ ctx = current->journal_info; -+ return ctx != NULL && ((unsigned long)ctx->magic) == context_magic; -+} -+ -+/* -+ * call balance dirty pages for the current context. -+ * -+ * File system is expected to call balance_dirty_pages_ratelimited() whenever -+ * it dirties a page. reiser4 does this for unformatted nodes (that is, during -+ * write---this covers vast majority of all dirty traffic), but we cannot do -+ * this immediately when formatted node is dirtied, because long term lock is -+ * usually held at that time. To work around this, dirtying of formatted node -+ * simply increases ->nr_marked_dirty counter in the current reiser4 -+ * context. When we are about to leave this context, -+ * balance_dirty_pages_ratelimited() is called, if necessary. -+ * -+ * This introduces another problem: sometimes we do not want to run -+ * balance_dirty_pages_ratelimited() when leaving a context, for example -+ * because some important lock (like ->i_mutex on the parent directory) is -+ * held. To achieve this, ->nobalance flag can be set in the current context. -+ */ -+static void balance_dirty_pages_at(reiser4_context *context) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(context->super); -+ -+ /* -+ * call balance_dirty_pages_ratelimited() to process formatted nodes -+ * dirtied during this system call. Do that only if we are not in mount -+ * and there were nodes dirtied in this context and we are not in -+ * writepage (to avoid deadlock) and not in pdflush -+ */ -+ if (sbinfo != NULL && sbinfo->fake != NULL && -+ context->nr_marked_dirty != 0 && -+ !(current->flags & PF_MEMALLOC) && -+ !current_is_pdflush()) -+ balance_dirty_pages_ratelimited(sbinfo->fake->i_mapping); -+} -+ -+/* release resources associated with context. -+ -+ This function should be called at the end of "session" with reiser4, -+ typically just before leaving reiser4 driver back to VFS. -+ -+ This is good place to put some degugging consistency checks, like that -+ thread released all locks and closed transcrash etc. -+ -+*/ -+static void reiser4_done_context(reiser4_context * context /* context being released */ ) -+{ -+ assert("nikita-860", context != NULL); -+ assert("nikita-859", context->magic == context_magic); -+ assert("vs-646", (reiser4_context *) current->journal_info == context); -+ assert("zam-686", !in_interrupt() && !in_irq()); -+ -+ /* only do anything when leaving top-level reiser4 context. All nested -+ * contexts are just dummies. */ -+ if (context->nr_children == 0) { -+ assert("jmacd-673", context->trans == NULL); -+ assert("jmacd-1002", lock_stack_isclean(&context->stack)); -+ assert("nikita-1936", reiser4_no_counters_are_held()); -+ assert("nikita-2626", list_empty_careful(reiser4_taps_list())); -+ assert("zam-1004", ergo(get_super_private(context->super), -+ get_super_private(context->super)->delete_mutex_owner != -+ current)); -+ -+ /* release all grabbed but as yet unused blocks */ -+ if (context->grabbed_blocks != 0) -+ all_grabbed2free(); -+ -+ /* -+ * synchronize against longterm_unlock_znode(): -+ * wake_up_requestor() wakes up requestors without holding -+ * zlock (otherwise they will immediately bump into that lock -+ * after wake up on another CPU). To work around (rare) -+ * situation where requestor has been woken up asynchronously -+ * and managed to run until completion (and destroy its -+ * context and lock stack) before wake_up_requestor() called -+ * wake_up() on it, wake_up_requestor() synchronize on lock -+ * stack spin lock. It has actually been observed that spin -+ * lock _was_ locked at this point, because -+ * wake_up_requestor() took interrupt. -+ */ -+ spin_lock_stack(&context->stack); -+ spin_unlock_stack(&context->stack); -+ -+ assert("zam-684", context->nr_children == 0); -+ /* restore original ->fs_context value */ -+ current->journal_info = context->outer; -+ if (context->on_stack == 0) -+ kfree(context); -+ } else { -+ context->nr_children--; -+#if REISER4_DEBUG -+ assert("zam-685", context->nr_children >= 0); -+#endif -+ } -+} -+ -+/* -+ * exit reiser4 context. Call balance_dirty_pages_at() if necessary. Close -+ * transaction. Call done_context() to do context related book-keeping. -+ */ -+void reiser4_exit_context(reiser4_context * context) -+{ -+ assert("nikita-3021", reiser4_schedulable()); -+ -+ if (context->nr_children == 0) { -+ if (!context->nobalance) { -+ reiser4_txn_restart(context); -+ balance_dirty_pages_at(context); -+ } -+ -+ /* if filesystem is mounted with -o sync or -o dirsync - commit -+ transaction. FIXME: TXNH_DONT_COMMIT is used to avoid -+ commiting on exit_context when inode semaphore is held and -+ to have ktxnmgrd to do commit instead to get better -+ concurrent filesystem accesses. But, when one mounts with -o -+ sync, he cares more about reliability than about -+ performance. So, for now we have this simple mount -o sync -+ support. */ -+ if (context->super->s_flags & (MS_SYNCHRONOUS | MS_DIRSYNC)) { -+ txn_atom *atom; -+ -+ atom = get_current_atom_locked_nocheck(); -+ if (atom) { -+ atom->flags |= ATOM_FORCE_COMMIT; -+ context->trans->flags &= ~TXNH_DONT_COMMIT; -+ spin_unlock_atom(atom); -+ } -+ } -+ reiser4_txn_end(context); -+ } -+ reiser4_done_context(context); -+} -+ -+void reiser4_ctx_gfp_mask_set(void) -+{ -+ reiser4_context *ctx; -+ -+ ctx = get_current_context(); -+ if (ctx->entd == 0 && -+ list_empty(&ctx->stack.locks) && -+ ctx->trans->atom == NULL) -+ ctx->gfp_mask = GFP_KERNEL; -+ else -+ ctx->gfp_mask = GFP_NOFS; -+} -+ -+void reiser4_ctx_gfp_mask_force (gfp_t mask) -+{ -+ reiser4_context *ctx; -+ ctx = get_current_context(); -+ -+ assert("edward-1454", ctx != NULL); -+ -+ ctx->gfp_mask = mask; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 120 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/context.h linux-2.6.24/fs/reiser4/context.h ---- linux-2.6.24.orig/fs/reiser4/context.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/context.h 2008-01-25 11:39:06.904199446 +0300 -@@ -0,0 +1,228 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Reiser4 context. See context.c for details. */ -+ -+#if !defined( __REISER4_CONTEXT_H__ ) -+#define __REISER4_CONTEXT_H__ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "tap.h" -+#include "lock.h" -+ -+#include /* for __u?? */ -+#include /* for struct super_block */ -+#include -+#include /* for struct task_struct */ -+ -+/* reiser4 per-thread context */ -+struct reiser4_context { -+ /* magic constant. For identification of reiser4 contexts. */ -+ __u32 magic; -+ -+ /* current lock stack. See lock.[ch]. This is where list of all -+ locks taken by current thread is kept. This is also used in -+ deadlock detection. */ -+ lock_stack stack; -+ -+ /* current transcrash. */ -+ txn_handle *trans; -+ /* transaction handle embedded into reiser4_context. ->trans points -+ * here by default. */ -+ txn_handle trans_in_ctx; -+ -+ /* super block we are working with. To get the current tree -+ use &get_super_private (reiser4_get_current_sb ())->tree. */ -+ struct super_block *super; -+ -+ /* parent fs activation */ -+ struct fs_activation *outer; -+ -+ /* per-thread grabbed (for further allocation) blocks counter */ -+ reiser4_block_nr grabbed_blocks; -+ -+ /* list of taps currently monitored. See tap.c */ -+ struct list_head taps; -+ -+ /* grabbing space is enabled */ -+ unsigned int grab_enabled:1; -+ /* should be set when we are write dirty nodes to disk in jnode_flush or -+ * reiser4_write_logs() */ -+ unsigned int writeout_mode:1; -+ /* true, if current thread is an ent thread */ -+ unsigned int entd:1; -+ /* true, if balance_dirty_pages() should not be run when leaving this -+ * context. This is used to avoid lengthly balance_dirty_pages() -+ * operation when holding some important resource, like directory -+ * ->i_mutex */ -+ unsigned int nobalance:1; -+ -+ /* this bit is used on reiser4_done_context to decide whether context is -+ kmalloc-ed and has to be kfree-ed */ -+ unsigned int on_stack:1; -+ -+ /* count non-trivial jnode_set_dirty() calls */ -+ unsigned long nr_marked_dirty; -+ -+ /* reiser4_sync_inodes calls (via generic_sync_sb_inodes) -+ * reiser4_writepages for each of dirty inodes. Reiser4_writepages -+ * captures pages. When number of pages captured in one -+ * reiser4_sync_inodes reaches some threshold - some atoms get -+ * flushed */ -+ int nr_captured; -+ int nr_children; /* number of child contexts */ -+#if REISER4_DEBUG -+ /* debugging information about reiser4 locks held by the current -+ * thread */ -+ reiser4_lock_cnt_info locks; -+ struct task_struct *task; /* so we can easily find owner of the stack */ -+ -+ /* -+ * disk space grabbing debugging support -+ */ -+ /* how many disk blocks were grabbed by the first call to -+ * reiser4_grab_space() in this context */ -+ reiser4_block_nr grabbed_initially; -+ -+ /* list of all threads doing flush currently */ -+ struct list_head flushers_link; -+ /* information about last error encountered by reiser4 */ -+ err_site err; -+#endif -+ void *vp; -+ gfp_t gfp_mask; -+}; -+ -+extern reiser4_context *get_context_by_lock_stack(lock_stack *); -+ -+/* Debugging helps. */ -+#if REISER4_DEBUG -+extern void print_contexts(void); -+#endif -+ -+#define current_tree (&(get_super_private(reiser4_get_current_sb())->tree)) -+#define current_blocksize reiser4_get_current_sb()->s_blocksize -+#define current_blocksize_bits reiser4_get_current_sb()->s_blocksize_bits -+ -+extern reiser4_context *reiser4_init_context(struct super_block *); -+extern void init_stack_context(reiser4_context *, struct super_block *); -+extern void reiser4_exit_context(reiser4_context *); -+ -+/* magic constant we store in reiser4_context allocated at the stack. Used to -+ catch accesses to staled or uninitialized contexts. */ -+#define context_magic ((__u32) 0x4b1b5d0b) -+ -+extern int is_in_reiser4_context(void); -+ -+/* -+ * return reiser4_context for the thread @tsk -+ */ -+static inline reiser4_context *get_context(const struct task_struct *tsk) -+{ -+ assert("vs-1682", -+ ((reiser4_context *) tsk->journal_info)->magic == context_magic); -+ return (reiser4_context *) tsk->journal_info; -+} -+ -+/* -+ * return reiser4 context of the current thread, or NULL if there is none. -+ */ -+static inline reiser4_context *get_current_context_check(void) -+{ -+ if (is_in_reiser4_context()) -+ return get_context(current); -+ else -+ return NULL; -+} -+ -+static inline reiser4_context *get_current_context(void); /* __attribute__((const)); */ -+ -+/* return context associated with current thread */ -+static inline reiser4_context *get_current_context(void) -+{ -+ return get_context(current); -+} -+ -+static inline gfp_t reiser4_ctx_gfp_mask_get(void) -+{ -+ reiser4_context *ctx; -+ -+ ctx = get_current_context_check(); -+ return (ctx == NULL) ? GFP_KERNEL : ctx->gfp_mask; -+} -+ -+void reiser4_ctx_gfp_mask_set(void); -+void reiser4_ctx_gfp_mask_force (gfp_t mask); -+ -+/* -+ * true if current thread is in the write-out mode. Thread enters write-out -+ * mode during jnode_flush and reiser4_write_logs(). -+ */ -+static inline int is_writeout_mode(void) -+{ -+ return get_current_context()->writeout_mode; -+} -+ -+/* -+ * enter write-out mode -+ */ -+static inline void writeout_mode_enable(void) -+{ -+ assert("zam-941", !get_current_context()->writeout_mode); -+ get_current_context()->writeout_mode = 1; -+} -+ -+/* -+ * leave write-out mode -+ */ -+static inline void writeout_mode_disable(void) -+{ -+ assert("zam-942", get_current_context()->writeout_mode); -+ get_current_context()->writeout_mode = 0; -+} -+ -+static inline void grab_space_enable(void) -+{ -+ get_current_context()->grab_enabled = 1; -+} -+ -+static inline void grab_space_disable(void) -+{ -+ get_current_context()->grab_enabled = 0; -+} -+ -+static inline void grab_space_set_enabled(int enabled) -+{ -+ get_current_context()->grab_enabled = enabled; -+} -+ -+static inline int is_grab_enabled(reiser4_context * ctx) -+{ -+ return ctx->grab_enabled; -+} -+ -+/* mark transaction handle in @ctx as TXNH_DONT_COMMIT, so that no commit or -+ * flush would be performed when it is closed. This is necessary when handle -+ * has to be closed under some coarse semaphore, like i_mutex of -+ * directory. Commit will be performed by ktxnmgrd. */ -+static inline void context_set_commit_async(reiser4_context * context) -+{ -+ context->nobalance = 1; -+ context->trans->flags |= TXNH_DONT_COMMIT; -+} -+ -+/* __REISER4_CONTEXT_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/coord.c linux-2.6.24/fs/reiser4/coord.c ---- linux-2.6.24.orig/fs/reiser4/coord.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/coord.c 2008-01-25 11:39:06.904199446 +0300 -@@ -0,0 +1,935 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "tree.h" -+#include "plugin/item/item.h" -+#include "znode.h" -+#include "coord.h" -+ -+/* Internal constructor. */ -+static inline void -+coord_init_values(coord_t * coord, const znode * node, pos_in_node_t item_pos, -+ pos_in_node_t unit_pos, between_enum between) -+{ -+ coord->node = (znode *) node; -+ coord_set_item_pos(coord, item_pos); -+ coord->unit_pos = unit_pos; -+ coord->between = between; -+ ON_DEBUG(coord->plug_v = 0); -+ ON_DEBUG(coord->body_v = 0); -+ -+ /*ON_TRACE (TRACE_COORDS, "init coord %p node %p: %u %u %s\n", coord, node, item_pos, unit_pos, coord_tween_tostring (between)); */ -+} -+ -+/* after shifting of node content, coord previously set properly may become -+ invalid, try to "normalize" it. */ -+void coord_normalize(coord_t * coord) -+{ -+ znode *node; -+ -+ node = coord->node; -+ assert("vs-683", node); -+ -+ coord_clear_iplug(coord); -+ -+ if (node_is_empty(node)) { -+ coord_init_first_unit(coord, node); -+ } else if ((coord->between == AFTER_ITEM) -+ || (coord->between == AFTER_UNIT)) { -+ return; -+ } else if (coord->item_pos == coord_num_items(coord) -+ && coord->between == BEFORE_ITEM) { -+ coord_dec_item_pos(coord); -+ coord->between = AFTER_ITEM; -+ } else if (coord->unit_pos == coord_num_units(coord) -+ && coord->between == BEFORE_UNIT) { -+ coord->unit_pos--; -+ coord->between = AFTER_UNIT; -+ } else if (coord->item_pos == coord_num_items(coord) -+ && coord->unit_pos == 0 && coord->between == BEFORE_UNIT) { -+ coord_dec_item_pos(coord); -+ coord->unit_pos = 0; -+ coord->between = AFTER_ITEM; -+ } -+} -+ -+/* Copy a coordinate. */ -+void coord_dup(coord_t * coord, const coord_t * old_coord) -+{ -+ assert("jmacd-9800", coord_check(old_coord)); -+ coord_dup_nocheck(coord, old_coord); -+} -+ -+/* Copy a coordinate without check. Useful when old_coord->node is not -+ loaded. As in cbk_tree_lookup -> connect_znode -> connect_one_side */ -+void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord) -+{ -+ coord->node = old_coord->node; -+ coord_set_item_pos(coord, old_coord->item_pos); -+ coord->unit_pos = old_coord->unit_pos; -+ coord->between = old_coord->between; -+ coord->iplugid = old_coord->iplugid; -+ ON_DEBUG(coord->plug_v = old_coord->plug_v); -+ ON_DEBUG(coord->body_v = old_coord->body_v); -+} -+ -+/* Initialize an invalid coordinate. */ -+void coord_init_invalid(coord_t * coord, const znode * node) -+{ -+ coord_init_values(coord, node, 0, 0, INVALID_COORD); -+} -+ -+void coord_init_first_unit_nocheck(coord_t * coord, const znode * node) -+{ -+ coord_init_values(coord, node, 0, 0, AT_UNIT); -+} -+ -+/* Initialize a coordinate to point at the first unit of the first item. If the node is -+ empty, it is positioned at the EMPTY_NODE. */ -+void coord_init_first_unit(coord_t * coord, const znode * node) -+{ -+ int is_empty = node_is_empty(node); -+ -+ coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : AT_UNIT)); -+ -+ assert("jmacd-9801", coord_check(coord)); -+} -+ -+/* Initialize a coordinate to point at the last unit of the last item. If the node is -+ empty, it is positioned at the EMPTY_NODE. */ -+void coord_init_last_unit(coord_t * coord, const znode * node) -+{ -+ int is_empty = node_is_empty(node); -+ -+ coord_init_values(coord, node, -+ (is_empty ? 0 : node_num_items(node) - 1), 0, -+ (is_empty ? EMPTY_NODE : AT_UNIT)); -+ if (!is_empty) -+ coord->unit_pos = coord_last_unit_pos(coord); -+ assert("jmacd-9802", coord_check(coord)); -+} -+ -+/* Initialize a coordinate to before the first item. If the node is empty, it is -+ positioned at the EMPTY_NODE. */ -+void coord_init_before_first_item(coord_t * coord, const znode * node) -+{ -+ int is_empty = node_is_empty(node); -+ -+ coord_init_values(coord, node, 0, 0, -+ (is_empty ? EMPTY_NODE : BEFORE_UNIT)); -+ -+ assert("jmacd-9803", coord_check(coord)); -+} -+ -+/* Initialize a coordinate to after the last item. If the node is empty, it is positioned -+ at the EMPTY_NODE. */ -+void coord_init_after_last_item(coord_t * coord, const znode * node) -+{ -+ int is_empty = node_is_empty(node); -+ -+ coord_init_values(coord, node, -+ (is_empty ? 0 : node_num_items(node) - 1), 0, -+ (is_empty ? EMPTY_NODE : AFTER_ITEM)); -+ -+ assert("jmacd-9804", coord_check(coord)); -+} -+ -+/* Initialize a coordinate to after last unit in the item. Coord must be set -+ already to existing item */ -+void coord_init_after_item_end(coord_t * coord) -+{ -+ coord->between = AFTER_UNIT; -+ coord->unit_pos = coord_last_unit_pos(coord); -+} -+ -+/* Initialize a coordinate to before the item. Coord must be set already to existing item */ -+void coord_init_before_item(coord_t * coord) -+{ -+ coord->unit_pos = 0; -+ coord->between = BEFORE_ITEM; -+} -+ -+/* Initialize a coordinate to after the item. Coord must be set already to existing item */ -+void coord_init_after_item(coord_t * coord) -+{ -+ coord->unit_pos = 0; -+ coord->between = AFTER_ITEM; -+} -+ -+/* Initialize a coordinate by 0s. Used in places where init_coord was used and -+ it was not clear how actually */ -+void coord_init_zero(coord_t * coord) -+{ -+ memset(coord, 0, sizeof(*coord)); -+} -+ -+/* Return the number of units at the present item. Asserts coord_is_existing_item(). */ -+unsigned coord_num_units(const coord_t * coord) -+{ -+ assert("jmacd-9806", coord_is_existing_item(coord)); -+ -+ return item_plugin_by_coord(coord)->b.nr_units(coord); -+} -+ -+/* Returns true if the coord was initializewd by coord_init_invalid (). */ -+/* Audited by: green(2002.06.15) */ -+int coord_is_invalid(const coord_t * coord) -+{ -+ return coord->between == INVALID_COORD; -+} -+ -+/* Returns true if the coordinate is positioned at an existing item, not before or after -+ an item. It may be placed at, before, or after any unit within the item, whether -+ existing or not. */ -+int coord_is_existing_item(const coord_t * coord) -+{ -+ switch (coord->between) { -+ case EMPTY_NODE: -+ case BEFORE_ITEM: -+ case AFTER_ITEM: -+ case INVALID_COORD: -+ return 0; -+ -+ case BEFORE_UNIT: -+ case AT_UNIT: -+ case AFTER_UNIT: -+ return coord->item_pos < coord_num_items(coord); -+ } -+ -+ impossible("jmacd-9900", "unreachable coord: %p", coord); -+ return 0; -+} -+ -+/* Returns true if the coordinate is positioned at an existing unit, not before or after a -+ unit. */ -+/* Audited by: green(2002.06.15) */ -+int coord_is_existing_unit(const coord_t * coord) -+{ -+ switch (coord->between) { -+ case EMPTY_NODE: -+ case BEFORE_UNIT: -+ case AFTER_UNIT: -+ case BEFORE_ITEM: -+ case AFTER_ITEM: -+ case INVALID_COORD: -+ return 0; -+ -+ case AT_UNIT: -+ return (coord->item_pos < coord_num_items(coord) -+ && coord->unit_pos < coord_num_units(coord)); -+ } -+ -+ impossible("jmacd-9902", "unreachable"); -+ return 0; -+} -+ -+/* Returns true if the coordinate is positioned at the first unit of the first item. Not -+ true for empty nodes nor coordinates positioned before the first item. */ -+/* Audited by: green(2002.06.15) */ -+int coord_is_leftmost_unit(const coord_t * coord) -+{ -+ return (coord->between == AT_UNIT && coord->item_pos == 0 -+ && coord->unit_pos == 0); -+} -+ -+#if REISER4_DEBUG -+/* For assertions only, checks for a valid coordinate. */ -+int coord_check(const coord_t * coord) -+{ -+ if (coord->node == NULL) { -+ return 0; -+ } -+ if (znode_above_root(coord->node)) -+ return 1; -+ -+ switch (coord->between) { -+ default: -+ case INVALID_COORD: -+ return 0; -+ case EMPTY_NODE: -+ if (!node_is_empty(coord->node)) { -+ return 0; -+ } -+ return coord->item_pos == 0 && coord->unit_pos == 0; -+ -+ case BEFORE_UNIT: -+ case AFTER_UNIT: -+ if (node_is_empty(coord->node) && (coord->item_pos == 0) -+ && (coord->unit_pos == 0)) -+ return 1; -+ case AT_UNIT: -+ break; -+ case AFTER_ITEM: -+ case BEFORE_ITEM: -+ /* before/after item should not set unit_pos. */ -+ if (coord->unit_pos != 0) { -+ return 0; -+ } -+ break; -+ } -+ -+ if (coord->item_pos >= node_num_items(coord->node)) { -+ return 0; -+ } -+ -+ /* FIXME-VS: we are going to check unit_pos. This makes no sense when -+ between is set either AFTER_ITEM or BEFORE_ITEM */ -+ if (coord->between == AFTER_ITEM || coord->between == BEFORE_ITEM) -+ return 1; -+ -+ if (coord_is_iplug_set(coord) && -+ coord->unit_pos > -+ item_plugin_by_coord(coord)->b.nr_units(coord) - 1) { -+ return 0; -+ } -+ return 1; -+} -+#endif -+ -+/* Adjust coordinate boundaries based on the number of items prior to coord_next/prev. -+ Returns 1 if the new position is does not exist. */ -+static int coord_adjust_items(coord_t * coord, unsigned items, int is_next) -+{ -+ /* If the node is invalid, leave it. */ -+ if (coord->between == INVALID_COORD) { -+ return 1; -+ } -+ -+ /* If the node is empty, set it appropriately. */ -+ if (items == 0) { -+ coord->between = EMPTY_NODE; -+ coord_set_item_pos(coord, 0); -+ coord->unit_pos = 0; -+ return 1; -+ } -+ -+ /* If it was empty and it no longer is, set to BEFORE/AFTER_ITEM. */ -+ if (coord->between == EMPTY_NODE) { -+ coord->between = (is_next ? BEFORE_ITEM : AFTER_ITEM); -+ coord_set_item_pos(coord, 0); -+ coord->unit_pos = 0; -+ return 0; -+ } -+ -+ /* If the item_pos is out-of-range, set it appropriatly. */ -+ if (coord->item_pos >= items) { -+ coord->between = AFTER_ITEM; -+ coord_set_item_pos(coord, items - 1); -+ coord->unit_pos = 0; -+ /* If is_next, return 1 (can't go any further). */ -+ return is_next; -+ } -+ -+ return 0; -+} -+ -+/* Advances the coordinate by one unit to the right. If empty, no change. If -+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is an -+ existing unit. */ -+int coord_next_unit(coord_t * coord) -+{ -+ unsigned items = coord_num_items(coord); -+ -+ if (coord_adjust_items(coord, items, 1) == 1) { -+ return 1; -+ } -+ -+ switch (coord->between) { -+ case BEFORE_UNIT: -+ /* Now it is positioned at the same unit. */ -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case AFTER_UNIT: -+ case AT_UNIT: -+ /* If it was at or after a unit and there are more units in this item, -+ advance to the next one. */ -+ if (coord->unit_pos < coord_last_unit_pos(coord)) { -+ coord->unit_pos += 1; -+ coord->between = AT_UNIT; -+ return 0; -+ } -+ -+ /* Otherwise, it is crossing an item boundary and treated as if it was -+ after the current item. */ -+ coord->between = AFTER_ITEM; -+ coord->unit_pos = 0; -+ /* FALLTHROUGH */ -+ -+ case AFTER_ITEM: -+ /* Check for end-of-node. */ -+ if (coord->item_pos == items - 1) { -+ return 1; -+ } -+ -+ coord_inc_item_pos(coord); -+ coord->unit_pos = 0; -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case BEFORE_ITEM: -+ /* The adjust_items checks ensure that we are valid here. */ -+ coord->unit_pos = 0; -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case INVALID_COORD: -+ case EMPTY_NODE: -+ /* Handled in coord_adjust_items(). */ -+ break; -+ } -+ -+ impossible("jmacd-9902", "unreachable"); -+ return 0; -+} -+ -+/* Advances the coordinate by one item to the right. If empty, no change. If -+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is -+ an existing item. */ -+int coord_next_item(coord_t * coord) -+{ -+ unsigned items = coord_num_items(coord); -+ -+ if (coord_adjust_items(coord, items, 1) == 1) { -+ return 1; -+ } -+ -+ switch (coord->between) { -+ case AFTER_UNIT: -+ case AT_UNIT: -+ case BEFORE_UNIT: -+ case AFTER_ITEM: -+ /* Check for end-of-node. */ -+ if (coord->item_pos == items - 1) { -+ coord->between = AFTER_ITEM; -+ coord->unit_pos = 0; -+ coord_clear_iplug(coord); -+ return 1; -+ } -+ -+ /* Anywhere in an item, go to the next one. */ -+ coord->between = AT_UNIT; -+ coord_inc_item_pos(coord); -+ coord->unit_pos = 0; -+ return 0; -+ -+ case BEFORE_ITEM: -+ /* The out-of-range check ensures that we are valid here. */ -+ coord->unit_pos = 0; -+ coord->between = AT_UNIT; -+ return 0; -+ case INVALID_COORD: -+ case EMPTY_NODE: -+ /* Handled in coord_adjust_items(). */ -+ break; -+ } -+ -+ impossible("jmacd-9903", "unreachable"); -+ return 0; -+} -+ -+/* Advances the coordinate by one unit to the left. If empty, no change. If -+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position -+ is an existing unit. */ -+int coord_prev_unit(coord_t * coord) -+{ -+ unsigned items = coord_num_items(coord); -+ -+ if (coord_adjust_items(coord, items, 0) == 1) { -+ return 1; -+ } -+ -+ switch (coord->between) { -+ case AT_UNIT: -+ case BEFORE_UNIT: -+ if (coord->unit_pos > 0) { -+ coord->unit_pos -= 1; -+ coord->between = AT_UNIT; -+ return 0; -+ } -+ -+ if (coord->item_pos == 0) { -+ coord->between = BEFORE_ITEM; -+ return 1; -+ } -+ -+ coord_dec_item_pos(coord); -+ coord->unit_pos = coord_last_unit_pos(coord); -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case AFTER_UNIT: -+ /* What if unit_pos is out-of-range? */ -+ assert("jmacd-5442", -+ coord->unit_pos <= coord_last_unit_pos(coord)); -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case BEFORE_ITEM: -+ if (coord->item_pos == 0) { -+ return 1; -+ } -+ -+ coord_dec_item_pos(coord); -+ /* FALLTHROUGH */ -+ -+ case AFTER_ITEM: -+ coord->between = AT_UNIT; -+ coord->unit_pos = coord_last_unit_pos(coord); -+ return 0; -+ -+ case INVALID_COORD: -+ case EMPTY_NODE: -+ break; -+ } -+ -+ impossible("jmacd-9904", "unreachable"); -+ return 0; -+} -+ -+/* Advances the coordinate by one item to the left. If empty, no change. If -+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position -+ is an existing item. */ -+int coord_prev_item(coord_t * coord) -+{ -+ unsigned items = coord_num_items(coord); -+ -+ if (coord_adjust_items(coord, items, 0) == 1) { -+ return 1; -+ } -+ -+ switch (coord->between) { -+ case AT_UNIT: -+ case AFTER_UNIT: -+ case BEFORE_UNIT: -+ case BEFORE_ITEM: -+ -+ if (coord->item_pos == 0) { -+ coord->between = BEFORE_ITEM; -+ coord->unit_pos = 0; -+ return 1; -+ } -+ -+ coord_dec_item_pos(coord); -+ coord->unit_pos = 0; -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case AFTER_ITEM: -+ coord->between = AT_UNIT; -+ coord->unit_pos = 0; -+ return 0; -+ -+ case INVALID_COORD: -+ case EMPTY_NODE: -+ break; -+ } -+ -+ impossible("jmacd-9905", "unreachable"); -+ return 0; -+} -+ -+/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */ -+void coord_init_sideof_unit(coord_t * coord, const znode * node, sideof dir) -+{ -+ assert("jmacd-9821", dir == LEFT_SIDE || dir == RIGHT_SIDE); -+ if (dir == LEFT_SIDE) { -+ coord_init_first_unit(coord, node); -+ } else { -+ coord_init_last_unit(coord, node); -+ } -+} -+ -+/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof -+ argument. */ -+/* Audited by: green(2002.06.15) */ -+int coord_is_after_sideof_unit(coord_t * coord, sideof dir) -+{ -+ assert("jmacd-9822", dir == LEFT_SIDE || dir == RIGHT_SIDE); -+ if (dir == LEFT_SIDE) { -+ return coord_is_before_leftmost(coord); -+ } else { -+ return coord_is_after_rightmost(coord); -+ } -+} -+ -+/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */ -+/* Audited by: green(2002.06.15) */ -+int coord_sideof_unit(coord_t * coord, sideof dir) -+{ -+ assert("jmacd-9823", dir == LEFT_SIDE || dir == RIGHT_SIDE); -+ if (dir == LEFT_SIDE) { -+ return coord_prev_unit(coord); -+ } else { -+ return coord_next_unit(coord); -+ } -+} -+ -+#if REISER4_DEBUG -+int coords_equal(const coord_t * c1, const coord_t * c2) -+{ -+ assert("nikita-2840", c1 != NULL); -+ assert("nikita-2841", c2 != NULL); -+ -+ return -+ c1->node == c2->node && -+ c1->item_pos == c2->item_pos && -+ c1->unit_pos == c2->unit_pos && c1->between == c2->between; -+} -+#endif /* REISER4_DEBUG */ -+ -+/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost -+ return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */ -+/* Audited by: green(2002.06.15) */ -+coord_wrt_node coord_wrt(const coord_t * coord) -+{ -+ if (coord_is_before_leftmost(coord)) { -+ return COORD_ON_THE_LEFT; -+ } -+ -+ if (coord_is_after_rightmost(coord)) { -+ return COORD_ON_THE_RIGHT; -+ } -+ -+ return COORD_INSIDE; -+} -+ -+/* Returns true if the coordinate is positioned after the last item or after the last unit -+ of the last item or it is an empty node. */ -+/* Audited by: green(2002.06.15) */ -+int coord_is_after_rightmost(const coord_t * coord) -+{ -+ assert("jmacd-7313", coord_check(coord)); -+ -+ switch (coord->between) { -+ case INVALID_COORD: -+ case AT_UNIT: -+ case BEFORE_UNIT: -+ case BEFORE_ITEM: -+ return 0; -+ -+ case EMPTY_NODE: -+ return 1; -+ -+ case AFTER_ITEM: -+ return (coord->item_pos == node_num_items(coord->node) - 1); -+ -+ case AFTER_UNIT: -+ return ((coord->item_pos == node_num_items(coord->node) - 1) && -+ coord->unit_pos == coord_last_unit_pos(coord)); -+ } -+ -+ impossible("jmacd-9908", "unreachable"); -+ return 0; -+} -+ -+/* Returns true if the coordinate is positioned before the first item or it is an empty -+ node. */ -+int coord_is_before_leftmost(const coord_t * coord) -+{ -+ /* FIXME-VS: coord_check requires node to be loaded whereas it is not -+ necessary to check if coord is set before leftmost -+ assert ("jmacd-7313", coord_check (coord)); */ -+ switch (coord->between) { -+ case INVALID_COORD: -+ case AT_UNIT: -+ case AFTER_ITEM: -+ case AFTER_UNIT: -+ return 0; -+ -+ case EMPTY_NODE: -+ return 1; -+ -+ case BEFORE_ITEM: -+ case BEFORE_UNIT: -+ return (coord->item_pos == 0) && (coord->unit_pos == 0); -+ } -+ -+ impossible("jmacd-9908", "unreachable"); -+ return 0; -+} -+ -+/* Returns true if the coordinate is positioned after a item, before a item, after the -+ last unit of an item, before the first unit of an item, or at an empty node. */ -+/* Audited by: green(2002.06.15) */ -+int coord_is_between_items(const coord_t * coord) -+{ -+ assert("jmacd-7313", coord_check(coord)); -+ -+ switch (coord->between) { -+ case INVALID_COORD: -+ case AT_UNIT: -+ return 0; -+ -+ case AFTER_ITEM: -+ case BEFORE_ITEM: -+ case EMPTY_NODE: -+ return 1; -+ -+ case BEFORE_UNIT: -+ return coord->unit_pos == 0; -+ -+ case AFTER_UNIT: -+ return coord->unit_pos == coord_last_unit_pos(coord); -+ } -+ -+ impossible("jmacd-9908", "unreachable"); -+ return 0; -+} -+ -+#if REISER4_DEBUG -+/* Returns true if the coordinates are positioned at adjacent units, regardless of -+ before-after or item boundaries. */ -+int coord_are_neighbors(coord_t * c1, coord_t * c2) -+{ -+ coord_t *left; -+ coord_t *right; -+ -+ assert("nikita-1241", c1 != NULL); -+ assert("nikita-1242", c2 != NULL); -+ assert("nikita-1243", c1->node == c2->node); -+ assert("nikita-1244", coord_is_existing_unit(c1)); -+ assert("nikita-1245", coord_is_existing_unit(c2)); -+ -+ left = right = NULL; -+ switch (coord_compare(c1, c2)) { -+ case COORD_CMP_ON_LEFT: -+ left = c1; -+ right = c2; -+ break; -+ case COORD_CMP_ON_RIGHT: -+ left = c2; -+ right = c1; -+ break; -+ case COORD_CMP_SAME: -+ return 0; -+ default: -+ wrong_return_value("nikita-1246", "compare_coords()"); -+ } -+ assert("vs-731", left && right); -+ if (left->item_pos == right->item_pos) { -+ return left->unit_pos + 1 == right->unit_pos; -+ } else if (left->item_pos + 1 == right->item_pos) { -+ return (left->unit_pos == coord_last_unit_pos(left)) -+ && (right->unit_pos == 0); -+ } else { -+ return 0; -+ } -+} -+#endif /* REISER4_DEBUG */ -+ -+/* Assuming two coordinates are positioned in the same node, return COORD_CMP_ON_RIGHT, -+ COORD_CMP_ON_LEFT, or COORD_CMP_SAME depending on c1's position relative to c2. */ -+/* Audited by: green(2002.06.15) */ -+coord_cmp coord_compare(coord_t * c1, coord_t * c2) -+{ -+ assert("vs-209", c1->node == c2->node); -+ assert("vs-194", coord_is_existing_unit(c1) -+ && coord_is_existing_unit(c2)); -+ -+ if (c1->item_pos > c2->item_pos) -+ return COORD_CMP_ON_RIGHT; -+ if (c1->item_pos < c2->item_pos) -+ return COORD_CMP_ON_LEFT; -+ if (c1->unit_pos > c2->unit_pos) -+ return COORD_CMP_ON_RIGHT; -+ if (c1->unit_pos < c2->unit_pos) -+ return COORD_CMP_ON_LEFT; -+ return COORD_CMP_SAME; -+} -+ -+/* If the coordinate is between items, shifts it to the right. Returns 0 on success and -+ non-zero if there is no position to the right. */ -+int coord_set_to_right(coord_t * coord) -+{ -+ unsigned items = coord_num_items(coord); -+ -+ if (coord_adjust_items(coord, items, 1) == 1) { -+ return 1; -+ } -+ -+ switch (coord->between) { -+ case AT_UNIT: -+ return 0; -+ -+ case BEFORE_ITEM: -+ case BEFORE_UNIT: -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case AFTER_UNIT: -+ if (coord->unit_pos < coord_last_unit_pos(coord)) { -+ coord->unit_pos += 1; -+ coord->between = AT_UNIT; -+ return 0; -+ } else { -+ -+ coord->unit_pos = 0; -+ -+ if (coord->item_pos == items - 1) { -+ coord->between = AFTER_ITEM; -+ return 1; -+ } -+ -+ coord_inc_item_pos(coord); -+ coord->between = AT_UNIT; -+ return 0; -+ } -+ -+ case AFTER_ITEM: -+ if (coord->item_pos == items - 1) { -+ return 1; -+ } -+ -+ coord_inc_item_pos(coord); -+ coord->unit_pos = 0; -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case EMPTY_NODE: -+ return 1; -+ -+ case INVALID_COORD: -+ break; -+ } -+ -+ impossible("jmacd-9920", "unreachable"); -+ return 0; -+} -+ -+/* If the coordinate is between items, shifts it to the left. Returns 0 on success and -+ non-zero if there is no position to the left. */ -+int coord_set_to_left(coord_t * coord) -+{ -+ unsigned items = coord_num_items(coord); -+ -+ if (coord_adjust_items(coord, items, 0) == 1) { -+ return 1; -+ } -+ -+ switch (coord->between) { -+ case AT_UNIT: -+ return 0; -+ -+ case AFTER_UNIT: -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case AFTER_ITEM: -+ coord->between = AT_UNIT; -+ coord->unit_pos = coord_last_unit_pos(coord); -+ return 0; -+ -+ case BEFORE_UNIT: -+ if (coord->unit_pos > 0) { -+ coord->unit_pos -= 1; -+ coord->between = AT_UNIT; -+ return 0; -+ } else { -+ -+ if (coord->item_pos == 0) { -+ coord->between = BEFORE_ITEM; -+ return 1; -+ } -+ -+ coord->unit_pos = coord_last_unit_pos(coord); -+ coord_dec_item_pos(coord); -+ coord->between = AT_UNIT; -+ return 0; -+ } -+ -+ case BEFORE_ITEM: -+ if (coord->item_pos == 0) { -+ return 1; -+ } -+ -+ coord_dec_item_pos(coord); -+ coord->unit_pos = coord_last_unit_pos(coord); -+ coord->between = AT_UNIT; -+ return 0; -+ -+ case EMPTY_NODE: -+ return 1; -+ -+ case INVALID_COORD: -+ break; -+ } -+ -+ impossible("jmacd-9920", "unreachable"); -+ return 0; -+} -+ -+static const char *coord_tween_tostring(between_enum n) -+{ -+ switch (n) { -+ case BEFORE_UNIT: -+ return "before unit"; -+ case BEFORE_ITEM: -+ return "before item"; -+ case AT_UNIT: -+ return "at unit"; -+ case AFTER_UNIT: -+ return "after unit"; -+ case AFTER_ITEM: -+ return "after item"; -+ case EMPTY_NODE: -+ return "empty node"; -+ case INVALID_COORD: -+ return "invalid"; -+ default: -+ { -+ static char buf[30]; -+ -+ sprintf(buf, "unknown: %i", n); -+ return buf; -+ } -+ } -+} -+ -+void print_coord(const char *mes, const coord_t * coord, int node) -+{ -+ if (coord == NULL) { -+ printk("%s: null\n", mes); -+ return; -+ } -+ printk("%s: item_pos = %d, unit_pos %d, tween=%s, iplug=%d\n", -+ mes, coord->item_pos, coord->unit_pos, -+ coord_tween_tostring(coord->between), coord->iplugid); -+} -+ -+int -+item_utmost_child_real_block(const coord_t * coord, sideof side, -+ reiser4_block_nr * blk) -+{ -+ return item_plugin_by_coord(coord)->f.utmost_child_real_block(coord, -+ side, -+ blk); -+} -+ -+int item_utmost_child(const coord_t * coord, sideof side, jnode ** child) -+{ -+ return item_plugin_by_coord(coord)->f.utmost_child(coord, side, child); -+} -+ -+/* @count bytes of flow @f got written, update correspondingly f->length, -+ f->data and f->key */ -+void move_flow_forward(flow_t * f, unsigned count) -+{ -+ if (f->data) -+ f->data += count; -+ f->length -= count; -+ set_key_offset(&f->key, get_key_offset(&f->key) + count); -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/coord.h linux-2.6.24/fs/reiser4/coord.h ---- linux-2.6.24.orig/fs/reiser4/coord.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/coord.h 2008-01-25 11:39:06.908200476 +0300 -@@ -0,0 +1,389 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Coords */ -+ -+#if !defined( __REISER4_COORD_H__ ) -+#define __REISER4_COORD_H__ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+ -+/* insertions happen between coords in the tree, so we need some means -+ of specifying the sense of betweenness. */ -+typedef enum { -+ BEFORE_UNIT, /* Note: we/init_coord depends on this value being zero. */ -+ AT_UNIT, -+ AFTER_UNIT, -+ BEFORE_ITEM, -+ AFTER_ITEM, -+ INVALID_COORD, -+ EMPTY_NODE, -+} between_enum; -+ -+/* location of coord w.r.t. its node */ -+typedef enum { -+ COORD_ON_THE_LEFT = -1, -+ COORD_ON_THE_RIGHT = +1, -+ COORD_INSIDE = 0 -+} coord_wrt_node; -+ -+typedef enum { -+ COORD_CMP_SAME = 0, COORD_CMP_ON_LEFT = -1, COORD_CMP_ON_RIGHT = +1 -+} coord_cmp; -+ -+struct coord { -+ /* node in a tree */ -+ /* 0 */ znode *node; -+ -+ /* position of item within node */ -+ /* 4 */ pos_in_node_t item_pos; -+ /* position of unit within item */ -+ /* 6 */ pos_in_node_t unit_pos; -+ /* optimization: plugin of item is stored in coord_t. Until this was -+ implemented, item_plugin_by_coord() was major CPU consumer. ->iplugid -+ is invalidated (set to 0xff) on each modification of ->item_pos, -+ and all such modifications are funneled through coord_*_item_pos() -+ functions below. -+ */ -+ /* 8 */ char iplugid; -+ /* position of coord w.r.t. to neighboring items and/or units. -+ Values are taken from &between_enum above. -+ */ -+ /* 9 */ char between; -+ /* padding. It will be added by the compiler anyway to conform to the -+ * C language alignment requirements. We keep it here to be on the -+ * safe side and to have a clear picture of the memory layout of this -+ * structure. */ -+ /* 10 */ __u16 pad; -+ /* 12 */ int offset; -+#if REISER4_DEBUG -+ unsigned long plug_v; -+ unsigned long body_v; -+#endif -+}; -+ -+#define INVALID_PLUGID ((char)((1 << 8) - 1)) -+#define INVALID_OFFSET -1 -+ -+static inline void coord_clear_iplug(coord_t * coord) -+{ -+ assert("nikita-2835", coord != NULL); -+ coord->iplugid = INVALID_PLUGID; -+ coord->offset = INVALID_OFFSET; -+} -+ -+static inline int coord_is_iplug_set(const coord_t * coord) -+{ -+ assert("nikita-2836", coord != NULL); -+ return coord->iplugid != INVALID_PLUGID; -+} -+ -+static inline void coord_set_item_pos(coord_t * coord, pos_in_node_t pos) -+{ -+ assert("nikita-2478", coord != NULL); -+ coord->item_pos = pos; -+ coord_clear_iplug(coord); -+} -+ -+static inline void coord_dec_item_pos(coord_t * coord) -+{ -+ assert("nikita-2480", coord != NULL); -+ --coord->item_pos; -+ coord_clear_iplug(coord); -+} -+ -+static inline void coord_inc_item_pos(coord_t * coord) -+{ -+ assert("nikita-2481", coord != NULL); -+ ++coord->item_pos; -+ coord_clear_iplug(coord); -+} -+ -+static inline void coord_add_item_pos(coord_t * coord, int delta) -+{ -+ assert("nikita-2482", coord != NULL); -+ coord->item_pos += delta; -+ coord_clear_iplug(coord); -+} -+ -+static inline void coord_invalid_item_pos(coord_t * coord) -+{ -+ assert("nikita-2832", coord != NULL); -+ coord->item_pos = (unsigned short)~0; -+ coord_clear_iplug(coord); -+} -+ -+/* Reverse a direction. */ -+static inline sideof sideof_reverse(sideof side) -+{ -+ return side == LEFT_SIDE ? RIGHT_SIDE : LEFT_SIDE; -+} -+ -+/* NOTE: There is a somewhat odd mixture of the following opposed terms: -+ -+ "first" and "last" -+ "next" and "prev" -+ "before" and "after" -+ "leftmost" and "rightmost" -+ -+ But I think the chosen names are decent the way they are. -+*/ -+ -+/* COORD INITIALIZERS */ -+ -+/* Initialize an invalid coordinate. */ -+extern void coord_init_invalid(coord_t * coord, const znode * node); -+ -+extern void coord_init_first_unit_nocheck(coord_t * coord, const znode * node); -+ -+/* Initialize a coordinate to point at the first unit of the first item. If the node is -+ empty, it is positioned at the EMPTY_NODE. */ -+extern void coord_init_first_unit(coord_t * coord, const znode * node); -+ -+/* Initialize a coordinate to point at the last unit of the last item. If the node is -+ empty, it is positioned at the EMPTY_NODE. */ -+extern void coord_init_last_unit(coord_t * coord, const znode * node); -+ -+/* Initialize a coordinate to before the first item. If the node is empty, it is -+ positioned at the EMPTY_NODE. */ -+extern void coord_init_before_first_item(coord_t * coord, const znode * node); -+ -+/* Initialize a coordinate to after the last item. If the node is empty, it is positioned -+ at the EMPTY_NODE. */ -+extern void coord_init_after_last_item(coord_t * coord, const znode * node); -+ -+/* Initialize a coordinate to after last unit in the item. Coord must be set -+ already to existing item */ -+void coord_init_after_item_end(coord_t * coord); -+ -+/* Initialize a coordinate to before the item. Coord must be set already to existing item */ -+void coord_init_before_item(coord_t *); -+/* Initialize a coordinate to after the item. Coord must be set already to existing item */ -+void coord_init_after_item(coord_t *); -+ -+/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */ -+extern void coord_init_sideof_unit(coord_t * coord, const znode * node, -+ sideof dir); -+ -+/* Initialize a coordinate by 0s. Used in places where init_coord was used and -+ it was not clear how actually -+ FIXME-VS: added by vs (2002, june, 8) */ -+extern void coord_init_zero(coord_t * coord); -+ -+/* COORD METHODS */ -+ -+/* after shifting of node content, coord previously set properly may become -+ invalid, try to "normalize" it. */ -+void coord_normalize(coord_t * coord); -+ -+/* Copy a coordinate. */ -+extern void coord_dup(coord_t * coord, const coord_t * old_coord); -+ -+/* Copy a coordinate without check. */ -+void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord); -+ -+unsigned coord_num_units(const coord_t * coord); -+ -+/* Return the last valid unit number at the present item (i.e., -+ coord_num_units() - 1). */ -+static inline unsigned coord_last_unit_pos(const coord_t * coord) -+{ -+ return coord_num_units(coord) - 1; -+} -+ -+#if REISER4_DEBUG -+/* For assertions only, checks for a valid coordinate. */ -+extern int coord_check(const coord_t * coord); -+ -+extern unsigned long znode_times_locked(const znode * z); -+ -+static inline void coord_update_v(coord_t * coord) -+{ -+ coord->plug_v = coord->body_v = znode_times_locked(coord->node); -+} -+#endif -+ -+extern int coords_equal(const coord_t * c1, const coord_t * c2); -+ -+extern void print_coord(const char *mes, const coord_t * coord, int print_node); -+ -+/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost -+ return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */ -+extern coord_wrt_node coord_wrt(const coord_t * coord); -+ -+/* Returns true if the coordinates are positioned at adjacent units, regardless of -+ before-after or item boundaries. */ -+extern int coord_are_neighbors(coord_t * c1, coord_t * c2); -+ -+/* Assuming two coordinates are positioned in the same node, return NCOORD_CMP_ON_RIGHT, -+ NCOORD_CMP_ON_LEFT, or NCOORD_CMP_SAME depending on c1's position relative to c2. */ -+extern coord_cmp coord_compare(coord_t * c1, coord_t * c2); -+ -+/* COORD PREDICATES */ -+ -+/* Returns true if the coord was initializewd by coord_init_invalid (). */ -+extern int coord_is_invalid(const coord_t * coord); -+ -+/* Returns true if the coordinate is positioned at an existing item, not before or after -+ an item. It may be placed at, before, or after any unit within the item, whether -+ existing or not. If this is true you can call methods of the item plugin. */ -+extern int coord_is_existing_item(const coord_t * coord); -+ -+/* Returns true if the coordinate is positioned after a item, before a item, after the -+ last unit of an item, before the first unit of an item, or at an empty node. */ -+extern int coord_is_between_items(const coord_t * coord); -+ -+/* Returns true if the coordinate is positioned at an existing unit, not before or after a -+ unit. */ -+extern int coord_is_existing_unit(const coord_t * coord); -+ -+/* Returns true if the coordinate is positioned at an empty node. */ -+extern int coord_is_empty(const coord_t * coord); -+ -+/* Returns true if the coordinate is positioned at the first unit of the first item. Not -+ true for empty nodes nor coordinates positioned before the first item. */ -+extern int coord_is_leftmost_unit(const coord_t * coord); -+ -+/* Returns true if the coordinate is positioned after the last item or after the last unit -+ of the last item or it is an empty node. */ -+extern int coord_is_after_rightmost(const coord_t * coord); -+ -+/* Returns true if the coordinate is positioned before the first item or it is an empty -+ node. */ -+extern int coord_is_before_leftmost(const coord_t * coord); -+ -+/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof -+ argument. */ -+extern int coord_is_after_sideof_unit(coord_t * coord, sideof dir); -+ -+/* COORD MODIFIERS */ -+ -+/* Advances the coordinate by one unit to the right. If empty, no change. If -+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is -+ an existing unit. */ -+extern int coord_next_unit(coord_t * coord); -+ -+/* Advances the coordinate by one item to the right. If empty, no change. If -+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is -+ an existing item. */ -+extern int coord_next_item(coord_t * coord); -+ -+/* Advances the coordinate by one unit to the left. If empty, no change. If -+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position -+ is an existing unit. */ -+extern int coord_prev_unit(coord_t * coord); -+ -+/* Advances the coordinate by one item to the left. If empty, no change. If -+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position -+ is an existing item. */ -+extern int coord_prev_item(coord_t * coord); -+ -+/* If the coordinate is between items, shifts it to the right. Returns 0 on success and -+ non-zero if there is no position to the right. */ -+extern int coord_set_to_right(coord_t * coord); -+ -+/* If the coordinate is between items, shifts it to the left. Returns 0 on success and -+ non-zero if there is no position to the left. */ -+extern int coord_set_to_left(coord_t * coord); -+ -+/* If the coordinate is at an existing unit, set to after that unit. Returns 0 on success -+ and non-zero if the unit did not exist. */ -+extern int coord_set_after_unit(coord_t * coord); -+ -+/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */ -+extern int coord_sideof_unit(coord_t * coord, sideof dir); -+ -+/* iterate over all units in @node */ -+#define for_all_units( coord, node ) \ -+ for( coord_init_before_first_item( ( coord ), ( node ) ) ; \ -+ coord_next_unit( coord ) == 0 ; ) -+ -+/* iterate over all items in @node */ -+#define for_all_items( coord, node ) \ -+ for( coord_init_before_first_item( ( coord ), ( node ) ) ; \ -+ coord_next_item( coord ) == 0 ; ) -+ -+/* COORD/ITEM METHODS */ -+ -+extern int item_utmost_child_real_block(const coord_t * coord, sideof side, -+ reiser4_block_nr * blk); -+extern int item_utmost_child(const coord_t * coord, sideof side, -+ jnode ** child); -+ -+/* a flow is a sequence of bytes being written to or read from the tree. The -+ tree will slice the flow into items while storing it into nodes, but all of -+ that is hidden from anything outside the tree. */ -+ -+struct flow { -+ reiser4_key key; /* key of start of flow's sequence of bytes */ -+ loff_t length; /* length of flow's sequence of bytes */ -+ char *data; /* start of flow's sequence of bytes */ -+ int user; /* if 1 data is user space, 0 - kernel space */ -+ rw_op op; /* NIKITA-FIXME-HANS: comment is where? */ -+}; -+ -+void move_flow_forward(flow_t * f, unsigned count); -+ -+/* &reiser4_item_data - description of data to be inserted or pasted -+ -+ Q: articulate the reasons for the difference between this and flow. -+ -+ A: Becides flow we insert into tree other things: stat data, directory -+ entry, etc. To insert them into tree one has to provide this structure. If -+ one is going to insert flow - he can use insert_flow, where this structure -+ does not have to be created -+*/ -+struct reiser4_item_data { -+ /* actual data to be inserted. If NULL, ->create_item() will not -+ do xmemcpy itself, leaving this up to the caller. This can -+ save some amount of unnecessary memory copying, for example, -+ during insertion of stat data. -+ -+ */ -+ char *data; -+ /* 1 if 'char * data' contains pointer to user space and 0 if it is -+ kernel space */ -+ int user; -+ /* amount of data we are going to insert or paste */ -+ int length; -+ /* "Arg" is opaque data that is passed down to the -+ ->create_item() method of node layout, which in turn -+ hands it to the ->create_hook() of item being created. This -+ arg is currently used by: -+ -+ . ->create_hook() of internal item -+ (fs/reiser4/plugin/item/internal.c:internal_create_hook()), -+ . ->paste() method of directory item. -+ . ->create_hook() of extent item -+ -+ For internal item, this is left "brother" of new node being -+ inserted and it is used to add new node into sibling list -+ after parent to it was just inserted into parent. -+ -+ While ->arg does look somewhat of unnecessary compication, -+ it actually saves a lot of headache in many places, because -+ all data necessary to insert or paste new data into tree are -+ collected in one place, and this eliminates a lot of extra -+ argument passing and storing everywhere. -+ -+ */ -+ void *arg; -+ /* plugin of item we are inserting */ -+ item_plugin *iplug; -+}; -+ -+/* __REISER4_COORD_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/debug.c linux-2.6.24/fs/reiser4/debug.c ---- linux-2.6.24.orig/fs/reiser4/debug.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/debug.c 2008-01-25 11:39:06.908200476 +0300 -@@ -0,0 +1,308 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Debugging facilities. */ -+ -+/* -+ * This file contains generic debugging functions used by reiser4. Roughly -+ * following: -+ * -+ * panicking: reiser4_do_panic(), reiser4_print_prefix(). -+ * -+ * locking: -+ * reiser4_schedulable(), reiser4_lock_counters(), print_lock_counters(), -+ * reiser4_no_counters_are_held(), reiser4_commit_check_locks() -+ * -+ * error code monitoring (see comment before RETERR macro): -+ * reiser4_return_err(), reiser4_report_err(). -+ * -+ * stack back-tracing: fill_backtrace() -+ * -+ * miscellaneous: reiser4_preempt_point(), call_on_each_assert(), -+ * reiser4_debugtrap(). -+ * -+ */ -+ -+#include "reiser4.h" -+#include "context.h" -+#include "super.h" -+#include "txnmgr.h" -+#include "znode.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#if 0 -+#if REISER4_DEBUG -+static void reiser4_report_err(void); -+#else -+#define reiser4_report_err() noop -+#endif -+#endif /* 0 */ -+ -+/* -+ * global buffer where message given to reiser4_panic is formatted. -+ */ -+static char panic_buf[REISER4_PANIC_MSG_BUFFER_SIZE]; -+ -+/* -+ * lock protecting consistency of panic_buf under concurrent panics -+ */ -+static DEFINE_SPINLOCK(panic_guard); -+ -+/* Your best friend. Call it on each occasion. This is called by -+ fs/reiser4/debug.h:reiser4_panic(). */ -+void reiser4_do_panic(const char *format /* format string */ , ... /* rest */ ) -+{ -+ static int in_panic = 0; -+ va_list args; -+ -+ /* -+ * check for recursive panic. -+ */ -+ if (in_panic == 0) { -+ in_panic = 1; -+ -+ spin_lock(&panic_guard); -+ va_start(args, format); -+ vsnprintf(panic_buf, sizeof(panic_buf), format, args); -+ va_end(args); -+ printk(KERN_EMERG "reiser4 panicked cowardly: %s", panic_buf); -+ spin_unlock(&panic_guard); -+ -+ /* -+ * if kernel debugger is configured---drop in. Early dropping -+ * into kgdb is not always convenient, because panic message -+ * is not yet printed most of the times. But: -+ * -+ * (1) message can be extracted from printk_buf[] -+ * (declared static inside of printk()), and -+ * -+ * (2) sometimes serial/kgdb combo dies while printing -+ * long panic message, so it's more prudent to break into -+ * debugger earlier. -+ * -+ */ -+ DEBUGON(1); -+ } -+ /* to make gcc happy about noreturn attribute */ -+ panic("%s", panic_buf); -+} -+ -+#if 0 -+void -+reiser4_print_prefix(const char *level, int reperr, const char *mid, -+ const char *function, const char *file, int lineno) -+{ -+ const char *comm; -+ int pid; -+ -+ if (unlikely(in_interrupt() || in_irq())) { -+ comm = "interrupt"; -+ pid = 0; -+ } else { -+ comm = current->comm; -+ pid = current->pid; -+ } -+ printk("%sreiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n", -+ level, comm, pid, function, file, lineno, mid); -+ if (reperr) -+ reiser4_report_err(); -+} -+#endif /* 0 */ -+ -+/* Preemption point: this should be called periodically during long running -+ operations (carry, allocate, and squeeze are best examples) */ -+int reiser4_preempt_point(void) -+{ -+ assert("nikita-3008", reiser4_schedulable()); -+ cond_resched(); -+ return signal_pending(current); -+} -+ -+#if REISER4_DEBUG -+/* Debugging aid: return struct where information about locks taken by current -+ thread is accumulated. This can be used to formulate lock ordering -+ constraints and various assertions. -+ -+*/ -+reiser4_lock_cnt_info *reiser4_lock_counters(void) -+{ -+ reiser4_context *ctx = get_current_context(); -+ assert("jmacd-1123", ctx != NULL); -+ return &ctx->locks; -+} -+ -+/* -+ * print human readable information about locks held by the reiser4 context. -+ */ -+static void print_lock_counters(const char *prefix, -+ const reiser4_lock_cnt_info * info) -+{ -+ printk("%s: jnode: %i, tree: %i (r:%i,w:%i), dk: %i (r:%i,w:%i)\n" -+ "jload: %i, " -+ "txnh: %i, atom: %i, stack: %i, txnmgr: %i, " -+ "ktxnmgrd: %i, fq: %i\n" -+ "inode: %i, " -+ "cbk_cache: %i (r:%i,w%i), " -+ "eflush: %i, " -+ "zlock: %i,\n" -+ "spin: %i, long: %i inode_sem: (r:%i,w:%i)\n" -+ "d: %i, x: %i, t: %i\n", prefix, -+ info->spin_locked_jnode, -+ info->rw_locked_tree, info->read_locked_tree, -+ info->write_locked_tree, -+ info->rw_locked_dk, info->read_locked_dk, info->write_locked_dk, -+ info->spin_locked_jload, -+ info->spin_locked_txnh, -+ info->spin_locked_atom, info->spin_locked_stack, -+ info->spin_locked_txnmgr, info->spin_locked_ktxnmgrd, -+ info->spin_locked_fq, -+ info->spin_locked_inode, -+ info->rw_locked_cbk_cache, -+ info->read_locked_cbk_cache, -+ info->write_locked_cbk_cache, -+ info->spin_locked_super_eflush, -+ info->spin_locked_zlock, -+ info->spin_locked, -+ info->long_term_locked_znode, -+ info->inode_sem_r, info->inode_sem_w, -+ info->d_refs, info->x_refs, info->t_refs); -+} -+ -+/* check that no spinlocks are held */ -+int reiser4_schedulable(void) -+{ -+ if (get_current_context_check() != NULL) { -+ if (!LOCK_CNT_NIL(spin_locked)) { -+ print_lock_counters("in atomic", reiser4_lock_counters()); -+ return 0; -+ } -+ } -+ might_sleep(); -+ return 1; -+} -+/* -+ * return true, iff no locks are held. -+ */ -+int reiser4_no_counters_are_held(void) -+{ -+ reiser4_lock_cnt_info *counters; -+ -+ counters = reiser4_lock_counters(); -+ return -+ (counters->spin_locked_zlock == 0) && -+ (counters->spin_locked_jnode == 0) && -+ (counters->rw_locked_tree == 0) && -+ (counters->read_locked_tree == 0) && -+ (counters->write_locked_tree == 0) && -+ (counters->rw_locked_dk == 0) && -+ (counters->read_locked_dk == 0) && -+ (counters->write_locked_dk == 0) && -+ (counters->spin_locked_txnh == 0) && -+ (counters->spin_locked_atom == 0) && -+ (counters->spin_locked_stack == 0) && -+ (counters->spin_locked_txnmgr == 0) && -+ (counters->spin_locked_inode == 0) && -+ (counters->spin_locked == 0) && -+ (counters->long_term_locked_znode == 0) && -+ (counters->inode_sem_r == 0) && -+ (counters->inode_sem_w == 0) && (counters->d_refs == 0); -+} -+ -+/* -+ * return true, iff transaction commit can be done under locks held by the -+ * current thread. -+ */ -+int reiser4_commit_check_locks(void) -+{ -+ reiser4_lock_cnt_info *counters; -+ int inode_sem_r; -+ int inode_sem_w; -+ int result; -+ -+ /* -+ * inode's read/write semaphore is the only reiser4 lock that can be -+ * held during commit. -+ */ -+ -+ counters = reiser4_lock_counters(); -+ inode_sem_r = counters->inode_sem_r; -+ inode_sem_w = counters->inode_sem_w; -+ -+ counters->inode_sem_r = counters->inode_sem_w = 0; -+ result = reiser4_no_counters_are_held(); -+ counters->inode_sem_r = inode_sem_r; -+ counters->inode_sem_w = inode_sem_w; -+ return result; -+} -+ -+/* -+ * fill "error site" in the current reiser4 context. See comment before RETERR -+ * macro for more details. -+ */ -+void reiser4_return_err(int code, const char *file, int line) -+{ -+ if (code < 0 && is_in_reiser4_context()) { -+ reiser4_context *ctx = get_current_context(); -+ -+ if (ctx != NULL) { -+ ctx->err.code = code; -+ ctx->err.file = file; -+ ctx->err.line = line; -+ } -+ } -+} -+ -+#if 0 -+/* -+ * report error information recorder by reiser4_return_err(). -+ */ -+static void reiser4_report_err(void) -+{ -+ reiser4_context *ctx = get_current_context_check(); -+ -+ if (ctx != NULL) { -+ if (ctx->err.code != 0) { -+ printk("code: %i at %s:%i\n", -+ ctx->err.code, ctx->err.file, ctx->err.line); -+ } -+ } -+} -+#endif /* 0 */ -+ -+#endif /* REISER4_DEBUG */ -+ -+#if KERNEL_DEBUGGER -+ -+/* -+ * this functions just drops into kernel debugger. It is a convenient place to -+ * put breakpoint in. -+ */ -+void reiser4_debugtrap(void) -+{ -+ /* do nothing. Put break point here. */ -+#if defined(CONFIG_KGDB) && !defined(CONFIG_REISER4_FS_MODULE) -+ extern void breakpoint(void); -+ breakpoint(); -+#endif -+} -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/debug.h linux-2.6.24/fs/reiser4/debug.h ---- linux-2.6.24.orig/fs/reiser4/debug.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/debug.h 2008-01-25 11:39:06.908200476 +0300 -@@ -0,0 +1,350 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Declarations of debug macros. */ -+ -+#if !defined( __FS_REISER4_DEBUG_H__ ) -+#define __FS_REISER4_DEBUG_H__ -+ -+#include "forward.h" -+#include "reiser4.h" -+ -+/* generic function to produce formatted output, decorating it with -+ whatever standard prefixes/postfixes we want. "Fun" is a function -+ that will be actually called, can be printk, panic etc. -+ This is for use by other debugging macros, not by users. */ -+#define DCALL(lev, fun, reperr, label, format, ...) \ -+({ \ -+ fun(lev "reiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n" format "\n" , \ -+ current->comm, current->pid, __FUNCTION__, \ -+ __FILE__, __LINE__, label, ## __VA_ARGS__); \ -+}) -+ -+/* -+ * cause kernel to crash -+ */ -+#define reiser4_panic(mid, format, ...) \ -+ DCALL("", reiser4_do_panic, 1, mid, format , ## __VA_ARGS__) -+ -+/* print message with indication of current process, file, line and -+ function */ -+#define reiser4_log(label, format, ...) \ -+ DCALL(KERN_DEBUG, printk, 0, label, format , ## __VA_ARGS__) -+ -+/* Assertion checked during compilation. -+ If "cond" is false (0) we get duplicate case label in switch. -+ Use this to check something like famous -+ cassert (sizeof(struct reiserfs_journal_commit) == 4096) ; -+ in 3.x journal.c. If cassertion fails you get compiler error, -+ so no "maintainer-id". -+*/ -+#define cassert(cond) ({ switch(-1) { case (cond): case 0: break; } }) -+ -+#define noop do {;} while(0) -+ -+#if REISER4_DEBUG -+/* version of info that only actually prints anything when _d_ebugging -+ is on */ -+#define dinfo(format, ...) printk(format , ## __VA_ARGS__) -+/* macro to catch logical errors. Put it into `default' clause of -+ switch() statement. */ -+#define impossible(label, format, ...) \ -+ reiser4_panic(label, "impossible: " format , ## __VA_ARGS__) -+/* assert assures that @cond is true. If it is not, reiser4_panic() is -+ called. Use this for checking logical consistency and _never_ call -+ this to check correctness of external data: disk blocks and user-input . */ -+#define assert(label, cond) \ -+({ \ -+ /* call_on_each_assert(); */ \ -+ if (cond) { \ -+ /* put negated check to avoid using !(cond) that would lose \ -+ * warnings for things like assert(a = b); */ \ -+ ; \ -+ } else { \ -+ DEBUGON(1); \ -+ reiser4_panic(label, "assertion failed: %s", #cond); \ -+ } \ -+}) -+ -+/* like assertion, but @expr is evaluated even if REISER4_DEBUG is off. */ -+#define check_me( label, expr ) assert( label, ( expr ) ) -+ -+#define ON_DEBUG( exp ) exp -+ -+extern int reiser4_schedulable(void); -+extern void call_on_each_assert(void); -+ -+#else -+ -+#define dinfo( format, args... ) noop -+#define impossible( label, format, args... ) noop -+#define assert( label, cond ) noop -+#define check_me( label, expr ) ( ( void ) ( expr ) ) -+#define ON_DEBUG( exp ) -+#define reiser4_schedulable() might_sleep() -+ -+/* REISER4_DEBUG */ -+#endif -+ -+#if REISER4_DEBUG -+/* per-thread information about lock acquired by this thread. Used by lock -+ * ordering checking in spin_macros.h */ -+typedef struct reiser4_lock_cnt_info { -+ int rw_locked_tree; -+ int read_locked_tree; -+ int write_locked_tree; -+ -+ int rw_locked_dk; -+ int read_locked_dk; -+ int write_locked_dk; -+ -+ int rw_locked_cbk_cache; -+ int read_locked_cbk_cache; -+ int write_locked_cbk_cache; -+ -+ int spin_locked_zlock; -+ int spin_locked_jnode; -+ int spin_locked_jload; -+ int spin_locked_txnh; -+ int spin_locked_atom; -+ int spin_locked_stack; -+ int spin_locked_txnmgr; -+ int spin_locked_ktxnmgrd; -+ int spin_locked_fq; -+ int spin_locked_inode; -+ int spin_locked_super_eflush; -+ int spin_locked; -+ int long_term_locked_znode; -+ -+ int inode_sem_r; -+ int inode_sem_w; -+ -+ int d_refs; -+ int x_refs; -+ int t_refs; -+} reiser4_lock_cnt_info; -+ -+extern struct reiser4_lock_cnt_info *reiser4_lock_counters(void); -+#define IN_CONTEXT(a, b) (is_in_reiser4_context() ? (a) : (b)) -+ -+/* increment lock-counter @counter, if present */ -+#define LOCK_CNT_INC(counter) \ -+ IN_CONTEXT(++(reiser4_lock_counters()->counter), 0) -+ -+/* decrement lock-counter @counter, if present */ -+#define LOCK_CNT_DEC(counter) \ -+ IN_CONTEXT(--(reiser4_lock_counters()->counter), 0) -+ -+/* check that lock-counter is zero. This is for use in assertions */ -+#define LOCK_CNT_NIL(counter) \ -+ IN_CONTEXT(reiser4_lock_counters()->counter == 0, 1) -+ -+/* check that lock-counter is greater than zero. This is for use in -+ * assertions */ -+#define LOCK_CNT_GTZ(counter) \ -+ IN_CONTEXT(reiser4_lock_counters()->counter > 0, 1) -+#define LOCK_CNT_LT(counter,n) \ -+ IN_CONTEXT(reiser4_lock_counters()->counter < n, 1) -+ -+#else /* REISER4_DEBUG */ -+ -+/* no-op versions on the above */ -+ -+typedef struct reiser4_lock_cnt_info { -+} reiser4_lock_cnt_info; -+ -+#define reiser4_lock_counters() ((reiser4_lock_cnt_info *)NULL) -+#define LOCK_CNT_INC(counter) noop -+#define LOCK_CNT_DEC(counter) noop -+#define LOCK_CNT_NIL(counter) (1) -+#define LOCK_CNT_GTZ(counter) (1) -+#define LOCK_CNT_LT(counter,n) (1) -+ -+#endif /* REISER4_DEBUG */ -+ -+#define assert_spin_not_locked(lock) BUG_ON(0) -+#define assert_rw_write_locked(lock) BUG_ON(0) -+#define assert_rw_read_locked(lock) BUG_ON(0) -+#define assert_rw_locked(lock) BUG_ON(0) -+#define assert_rw_not_write_locked(lock) BUG_ON(0) -+#define assert_rw_not_read_locked(lock) BUG_ON(0) -+#define assert_rw_not_locked(lock) BUG_ON(0) -+ -+/* flags controlling debugging behavior. Are set through debug_flags=N mount -+ option. */ -+typedef enum { -+ /* print a lot of information during panic. When this is on all jnodes -+ * are listed. This can be *very* large output. Usually you don't want -+ * this. Especially over serial line. */ -+ REISER4_VERBOSE_PANIC = 0x00000001, -+ /* print a lot of information during umount */ -+ REISER4_VERBOSE_UMOUNT = 0x00000002, -+ /* print gathered statistics on umount */ -+ REISER4_STATS_ON_UMOUNT = 0x00000004, -+ /* check node consistency */ -+ REISER4_CHECK_NODE = 0x00000008 -+} reiser4_debug_flags; -+ -+extern int is_in_reiser4_context(void); -+ -+/* -+ * evaluate expression @e only if with reiser4 context -+ */ -+#define ON_CONTEXT(e) do { \ -+ if(is_in_reiser4_context()) { \ -+ e; \ -+ } } while(0) -+ -+/* -+ * evaluate expression @e only when within reiser4_context and debugging is -+ * on. -+ */ -+#define ON_DEBUG_CONTEXT( e ) ON_DEBUG( ON_CONTEXT( e ) ) -+ -+/* -+ * complain about unexpected function result and crash. Used in "default" -+ * branches of switch statements and alike to assert that invalid results are -+ * not silently ignored. -+ */ -+#define wrong_return_value( label, function ) \ -+ impossible( label, "wrong return value from " function ) -+ -+/* Issue different types of reiser4 messages to the console */ -+#define warning( label, format, ... ) \ -+ DCALL( KERN_WARNING, \ -+ printk, 1, label, "WARNING: " format , ## __VA_ARGS__ ) -+#define notice( label, format, ... ) \ -+ DCALL( KERN_NOTICE, \ -+ printk, 1, label, "NOTICE: " format , ## __VA_ARGS__ ) -+ -+/* mark not yet implemented functionality */ -+#define not_yet( label, format, ... ) \ -+ reiser4_panic( label, "NOT YET IMPLEMENTED: " format , ## __VA_ARGS__ ) -+ -+extern void reiser4_do_panic(const char *format, ...) -+ __attribute__ ((noreturn, format(printf, 1, 2))); -+ -+extern int reiser4_preempt_point(void); -+extern void reiser4_print_stats(void); -+ -+#if REISER4_DEBUG -+extern int reiser4_no_counters_are_held(void); -+extern int reiser4_commit_check_locks(void); -+#else -+#define reiser4_no_counters_are_held() (1) -+#define reiser4_commit_check_locks() (1) -+#endif -+ -+/* true if @i is power-of-two. Useful for rate-limited warnings, etc. */ -+#define IS_POW(i) \ -+({ \ -+ typeof(i) __i; \ -+ \ -+ __i = (i); \ -+ !(__i & (__i - 1)); \ -+}) -+ -+#define KERNEL_DEBUGGER (1) -+ -+#if KERNEL_DEBUGGER -+ -+extern void reiser4_debugtrap(void); -+ -+/* -+ * Check condition @cond and drop into kernel debugger (kgdb) if it's true. If -+ * kgdb is not compiled in, do nothing. -+ */ -+#define DEBUGON(cond) \ -+({ \ -+ if (unlikely(cond)) \ -+ reiser4_debugtrap(); \ -+}) -+#else -+#define DEBUGON(cond) noop -+#endif -+ -+/* -+ * Error code tracing facility. (Idea is borrowed from XFS code.) -+ * -+ * Suppose some strange and/or unexpected code is returned from some function -+ * (for example, write(2) returns -EEXIST). It is possible to place a -+ * breakpoint in the reiser4_write(), but it is too late here. How to find out -+ * in what particular place -EEXIST was generated first? -+ * -+ * In reiser4 all places where actual error codes are produced (that is, -+ * statements of the form -+ * -+ * return -EFOO; // (1), or -+ * -+ * result = -EFOO; // (2) -+ * -+ * are replaced with -+ * -+ * return RETERR(-EFOO); // (1a), and -+ * -+ * result = RETERR(-EFOO); // (2a) respectively -+ * -+ * RETERR() macro fills a backtrace in reiser4_context. This back-trace is -+ * printed in error and warning messages. Moreover, it's possible to put a -+ * conditional breakpoint in reiser4_return_err (low-level function called -+ * by RETERR() to do the actual work) to break into debugger immediately -+ * when particular error happens. -+ * -+ */ -+ -+#if REISER4_DEBUG -+ -+/* -+ * data-type to store information about where error happened ("error site"). -+ */ -+typedef struct err_site { -+ int code; /* error code */ -+ const char *file; /* source file, filled by __FILE__ */ -+ int line; /* source file line, filled by __LINE__ */ -+} err_site; -+ -+extern void reiser4_return_err(int code, const char *file, int line); -+ -+/* -+ * fill &get_current_context()->err_site with error information. -+ */ -+#define RETERR(code) \ -+({ \ -+ typeof(code) __code; \ -+ \ -+ __code = (code); \ -+ reiser4_return_err(__code, __FILE__, __LINE__); \ -+ __code; \ -+}) -+ -+#else -+ -+/* -+ * no-op versions of the above -+ */ -+ -+typedef struct err_site { -+} err_site; -+#define RETERR(code) code -+#endif -+ -+#if REISER4_LARGE_KEY -+/* -+ * conditionally compile arguments only if REISER4_LARGE_KEY is on. -+ */ -+#define ON_LARGE_KEY(...) __VA_ARGS__ -+#else -+#define ON_LARGE_KEY(...) -+#endif -+ -+/* __FS_REISER4_DEBUG_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/dformat.h linux-2.6.24/fs/reiser4/dformat.h ---- linux-2.6.24.orig/fs/reiser4/dformat.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/dformat.h 2008-01-25 11:39:06.908200476 +0300 -@@ -0,0 +1,70 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Formats of on-disk data and conversion functions. */ -+ -+/* put all item formats in the files describing the particular items, -+ our model is, everything you need to do to add an item to reiser4, -+ (excepting the changes to the plugin that uses the item which go -+ into the file defining that plugin), you put into one file. */ -+/* Data on disk are stored in little-endian format. -+ To declare fields of on-disk structures, use d8, d16, d32 and d64. -+ d??tocpu() and cputod??() to convert. */ -+ -+#if !defined( __FS_REISER4_DFORMAT_H__ ) -+#define __FS_REISER4_DFORMAT_H__ -+ -+#include -+#include -+#include -+ -+typedef __u8 d8; -+typedef __le16 d16; -+typedef __le32 d32; -+typedef __le64 d64; -+ -+#define PACKED __attribute__((packed)) -+ -+/* data-type for block number */ -+typedef __u64 reiser4_block_nr; -+ -+/* data-type for block number on disk, disk format */ -+typedef __le64 reiser4_dblock_nr; -+ -+/** -+ * disk_addr_eq - compare disk addresses -+ * @b1: pointer to block number ot compare -+ * @b2: pointer to block number ot compare -+ * -+ * Returns true if if disk addresses are the same -+ */ -+static inline int disk_addr_eq(const reiser4_block_nr *b1, -+ const reiser4_block_nr * b2) -+{ -+ assert("nikita-1033", b1 != NULL); -+ assert("nikita-1266", b2 != NULL); -+ -+ return !memcmp(b1, b2, sizeof *b1); -+} -+ -+/* structure of master reiser4 super block */ -+typedef struct reiser4_master_sb { -+ char magic[16]; /* "ReIsEr4" */ -+ __le16 disk_plugin_id; /* id of disk layout plugin */ -+ __le16 blocksize; -+ char uuid[16]; /* unique id */ -+ char label[16]; /* filesystem label */ -+ __le64 diskmap; /* location of the diskmap. 0 if not present */ -+} reiser4_master_sb; -+ -+/* __FS_REISER4_DFORMAT_H__ */ -+#endif -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/dscale.c linux-2.6.24/fs/reiser4/dscale.c ---- linux-2.6.24.orig/fs/reiser4/dscale.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/dscale.c 2008-01-25 11:55:43.884539336 +0300 -@@ -0,0 +1,192 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Scalable on-disk integers */ -+ -+/* -+ * Various on-disk structures contain integer-like structures. Stat-data -+ * contain [yes, "data" is plural, check the dictionary] file size, link -+ * count; extent unit contains extent width etc. To accommodate for general -+ * case enough space is reserved to keep largest possible value. 64 bits in -+ * all cases above. But in overwhelming majority of cases numbers actually -+ * stored in these fields will be comparatively small and reserving 8 bytes is -+ * a waste of precious disk bandwidth. -+ * -+ * Scalable integers are one way to solve this problem. dscale_write() -+ * function stores __u64 value in the given area consuming from 1 to 9 bytes, -+ * depending on the magnitude of the value supplied. dscale_read() reads value -+ * previously stored by dscale_write(). -+ * -+ * dscale_write() produces format not completely unlike of UTF: two highest -+ * bits of the first byte are used to store "tag". One of 4 possible tag -+ * values is chosen depending on the number being encoded: -+ * -+ * 0 ... 0x3f => 0 [table 1] -+ * 0x40 ... 0x3fff => 1 -+ * 0x4000 ... 0x3fffffff => 2 -+ * 0x40000000 ... 0xffffffffffffffff => 3 -+ * -+ * (see dscale_range() function) -+ * -+ * Values in the range 0x40000000 ... 0xffffffffffffffff require 8 full bytes -+ * to be stored, so in this case there is no place in the first byte to store -+ * tag. For such values tag is stored in an extra 9th byte. -+ * -+ * As _highest_ bits are used for the test (which is natural) scaled integers -+ * are stored in BIG-ENDIAN format in contrast with the rest of reiser4 which -+ * uses LITTLE-ENDIAN. -+ * -+ */ -+ -+#include "debug.h" -+#include "dscale.h" -+ -+/* return tag of scaled integer stored at @address */ -+static int gettag(const unsigned char *address) -+{ -+ /* tag is stored in two highest bits */ -+ return (*address) >> 6; -+} -+ -+/* clear tag from value. Clear tag embedded into @value. */ -+static void cleartag(__u64 * value, int tag) -+{ -+ /* -+ * W-w-what ?! -+ * -+ * Actually, this is rather simple: @value passed here was read by -+ * dscale_read(), converted from BIG-ENDIAN, and padded to __u64 by -+ * zeroes. Tag is still stored in the highest (arithmetically) -+ * non-zero bits of @value, but relative position of tag within __u64 -+ * depends on @tag. -+ * -+ * For example if @tag is 0, it's stored 2 highest bits of lowest -+ * byte, and its offset (counting from lowest bit) is 8 - 2 == 6 bits. -+ * -+ * If tag is 1, it's stored in two highest bits of 2nd lowest byte, -+ * and it's offset if (2 * 8) - 2 == 14 bits. -+ * -+ * See table 1 above for details. -+ * -+ * All these cases are captured by the formula: -+ */ -+ *value &= ~(3 << (((1 << tag) << 3) - 2)); -+ /* -+ * That is, clear two (3 == 0t11) bits at the offset -+ * -+ * 8 * (2 ^ tag) - 2, -+ * -+ * that is, two highest bits of (2 ^ tag)-th byte of @value. -+ */ -+} -+ -+/* return tag for @value. See table 1 above for details. */ -+static int dscale_range(__u64 value) -+{ -+ if (value > 0x3fffffff) -+ return 3; -+ if (value > 0x3fff) -+ return 2; -+ if (value > 0x3f) -+ return 1; -+ return 0; -+} -+ -+/* restore value stored at @adderss by dscale_write() and return number of -+ * bytes consumed */ -+int dscale_read(unsigned char *address, __u64 * value) -+{ -+ int tag; -+ -+ /* read tag */ -+ tag = gettag(address); -+ switch (tag) { -+ case 3: -+ /* In this case tag is stored in an extra byte, skip this byte -+ * and decode value stored in the next 8 bytes.*/ -+ *value = __be64_to_cpu(get_unaligned((__be64 *)(address + 1))); -+ /* worst case: 8 bytes for value itself plus one byte for -+ * tag. */ -+ return 9; -+ case 0: -+ *value = get_unaligned(address); -+ break; -+ case 1: -+ *value = __be16_to_cpu(get_unaligned((__be16 *)address)); -+ break; -+ case 2: -+ *value = __be32_to_cpu(get_unaligned((__be32 *)address)); -+ break; -+ default: -+ return RETERR(-EIO); -+ } -+ /* clear tag embedded into @value */ -+ cleartag(value, tag); -+ /* number of bytes consumed is (2 ^ tag)---see table 1. */ -+ return 1 << tag; -+} -+ -+/* number of bytes consumed */ -+int dscale_bytes_to_read(unsigned char *address) -+{ -+ int tag; -+ -+ tag = gettag(address); -+ switch (tag) { -+ case 0: -+ case 1: -+ case 2: -+ return 1 << tag; -+ case 3: -+ return 9; -+ default: -+ return RETERR(-EIO); -+ } -+} -+ -+/* store @value at @address and return number of bytes consumed */ -+int dscale_write(unsigned char *address, __u64 value) -+{ -+ int tag; -+ int shift; -+ __be64 v; -+ unsigned char *valarr; -+ -+ tag = dscale_range(value); -+ v = __cpu_to_be64(value); -+ valarr = (unsigned char *)&v; -+ shift = (tag == 3) ? 1 : 0; -+ memcpy(address + shift, valarr + sizeof v - (1 << tag), 1 << tag); -+ *address |= (tag << 6); -+ return shift + (1 << tag); -+} -+ -+/* number of bytes required to store @value */ -+int dscale_bytes_to_write(__u64 value) -+{ -+ int bytes; -+ -+ bytes = 1 << dscale_range(value); -+ if (bytes == 8) -+ ++bytes; -+ return bytes; -+} -+ -+/* returns true if @value and @other require the same number of bytes to be -+ * stored. Used by detect when data structure (like stat-data) has to be -+ * expanded or contracted. */ -+int dscale_fit(__u64 value, __u64 other) -+{ -+ return dscale_range(value) == dscale_range(other); -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/dscale.h linux-2.6.24/fs/reiser4/dscale.h ---- linux-2.6.24.orig/fs/reiser4/dscale.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/dscale.h 2008-01-25 11:55:43.884539336 +0300 -@@ -0,0 +1,28 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Scalable on-disk integers. See dscale.h for details. */ -+ -+#if !defined( __FS_REISER4_DSCALE_H__ ) -+#define __FS_REISER4_DSCALE_H__ -+ -+#include "dformat.h" -+ -+extern int dscale_read(unsigned char *address, __u64 * value); -+extern int dscale_write(unsigned char *address, __u64 value); -+extern int dscale_bytes_to_read(unsigned char *address); -+extern int dscale_bytes_to_write(__u64 value); -+extern int dscale_fit(__u64 value, __u64 other); -+ -+/* __FS_REISER4_DSCALE_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/entd.c linux-2.6.24/fs/reiser4/entd.c ---- linux-2.6.24.orig/fs/reiser4/entd.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/entd.c 2008-01-25 11:39:06.912201506 +0300 -@@ -0,0 +1,335 @@ -+/* Copyright 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Ent daemon. */ -+ -+#include "debug.h" -+#include "txnmgr.h" -+#include "tree.h" -+#include "entd.h" -+#include "super.h" -+#include "context.h" -+#include "reiser4.h" -+#include "vfs_ops.h" -+#include "page_cache.h" -+#include "inode.h" -+ -+#include /* struct task_struct */ -+#include -+#include -+#include -+#include /* INITIAL_JIFFIES */ -+#include /* bdi_write_congested */ -+#include -+#include -+#include -+ -+#define DEF_PRIORITY 12 -+#define MAX_ENTD_ITERS 10 -+ -+static void entd_flush(struct super_block *, struct wbq *); -+static int entd(void *arg); -+ -+/* -+ * set ->comm field of end thread to make its state visible to the user level -+ */ -+#define entd_set_comm(state) \ -+ snprintf(current->comm, sizeof(current->comm), \ -+ "ent:%s%s", super->s_id, (state)) -+ -+/** -+ * reiser4_init_entd - initialize entd context and start kernel daemon -+ * @super: super block to start ent thread for -+ * -+ * Creates entd contexts, starts kernel thread and waits until it -+ * initializes. -+ */ -+int reiser4_init_entd(struct super_block *super) -+{ -+ entd_context *ctx; -+ -+ assert("nikita-3104", super != NULL); -+ -+ ctx = get_entd_context(super); -+ -+ memset(ctx, 0, sizeof *ctx); -+ spin_lock_init(&ctx->guard); -+ init_waitqueue_head(&ctx->wait); -+#if REISER4_DEBUG -+ INIT_LIST_HEAD(&ctx->flushers_list); -+#endif -+ /* lists of writepage requests */ -+ INIT_LIST_HEAD(&ctx->todo_list); -+ INIT_LIST_HEAD(&ctx->done_list); -+ /* start entd */ -+ ctx->tsk = kthread_run(entd, super, "ent:%s", super->s_id); -+ if (IS_ERR(ctx->tsk)) -+ return PTR_ERR(ctx->tsk); -+ return 0; -+} -+ -+static void put_wbq(struct wbq *rq) -+{ -+ iput(rq->mapping->host); -+ complete(&rq->completion); -+} -+ -+/* ent should be locked */ -+static struct wbq *__get_wbq(entd_context * ent) -+{ -+ struct wbq *wbq; -+ -+ if (list_empty(&ent->todo_list)) -+ return NULL; -+ -+ ent->nr_todo_reqs --; -+ wbq = list_entry(ent->todo_list.next, struct wbq, link); -+ list_del_init(&wbq->link); -+ return wbq; -+} -+ -+/* ent thread function */ -+static int entd(void *arg) -+{ -+ struct super_block *super; -+ entd_context *ent; -+ int done = 0; -+ -+ super = arg; -+ /* do_fork() just copies task_struct into the new -+ thread. ->fs_context shouldn't be copied of course. This shouldn't -+ be a problem for the rest of the code though. -+ */ -+ current->journal_info = NULL; -+ -+ ent = get_entd_context(super); -+ -+ while (!done) { -+ try_to_freeze(); -+ -+ spin_lock(&ent->guard); -+ while (ent->nr_todo_reqs != 0) { -+ struct wbq *rq; -+ -+ assert("", list_empty(&ent->done_list)); -+ -+ /* take request from the queue head */ -+ rq = __get_wbq(ent); -+ assert("", rq != NULL); -+ ent->cur_request = rq; -+ spin_unlock(&ent->guard); -+ -+ entd_set_comm("!"); -+ entd_flush(super, rq); -+ -+ put_wbq(rq); -+ -+ /* -+ * wakeup all requestors and iput their inodes -+ */ -+ spin_lock(&ent->guard); -+ while (!list_empty(&ent->done_list)) { -+ rq = list_entry(ent->done_list.next, struct wbq, link); -+ list_del_init(&rq->link); -+ ent->nr_done_reqs --; -+ spin_unlock(&ent->guard); -+ assert("", rq->written == 1); -+ put_wbq(rq); -+ spin_lock(&ent->guard); -+ } -+ } -+ spin_unlock(&ent->guard); -+ -+ entd_set_comm("."); -+ -+ { -+ DEFINE_WAIT(__wait); -+ -+ do { -+ prepare_to_wait(&ent->wait, &__wait, TASK_INTERRUPTIBLE); -+ if (kthread_should_stop()) { -+ done = 1; -+ break; -+ } -+ if (ent->nr_todo_reqs != 0) -+ break; -+ schedule(); -+ } while (0); -+ finish_wait(&ent->wait, &__wait); -+ } -+ } -+ BUG_ON(ent->nr_todo_reqs != 0); -+ return 0; -+} -+ -+/** -+ * reiser4_done_entd - stop entd kernel thread -+ * @super: super block to stop ent thread for -+ * -+ * It is called on umount. Sends stop signal to entd and wait until it handles -+ * it. -+ */ -+void reiser4_done_entd(struct super_block *super) -+{ -+ entd_context *ent; -+ -+ assert("nikita-3103", super != NULL); -+ -+ ent = get_entd_context(super); -+ assert("zam-1055", ent->tsk != NULL); -+ kthread_stop(ent->tsk); -+} -+ -+/* called at the beginning of jnode_flush to register flusher thread with ent -+ * daemon */ -+void reiser4_enter_flush(struct super_block *super) -+{ -+ entd_context *ent; -+ -+ assert("zam-1029", super != NULL); -+ ent = get_entd_context(super); -+ -+ assert("zam-1030", ent != NULL); -+ -+ spin_lock(&ent->guard); -+ ent->flushers++; -+#if REISER4_DEBUG -+ list_add(&get_current_context()->flushers_link, &ent->flushers_list); -+#endif -+ spin_unlock(&ent->guard); -+} -+ -+/* called at the end of jnode_flush */ -+void reiser4_leave_flush(struct super_block *super) -+{ -+ entd_context *ent; -+ int wake_up_ent; -+ -+ assert("zam-1027", super != NULL); -+ ent = get_entd_context(super); -+ -+ assert("zam-1028", ent != NULL); -+ -+ spin_lock(&ent->guard); -+ ent->flushers--; -+ wake_up_ent = (ent->flushers == 0 && ent->nr_todo_reqs != 0); -+#if REISER4_DEBUG -+ list_del_init(&get_current_context()->flushers_link); -+#endif -+ spin_unlock(&ent->guard); -+ if (wake_up_ent) -+ wake_up(&ent->wait); -+} -+ -+#define ENTD_CAPTURE_APAGE_BURST SWAP_CLUSTER_MAX -+ -+static void entd_flush(struct super_block *super, struct wbq *rq) -+{ -+ reiser4_context ctx; -+ int tmp; -+ -+ init_stack_context(&ctx, super); -+ ctx.entd = 1; -+ ctx.gfp_mask = GFP_NOFS; -+ -+ rq->wbc->range_start = page_offset(rq->page); -+ rq->wbc->range_end = rq->wbc->range_start + -+ (ENTD_CAPTURE_APAGE_BURST << PAGE_CACHE_SHIFT); -+ tmp = rq->wbc->nr_to_write; -+ rq->mapping->a_ops->writepages(rq->mapping, rq->wbc); -+ -+ if (rq->wbc->nr_to_write > 0) { -+ rq->wbc->range_start = 0; -+ rq->wbc->range_end = LLONG_MAX; -+ generic_sync_sb_inodes(super, rq->wbc); -+ } -+ rq->wbc->nr_to_write = ENTD_CAPTURE_APAGE_BURST; -+ reiser4_writeout(super, rq->wbc); -+ -+ context_set_commit_async(&ctx); -+ reiser4_exit_context(&ctx); -+} -+ -+/** -+ * write_page_by_ent - ask entd thread to flush this page as part of slum -+ * @page: page to be written -+ * @wbc: writeback control passed to reiser4_writepage -+ * -+ * Creates a request, puts it on entd list of requests, wakeups entd if -+ * necessary, waits until entd completes with the request. -+ */ -+int write_page_by_ent(struct page *page, struct writeback_control *wbc) -+{ -+ struct super_block *sb; -+ struct inode *inode; -+ entd_context *ent; -+ struct wbq rq; -+ -+ assert("", PageLocked(page)); -+ assert("", page->mapping != NULL); -+ -+ sb = page->mapping->host->i_sb; -+ ent = get_entd_context(sb); -+ assert("", ent && ent->done == 0); -+ -+ /* -+ * we are going to unlock page and ask ent thread to write the -+ * page. Re-dirty page before unlocking so that if ent thread fails to -+ * write it - it will remain dirty -+ */ -+ reiser4_set_page_dirty_internal(page); -+ -+ /* -+ * pin inode in memory, unlock page, entd_flush will iput. We can not -+ * iput here becasue we can not allow delete_inode to be called here -+ */ -+ inode = igrab(page->mapping->host); -+ unlock_page(page); -+ if (inode == NULL) -+ /* inode is getting freed */ -+ return 0; -+ -+ /* init wbq */ -+ INIT_LIST_HEAD(&rq.link); -+ rq.magic = WBQ_MAGIC; -+ rq.wbc = wbc; -+ rq.page = page; -+ rq.mapping = inode->i_mapping; -+ rq.node = NULL; -+ rq.written = 0; -+ init_completion(&rq.completion); -+ -+ /* add request to entd's list of writepage requests */ -+ spin_lock(&ent->guard); -+ ent->nr_todo_reqs++; -+ list_add_tail(&rq.link, &ent->todo_list); -+ if (ent->nr_todo_reqs == 1) -+ wake_up(&ent->wait); -+ -+ spin_unlock(&ent->guard); -+ -+ /* wait until entd finishes */ -+ wait_for_completion(&rq.completion); -+ -+ if (rq.written) -+ /* Eventually ENTD has written the page to disk. */ -+ return 0; -+ return 0; -+} -+ -+int wbq_available(void) -+{ -+ struct super_block *sb = reiser4_get_current_sb(); -+ entd_context *ent = get_entd_context(sb); -+ return ent->nr_todo_reqs; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/entd.h linux-2.6.24/fs/reiser4/entd.h ---- linux-2.6.24.orig/fs/reiser4/entd.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/entd.h 2008-01-25 11:39:06.912201506 +0300 -@@ -0,0 +1,90 @@ -+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Ent daemon. */ -+ -+#ifndef __ENTD_H__ -+#define __ENTD_H__ -+ -+#include "context.h" -+ -+#include -+#include -+#include -+#include -+#include /* for struct task_struct */ -+ -+#define WBQ_MAGIC 0x7876dc76 -+ -+/* write-back request. */ -+struct wbq { -+ int magic; -+ struct list_head link; /* list head of this list is in entd context */ -+ struct writeback_control *wbc; -+ struct page *page; -+ struct address_space *mapping; -+ struct completion completion; -+ jnode *node; /* set if ent thread captured requested page */ -+ int written; /* set if ent thread wrote requested page */ -+}; -+ -+/* ent-thread context. This is used to synchronize starting/stopping ent -+ * threads. */ -+typedef struct entd_context { -+ /* wait queue that ent thread waits on for more work. It's -+ * signaled by write_page_by_ent(). */ -+ wait_queue_head_t wait; -+ /* spinlock protecting other fields */ -+ spinlock_t guard; -+ /* ent thread */ -+ struct task_struct *tsk; -+ /* set to indicate that ent thread should leave. */ -+ int done; -+ /* counter of active flushers */ -+ int flushers; -+ /* -+ * when reiser4_writepage asks entd to write a page - it adds struct -+ * wbq to this list -+ */ -+ struct list_head todo_list; -+ /* number of elements on the above list */ -+ int nr_todo_reqs; -+ -+ struct wbq *cur_request; -+ /* -+ * when entd writes a page it moves write-back request from todo_list -+ * to done_list. This list is used at the end of entd iteration to -+ * wakeup requestors and iput inodes. -+ */ -+ struct list_head done_list; -+ /* number of elements on the above list */ -+ int nr_done_reqs; -+ -+#if REISER4_DEBUG -+ /* list of all active flushers */ -+ struct list_head flushers_list; -+#endif -+} entd_context; -+ -+extern int reiser4_init_entd(struct super_block *); -+extern void reiser4_done_entd(struct super_block *); -+ -+extern void reiser4_enter_flush(struct super_block *); -+extern void reiser4_leave_flush(struct super_block *); -+ -+extern int write_page_by_ent(struct page *, struct writeback_control *); -+extern int wbq_available(void); -+extern void ent_writes_page(struct super_block *, struct page *); -+ -+extern jnode *get_jnode_by_wbq(struct super_block *, struct wbq *); -+/* __ENTD_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/eottl.c linux-2.6.24/fs/reiser4/eottl.c ---- linux-2.6.24.orig/fs/reiser4/eottl.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/eottl.c 2008-01-25 11:39:06.912201506 +0300 -@@ -0,0 +1,509 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/item/item.h" -+#include "plugin/node/node.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree_walk.h" -+#include "tree_mod.h" -+#include "carry.h" -+#include "tree.h" -+#include "super.h" -+ -+#include /* for __u?? */ -+ -+/* -+ * Extents on the twig level (EOTTL) handling. -+ * -+ * EOTTL poses some problems to the tree traversal, that are better explained -+ * by example. -+ * -+ * Suppose we have block B1 on the twig level with the following items: -+ * -+ * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id, -+ * offset) -+ * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each -+ * 2. internal item I2 with key (10:0:0:0) -+ * -+ * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and -+ * then intra-node lookup is done. This lookup finished on the E1, because the -+ * key we are looking for is larger than the key of E1 and is smaller than key -+ * the of I2. -+ * -+ * Here search is stuck. -+ * -+ * After some thought it is clear what is wrong here: extents on the twig level -+ * break some basic property of the *search* tree (on the pretext, that they -+ * restore property of balanced tree). -+ * -+ * Said property is the following: if in the internal node of the search tree -+ * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be -+ * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible -+ * through the Pointer. -+ * -+ * This is not true, when Pointer is Extent-Pointer, simply because extent -+ * cannot expand indefinitely to the right to include any item with -+ * -+ * Key1 <= Key <= Key2. -+ * -+ * For example, our E1 extent is only responsible for the data with keys -+ * -+ * (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and -+ * -+ * so, key range -+ * -+ * ( (1:4:100:0xffffffffffffffff), (10:0:0:0) ) -+ * -+ * is orphaned: there is no way to get there from the tree root. -+ * -+ * In other words, extent pointers are different than normal child pointers as -+ * far as search tree is concerned, and this creates such problems. -+ * -+ * Possible solution for this problem is to insert our item into node pointed -+ * to by I2. There are some problems through: -+ * -+ * (1) I2 can be in a different node. -+ * (2) E1 can be immediately followed by another extent E2. -+ * -+ * (1) is solved by calling reiser4_get_right_neighbor() and accounting -+ * for locks/coords as necessary. -+ * -+ * (2) is more complex. Solution here is to insert new empty leaf node and -+ * insert internal item between E1 and E2 pointing to said leaf node. This is -+ * further complicated by possibility that E2 is in a different node, etc. -+ * -+ * Problems: -+ * -+ * (1) if there was internal item I2 immediately on the right of an extent E1 -+ * we and we decided to insert new item S1 into node N2 pointed to by I2, then -+ * key of S1 will be less than smallest key in the N2. Normally, search key -+ * checks that key we are looking for is in the range of keys covered by the -+ * node key is being looked in. To work around of this situation, while -+ * preserving useful consistency check new flag CBK_TRUST_DK was added to the -+ * cbk falgs bitmask. This flag is automatically set on entrance to the -+ * coord_by_key() and is only cleared when we are about to enter situation -+ * described above. -+ * -+ * (2) If extent E1 is immediately followed by another extent E2 and we are -+ * searching for the key that is between E1 and E2 we only have to insert new -+ * empty leaf node when coord_by_key was called for insertion, rather than just -+ * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to -+ * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls -+ * performed by insert_by_key() and friends. -+ * -+ * (3) Insertion of new empty leaf node (possibly) requires balancing. In any -+ * case it requires modification of node content which is only possible under -+ * write lock. It may well happen that we only have read lock on the node where -+ * new internal pointer is to be inserted (common case: lookup of non-existent -+ * stat-data that fells between two extents). If only read lock is held, tree -+ * traversal is restarted with lock_level modified so that next time we hit -+ * this problem, write lock will be held. Once we have write lock, balancing -+ * will be performed. -+ */ -+ -+/** -+ * is_next_item_internal - check whether next item is internal -+ * @coord: coordinate of extent item in twig node -+ * @key: search key -+ * @lh: twig node lock handle -+ * -+ * Looks at the unit next to @coord. If it is an internal one - 1 is returned, -+ * @coord is set to that unit. If that unit is in right neighbor, @lh is moved -+ * to that node, @coord is set to its first unit. If next item is not internal -+ * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2 -+ * is returned if search restart has to be done. -+ */ -+static int -+is_next_item_internal(coord_t *coord, const reiser4_key *key, -+ lock_handle *lh) -+{ -+ coord_t next; -+ lock_handle rn; -+ int result; -+ -+ coord_dup(&next, coord); -+ if (coord_next_unit(&next) == 0) { -+ /* next unit is in this node */ -+ if (item_is_internal(&next)) { -+ coord_dup(coord, &next); -+ return 1; -+ } -+ assert("vs-3", item_is_extent(&next)); -+ return 0; -+ } -+ -+ /* -+ * next unit either does not exist or is in right neighbor. If it is in -+ * right neighbor we have to check right delimiting key because -+ * concurrent thread could get their first and insert item with a key -+ * smaller than @key -+ */ -+ read_lock_dk(current_tree); -+ result = keycmp(key, znode_get_rd_key(coord->node)); -+ read_unlock_dk(current_tree); -+ assert("vs-6", result != EQUAL_TO); -+ if (result == GREATER_THAN) -+ return 2; -+ -+ /* lock right neighbor */ -+ init_lh(&rn); -+ result = reiser4_get_right_neighbor(&rn, coord->node, -+ znode_is_wlocked(coord->node) ? -+ ZNODE_WRITE_LOCK : ZNODE_READ_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ if (result == -E_NO_NEIGHBOR) { -+ /* we are on the rightmost edge of the tree */ -+ done_lh(&rn); -+ return 0; -+ } -+ -+ if (result) { -+ assert("vs-4", result < 0); -+ done_lh(&rn); -+ return result; -+ } -+ -+ /* -+ * check whether concurrent thread managed to insert item with a key -+ * smaller than @key -+ */ -+ read_lock_dk(current_tree); -+ result = keycmp(key, znode_get_ld_key(rn.node)); -+ read_unlock_dk(current_tree); -+ assert("vs-6", result != EQUAL_TO); -+ if (result == GREATER_THAN) { -+ done_lh(&rn); -+ return 2; -+ } -+ -+ result = zload(rn.node); -+ if (result) { -+ assert("vs-5", result < 0); -+ done_lh(&rn); -+ return result; -+ } -+ -+ coord_init_first_unit(&next, rn.node); -+ if (item_is_internal(&next)) { -+ /* -+ * next unit is in right neighbor and it is an unit of internal -+ * item. Unlock coord->node. Move @lh to right neighbor. @coord -+ * is set to the first unit of right neighbor. -+ */ -+ coord_dup(coord, &next); -+ zrelse(rn.node); -+ done_lh(lh); -+ move_lh(lh, &rn); -+ return 1; -+ } -+ -+ /* -+ * next unit is unit of extent item. Return without chaning @lh and -+ * @coord. -+ */ -+ assert("vs-6", item_is_extent(&next)); -+ zrelse(rn.node); -+ done_lh(&rn); -+ return 0; -+} -+ -+/** -+ * rd_key - calculate key of an item next to the given one -+ * @coord: position in a node -+ * @key: storage for result key -+ * -+ * @coord is set between items or after the last item in a node. Calculate key -+ * of item to the right of @coord. -+ */ -+static reiser4_key *rd_key(const coord_t *coord, reiser4_key *key) -+{ -+ coord_t dup; -+ -+ assert("nikita-2281", coord_is_between_items(coord)); -+ coord_dup(&dup, coord); -+ -+ if (coord_set_to_right(&dup) == 0) -+ /* next item is in this node. Return its key. */ -+ unit_key_by_coord(&dup, key); -+ else { -+ /* -+ * next item either does not exist or is in right -+ * neighbor. Return znode's right delimiting key. -+ */ -+ read_lock_dk(current_tree); -+ *key = *znode_get_rd_key(coord->node); -+ read_unlock_dk(current_tree); -+ } -+ return key; -+} -+ -+/** -+ * add_empty_leaf - insert empty leaf between two extents -+ * @insert_coord: position in twig node between two extents -+ * @lh: twig node lock handle -+ * @key: left delimiting key of new node -+ * @rdkey: right delimiting key of new node -+ * -+ * Inserts empty leaf node between two extent items. It is necessary when we -+ * have to insert an item on leaf level between two extents (items on the twig -+ * level). -+ */ -+static int -+add_empty_leaf(coord_t *insert_coord, lock_handle *lh, -+ const reiser4_key *key, const reiser4_key *rdkey) -+{ -+ int result; -+ carry_pool *pool; -+ carry_level *todo; -+ reiser4_item_data *item; -+ carry_insert_data *cdata; -+ carry_op *op; -+ znode *node; -+ reiser4_tree *tree; -+ -+ assert("vs-49827", znode_contains_key_lock(insert_coord->node, key)); -+ tree = znode_get_tree(insert_coord->node); -+ node = reiser4_new_node(insert_coord->node, LEAF_LEVEL); -+ if (IS_ERR(node)) -+ return PTR_ERR(node); -+ -+ /* setup delimiting keys for node being inserted */ -+ write_lock_dk(tree); -+ znode_set_ld_key(node, key); -+ znode_set_rd_key(node, rdkey); -+ ON_DEBUG(node->creator = current); -+ ON_DEBUG(node->first_key = *key); -+ write_unlock_dk(tree); -+ -+ ZF_SET(node, JNODE_ORPHAN); -+ -+ /* -+ * allocate carry_pool, 3 carry_level-s, reiser4_item_data and -+ * carry_insert_data -+ */ -+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) + -+ sizeof(*item) + sizeof(*cdata)); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ todo = (carry_level *) (pool + 1); -+ init_carry_level(todo, pool); -+ -+ item = (reiser4_item_data *) (todo + 3); -+ cdata = (carry_insert_data *) (item + 1); -+ -+ op = reiser4_post_carry(todo, COP_INSERT, insert_coord->node, 0); -+ if (!IS_ERR(op)) { -+ cdata->coord = insert_coord; -+ cdata->key = key; -+ cdata->data = item; -+ op->u.insert.d = cdata; -+ op->u.insert.type = COPT_ITEM_DATA; -+ build_child_ptr_data(node, item); -+ item->arg = NULL; -+ /* have @insert_coord to be set at inserted item after -+ insertion is done */ -+ todo->track_type = CARRY_TRACK_CHANGE; -+ todo->tracked = lh; -+ -+ result = reiser4_carry(todo, NULL); -+ if (result == 0) { -+ /* -+ * pin node in memory. This is necessary for -+ * znode_make_dirty() below. -+ */ -+ result = zload(node); -+ if (result == 0) { -+ lock_handle local_lh; -+ -+ /* -+ * if we inserted new child into tree we have -+ * to mark it dirty so that flush will be able -+ * to process it. -+ */ -+ init_lh(&local_lh); -+ result = longterm_lock_znode(&local_lh, node, -+ ZNODE_WRITE_LOCK, -+ ZNODE_LOCK_LOPRI); -+ if (result == 0) { -+ znode_make_dirty(node); -+ -+ /* -+ * when internal item pointing to @node -+ * was inserted into twig node -+ * create_hook_internal did not connect -+ * it properly because its right -+ * neighbor was not known. Do it -+ * here -+ */ -+ write_lock_tree(tree); -+ assert("nikita-3312", -+ znode_is_right_connected(node)); -+ assert("nikita-2984", -+ node->right == NULL); -+ ZF_CLR(node, JNODE_RIGHT_CONNECTED); -+ write_unlock_tree(tree); -+ result = -+ connect_znode(insert_coord, node); -+ ON_DEBUG(if (result == 0) check_dkeys(node);); -+ -+ done_lh(lh); -+ move_lh(lh, &local_lh); -+ assert("vs-1676", node_is_empty(node)); -+ coord_init_first_unit(insert_coord, -+ node); -+ } else { -+ warning("nikita-3136", -+ "Cannot lock child"); -+ } -+ done_lh(&local_lh); -+ zrelse(node); -+ } -+ } -+ } else -+ result = PTR_ERR(op); -+ zput(node); -+ done_carry_pool(pool); -+ return result; -+} -+ -+/** -+ * handle_eottl - handle extent-on-the-twig-level cases in tree traversal -+ * @h: search handle -+ * @outcome: flag saying whether search has to restart or is done -+ * -+ * Handles search on twig level. If this function completes search itself then -+ * it returns 1. If search has to go one level down then 0 is returned. If -+ * error happens then LOOKUP_DONE is returned via @outcome and error code is saved -+ * in @h->result. -+ */ -+int handle_eottl(cbk_handle *h, int *outcome) -+{ -+ int result; -+ reiser4_key key; -+ coord_t *coord; -+ -+ coord = h->coord; -+ -+ if (h->level != TWIG_LEVEL || -+ (coord_is_existing_item(coord) && item_is_internal(coord))) { -+ /* Continue to traverse tree downward. */ -+ return 0; -+ } -+ -+ /* -+ * make sure that @h->coord is set to twig node and that it is either -+ * set to extent item or after extent item -+ */ -+ assert("vs-356", h->level == TWIG_LEVEL); -+ assert("vs-357", ( { -+ coord_t lcoord; -+ coord_dup(&lcoord, coord); -+ check_me("vs-733", coord_set_to_left(&lcoord) == 0); -+ item_is_extent(&lcoord); -+ } -+ )); -+ -+ if (*outcome == NS_FOUND) { -+ /* we have found desired key on twig level in extent item */ -+ h->result = CBK_COORD_FOUND; -+ *outcome = LOOKUP_DONE; -+ return 1; -+ } -+ -+ if (!(h->flags & CBK_FOR_INSERT)) { -+ /* tree traversal is not for insertion. Just return -+ CBK_COORD_NOTFOUND. */ -+ h->result = CBK_COORD_NOTFOUND; -+ *outcome = LOOKUP_DONE; -+ return 1; -+ } -+ -+ /* take a look at the item to the right of h -> coord */ -+ result = is_next_item_internal(coord, h->key, h->active_lh); -+ if (unlikely(result < 0)) { -+ h->error = "get_right_neighbor failed"; -+ h->result = result; -+ *outcome = LOOKUP_DONE; -+ return 1; -+ } -+ if (result == 0) { -+ /* -+ * item to the right is also an extent one. Allocate a new node -+ * and insert pointer to it after item h -> coord. -+ * -+ * This is a result of extents being located at the twig -+ * level. For explanation, see comment just above -+ * is_next_item_internal(). -+ */ -+ znode *loaded; -+ -+ if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) { -+ /* -+ * we got node read locked, restart coord_by_key to -+ * have write lock on twig level -+ */ -+ h->lock_level = TWIG_LEVEL; -+ h->lock_mode = ZNODE_WRITE_LOCK; -+ *outcome = LOOKUP_REST; -+ return 1; -+ } -+ -+ loaded = coord->node; -+ result = -+ add_empty_leaf(coord, h->active_lh, h->key, -+ rd_key(coord, &key)); -+ if (result) { -+ h->error = "could not add empty leaf"; -+ h->result = result; -+ *outcome = LOOKUP_DONE; -+ return 1; -+ } -+ /* added empty leaf is locked (h->active_lh), its parent node -+ is unlocked, h->coord is set as EMPTY */ -+ assert("vs-13", coord->between == EMPTY_NODE); -+ assert("vs-14", znode_is_write_locked(coord->node)); -+ assert("vs-15", -+ WITH_DATA(coord->node, node_is_empty(coord->node))); -+ assert("vs-16", jnode_is_leaf(ZJNODE(coord->node))); -+ assert("vs-17", coord->node == h->active_lh->node); -+ *outcome = LOOKUP_DONE; -+ h->result = CBK_COORD_NOTFOUND; -+ return 1; -+ } else if (result == 1) { -+ /* -+ * this is special case mentioned in the comment on -+ * tree.h:cbk_flags. We have found internal item immediately on -+ * the right of extent, and we are going to insert new item -+ * there. Key of item we are going to insert is smaller than -+ * leftmost key in the node pointed to by said internal item -+ * (otherwise search wouldn't come to the extent in the first -+ * place). -+ * -+ * This is a result of extents being located at the twig -+ * level. For explanation, see comment just above -+ * is_next_item_internal(). -+ */ -+ h->flags &= ~CBK_TRUST_DK; -+ } else { -+ assert("vs-8", result == 2); -+ *outcome = LOOKUP_REST; -+ return 1; -+ } -+ assert("vs-362", WITH_DATA(coord->node, item_is_internal(coord))); -+ return 0; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 120 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/estimate.c linux-2.6.24/fs/reiser4/estimate.c ---- linux-2.6.24.orig/fs/reiser4/estimate.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/estimate.c 2008-01-25 11:39:06.912201506 +0300 -@@ -0,0 +1,120 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "debug.h" -+#include "dformat.h" -+#include "tree.h" -+#include "carry.h" -+#include "inode.h" -+#include "plugin/cluster.h" -+#include "plugin/item/ctail.h" -+ -+/* this returns how many nodes might get dirty and added nodes if @children nodes are dirtied -+ -+ Amount of internals which will get dirty or get allocated we estimate as 5% of the childs + 1 balancing. 1 balancing -+ is 2 neighbours, 2 new blocks and the current block on the leaf level, 2 neighbour nodes + the current (or 1 -+ neighbour and 1 new and the current) on twig level, 2 neighbour nodes on upper levels and 1 for a new root. So 5 for -+ leaf level, 3 for twig level, 2 on upper + 1 for root. -+ -+ Do not calculate the current node of the lowest level here - this is overhead only. -+ -+ children is almost always 1 here. Exception is flow insertion -+*/ -+static reiser4_block_nr -+max_balance_overhead(reiser4_block_nr childen, tree_level tree_height) -+{ -+ reiser4_block_nr ten_percent; -+ -+ ten_percent = ((103 * childen) >> 10); -+ -+ /* If we have too many balancings at the time, tree height can raise on more -+ then 1. Assume that if tree_height is 5, it can raise on 1 only. */ -+ return ((tree_height < 5 ? 5 : tree_height) * 2 + (4 + ten_percent)); -+} -+ -+/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to -+ perform insertion of one item into the tree */ -+/* it is only called when tree height changes, or gets initialized */ -+reiser4_block_nr calc_estimate_one_insert(tree_level height) -+{ -+ return 1 + max_balance_overhead(1, height); -+} -+ -+reiser4_block_nr estimate_one_insert_item(reiser4_tree * tree) -+{ -+ return tree->estimate_one_insert; -+} -+ -+/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to -+ perform insertion of one unit into an item in the tree */ -+reiser4_block_nr estimate_one_insert_into_item(reiser4_tree * tree) -+{ -+ /* estimate insert into item just like item insertion */ -+ return tree->estimate_one_insert; -+} -+ -+reiser4_block_nr estimate_one_item_removal(reiser4_tree * tree) -+{ -+ /* on item removal reiser4 does not try to pack nodes more complact, so, only one node may be dirtied on leaf -+ level */ -+ return tree->estimate_one_insert; -+} -+ -+/* on leaf level insert_flow may add CARRY_FLOW_NEW_NODES_LIMIT new nodes and dirty 3 existing nodes (insert point and -+ both its neighbors). Max_balance_overhead should estimate number of blocks which may change/get added on internal -+ levels */ -+reiser4_block_nr estimate_insert_flow(tree_level height) -+{ -+ return 3 + CARRY_FLOW_NEW_NODES_LIMIT + max_balance_overhead(3 + -+ CARRY_FLOW_NEW_NODES_LIMIT, -+ height); -+} -+ -+/* returnes max number of nodes can be occupied by disk cluster */ -+static reiser4_block_nr estimate_cluster(struct inode * inode, int unprepped) -+{ -+ int per_cluster; -+ per_cluster = (unprepped ? 1 : cluster_nrpages(inode)); -+ return 3 + per_cluster + -+ max_balance_overhead(3 + per_cluster, -+ REISER4_MAX_ZTREE_HEIGHT); -+} -+ -+/* how many nodes might get dirty and added -+ during insertion of a disk cluster */ -+reiser4_block_nr estimate_insert_cluster(struct inode * inode) -+{ -+ return estimate_cluster(inode, 1); /* 24 */ -+} -+ -+/* how many nodes might get dirty and added -+ during update of a (prepped or unprepped) disk cluster */ -+reiser4_block_nr estimate_update_cluster(struct inode * inode) -+{ -+ return estimate_cluster(inode, 0); /* 44, for 64K-cluster */ -+} -+ -+/* How many nodes occupied by a disk cluster might get dirty. -+ Note that this estimation is not precise (i.e. disk cluster -+ can occupy more nodes). -+ Q: Why we don't use precise estimation? -+ A: 1.Because precise estimation is fairly bad: 65536 nodes -+ for 64K logical cluster, it means 256M of dead space on -+ a partition -+ 2.It is a very rare case when disk cluster occupies more -+ nodes then this estimation returns. -+*/ -+reiser4_block_nr estimate_dirty_cluster(struct inode * inode) -+{ -+ return cluster_nrpages(inode) + 4; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/export_ops.c linux-2.6.24/fs/reiser4/export_ops.c ---- linux-2.6.24.orig/fs/reiser4/export_ops.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/export_ops.c 2008-01-25 12:03:29.960445090 +0300 -@@ -0,0 +1,319 @@ -+/* Copyright 2005 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#include "inode.h" -+#include "plugin/plugin.h" -+ -+/* -+ * Supported file-handle types -+ */ -+typedef enum { -+ FH_WITH_PARENT = 0x10, /* file handle with parent */ -+ FH_WITHOUT_PARENT = 0x11 /* file handle without parent */ -+} reiser4_fhtype; -+ -+#define NFSERROR (255) -+ -+/* initialize place-holder for object */ -+static void object_on_wire_init(reiser4_object_on_wire *o) -+{ -+ o->plugin = NULL; -+} -+ -+/* finish with @o */ -+static void object_on_wire_done(reiser4_object_on_wire *o) -+{ -+ if (o->plugin != NULL) -+ o->plugin->wire.done(o); -+} -+ -+/* -+ * read serialized object identity from @addr and store information about -+ * object in @obj. This is dual to encode_inode(). -+ */ -+static char *decode_inode(struct super_block *s, char *addr, -+ reiser4_object_on_wire * obj) -+{ -+ file_plugin *fplug; -+ -+ /* identifier of object plugin is stored in the first two bytes, -+ * followed by... */ -+ fplug = file_plugin_by_disk_id(reiser4_get_tree(s), (d16 *) addr); -+ if (fplug != NULL) { -+ addr += sizeof(d16); -+ obj->plugin = fplug; -+ assert("nikita-3520", fplug->wire.read != NULL); -+ /* plugin specific encoding of object identity. */ -+ addr = fplug->wire.read(addr, obj); -+ } else -+ addr = ERR_PTR(RETERR(-EINVAL)); -+ return addr; -+} -+ -+static struct dentry *reiser4_get_dentry(struct super_block *super, -+ void *data); -+/** -+ * reiser4_decode_fh: decode on-wire object - helper function -+ * for fh_to_dentry, fh_to_parent export operations; -+ * @super: super block; -+ * @addr: onwire object to be decoded; -+ * -+ * Returns dentry referring to the object being decoded. -+ */ -+static struct dentry *reiser4_decode_fh(struct super_block * super, -+ char * addr) -+{ -+ reiser4_object_on_wire object; -+ -+ object_on_wire_init(&object); -+ -+ addr = decode_inode(super, addr, &object); -+ if (!IS_ERR(addr)) { -+ struct dentry *d; -+ d = reiser4_get_dentry(super, &object); -+ if (d != NULL && !IS_ERR(d)) -+ /* FIXME check for -ENOMEM */ -+ reiser4_get_dentry_fsdata(d)->stateless = 1; -+ addr = (char *)d; -+ } -+ object_on_wire_done(&object); -+ return (void *)addr; -+} -+ -+static struct dentry *reiser4_fh_to_dentry(struct super_block *sb, -+ struct fid *fid, -+ int fh_len, int fh_type) -+{ -+ reiser4_context *ctx; -+ struct dentry *d; -+ -+ assert("edward-1536", -+ fh_type == FH_WITH_PARENT || fh_type == FH_WITHOUT_PARENT); -+ -+ ctx = reiser4_init_context(sb); -+ if (IS_ERR(ctx)) -+ return (struct dentry *)ctx; -+ -+ d = reiser4_decode_fh(sb, (char *)fid->raw); -+ -+ reiser4_exit_context(ctx); -+ return d; -+} -+ -+static struct dentry *reiser4_fh_to_parent(struct super_block *sb, -+ struct fid *fid, -+ int fh_len, int fh_type) -+{ -+ char * addr; -+ struct dentry * d; -+ reiser4_context *ctx; -+ file_plugin *fplug; -+ -+ if (fh_type == FH_WITHOUT_PARENT) -+ return NULL; -+ assert("edward-1537", fh_type == FH_WITH_PARENT); -+ -+ ctx = reiser4_init_context(sb); -+ if (IS_ERR(ctx)) -+ return (struct dentry *)ctx; -+ addr = (char *)fid->raw; -+ /* extract 2-bytes file plugin id */ -+ fplug = file_plugin_by_disk_id(reiser4_get_tree(sb), (d16 *)addr); -+ if (fplug == NULL) { -+ d = ERR_PTR(RETERR(-EINVAL)); -+ goto exit; -+ } -+ addr += sizeof(d16); -+ /* skip previously encoded object */ -+ addr = fplug->wire.read(addr, NULL /* skip */); -+ if (IS_ERR(addr)) { -+ d = (struct dentry *)addr; -+ goto exit; -+ } -+ /* @extract and decode parent object */ -+ d = reiser4_decode_fh(sb, addr); -+ exit: -+ reiser4_exit_context(ctx); -+ return d; -+} -+ -+/* -+ * Object serialization support. -+ * -+ * To support knfsd file system provides export_operations that are used to -+ * construct and interpret NFS file handles. As a generalization of this, -+ * reiser4 object plugins have serialization support: it provides methods to -+ * create on-wire representation of identity of reiser4 object, and -+ * re-create/locate object given its on-wire identity. -+ * -+ */ -+ -+/* -+ * return number of bytes that on-wire representation of @inode's identity -+ * consumes. -+ */ -+static int encode_inode_size(struct inode *inode) -+{ -+ assert("nikita-3514", inode != NULL); -+ assert("nikita-3515", inode_file_plugin(inode) != NULL); -+ assert("nikita-3516", inode_file_plugin(inode)->wire.size != NULL); -+ -+ return inode_file_plugin(inode)->wire.size(inode) + sizeof(d16); -+} -+ -+/* -+ * store on-wire representation of @inode's identity at the area beginning at -+ * @start. -+ */ -+static char *encode_inode(struct inode *inode, char *start) -+{ -+ assert("nikita-3517", inode != NULL); -+ assert("nikita-3518", inode_file_plugin(inode) != NULL); -+ assert("nikita-3519", inode_file_plugin(inode)->wire.write != NULL); -+ -+ /* -+ * first, store two-byte identifier of object plugin, then -+ */ -+ save_plugin_id(file_plugin_to_plugin(inode_file_plugin(inode)), -+ (d16 *) start); -+ start += sizeof(d16); -+ /* -+ * call plugin to serialize object's identity -+ */ -+ return inode_file_plugin(inode)->wire.write(inode, start); -+} -+ -+/* this returns number of 32 bit long numbers encoded in @lenp. 255 is -+ * returned if file handle can not be stored */ -+/** -+ * reiser4_encode_fh - encode_fh of export operations -+ * @dentry: -+ * @fh: -+ * @lenp: -+ * @need_parent: -+ * -+ */ -+static int -+reiser4_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp, -+ int need_parent) -+{ -+ struct inode *inode; -+ struct inode *parent; -+ char *addr; -+ int need; -+ int delta; -+ int result; -+ reiser4_context *ctx; -+ -+ /* -+ * knfsd asks as to serialize object in @dentry, and, optionally its -+ * parent (if need_parent != 0). -+ * -+ * encode_inode() and encode_inode_size() is used to build -+ * representation of object and its parent. All hard work is done by -+ * object plugins. -+ */ -+ inode = dentry->d_inode; -+ parent = dentry->d_parent->d_inode; -+ -+ addr = (char *)fh; -+ -+ need = encode_inode_size(inode); -+ if (need < 0) -+ return NFSERROR; -+ if (need_parent) { -+ delta = encode_inode_size(parent); -+ if (delta < 0) -+ return NFSERROR; -+ need += delta; -+ } -+ -+ ctx = reiser4_init_context(dentry->d_inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ if (need <= sizeof(__u32) * (*lenp)) { -+ addr = encode_inode(inode, addr); -+ if (need_parent) -+ addr = encode_inode(parent, addr); -+ -+ /* store in lenp number of 32bit words required for file -+ * handle. */ -+ *lenp = (need + sizeof(__u32) - 1) >> 2; -+ result = need_parent ? FH_WITH_PARENT : FH_WITHOUT_PARENT; -+ } else -+ /* no enough space in file handle */ -+ result = NFSERROR; -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/** -+ * reiser4_get_dentry_parent - get_parent of export operations -+ * @child: -+ * -+ */ -+static struct dentry *reiser4_get_dentry_parent(struct dentry *child) -+{ -+ struct inode *dir; -+ dir_plugin *dplug; -+ -+ assert("nikita-3527", child != NULL); -+ /* see comment in reiser4_get_dentry() about following assertion */ -+ assert("nikita-3528", is_in_reiser4_context()); -+ -+ dir = child->d_inode; -+ assert("nikita-3529", dir != NULL); -+ dplug = inode_dir_plugin(dir); -+ assert("nikita-3531", ergo(dplug != NULL, dplug->get_parent != NULL)); -+ if (dplug != NULL) -+ return dplug->get_parent(dir); -+ else -+ return ERR_PTR(RETERR(-ENOTDIR)); -+} -+ -+/** -+ * reiser4_get_dentry - get_dentry of export operations -+ * @super: -+ * @data: -+ * -+ * -+ */ -+static struct dentry *reiser4_get_dentry(struct super_block *super, void *data) -+{ -+ reiser4_object_on_wire *o; -+ -+ assert("nikita-3522", super != NULL); -+ assert("nikita-3523", data != NULL); -+ /* -+ * this is only supposed to be called by -+ * -+ * reiser4_decode_fh->find_exported_dentry -+ * -+ * so, reiser4_context should be here already. -+ */ -+ assert("nikita-3526", is_in_reiser4_context()); -+ -+ o = (reiser4_object_on_wire *)data; -+ assert("nikita-3524", o->plugin != NULL); -+ assert("nikita-3525", o->plugin->wire.get != NULL); -+ -+ return o->plugin->wire.get(super, o); -+} -+ -+struct export_operations reiser4_export_operations = { -+ .encode_fh = reiser4_encode_fh, -+ .fh_to_dentry = reiser4_fh_to_dentry, -+ .fh_to_parent = reiser4_fh_to_parent, -+ .get_parent = reiser4_get_dentry_parent, -+}; -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/flush.c linux-2.6.24/fs/reiser4/flush.c ---- linux-2.6.24.orig/fs/reiser4/flush.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/flush.c 2008-01-25 11:39:06.000000000 +0300 -@@ -0,0 +1,3625 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* The design document for this file is at http://www.namesys.com/v4/v4.html. */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/item/item.h" -+#include "plugin/plugin.h" -+#include "plugin/object.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree_walk.h" -+#include "carry.h" -+#include "tree.h" -+#include "vfs_ops.h" -+#include "inode.h" -+#include "page_cache.h" -+#include "wander.h" -+#include "super.h" -+#include "entd.h" -+#include "reiser4.h" -+#include "flush.h" -+#include "writeout.h" -+ -+#include -+#include /* for struct super_block */ -+#include /* for struct page */ -+#include /* for struct bio */ -+#include -+#include -+ -+/* IMPLEMENTATION NOTES */ -+ -+/* PARENT-FIRST: Some terminology: A parent-first traversal is a way of assigning a total -+ order to the nodes of the tree in which the parent is placed before its children, which -+ are ordered (recursively) in left-to-right order. When we speak of a "parent-first preceder", it -+ describes the node that "came before in forward parent-first order". When we speak of a -+ "parent-first follower", it describes the node that "comes next in parent-first -+ order" (alternatively the node that "came before in reverse parent-first order"). -+ -+ The following pseudo-code prints the nodes of a tree in forward parent-first order: -+ -+ void parent_first (node) -+ { -+ print_node (node); -+ if (node->level > leaf) { -+ for (i = 0; i < num_children; i += 1) { -+ parent_first (node->child[i]); -+ } -+ } -+ } -+*/ -+ -+/* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE? The idea is to optimize block allocation so -+ that a left-to-right scan of the tree's data (i.e., the leaves in left-to-right order) -+ can be accomplished with sequential reads, which results in reading nodes in their -+ parent-first order. This is a read-optimization aspect of the flush algorithm, and -+ there is also a write-optimization aspect, which is that we wish to make large -+ sequential writes to the disk by allocating or reallocating blocks so that they can be -+ written in sequence. Sometimes the read-optimization and write-optimization goals -+ conflict with each other, as we discuss in more detail below. -+*/ -+ -+/* STATE BITS: The flush code revolves around the state of the jnodes it covers. Here are -+ the relevant jnode->state bits and their relevence to flush: -+ -+ JNODE_DIRTY: If a node is dirty, it must be flushed. But in order to be written it -+ must be allocated first. In order to be considered allocated, the jnode must have -+ exactly one of { JNODE_OVRWR, JNODE_RELOC } set. These two bits are exclusive, and -+ all dirtied jnodes eventually have one of these bits set during each transaction. -+ -+ JNODE_CREATED: The node was freshly created in its transaction and has no previous -+ block address, so it is unconditionally assigned to be relocated, although this is -+ mainly for code-convenience. It is not being 'relocated' from anything, but in -+ almost every regard it is treated as part of the relocate set. The JNODE_CREATED bit -+ remains set even after JNODE_RELOC is set, so the actual relocate can be -+ distinguished from the created-and-allocated set easily: relocate-set members -+ (belonging to the preserve-set) have (JNODE_RELOC) set and created-set members which -+ have no previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set. -+ -+ JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm made the -+ decision to maintain the pre-existing location for this node and it will be written -+ to the wandered-log. -+ -+ JNODE_RELOC: The flush algorithm made the decision to relocate this block (if it was -+ not created, see note above). A block with JNODE_RELOC set is eligible for -+ early-flushing and may be submitted during flush_empty_queues. When the JNODE_RELOC -+ bit is set on a znode, the parent node's internal item is modified and the znode is -+ rehashed. -+ -+ JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm scans the node -+ and calls plugin->f.squeeze() method for its items. By this technology we update disk -+ clusters of cryptcompress objects. Also if leftmost point that was found by flush scan -+ has this flag (races with write(), rare case) the flush algorythm makes the decision -+ to pass it to squalloc() in spite of its flushprepped status for squeezing, not for -+ repeated allocation. -+ -+ JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode into its -+ flush queue. This means the jnode is not on any clean or dirty list, instead it is -+ moved to one of the flush queue (see flush_queue.h) object private list. This -+ prevents multiple concurrent flushes from attempting to start flushing from the -+ same node. -+ -+ (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up -+ squeeze-and-allocate on a node while its children are actively being squeezed and -+ allocated. This flag was created to avoid submitting a write request for a node -+ while its children are still being allocated and squeezed. Then flush queue was -+ re-implemented to allow unlimited number of nodes be queued. This flag support was -+ commented out in source code because we decided that there was no reason to submit -+ queued nodes before jnode_flush() finishes. However, current code calls fq_write() -+ during a slum traversal and may submit "busy nodes" to disk. Probably we can -+ re-enable the JNODE_FLUSH_BUSY bit support in future. -+ -+ With these state bits, we describe a test used frequently in the code below, -+ jnode_is_flushprepped() (and the spin-lock-taking jnode_check_flushprepped()). The -+ test for "flushprepped" returns true if any of the following are true: -+ -+ - The node is not dirty -+ - The node has JNODE_RELOC set -+ - The node has JNODE_OVRWR set -+ -+ If either the node is not dirty or it has already been processed by flush (and assigned -+ JNODE_OVRWR or JNODE_RELOC), then it is prepped. If jnode_is_flushprepped() returns -+ true then flush has work to do on that node. -+*/ -+ -+/* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never -+ flushprepped twice (unless an explicit call to flush_unprep is made as described in -+ detail below). For example a node is dirtied, allocated, and then early-flushed to -+ disk and set clean. Before the transaction commits, the page is dirtied again and, due -+ to memory pressure, the node is flushed again. The flush algorithm will not relocate -+ the node to a new disk location, it will simply write it to the same, previously -+ relocated position again. -+*/ -+ -+/* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm where we -+ start at a leaf node and allocate in parent-first order by iterating to the right. At -+ each step of the iteration, we check for the right neighbor. Before advancing to the -+ right neighbor, we check if the current position and the right neighbor share the same -+ parent. If they do not share the same parent, the parent is allocated before the right -+ neighbor. -+ -+ This process goes recursively up the tree and squeeze nodes level by level as long as -+ the right neighbor and the current position have different parents, then it allocates -+ the right-neighbors-with-different-parents on the way back down. This process is -+ described in more detail in flush_squalloc_changed_ancestor and the recursive function -+ squalloc_one_changed_ancestor. But the purpose here is not to discuss the -+ specifics of the bottom-up approach as it is to contrast the bottom-up and top-down -+ approaches. -+ -+ The top-down algorithm was implemented earlier (April-May 2002). In the top-down -+ approach, we find a starting point by scanning left along each level past dirty nodes, -+ then going up and repeating the process until the left node and the parent node are -+ clean. We then perform a parent-first traversal from the starting point, which makes -+ allocating in parent-first order trivial. After one subtree has been allocated in this -+ manner, we move to the right, try moving upward, then repeat the parent-first -+ traversal. -+ -+ Both approaches have problems that need to be addressed. Both are approximately the -+ same amount of code, but the bottom-up approach has advantages in the order it acquires -+ locks which, at the very least, make it the better approach. At first glance each one -+ makes the other one look simpler, so it is important to remember a few of the problems -+ with each one. -+ -+ Main problem with the top-down approach: When you encounter a clean child during the -+ parent-first traversal, what do you do? You would like to avoid searching through a -+ large tree of nodes just to find a few dirty leaves at the bottom, and there is not an -+ obvious solution. One of the advantages of the top-down approach is that during the -+ parent-first traversal you check every child of a parent to see if it is dirty. In -+ this way, the top-down approach easily handles the main problem of the bottom-up -+ approach: unallocated children. -+ -+ The unallocated children problem is that before writing a node to disk we must make -+ sure that all of its children are allocated. Otherwise, the writing the node means -+ extra I/O because the node will have to be written again when the child is finally -+ allocated. -+ -+ WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM. Except for bugs, this -+ should not cause any file system corruption, it only degrades I/O performance because a -+ node may be written when it is sure to be written at least one more time in the same -+ transaction when the remaining children are allocated. What follows is a description -+ of how we will solve the problem. -+*/ -+ -+/* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node then, -+ proceeding in parent first order, allocate some of its left-children, then encounter a -+ clean child in the middle of the parent. We do not allocate the clean child, but there -+ may remain unallocated (dirty) children to the right of the clean child. If we were to -+ stop flushing at this moment and write everything to disk, the parent might still -+ contain unallocated children. -+ -+ We could try to allocate all the descendents of every node that we allocate, but this -+ is not necessary. Doing so could result in allocating the entire tree: if the root -+ node is allocated then every unallocated node would have to be allocated before -+ flushing. Actually, we do not have to write a node just because we allocate it. It is -+ possible to allocate but not write a node during flush, when it still has unallocated -+ children. However, this approach is probably not optimal for the following reason. -+ -+ The flush algorithm is designed to allocate nodes in parent-first order in an attempt -+ to optimize reads that occur in the same order. Thus we are read-optimizing for a -+ left-to-right scan through all the leaves in the system, and we are hoping to -+ write-optimize at the same time because those nodes will be written together in batch. -+ What happens, however, if we assign a block number to a node in its read-optimized -+ order but then avoid writing it because it has unallocated children? In that -+ situation, we lose out on the write-optimization aspect because a node will have to be -+ written again to the its location on the device, later, which likely means seeking back -+ to that location. -+ -+ So there are tradeoffs. We can choose either: -+ -+ A. Allocate all unallocated children to preserve both write-optimization and -+ read-optimization, but this is not always desirable because it may mean having to -+ allocate and flush very many nodes at once. -+ -+ B. Defer writing nodes with unallocated children, keep their read-optimized locations, -+ but sacrifice write-optimization because those nodes will be written again. -+ -+ C. Defer writing nodes with unallocated children, but do not keep their read-optimized -+ locations. Instead, choose to write-optimize them later, when they are written. To -+ facilitate this, we "undo" the read-optimized allocation that was given to the node so -+ that later it can be write-optimized, thus "unpreparing" the flush decision. This is a -+ case where we disturb the FLUSH_PREP_ONCE_PER_TRANSACTION rule described above. By a -+ call to flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit; -+ if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate its block -+ location, and set the JNODE_CREATED bit, effectively setting the node back to an -+ unallocated state. -+ -+ We will take the following approach in v4.0: for twig nodes we will always finish -+ allocating unallocated children (A). For nodes with (level > TWIG) we will defer -+ writing and choose write-optimization (C). -+ -+ To summarize, there are several parts to a solution that avoids the problem with -+ unallocated children: -+ -+ FIXME-ZAM: Still no one approach is implemented to eliminate the "UNALLOCATED CHILDREN" -+ problem because there was an experiment which was done showed that we have 1-2 nodes -+ with unallocated children for thousands of written nodes. The experiment was simple -+ like coping / deletion of linux kernel sources. However the problem can arise in more -+ complex tests. I think we have jnode_io_hook to insert a check for unallocated -+ children and see what kind of problem we have. -+ -+ 1. When flush reaches a stopping point (e.g., a clean node), it should continue calling -+ squeeze-and-allocate on any remaining unallocated children. FIXME: Difficulty to -+ implement: should be simple -- amounts to adding a while loop to jnode_flush, see -+ comments in that function. -+ -+ 2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes may still -+ have unallocated children. If the twig level has unallocated children it is an -+ assertion failure. If a higher-level node has unallocated children, then it should be -+ explicitly de-allocated by a call to flush_unprep(). FIXME: Difficulty to implement: -+ should be simple. -+ -+ 3. (CPU-Optimization) Checking whether a node has unallocated children may consume more -+ CPU cycles than we would like, and it is possible (but medium complexity) to optimize -+ this somewhat in the case where large sub-trees are flushed. The following observation -+ helps: if both the left- and right-neighbor of a node are processed by the flush -+ algorithm then the node itself is guaranteed to have all of its children allocated. -+ However, the cost of this check may not be so expensive after all: it is not needed for -+ leaves and flush can guarantee this property for twigs. That leaves only (level > -+ TWIG) nodes that have to be checked, so this optimization only helps if at least three -+ (level > TWIG) nodes are flushed in one pass, and the savings will be very small unless -+ there are many more (level > TWIG) nodes. But if there are many (level > TWIG) nodes -+ then the number of blocks being written will be very large, so the savings may be -+ insignificant. That said, the idea is to maintain both the left and right edges of -+ nodes that are processed in flush. When flush_empty_queue() is called, a relatively -+ simple test will tell whether the (level > TWIG) node is on the edge. If it is on the -+ edge, the slow check is necessary, but if it is in the interior then it can be assumed -+ to have all of its children allocated. FIXME: medium complexity to implement, but -+ simple to verify given that we must have a slow check anyway. -+ -+ 4. (Optional) This part is optional, not for v4.0--flush should work independently of -+ whether this option is used or not. Called RAPID_SCAN, the idea is to amend the -+ left-scan operation to take unallocated children into account. Normally, the left-scan -+ operation goes left as long as adjacent nodes are dirty up until some large maximum -+ value (FLUSH_SCAN_MAXNODES) at which point it stops and begins flushing. But scan-left -+ may stop at a position where there are unallocated children to the left with the same -+ parent. When RAPID_SCAN is enabled, the ordinary scan-left operation stops after -+ FLUSH_RELOCATE_THRESHOLD, which is much smaller than FLUSH_SCAN_MAXNODES, then procedes -+ with a rapid scan. The rapid scan skips all the interior children of a node--if the -+ leftmost child of a twig is dirty, check its left neighbor (the rightmost child of the -+ twig to the left). If the left neighbor of the leftmost child is also dirty, then -+ continue the scan at the left twig and repeat. This option will cause flush to -+ allocate more twigs in a single pass, but it also has the potential to write many more -+ nodes than would otherwise be written without the RAPID_SCAN option. RAPID_SCAN -+ was partially implemented, code removed August 12, 2002 by JMACD. -+*/ -+ -+/* FLUSH CALLED ON NON-LEAF LEVEL. Most of our design considerations assume that the -+ starting point for flush is a leaf node, but actually the flush code cares very little -+ about whether or not this is true. It is possible that all the leaf nodes are flushed -+ and dirty parent nodes still remain, in which case jnode_flush() is called on a -+ non-leaf argument. Flush doesn't care--it treats the argument node as if it were a -+ leaf, even when it is not. This is a simple approach, and there may be a more optimal -+ policy but until a problem with this approach is discovered, simplest is probably best. -+ -+ NOTE: In this case, the ordering produced by flush is parent-first only if you ignore -+ the leaves. This is done as a matter of simplicity and there is only one (shaky) -+ justification. When an atom commits, it flushes all leaf level nodes first, followed -+ by twigs, and so on. With flushing done in this order, if flush is eventually called -+ on a non-leaf node it means that (somehow) we reached a point where all leaves are -+ clean and only internal nodes need to be flushed. If that it the case, then it means -+ there were no leaves that were the parent-first preceder/follower of the parent. This -+ is expected to be a rare case, which is why we do nothing special about it. However, -+ memory pressure may pass an internal node to flush when there are still dirty leaf -+ nodes that need to be flushed, which could prove our original assumptions -+ "inoperative". If this needs to be fixed, then scan_left/right should have -+ special checks for the non-leaf levels. For example, instead of passing from a node to -+ the left neighbor, it should pass from the node to the left neighbor's rightmost -+ descendent (if dirty). -+ -+*/ -+ -+/* UNIMPLEMENTED AS YET: REPACKING AND RESIZING. We walk the tree in 4MB-16MB chunks, dirtying everything and putting -+ it into a transaction. We tell the allocator to allocate the blocks as far as possible towards one end of the -+ logical device--the left (starting) end of the device if we are walking from left to right, the right end of the -+ device if we are walking from right to left. We then make passes in alternating directions, and as we do this the -+ device becomes sorted such that tree order and block number order fully correlate. -+ -+ Resizing is done by shifting everything either all the way to the left or all the way -+ to the right, and then reporting the last block. -+*/ -+ -+/* RELOCATE DECISIONS: The code makes a decision to relocate in several places. This -+ descibes the policy from the highest level: -+ -+ The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive nodes on the -+ leaf level during flush-scan (right, left), then we unconditionally decide to relocate -+ leaf nodes. -+ -+ Otherwise, there are two contexts in which we make a decision to relocate: -+ -+ 1. The REVERSE PARENT-FIRST context: Implemented in reverse_relocate_test(). -+ During the initial stages of flush, after scan-right completes, we want to ask the -+ question: should we relocate this leaf node and thus dirty the parent node. Then if -+ the node is a leftmost child its parent is its own parent-first preceder, thus we repeat -+ the question at the next level up, and so on. In these cases we are moving in the -+ reverse-parent first direction. -+ -+ There is another case which is considered the reverse direction, which comes at the end -+ of a twig in reverse_relocate_end_of_twig(). As we finish processing a twig we may -+ reach a point where there is a clean twig to the right with a dirty leftmost child. In -+ this case, we may wish to relocate the child by testing if it should be relocated -+ relative to its parent. -+ -+ 2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done in -+ allocate_znode. What distinguishes the forward parent-first case from the -+ reverse-parent first case is that the preceder has already been allocated in the -+ forward case, whereas in the reverse case we don't know what the preceder is until we -+ finish "going in reverse". That simplifies the forward case considerably, and there we -+ actually use the block allocator to determine whether, e.g., a block closer to the -+ preceder is available. -+*/ -+ -+/* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration. The idea is, once we -+ finish scan-left and find a starting point, if the parent's left neighbor is dirty then -+ squeeze the parent's left neighbor and the parent. This may change the -+ flush-starting-node's parent. Repeat until the child's parent is stable. If the child -+ is a leftmost child, repeat this left-edge squeezing operation at the next level up. -+ Note that we cannot allocate extents during this or they will be out of parent-first -+ order. There is also some difficult coordinate maintenence issues. We can't do a tree -+ search to find coordinates again (because we hold locks), we have to determine them -+ from the two nodes being squeezed. Looks difficult, but has potential to increase -+ space utilization. */ -+ -+/* Flush-scan helper functions. */ -+static void scan_init(flush_scan * scan); -+static void scan_done(flush_scan * scan); -+ -+/* Flush-scan algorithm. */ -+static int scan_left(flush_scan * scan, flush_scan * right, jnode * node, -+ unsigned limit); -+static int scan_right(flush_scan * scan, jnode * node, unsigned limit); -+static int scan_common(flush_scan * scan, flush_scan * other); -+static int scan_formatted(flush_scan * scan); -+static int scan_unformatted(flush_scan * scan, flush_scan * other); -+static int scan_by_coord(flush_scan * scan); -+ -+/* Initial flush-point ancestor allocation. */ -+static int alloc_pos_and_ancestors(flush_pos_t * pos); -+static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos); -+static int set_preceder(const coord_t * coord_in, flush_pos_t * pos); -+ -+/* Main flush algorithm. Note on abbreviation: "squeeze and allocate" == "squalloc". */ -+static int squalloc(flush_pos_t * pos); -+ -+/* Flush squeeze implementation. */ -+static int squeeze_right_non_twig(znode * left, znode * right); -+static int shift_one_internal_unit(znode * left, znode * right); -+ -+/* Flush reverse parent-first relocation routines. */ -+static int reverse_relocate_if_close_enough(const reiser4_block_nr * pblk, -+ const reiser4_block_nr * nblk); -+static int reverse_relocate_test(jnode * node, const coord_t * parent_coord, -+ flush_pos_t * pos); -+static int reverse_relocate_check_dirty_parent(jnode * node, -+ const coord_t * parent_coord, -+ flush_pos_t * pos); -+ -+/* Flush allocate write-queueing functions: */ -+static int allocate_znode(znode * node, const coord_t * parent_coord, -+ flush_pos_t * pos); -+static int allocate_znode_update(znode * node, const coord_t * parent_coord, -+ flush_pos_t * pos); -+static int lock_parent_and_allocate_znode(znode *, flush_pos_t *); -+ -+/* Flush helper functions: */ -+static int jnode_lock_parent_coord(jnode * node, -+ coord_t * coord, -+ lock_handle * parent_lh, -+ load_count * parent_zh, -+ znode_lock_mode mode, int try); -+static int neighbor_in_slum(znode * node, lock_handle * right_lock, sideof side, -+ znode_lock_mode mode, int check_dirty, int expected); -+static int znode_same_parents(znode * a, znode * b); -+ -+static int znode_check_flushprepped(znode * node) -+{ -+ return jnode_check_flushprepped(ZJNODE(node)); -+} -+ -+/* Flush position functions */ -+static void pos_init(flush_pos_t * pos); -+static int pos_valid(flush_pos_t * pos); -+static void pos_done(flush_pos_t * pos); -+static int pos_stop(flush_pos_t * pos); -+ -+/* check that @org is first jnode extent unit, if extent is unallocated, -+ * because all jnodes of unallocated extent are dirty and of the same atom. */ -+#define checkchild(scan) \ -+assert("nikita-3435", \ -+ ergo(scan->direction == LEFT_SIDE && \ -+ (scan->parent_coord.node->level == TWIG_LEVEL) && \ -+ jnode_is_unformatted(scan->node) && \ -+ extent_is_unallocated(&scan->parent_coord), \ -+ extent_unit_index(&scan->parent_coord) == index_jnode(scan->node))) -+ -+/* This flush_cnt variable is used to track the number of concurrent flush operations, -+ useful for debugging. It is initialized in txnmgr.c out of laziness (because flush has -+ no static initializer function...) */ -+ON_DEBUG(atomic_t flush_cnt; -+ ) -+ -+/* check fs backing device for write congestion */ -+static int check_write_congestion(void) -+{ -+ struct super_block *sb; -+ struct backing_dev_info *bdi; -+ -+ sb = reiser4_get_current_sb(); -+ bdi = reiser4_get_super_fake(sb)->i_mapping->backing_dev_info; -+ return bdi_write_congested(bdi); -+} -+ -+/* conditionally write flush queue */ -+static int write_prepped_nodes(flush_pos_t * pos) -+{ -+ int ret; -+ -+ assert("zam-831", pos); -+ assert("zam-832", pos->fq); -+ -+ if (!(pos->flags & JNODE_FLUSH_WRITE_BLOCKS)) -+ return 0; -+ -+ if (check_write_congestion()) -+ return 0; -+ -+ ret = reiser4_write_fq(pos->fq, pos->nr_written, -+ WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM); -+ return ret; -+} -+ -+/* Proper release all flush pos. resources then move flush position to new -+ locked node */ -+static void move_flush_pos(flush_pos_t * pos, lock_handle * new_lock, -+ load_count * new_load, const coord_t * new_coord) -+{ -+ assert("zam-857", new_lock->node == new_load->node); -+ -+ if (new_coord) { -+ assert("zam-858", new_coord->node == new_lock->node); -+ coord_dup(&pos->coord, new_coord); -+ } else { -+ coord_init_first_unit(&pos->coord, new_lock->node); -+ } -+ -+ if (pos->child) { -+ jput(pos->child); -+ pos->child = NULL; -+ } -+ -+ move_load_count(&pos->load, new_load); -+ done_lh(&pos->lock); -+ move_lh(&pos->lock, new_lock); -+} -+ -+/* delete empty node which link from the parent still exists. */ -+static int delete_empty_node(znode * node) -+{ -+ reiser4_key smallest_removed; -+ -+ assert("zam-1019", node != NULL); -+ assert("zam-1020", node_is_empty(node)); -+ assert("zam-1023", znode_is_wlocked(node)); -+ -+ return reiser4_delete_node(node, &smallest_removed, NULL, 1); -+} -+ -+/* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */ -+static int prepare_flush_pos(flush_pos_t * pos, jnode * org) -+{ -+ int ret; -+ load_count load; -+ lock_handle lock; -+ -+ init_lh(&lock); -+ init_load_count(&load); -+ -+ if (jnode_is_znode(org)) { -+ ret = longterm_lock_znode(&lock, JZNODE(org), -+ ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI); -+ if (ret) -+ return ret; -+ -+ ret = incr_load_count_znode(&load, JZNODE(org)); -+ if (ret) -+ return ret; -+ -+ pos->state = -+ (jnode_get_level(org) == -+ LEAF_LEVEL) ? POS_ON_LEAF : POS_ON_INTERNAL; -+ move_flush_pos(pos, &lock, &load, NULL); -+ } else { -+ coord_t parent_coord; -+ ret = jnode_lock_parent_coord(org, &parent_coord, &lock, -+ &load, ZNODE_WRITE_LOCK, 0); -+ if (ret) -+ goto done; -+ if (!item_is_extent(&parent_coord)) { -+ /* file was converted to tail, org became HB, we found internal -+ item */ -+ ret = -EAGAIN; -+ goto done; -+ } -+ -+ pos->state = POS_ON_EPOINT; -+ move_flush_pos(pos, &lock, &load, &parent_coord); -+ pos->child = jref(org); -+ if (extent_is_unallocated(&parent_coord) -+ && extent_unit_index(&parent_coord) != index_jnode(org)) { -+ /* @org is not first child of its parent unit. This may happen -+ because longerm lock of its parent node was released between -+ scan_left and scan_right. For now work around this having flush to repeat */ -+ ret = -EAGAIN; -+ } -+ } -+ -+ done: -+ done_load_count(&load); -+ done_lh(&lock); -+ return ret; -+} -+ -+/* TODO LIST (no particular order): */ -+/* I have labelled most of the legitimate FIXME comments in this file with letters to -+ indicate which issue they relate to. There are a few miscellaneous FIXMEs with -+ specific names mentioned instead that need to be inspected/resolved. */ -+/* B. There is an issue described in reverse_relocate_test having to do with an -+ imprecise is_preceder? check having to do with partially-dirty extents. The code that -+ sets preceder hints and computes the preceder is basically untested. Careful testing -+ needs to be done that preceder calculations are done correctly, since if it doesn't -+ affect correctness we will not catch this stuff during regular testing. */ -+/* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling. It is unclear which of these are -+ considered expected but unlikely conditions. Flush currently returns 0 (i.e., success -+ but no progress, i.e., restart) whenever it receives any of these in jnode_flush(). -+ Many of the calls that may produce one of these return values (i.e., -+ longterm_lock_znode, reiser4_get_parent, reiser4_get_neighbor, ...) check some of these -+ values themselves and, for instance, stop flushing instead of resulting in a restart. -+ If any of these results are true error conditions then flush will go into a busy-loop, -+ as we noticed during testing when a corrupt tree caused find_child_ptr to return -+ ENOENT. It needs careful thought and testing of corner conditions. -+*/ -+/* D. Atomicity of flush_prep against deletion and flush concurrency. Suppose a created -+ block is assigned a block number then early-flushed to disk. It is dirtied again and -+ flush is called again. Concurrently, that block is deleted, and the de-allocation of -+ its block number does not need to be deferred, since it is not part of the preserve set -+ (i.e., it didn't exist before the transaction). I think there may be a race condition -+ where flush writes the dirty, created block after the non-deferred deallocated block -+ number is re-allocated, making it possible to write deleted data on top of non-deleted -+ data. Its just a theory, but it needs to be thought out. */ -+/* F. bio_alloc() failure is not handled gracefully. */ -+/* G. Unallocated children. */ -+/* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered blocks. */ -+/* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */ -+ -+/* JNODE_FLUSH: MAIN ENTRY POINT */ -+/* This is the main entry point for flushing a jnode and its dirty neighborhood (dirty -+ neighborhood is named "slum"). Jnode_flush() is called if reiser4 has to write dirty -+ blocks to disk, it happens when Linux VM decides to reduce number of dirty pages or as -+ a part of transaction commit. -+ -+ Our objective here is to prep and flush the slum the jnode belongs to. We want to -+ squish the slum together, and allocate the nodes in it as we squish because allocation -+ of children affects squishing of parents. -+ -+ The "argument" @node tells flush where to start. From there, flush finds the left edge -+ of the slum, and calls squalloc (in which nodes are squeezed and allocated). To find a -+ "better place" to start squalloc first we perform a flush_scan. -+ -+ Flush-scanning may be performed in both left and right directions, but for different -+ purposes. When scanning to the left, we are searching for a node that precedes a -+ sequence of parent-first-ordered nodes which we will then flush in parent-first order. -+ During flush-scanning, we also take the opportunity to count the number of consecutive -+ leaf nodes. If this number is past some threshold (FLUSH_RELOCATE_THRESHOLD), then we -+ make a decision to reallocate leaf nodes (thus favoring write-optimization). -+ -+ Since the flush argument node can be anywhere in a sequence of dirty leaves, there may -+ also be dirty nodes to the right of the argument. If the scan-left operation does not -+ count at least FLUSH_RELOCATE_THRESHOLD nodes then we follow it with a right-scan -+ operation to see whether there is, in fact, enough nodes to meet the relocate -+ threshold. Each right- and left-scan operation uses a single flush_scan object. -+ -+ After left-scan and possibly right-scan, we prepare a flush_position object with the -+ starting flush point or parent coordinate, which was determined using scan-left. -+ -+ Next we call the main flush routine, squalloc, which iterates along the -+ leaf level, squeezing and allocating nodes (and placing them into the flush queue). -+ -+ After squalloc returns we take extra steps to ensure that all the children -+ of the final twig node are allocated--this involves repeating squalloc -+ until we finish at a twig with no unallocated children. -+ -+ Finally, we call flush_empty_queue to submit write-requests to disk. If we encounter -+ any above-twig nodes during flush_empty_queue that still have unallocated children, we -+ flush_unprep them. -+ -+ Flush treats several "failure" cases as non-failures, essentially causing them to start -+ over. E_DEADLOCK is one example. FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should -+ probably be handled properly rather than restarting, but there are a bunch of cases to -+ audit. -+*/ -+ -+static int -+jnode_flush(jnode * node, long nr_to_write, long *nr_written, -+ flush_queue_t * fq, int flags) -+{ -+ long ret = 0; -+ flush_scan *right_scan; -+ flush_scan *left_scan; -+ flush_pos_t *flush_pos; -+ int todo; -+ struct super_block *sb; -+ reiser4_super_info_data *sbinfo; -+ jnode *leftmost_in_slum = NULL; -+ -+ assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack())); -+ assert("nikita-3022", reiser4_schedulable()); -+ -+ assert("nikita-3185", -+ get_current_super_private()->delete_mutex_owner != current); -+ -+ /* allocate right_scan, left_scan and flush_pos */ -+ right_scan = -+ kmalloc(2 * sizeof(*right_scan) + sizeof(*flush_pos), -+ reiser4_ctx_gfp_mask_get()); -+ if (right_scan == NULL) -+ return RETERR(-ENOMEM); -+ left_scan = right_scan + 1; -+ flush_pos = (flush_pos_t *) (left_scan + 1); -+ -+ sb = reiser4_get_current_sb(); -+ sbinfo = get_super_private(sb); -+ -+ /* Flush-concurrency debug code */ -+#if REISER4_DEBUG -+ atomic_inc(&flush_cnt); -+#endif -+ -+ reiser4_enter_flush(sb); -+ -+ /* Initialize a flush position. */ -+ pos_init(flush_pos); -+ -+ flush_pos->nr_written = nr_written; -+ flush_pos->fq = fq; -+ flush_pos->flags = flags; -+ flush_pos->nr_to_write = nr_to_write; -+ -+ scan_init(right_scan); -+ scan_init(left_scan); -+ -+ /* First scan left and remember the leftmost scan position. If the leftmost -+ position is unformatted we remember its parent_coord. We scan until counting -+ FLUSH_SCAN_MAXNODES. -+ -+ If starting @node is unformatted, at the beginning of left scan its -+ parent (twig level node, containing extent item) will be long term -+ locked and lock handle will be stored in the -+ @right_scan->parent_lock. This lock is used to start the rightward -+ scan without redoing the tree traversal (necessary to find parent) -+ and, hence, is kept during leftward scan. As a result, we have to -+ use try-lock when taking long term locks during the leftward scan. -+ */ -+ ret = scan_left(left_scan, right_scan, -+ node, sbinfo->flush.scan_maxnodes); -+ if (ret != 0) -+ goto failed; -+ -+ leftmost_in_slum = jref(left_scan->node); -+ scan_done(left_scan); -+ -+ /* Then possibly go right to decide if we will use a policy of relocating leaves. -+ This is only done if we did not scan past (and count) enough nodes during the -+ leftward scan. If we do scan right, we only care to go far enough to establish -+ that at least FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed. The -+ scan limit is the difference between left_scan.count and the threshold. */ -+ -+ todo = sbinfo->flush.relocate_threshold - left_scan->count; -+ /* scan right is inherently deadlock prone, because we are -+ * (potentially) holding a lock on the twig node at this moment. -+ * FIXME: this is incorrect comment: lock is not held */ -+ if (todo > 0) { -+ ret = scan_right(right_scan, node, (unsigned)todo); -+ if (ret != 0) -+ goto failed; -+ } -+ -+ /* Only the right-scan count is needed, release any rightward locks right away. */ -+ scan_done(right_scan); -+ -+ /* ... and the answer is: we should relocate leaf nodes if at least -+ FLUSH_RELOCATE_THRESHOLD nodes were found. */ -+ flush_pos->leaf_relocate = JF_ISSET(node, JNODE_REPACK) || -+ (left_scan->count + right_scan->count >= -+ sbinfo->flush.relocate_threshold); -+ -+ /* Funny business here. We set the 'point' in the flush_position at prior to -+ starting squalloc regardless of whether the first point is -+ formatted or unformatted. Without this there would be an invariant, in the -+ rest of the code, that if the flush_position is unformatted then -+ flush_position->point is NULL and flush_position->parent_{lock,coord} is set, -+ and if the flush_position is formatted then flush_position->point is non-NULL -+ and no parent info is set. -+ -+ This seems lazy, but it makes the initial calls to reverse_relocate_test -+ (which ask "is it the pos->point the leftmost child of its parent") much easier -+ because we know the first child already. Nothing is broken by this, but the -+ reasoning is subtle. Holding an extra reference on a jnode during flush can -+ cause us to see nodes with HEARD_BANSHEE during squalloc, because nodes are not -+ removed from sibling lists until they have zero reference count. Flush would -+ never observe a HEARD_BANSHEE node on the left-edge of flush, nodes are only -+ deleted to the right. So if nothing is broken, why fix it? -+ -+ NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any -+ point and in any moment, because of the concurrent file system -+ activity (for example, truncate). */ -+ -+ /* Check jnode state after flush_scan completed. Having a lock on this -+ node or its parent (in case of unformatted) helps us in case of -+ concurrent flushing. */ -+ if (jnode_check_flushprepped(leftmost_in_slum) -+ && !jnode_convertible(leftmost_in_slum)) { -+ ret = 0; -+ goto failed; -+ } -+ -+ /* Now setup flush_pos using scan_left's endpoint. */ -+ ret = prepare_flush_pos(flush_pos, leftmost_in_slum); -+ if (ret) -+ goto failed; -+ -+ if (znode_get_level(flush_pos->coord.node) == LEAF_LEVEL -+ && node_is_empty(flush_pos->coord.node)) { -+ znode *empty = flush_pos->coord.node; -+ -+ assert("zam-1022", !ZF_ISSET(empty, JNODE_HEARD_BANSHEE)); -+ ret = delete_empty_node(empty); -+ goto failed; -+ } -+ -+ if (jnode_check_flushprepped(leftmost_in_slum) -+ && !jnode_convertible(leftmost_in_slum)) { -+ ret = 0; -+ goto failed; -+ } -+ -+ /* Set pos->preceder and (re)allocate pos and its ancestors if it is needed */ -+ ret = alloc_pos_and_ancestors(flush_pos); -+ if (ret) -+ goto failed; -+ -+ /* Do the main rightward-bottom-up squeeze and allocate loop. */ -+ ret = squalloc(flush_pos); -+ pos_stop(flush_pos); -+ if (ret) -+ goto failed; -+ -+ /* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated children. -+ First, the pos_stop() and pos_valid() routines should be modified -+ so that pos_stop() sets a flush_position->stop flag to 1 without -+ releasing the current position immediately--instead release it in -+ pos_done(). This is a better implementation than the current one anyway. -+ -+ It is not clear that all fields of the flush_position should not be released, -+ but at the very least the parent_lock, parent_coord, and parent_load should -+ remain held because they are hold the last twig when pos_stop() is -+ called. -+ -+ When we reach this point in the code, if the parent_coord is set to after the -+ last item then we know that flush reached the end of a twig (and according to -+ the new flush queueing design, we will return now). If parent_coord is not -+ past the last item, we should check if the current twig has any unallocated -+ children to the right (we are not concerned with unallocated children to the -+ left--in that case the twig itself should not have been allocated). If the -+ twig has unallocated children to the right, set the parent_coord to that -+ position and then repeat the call to squalloc. -+ -+ Testing for unallocated children may be defined in two ways: if any internal -+ item has a fake block number, it is unallocated; if any extent item is -+ unallocated then all of its children are unallocated. But there is a more -+ aggressive approach: if there are any dirty children of the twig to the right -+ of the current position, we may wish to relocate those nodes now. Checking for -+ potential relocation is more expensive as it requires knowing whether there are -+ any dirty children that are not unallocated. The extent_needs_allocation -+ should be used after setting the correct preceder. -+ -+ When we reach the end of a twig at this point in the code, if the flush can -+ continue (when the queue is ready) it will need some information on the future -+ starting point. That should be stored away in the flush_handle using a seal, I -+ believe. Holding a jref() on the future starting point may break other code -+ that deletes that node. -+ */ -+ -+ /* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is called -+ above the twig level. If the VM calls flush above the twig level, do nothing -+ and return (but figure out why this happens). The txnmgr should be modified to -+ only flush its leaf-level dirty list. This will do all the necessary squeeze -+ and allocate steps but leave unallocated branches and possibly unallocated -+ twigs (when the twig's leftmost child is not dirty). After flushing the leaf -+ level, the remaining unallocated nodes should be given write-optimized -+ locations. (Possibly, the remaining unallocated twigs should be allocated just -+ before their leftmost child.) -+ */ -+ -+ /* Any failure reaches this point. */ -+ failed: -+ -+ switch (ret) { -+ case -E_REPEAT: -+ case -EINVAL: -+ case -E_DEADLOCK: -+ case -E_NO_NEIGHBOR: -+ case -ENOENT: -+ /* FIXME(C): Except for E_DEADLOCK, these should probably be handled properly -+ in each case. They already are handled in many cases. */ -+ /* Something bad happened, but difficult to avoid... Try again! */ -+ ret = 0; -+ } -+ -+ if (leftmost_in_slum) -+ jput(leftmost_in_slum); -+ -+ pos_done(flush_pos); -+ scan_done(left_scan); -+ scan_done(right_scan); -+ kfree(right_scan); -+ -+ ON_DEBUG(atomic_dec(&flush_cnt)); -+ -+ reiser4_leave_flush(sb); -+ -+ return ret; -+} -+ -+/* The reiser4 flush subsystem can be turned into "rapid flush mode" means that -+ * flusher should submit all prepped nodes immediately without keeping them in -+ * flush queues for long time. The reason for rapid flush mode is to free -+ * memory as fast as possible. */ -+ -+#if REISER4_USE_RAPID_FLUSH -+ -+/** -+ * submit all prepped nodes if rapid flush mode is set, -+ * turn rapid flush mode off. -+ */ -+ -+static int rapid_flush(flush_pos_t * pos) -+{ -+ if (!wbq_available()) -+ return 0; -+ -+ return write_prepped_nodes(pos); -+} -+ -+#else -+ -+#define rapid_flush(pos) (0) -+ -+#endif /* REISER4_USE_RAPID_FLUSH */ -+ -+static jnode *find_flush_start_jnode(jnode *start, txn_atom *atom, -+ flush_queue_t *fq, int *nr_queued, -+ int flags) -+{ -+ jnode * node; -+ -+ if (start != NULL) { -+ spin_lock_jnode(start); -+ if (!jnode_is_flushprepped(start)) { -+ assert("zam-1056", start->atom == atom); -+ node = start; -+ goto enter; -+ } -+ spin_unlock_jnode(start); -+ } -+ /* -+ * In this loop we process all already prepped (RELOC or OVRWR) and dirtied again -+ * nodes. The atom spin lock is not released until all dirty nodes processed or -+ * not prepped node found in the atom dirty lists. -+ */ -+ while ((node = find_first_dirty_jnode(atom, flags))) { -+ spin_lock_jnode(node); -+ enter: -+ assert("zam-881", JF_ISSET(node, JNODE_DIRTY)); -+ assert("zam-898", !JF_ISSET(node, JNODE_OVRWR)); -+ -+ if (JF_ISSET(node, JNODE_WRITEBACK)) { -+ /* move node to the end of atom's writeback list */ -+ list_move_tail(&node->capture_link, ATOM_WB_LIST(atom)); -+ -+ /* -+ * jnode is not necessarily on dirty list: if it was dirtied when -+ * it was on flush queue - it does not get moved to dirty list -+ */ -+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), -+ WB_LIST, 1)); -+ -+ } else if (jnode_is_znode(node) -+ && znode_above_root(JZNODE(node))) { -+ /* -+ * A special case for znode-above-root. The above-root (fake) -+ * znode is captured and dirtied when the tree height changes or -+ * when the root node is relocated. This causes atoms to fuse so -+ * that changes at the root are serialized. However, this node is -+ * never flushed. This special case used to be in lock.c to -+ * prevent the above-root node from ever being captured, but now -+ * that it is captured we simply prevent it from flushing. The -+ * log-writer code relies on this to properly log superblock -+ * modifications of the tree height. -+ */ -+ jnode_make_wander_nolock(node); -+ } else if (JF_ISSET(node, JNODE_RELOC)) { -+ queue_jnode(fq, node); -+ ++(*nr_queued); -+ } else -+ break; -+ -+ spin_unlock_jnode(node); -+ } -+ return node; -+} -+ -+/* Flush some nodes of current atom, usually slum, return -E_REPEAT if there are more nodes -+ * to flush, return 0 if atom's dirty lists empty and keep current atom locked, return -+ * other errors as they are. */ -+int -+flush_current_atom(int flags, long nr_to_write, long *nr_submitted, -+ txn_atom ** atom, jnode *start) -+{ -+ reiser4_super_info_data *sinfo = get_current_super_private(); -+ flush_queue_t *fq = NULL; -+ jnode *node; -+ int nr_queued; -+ int ret; -+ -+ assert("zam-889", atom != NULL && *atom != NULL); -+ assert_spin_locked(&((*atom)->alock)); -+ assert("zam-892", get_current_context()->trans->atom == *atom); -+ -+ nr_to_write = LONG_MAX; -+ while (1) { -+ ret = reiser4_fq_by_atom(*atom, &fq); -+ if (ret != -E_REPEAT) -+ break; -+ *atom = get_current_atom_locked(); -+ } -+ if (ret) -+ return ret; -+ -+ assert_spin_locked(&((*atom)->alock)); -+ -+ /* parallel flushers limit */ -+ if (sinfo->tmgr.atom_max_flushers != 0) { -+ while ((*atom)->nr_flushers >= sinfo->tmgr.atom_max_flushers) { -+ /* An reiser4_atom_send_event() call is inside -+ reiser4_fq_put_nolock() which is called when flush is -+ finished and nr_flushers is decremented. */ -+ reiser4_atom_wait_event(*atom); -+ *atom = get_current_atom_locked(); -+ } -+ } -+ -+ /* count ourself as a flusher */ -+ (*atom)->nr_flushers++; -+ -+ writeout_mode_enable(); -+ -+ nr_queued = 0; -+ node = find_flush_start_jnode(start, *atom, fq, &nr_queued, flags); -+ -+ if (node == NULL) { -+ if (nr_queued == 0) { -+ (*atom)->nr_flushers--; -+ reiser4_fq_put_nolock(fq); -+ reiser4_atom_send_event(*atom); -+ /* current atom remains locked */ -+ writeout_mode_disable(); -+ return 0; -+ } -+ spin_unlock_atom(*atom); -+ } else { -+ jref(node); -+ BUG_ON((*atom)->super != node->tree->super); -+ spin_unlock_atom(*atom); -+ spin_unlock_jnode(node); -+ BUG_ON(nr_to_write == 0); -+ ret = jnode_flush(node, nr_to_write, nr_submitted, fq, flags); -+ jput(node); -+ } -+ -+ ret = -+ reiser4_write_fq(fq, nr_submitted, -+ WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM); -+ -+ *atom = get_current_atom_locked(); -+ (*atom)->nr_flushers--; -+ reiser4_fq_put_nolock(fq); -+ reiser4_atom_send_event(*atom); -+ spin_unlock_atom(*atom); -+ -+ writeout_mode_disable(); -+ -+ if (ret == 0) -+ ret = -E_REPEAT; -+ -+ return ret; -+} -+ -+/* REVERSE PARENT-FIRST RELOCATION POLICIES */ -+ -+/* This implements the is-it-close-enough-to-its-preceder? test for relocation in the -+ reverse parent-first relocate context. Here all we know is the preceder and the block -+ number. Since we are going in reverse, the preceder may still be relocated as well, so -+ we can't ask the block allocator "is there a closer block available to relocate?" here. -+ In the _forward_ parent-first relocate context (not here) we actually call the block -+ allocator to try and find a closer location. */ -+static int -+reverse_relocate_if_close_enough(const reiser4_block_nr * pblk, -+ const reiser4_block_nr * nblk) -+{ -+ reiser4_block_nr dist; -+ -+ assert("jmacd-7710", *pblk != 0 && *nblk != 0); -+ assert("jmacd-7711", !reiser4_blocknr_is_fake(pblk)); -+ assert("jmacd-7712", !reiser4_blocknr_is_fake(nblk)); -+ -+ /* Distance is the absolute value. */ -+ dist = (*pblk > *nblk) ? (*pblk - *nblk) : (*nblk - *pblk); -+ -+ /* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from its preceder -+ block, do not relocate. */ -+ if (dist <= get_current_super_private()->flush.relocate_distance) { -+ return 0; -+ } -+ -+ return 1; -+} -+ -+/* This function is a predicate that tests for relocation. Always called in the -+ reverse-parent-first context, when we are asking whether the current node should be -+ relocated in order to expand the flush by dirtying the parent level (and thus -+ proceeding to flush that level). When traversing in the forward parent-first direction -+ (not here), relocation decisions are handled in two places: allocate_znode() and -+ extent_needs_allocation(). */ -+static int -+reverse_relocate_test(jnode * node, const coord_t * parent_coord, -+ flush_pos_t * pos) -+{ -+ reiser4_block_nr pblk = 0; -+ reiser4_block_nr nblk = 0; -+ -+ assert("jmacd-8989", !jnode_is_root(node)); -+ -+ /* -+ * This function is called only from the -+ * reverse_relocate_check_dirty_parent() and only if the parent -+ * node is clean. This implies that the parent has the real (i.e., not -+ * fake) block number, and, so does the child, because otherwise the -+ * parent would be dirty. -+ */ -+ -+ /* New nodes are treated as if they are being relocated. */ -+ if (JF_ISSET (node, JNODE_CREATED) || -+ (pos->leaf_relocate && jnode_get_level(node) == LEAF_LEVEL)) { -+ return 1; -+ } -+ -+ /* Find the preceder. FIXME(B): When the child is an unformatted, previously -+ existing node, the coord may be leftmost even though the child is not the -+ parent-first preceder of the parent. If the first dirty node appears somewhere -+ in the middle of the first extent unit, this preceder calculation is wrong. -+ Needs more logic in here. */ -+ if (coord_is_leftmost_unit(parent_coord)) { -+ pblk = *znode_get_block(parent_coord->node); -+ } else { -+ pblk = pos->preceder.blk; -+ } -+ check_preceder(pblk); -+ -+ /* If (pblk == 0) then the preceder isn't allocated or isn't known: relocate. */ -+ if (pblk == 0) { -+ return 1; -+ } -+ -+ nblk = *jnode_get_block(node); -+ -+ if (reiser4_blocknr_is_fake(&nblk)) -+ /* child is unallocated, mark parent dirty */ -+ return 1; -+ -+ return reverse_relocate_if_close_enough(&pblk, &nblk); -+} -+ -+/* This function calls reverse_relocate_test to make a reverse-parent-first -+ relocation decision and then, if yes, it marks the parent dirty. */ -+static int -+reverse_relocate_check_dirty_parent(jnode * node, const coord_t * parent_coord, -+ flush_pos_t * pos) -+{ -+ int ret; -+ -+ if (!JF_ISSET(ZJNODE(parent_coord->node), JNODE_DIRTY)) { -+ -+ ret = reverse_relocate_test(node, parent_coord, pos); -+ if (ret < 0) { -+ return ret; -+ } -+ -+ /* FIXME-ZAM -+ if parent is already relocated - we do not want to grab space, right? */ -+ if (ret == 1) { -+ int grabbed; -+ -+ grabbed = get_current_context()->grabbed_blocks; -+ if (reiser4_grab_space_force((__u64) 1, BA_RESERVED) != -+ 0) -+ reiser4_panic("umka-1250", -+ "No space left during flush."); -+ -+ assert("jmacd-18923", -+ znode_is_write_locked(parent_coord->node)); -+ znode_make_dirty(parent_coord->node); -+ grabbed2free_mark(grabbed); -+ } -+ } -+ -+ return 0; -+} -+ -+/* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE FORWARD -+ PARENT-FIRST LOOP BEGINS) */ -+ -+/* Get the leftmost child for given coord. */ -+static int get_leftmost_child_of_unit(const coord_t * coord, jnode ** child) -+{ -+ int ret; -+ -+ ret = item_utmost_child(coord, LEFT_SIDE, child); -+ -+ if (ret) -+ return ret; -+ -+ if (IS_ERR(*child)) -+ return PTR_ERR(*child); -+ -+ return 0; -+} -+ -+/* This step occurs after the left- and right-scans are completed, before starting the -+ forward parent-first traversal. Here we attempt to allocate ancestors of the starting -+ flush point, which means continuing in the reverse parent-first direction to the -+ parent, grandparent, and so on (as long as the child is a leftmost child). This -+ routine calls a recursive process, alloc_one_ancestor, which does the real work, -+ except there is special-case handling here for the first ancestor, which may be a twig. -+ At each level (here and alloc_one_ancestor), we check for relocation and then, if -+ the child is a leftmost child, repeat at the next level. On the way back down (the -+ recursion), we allocate the ancestors in parent-first order. */ -+static int alloc_pos_and_ancestors(flush_pos_t * pos) -+{ -+ int ret = 0; -+ lock_handle plock; -+ load_count pload; -+ coord_t pcoord; -+ -+ if (znode_check_flushprepped(pos->lock.node)) -+ return 0; -+ -+ coord_init_invalid(&pcoord, NULL); -+ init_lh(&plock); -+ init_load_count(&pload); -+ -+ if (pos->state == POS_ON_EPOINT) { -+ /* a special case for pos on twig level, where we already have -+ a lock on parent node. */ -+ /* The parent may not be dirty, in which case we should decide -+ whether to relocate the child now. If decision is made to -+ relocate the child, the parent is marked dirty. */ -+ ret = -+ reverse_relocate_check_dirty_parent(pos->child, &pos->coord, -+ pos); -+ if (ret) -+ goto exit; -+ -+ /* FIXME_NFQUCMPD: We only need to allocate the twig (if child -+ is leftmost) and the leaf/child, so recursion is not needed. -+ Levels above the twig will be allocated for -+ write-optimization before the transaction commits. */ -+ -+ /* Do the recursive step, allocating zero or more of our -+ * ancestors. */ -+ ret = alloc_one_ancestor(&pos->coord, pos); -+ -+ } else { -+ if (!znode_is_root(pos->lock.node)) { -+ /* all formatted nodes except tree root */ -+ ret = -+ reiser4_get_parent(&plock, pos->lock.node, -+ ZNODE_WRITE_LOCK); -+ if (ret) -+ goto exit; -+ -+ ret = incr_load_count_znode(&pload, plock.node); -+ if (ret) -+ goto exit; -+ -+ ret = -+ find_child_ptr(plock.node, pos->lock.node, &pcoord); -+ if (ret) -+ goto exit; -+ -+ ret = -+ reverse_relocate_check_dirty_parent(ZJNODE -+ (pos->lock. -+ node), &pcoord, -+ pos); -+ if (ret) -+ goto exit; -+ -+ ret = alloc_one_ancestor(&pcoord, pos); -+ if (ret) -+ goto exit; -+ } -+ -+ ret = allocate_znode(pos->lock.node, &pcoord, pos); -+ } -+ exit: -+ done_load_count(&pload); -+ done_lh(&plock); -+ return ret; -+} -+ -+/* This is the recursive step described in alloc_pos_and_ancestors, above. Ignoring the -+ call to set_preceder, which is the next function described, this checks if the -+ child is a leftmost child and returns if it is not. If the child is a leftmost child -+ it checks for relocation, possibly dirtying the parent. Then it performs the recursive -+ step. */ -+static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos) -+{ -+ int ret = 0; -+ lock_handle alock; -+ load_count aload; -+ coord_t acoord; -+ -+ /* As we ascend at the left-edge of the region to flush, take this opportunity at -+ the twig level to find our parent-first preceder unless we have already set -+ it. */ -+ if (pos->preceder.blk == 0) { -+ ret = set_preceder(coord, pos); -+ if (ret != 0) -+ return ret; -+ } -+ -+ /* If the ancestor is clean or already allocated, or if the child is not a -+ leftmost child, stop going up, even leaving coord->node not flushprepped. */ -+ if (znode_check_flushprepped(coord->node) -+ || !coord_is_leftmost_unit(coord)) -+ return 0; -+ -+ init_lh(&alock); -+ init_load_count(&aload); -+ coord_init_invalid(&acoord, NULL); -+ -+ /* Only ascend to the next level if it is a leftmost child, but write-lock the -+ parent in case we will relocate the child. */ -+ if (!znode_is_root(coord->node)) { -+ -+ ret = -+ jnode_lock_parent_coord(ZJNODE(coord->node), &acoord, -+ &alock, &aload, ZNODE_WRITE_LOCK, -+ 0); -+ if (ret != 0) { -+ /* FIXME(C): check EINVAL, E_DEADLOCK */ -+ goto exit; -+ } -+ -+ ret = -+ reverse_relocate_check_dirty_parent(ZJNODE(coord->node), -+ &acoord, pos); -+ if (ret != 0) { -+ goto exit; -+ } -+ -+ /* Recursive call. */ -+ if (!znode_check_flushprepped(acoord.node)) { -+ ret = alloc_one_ancestor(&acoord, pos); -+ if (ret) -+ goto exit; -+ } -+ } -+ -+ /* Note: we call allocate with the parent write-locked (except at the root) in -+ case we relocate the child, in which case it will modify the parent during this -+ call. */ -+ ret = allocate_znode(coord->node, &acoord, pos); -+ -+ exit: -+ done_load_count(&aload); -+ done_lh(&alock); -+ return ret; -+} -+ -+/* During the reverse parent-first alloc_pos_and_ancestors process described above there is -+ a call to this function at the twig level. During alloc_pos_and_ancestors we may ask: -+ should this node be relocated (in reverse parent-first context)? We repeat this -+ process as long as the child is the leftmost child, eventually reaching an ancestor of -+ the flush point that is not a leftmost child. The preceder of that ancestors, which is -+ not a leftmost child, is actually on the leaf level. The preceder of that block is the -+ left-neighbor of the flush point. The preceder of that block is the rightmost child of -+ the twig on the left. So, when alloc_pos_and_ancestors passes upward through the twig -+ level, it stops momentarily to remember the block of the rightmost child of the twig on -+ the left and sets it to the flush_position's preceder_hint. -+ -+ There is one other place where we may set the flush_position's preceder hint, which is -+ during scan-left. -+*/ -+static int set_preceder(const coord_t * coord_in, flush_pos_t * pos) -+{ -+ int ret; -+ coord_t coord; -+ lock_handle left_lock; -+ load_count left_load; -+ -+ coord_dup(&coord, coord_in); -+ -+ init_lh(&left_lock); -+ init_load_count(&left_load); -+ -+ /* FIXME(B): Same FIXME as in "Find the preceder" in reverse_relocate_test. -+ coord_is_leftmost_unit is not the right test if the unformatted child is in the -+ middle of the first extent unit. */ -+ if (!coord_is_leftmost_unit(&coord)) { -+ coord_prev_unit(&coord); -+ } else { -+ ret = -+ reiser4_get_left_neighbor(&left_lock, coord.node, -+ ZNODE_READ_LOCK, GN_SAME_ATOM); -+ if (ret) { -+ /* If we fail for any reason it doesn't matter because the -+ preceder is only a hint. We are low-priority at this point, so -+ this must be the case. */ -+ if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR || -+ ret == -ENOENT || ret == -EINVAL -+ || ret == -E_DEADLOCK) { -+ ret = 0; -+ } -+ goto exit; -+ } -+ -+ ret = incr_load_count_znode(&left_load, left_lock.node); -+ if (ret) -+ goto exit; -+ -+ coord_init_last_unit(&coord, left_lock.node); -+ } -+ -+ ret = -+ item_utmost_child_real_block(&coord, RIGHT_SIDE, -+ &pos->preceder.blk); -+ exit: -+ check_preceder(pos->preceder.blk); -+ done_load_count(&left_load); -+ done_lh(&left_lock); -+ return ret; -+} -+ -+/* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */ -+ -+/* This procedure implements the outer loop of the flush algorithm. To put this in -+ context, here is the general list of steps taken by the flush routine as a whole: -+ -+ 1. Scan-left -+ 2. Scan-right (maybe) -+ 3. Allocate initial flush position and its ancestors -+ 4. -+ 5. -+ 6. -+ -+ This procedure implements the loop in steps 4 through 6 in the above listing. -+ -+ Step 4: if the current flush position is an extent item (position on the twig level), -+ it allocates the extent (allocate_extent_item_in_place) then shifts to the next -+ coordinate. If the next coordinate's leftmost child needs flushprep, we will continue. -+ If the next coordinate is an internal item, we descend back to the leaf level, -+ otherwise we repeat a step #4 (labeled ALLOC_EXTENTS below). If the "next coordinate" -+ brings us past the end of the twig level, then we call -+ reverse_relocate_end_of_twig to possibly dirty the next (right) twig, prior to -+ step #5 which moves to the right. -+ -+ Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up the -+ tree to allocate any ancestors of the next-right flush position that are not also -+ ancestors of the current position. Those ancestors (in top-down order) are the next in -+ parent-first order. We squeeze adjacent nodes on the way up until the right node and -+ current node share the same parent, then allocate on the way back down. Finally, this -+ step sets the flush position to the next-right node. Then repeat steps 4 and 5. -+*/ -+ -+/* SQUEEZE CODE */ -+ -+/* squalloc_right_twig helper function, cut a range of extent items from -+ cut node to->node from the beginning up to coord @to. */ -+static int squalloc_right_twig_cut(coord_t * to, reiser4_key * to_key, -+ znode * left) -+{ -+ coord_t from; -+ reiser4_key from_key; -+ -+ coord_init_first_unit(&from, to->node); -+ item_key_by_coord(&from, &from_key); -+ -+ return cut_node_content(&from, to, &from_key, to_key, NULL); -+} -+ -+/* Copy as much of the leading extents from @right to @left, allocating -+ unallocated extents as they are copied. Returns SQUEEZE_TARGET_FULL or -+ SQUEEZE_SOURCE_EMPTY when no more can be shifted. If the next item is an -+ internal item it calls shift_one_internal_unit and may then return -+ SUBTREE_MOVED. */ -+static int squeeze_right_twig(znode * left, znode * right, flush_pos_t * pos) -+{ -+ int ret = SUBTREE_MOVED; -+ coord_t coord; /* used to iterate over items */ -+ reiser4_key stop_key; -+ -+ assert("jmacd-2008", !node_is_empty(right)); -+ coord_init_first_unit(&coord, right); -+ -+ /* FIXME: can be optimized to cut once */ -+ while (!node_is_empty(coord.node) && item_is_extent(&coord)) { -+ ON_DEBUG(void *vp); -+ -+ assert("vs-1468", coord_is_leftmost_unit(&coord)); -+ ON_DEBUG(vp = shift_check_prepare(left, coord.node)); -+ -+ /* stop_key is used to find what was copied and what to cut */ -+ stop_key = *reiser4_min_key(); -+ ret = squalloc_extent(left, &coord, pos, &stop_key); -+ if (ret != SQUEEZE_CONTINUE) { -+ ON_DEBUG(kfree(vp)); -+ break; -+ } -+ assert("vs-1465", !keyeq(&stop_key, reiser4_min_key())); -+ -+ /* Helper function to do the cutting. */ -+ set_key_offset(&stop_key, get_key_offset(&stop_key) - 1); -+ check_me("vs-1466", -+ squalloc_right_twig_cut(&coord, &stop_key, left) == 0); -+ -+ ON_DEBUG(shift_check(vp, left, coord.node)); -+ } -+ -+ if (node_is_empty(coord.node)) -+ ret = SQUEEZE_SOURCE_EMPTY; -+ -+ if (ret == SQUEEZE_TARGET_FULL) { -+ goto out; -+ } -+ -+ if (node_is_empty(right)) { -+ /* The whole right node was copied into @left. */ -+ assert("vs-464", ret == SQUEEZE_SOURCE_EMPTY); -+ goto out; -+ } -+ -+ coord_init_first_unit(&coord, right); -+ -+ if (!item_is_internal(&coord)) { -+ /* we do not want to squeeze anything else to left neighbor because "slum" -+ is over */ -+ ret = SQUEEZE_TARGET_FULL; -+ goto out; -+ } -+ assert("jmacd-433", item_is_internal(&coord)); -+ -+ /* Shift an internal unit. The child must be allocated before shifting any more -+ extents, so we stop here. */ -+ ret = shift_one_internal_unit(left, right); -+ -+ out: -+ assert("jmacd-8612", ret < 0 || ret == SQUEEZE_TARGET_FULL -+ || ret == SUBTREE_MOVED || ret == SQUEEZE_SOURCE_EMPTY); -+ -+ if (ret == SQUEEZE_TARGET_FULL) { -+ /* We submit prepped nodes here and expect that this @left twig -+ * will not be modified again during this jnode_flush() call. */ -+ int ret1; -+ -+ /* NOTE: seems like io is done under long term locks. */ -+ ret1 = write_prepped_nodes(pos); -+ if (ret1 < 0) -+ return ret1; -+ } -+ -+ return ret; -+} -+ -+#if REISER4_DEBUG -+static void item_convert_invariant(flush_pos_t * pos) -+{ -+ assert("edward-1225", coord_is_existing_item(&pos->coord)); -+ if (chaining_data_present(pos)) { -+ item_plugin *iplug = item_convert_plug(pos); -+ -+ assert("edward-1000", -+ iplug == item_plugin_by_coord(&pos->coord)); -+ assert("edward-1001", iplug->f.convert != NULL); -+ } else -+ assert("edward-1226", pos->child == NULL); -+} -+#else -+ -+#define item_convert_invariant(pos) noop -+ -+#endif -+ -+/* Scan node items starting from the first one and apply for each -+ item its flush ->convert() method (if any). This method may -+ resize/kill the item so the tree will be changed. -+*/ -+static int convert_node(flush_pos_t * pos, znode * node) -+{ -+ int ret = 0; -+ item_plugin *iplug; -+ -+ assert("edward-304", pos != NULL); -+ assert("edward-305", pos->child == NULL); -+ assert("edward-475", znode_convertible(node)); -+ assert("edward-669", znode_is_wlocked(node)); -+ assert("edward-1210", !node_is_empty(node)); -+ -+ if (znode_get_level(node) != LEAF_LEVEL) -+ /* unsupported */ -+ goto exit; -+ -+ coord_init_first_unit(&pos->coord, node); -+ -+ while (1) { -+ ret = 0; -+ coord_set_to_left(&pos->coord); -+ item_convert_invariant(pos); -+ -+ iplug = item_plugin_by_coord(&pos->coord); -+ assert("edward-844", iplug != NULL); -+ -+ if (iplug->f.convert) { -+ ret = iplug->f.convert(pos); -+ if (ret) -+ goto exit; -+ } -+ assert("edward-307", pos->child == NULL); -+ -+ if (coord_next_item(&pos->coord)) { -+ /* node is over */ -+ -+ if (!chaining_data_present(pos)) -+ /* finished this node */ -+ break; -+ if (should_chain_next_node(pos)) { -+ /* go to next node */ -+ move_chaining_data(pos, 0 /* to next node */ ); -+ break; -+ } -+ /* repeat this node */ -+ move_chaining_data(pos, 1 /* this node */ ); -+ continue; -+ } -+ /* Node is not over. -+ Check if there is attached convert data. -+ If so roll one item position back and repeat -+ on this node -+ */ -+ if (chaining_data_present(pos)) { -+ -+ if (iplug != item_plugin_by_coord(&pos->coord)) -+ set_item_convert_count(pos, 0); -+ -+ ret = coord_prev_item(&pos->coord); -+ assert("edward-1003", !ret); -+ -+ move_chaining_data(pos, 1 /* this node */ ); -+ } -+ } -+ JF_CLR(ZJNODE(node), JNODE_CONVERTIBLE); -+ znode_make_dirty(node); -+ exit: -+ assert("edward-1004", !ret); -+ return ret; -+} -+ -+/* Squeeze and allocate the right neighbor. This is called after @left and -+ its current children have been squeezed and allocated already. This -+ procedure's job is to squeeze and items from @right to @left. -+ -+ If at the leaf level, use the shift_everything_left memcpy-optimized -+ version of shifting (squeeze_right_leaf). -+ -+ If at the twig level, extents are allocated as they are shifted from @right -+ to @left (squalloc_right_twig). -+ -+ At any other level, shift one internal item and return to the caller -+ (squalloc_parent_first) so that the shifted-subtree can be processed in -+ parent-first order. -+ -+ When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is -+ returned. When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is -+ returned. If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL -+ is returned. -+*/ -+ -+static int squeeze_right_neighbor(flush_pos_t * pos, znode * left, -+ znode * right) -+{ -+ int ret; -+ -+ /* FIXME it is possible to see empty hasn't-heard-banshee node in a -+ * tree owing to error (for example, ENOSPC) in write */ -+ /* assert("jmacd-9321", !node_is_empty(left)); */ -+ assert("jmacd-9322", !node_is_empty(right)); -+ assert("jmacd-9323", znode_get_level(left) == znode_get_level(right)); -+ -+ switch (znode_get_level(left)) { -+ case TWIG_LEVEL: -+ /* Shift with extent allocating until either an internal item -+ is encountered or everything is shifted or no free space -+ left in @left */ -+ ret = squeeze_right_twig(left, right, pos); -+ break; -+ -+ default: -+ /* All other levels can use shift_everything until we implement per-item -+ flush plugins. */ -+ ret = squeeze_right_non_twig(left, right); -+ break; -+ } -+ -+ assert("jmacd-2011", (ret < 0 || -+ ret == SQUEEZE_SOURCE_EMPTY -+ || ret == SQUEEZE_TARGET_FULL -+ || ret == SUBTREE_MOVED)); -+ return ret; -+} -+ -+static int squeeze_right_twig_and_advance_coord(flush_pos_t * pos, -+ znode * right) -+{ -+ int ret; -+ -+ ret = squeeze_right_twig(pos->lock.node, right, pos); -+ if (ret < 0) -+ return ret; -+ if (ret > 0) { -+ coord_init_after_last_item(&pos->coord, pos->lock.node); -+ return ret; -+ } -+ -+ coord_init_last_unit(&pos->coord, pos->lock.node); -+ return 0; -+} -+ -+/* forward declaration */ -+static int squalloc_upper_levels(flush_pos_t *, znode *, znode *); -+ -+/* do a fast check for "same parents" condition before calling -+ * squalloc_upper_levels() */ -+static inline int check_parents_and_squalloc_upper_levels(flush_pos_t * pos, -+ znode * left, -+ znode * right) -+{ -+ if (znode_same_parents(left, right)) -+ return 0; -+ -+ return squalloc_upper_levels(pos, left, right); -+} -+ -+/* Check whether the parent of given @right node needs to be processes -+ ((re)allocated) prior to processing of the child. If @left and @right do not -+ share at least the parent of the @right is after the @left but before the -+ @right in parent-first order, we have to (re)allocate it before the @right -+ gets (re)allocated. */ -+static int squalloc_upper_levels(flush_pos_t * pos, znode * left, znode * right) -+{ -+ int ret; -+ -+ lock_handle left_parent_lock; -+ lock_handle right_parent_lock; -+ -+ load_count left_parent_load; -+ load_count right_parent_load; -+ -+ init_lh(&left_parent_lock); -+ init_lh(&right_parent_lock); -+ -+ init_load_count(&left_parent_load); -+ init_load_count(&right_parent_load); -+ -+ ret = reiser4_get_parent(&left_parent_lock, left, ZNODE_WRITE_LOCK); -+ if (ret) -+ goto out; -+ -+ ret = reiser4_get_parent(&right_parent_lock, right, ZNODE_WRITE_LOCK); -+ if (ret) -+ goto out; -+ -+ /* Check for same parents */ -+ if (left_parent_lock.node == right_parent_lock.node) -+ goto out; -+ -+ if (znode_check_flushprepped(right_parent_lock.node)) { -+ /* Keep parent-first order. In the order, the right parent node stands -+ before the @right node. If it is already allocated, we set the -+ preceder (next block search start point) to its block number, @right -+ node should be allocated after it. -+ -+ However, preceder is set only if the right parent is on twig level. -+ The explanation is the following: new branch nodes are allocated over -+ already allocated children while the tree grows, it is difficult to -+ keep tree ordered, we assume that only leaves and twings are correctly -+ allocated. So, only twigs are used as a preceder for allocating of the -+ rest of the slum. */ -+ if (znode_get_level(right_parent_lock.node) == TWIG_LEVEL) { -+ pos->preceder.blk = -+ *znode_get_block(right_parent_lock.node); -+ check_preceder(pos->preceder.blk); -+ } -+ goto out; -+ } -+ -+ ret = incr_load_count_znode(&left_parent_load, left_parent_lock.node); -+ if (ret) -+ goto out; -+ -+ ret = incr_load_count_znode(&right_parent_load, right_parent_lock.node); -+ if (ret) -+ goto out; -+ -+ ret = -+ squeeze_right_neighbor(pos, left_parent_lock.node, -+ right_parent_lock.node); -+ /* We stop if error. We stop if some items/units were shifted (ret == 0) -+ * and thus @right changed its parent. It means we have not process -+ * right_parent node prior to processing of @right. Positive return -+ * values say that shifting items was not happen because of "empty -+ * source" or "target full" conditions. */ -+ if (ret <= 0) -+ goto out; -+ -+ /* parent(@left) and parent(@right) may have different parents also. We -+ * do a recursive call for checking that. */ -+ ret = -+ check_parents_and_squalloc_upper_levels(pos, left_parent_lock.node, -+ right_parent_lock.node); -+ if (ret) -+ goto out; -+ -+ /* allocate znode when going down */ -+ ret = lock_parent_and_allocate_znode(right_parent_lock.node, pos); -+ -+ out: -+ done_load_count(&left_parent_load); -+ done_load_count(&right_parent_load); -+ -+ done_lh(&left_parent_lock); -+ done_lh(&right_parent_lock); -+ -+ return ret; -+} -+ -+/* Check the leftmost child "flushprepped" status, also returns true if child -+ * node was not found in cache. */ -+static int leftmost_child_of_unit_check_flushprepped(const coord_t * coord) -+{ -+ int ret; -+ int prepped; -+ -+ jnode *child; -+ -+ ret = get_leftmost_child_of_unit(coord, &child); -+ -+ if (ret) -+ return ret; -+ -+ if (child) { -+ prepped = jnode_check_flushprepped(child); -+ jput(child); -+ } else { -+ /* We consider not existing child as a node which slum -+ processing should not continue to. Not cached node is clean, -+ so it is flushprepped. */ -+ prepped = 1; -+ } -+ -+ return prepped; -+} -+ -+/* (re)allocate znode with automated getting parent node */ -+static int lock_parent_and_allocate_znode(znode * node, flush_pos_t * pos) -+{ -+ int ret; -+ lock_handle parent_lock; -+ load_count parent_load; -+ coord_t pcoord; -+ -+ assert("zam-851", znode_is_write_locked(node)); -+ -+ init_lh(&parent_lock); -+ init_load_count(&parent_load); -+ -+ ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK); -+ if (ret) -+ goto out; -+ -+ ret = incr_load_count_znode(&parent_load, parent_lock.node); -+ if (ret) -+ goto out; -+ -+ ret = find_child_ptr(parent_lock.node, node, &pcoord); -+ if (ret) -+ goto out; -+ -+ ret = allocate_znode(node, &pcoord, pos); -+ -+ out: -+ done_load_count(&parent_load); -+ done_lh(&parent_lock); -+ return ret; -+} -+ -+/* Process nodes on leaf level until unformatted node or rightmost node in the -+ * slum reached. */ -+static int handle_pos_on_formatted(flush_pos_t * pos) -+{ -+ int ret; -+ lock_handle right_lock; -+ load_count right_load; -+ -+ init_lh(&right_lock); -+ init_load_count(&right_load); -+ -+ if (should_convert_node(pos, pos->lock.node)) { -+ ret = convert_node(pos, pos->lock.node); -+ if (ret) -+ return ret; -+ } -+ -+ while (1) { -+ int expected; -+ expected = should_convert_next_node(pos); -+ ret = neighbor_in_slum(pos->lock.node, &right_lock, RIGHT_SIDE, -+ ZNODE_WRITE_LOCK, !expected, expected); -+ if (ret) { -+ if (expected) -+ warning("edward-1495", -+ "Expected neighbor not found (ret = %d). Fsck?", -+ ret); -+ break; -+ } -+ -+ /* we don't prep(allocate) nodes for flushing twice. This can be suboptimal, or it -+ * can be optimal. For now we choose to live with the risk that it will -+ * be suboptimal because it would be quite complex to code it to be -+ * smarter. */ -+ if (znode_check_flushprepped(right_lock.node) -+ && !znode_convertible(right_lock.node)) { -+ assert("edward-1005", !should_convert_next_node(pos)); -+ pos_stop(pos); -+ break; -+ } -+ -+ ret = incr_load_count_znode(&right_load, right_lock.node); -+ if (ret) -+ break; -+ if (should_convert_node(pos, right_lock.node)) { -+ ret = convert_node(pos, right_lock.node); -+ if (ret) -+ break; -+ if (node_is_empty(right_lock.node)) { -+ /* node became empty after converting, repeat */ -+ done_load_count(&right_load); -+ done_lh(&right_lock); -+ continue; -+ } -+ } -+ -+ /* squeeze _before_ going upward. */ -+ ret = -+ squeeze_right_neighbor(pos, pos->lock.node, -+ right_lock.node); -+ if (ret < 0) -+ break; -+ -+ if (znode_check_flushprepped(right_lock.node)) { -+ if (should_convert_next_node(pos)) { -+ /* in spite of flushprepped status of the node, -+ its right slum neighbor should be converted */ -+ assert("edward-953", convert_data(pos)); -+ assert("edward-954", item_convert_data(pos)); -+ -+ if (node_is_empty(right_lock.node)) { -+ done_load_count(&right_load); -+ done_lh(&right_lock); -+ } else -+ move_flush_pos(pos, &right_lock, -+ &right_load, NULL); -+ continue; -+ } -+ pos_stop(pos); -+ break; -+ } -+ -+ if (node_is_empty(right_lock.node)) { -+ /* repeat if right node was squeezed completely */ -+ done_load_count(&right_load); -+ done_lh(&right_lock); -+ continue; -+ } -+ -+ /* parent(right_lock.node) has to be processed before -+ * (right_lock.node) due to "parent-first" allocation order. */ -+ ret = -+ check_parents_and_squalloc_upper_levels(pos, pos->lock.node, -+ right_lock.node); -+ if (ret) -+ break; -+ /* (re)allocate _after_ going upward */ -+ ret = lock_parent_and_allocate_znode(right_lock.node, pos); -+ if (ret) -+ break; -+ if (should_terminate_squalloc(pos)) { -+ set_item_convert_count(pos, 0); -+ break; -+ } -+ -+ /* advance the flush position to the right neighbor */ -+ move_flush_pos(pos, &right_lock, &right_load, NULL); -+ -+ ret = rapid_flush(pos); -+ if (ret) -+ break; -+ } -+ check_convert_info(pos); -+ done_load_count(&right_load); -+ done_lh(&right_lock); -+ -+ /* This function indicates via pos whether to stop or go to twig or continue on current -+ * level. */ -+ return ret; -+ -+} -+ -+/* Process nodes on leaf level until unformatted node or rightmost node in the -+ * slum reached. */ -+static int handle_pos_on_leaf(flush_pos_t * pos) -+{ -+ int ret; -+ -+ assert("zam-845", pos->state == POS_ON_LEAF); -+ -+ ret = handle_pos_on_formatted(pos); -+ -+ if (ret == -E_NO_NEIGHBOR) { -+ /* cannot get right neighbor, go process extents. */ -+ pos->state = POS_TO_TWIG; -+ return 0; -+ } -+ -+ return ret; -+} -+ -+/* Process slum on level > 1 */ -+static int handle_pos_on_internal(flush_pos_t * pos) -+{ -+ assert("zam-850", pos->state == POS_ON_INTERNAL); -+ return handle_pos_on_formatted(pos); -+} -+ -+/* check whether squalloc should stop before processing given extent */ -+static int squalloc_extent_should_stop(flush_pos_t * pos) -+{ -+ assert("zam-869", item_is_extent(&pos->coord)); -+ -+ /* pos->child is a jnode handle_pos_on_extent() should start with in -+ * stead of the first child of the first extent unit. */ -+ if (pos->child) { -+ int prepped; -+ -+ assert("vs-1383", jnode_is_unformatted(pos->child)); -+ prepped = jnode_check_flushprepped(pos->child); -+ pos->pos_in_unit = -+ jnode_get_index(pos->child) - -+ extent_unit_index(&pos->coord); -+ assert("vs-1470", -+ pos->pos_in_unit < extent_unit_width(&pos->coord)); -+ assert("nikita-3434", -+ ergo(extent_is_unallocated(&pos->coord), -+ pos->pos_in_unit == 0)); -+ jput(pos->child); -+ pos->child = NULL; -+ -+ return prepped; -+ } -+ -+ pos->pos_in_unit = 0; -+ if (extent_is_unallocated(&pos->coord)) -+ return 0; -+ -+ return leftmost_child_of_unit_check_flushprepped(&pos->coord); -+} -+ -+/* Handle the case when regular reiser4 tree (znodes connected one to its -+ * neighbors by sibling pointers) is interrupted on leaf level by one or more -+ * unformatted nodes. By having a lock on twig level and use extent code -+ * routines to process unformatted nodes we swim around an irregular part of -+ * reiser4 tree. */ -+static int handle_pos_on_twig(flush_pos_t * pos) -+{ -+ int ret; -+ -+ assert("zam-844", pos->state == POS_ON_EPOINT); -+ assert("zam-843", item_is_extent(&pos->coord)); -+ -+ /* We decide should we continue slum processing with current extent -+ unit: if leftmost child of current extent unit is flushprepped -+ (i.e. clean or already processed by flush) we stop squalloc(). There -+ is a fast check for unallocated extents which we assume contain all -+ not flushprepped nodes. */ -+ /* FIXME: Here we implement simple check, we are only looking on the -+ leftmost child. */ -+ ret = squalloc_extent_should_stop(pos); -+ if (ret != 0) { -+ pos_stop(pos); -+ return ret; -+ } -+ -+ while (pos_valid(pos) && coord_is_existing_unit(&pos->coord) -+ && item_is_extent(&pos->coord)) { -+ ret = reiser4_alloc_extent(pos); -+ if (ret) { -+ break; -+ } -+ coord_next_unit(&pos->coord); -+ } -+ -+ if (coord_is_after_rightmost(&pos->coord)) { -+ pos->state = POS_END_OF_TWIG; -+ return 0; -+ } -+ if (item_is_internal(&pos->coord)) { -+ pos->state = POS_TO_LEAF; -+ return 0; -+ } -+ -+ assert("zam-860", item_is_extent(&pos->coord)); -+ -+ /* "slum" is over */ -+ pos->state = POS_INVALID; -+ return 0; -+} -+ -+/* When we about to return flush position from twig to leaf level we can process -+ * the right twig node or move position to the leaf. This processes right twig -+ * if it is possible and jump to leaf level if not. */ -+static int handle_pos_end_of_twig(flush_pos_t * pos) -+{ -+ int ret; -+ lock_handle right_lock; -+ load_count right_load; -+ coord_t at_right; -+ jnode *child = NULL; -+ -+ assert("zam-848", pos->state == POS_END_OF_TWIG); -+ assert("zam-849", coord_is_after_rightmost(&pos->coord)); -+ -+ init_lh(&right_lock); -+ init_load_count(&right_load); -+ -+ /* We get a lock on the right twig node even it is not dirty because -+ * slum continues or discontinues on leaf level not on next twig. This -+ * lock on the right twig is needed for getting its leftmost child. */ -+ ret = -+ reiser4_get_right_neighbor(&right_lock, pos->lock.node, -+ ZNODE_WRITE_LOCK, GN_SAME_ATOM); -+ if (ret) -+ goto out; -+ -+ ret = incr_load_count_znode(&right_load, right_lock.node); -+ if (ret) -+ goto out; -+ -+ /* right twig could be not dirty */ -+ if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) { -+ /* If right twig node is dirty we always attempt to squeeze it -+ * content to the left... */ -+ became_dirty: -+ ret = -+ squeeze_right_twig_and_advance_coord(pos, right_lock.node); -+ if (ret <= 0) { -+ /* pos->coord is on internal item, go to leaf level, or -+ * we have an error which will be caught in squalloc() */ -+ pos->state = POS_TO_LEAF; -+ goto out; -+ } -+ -+ /* If right twig was squeezed completely we wave to re-lock -+ * right twig. now it is done through the top-level squalloc -+ * routine. */ -+ if (node_is_empty(right_lock.node)) -+ goto out; -+ -+ /* ... and prep it if it is not yet prepped */ -+ if (!znode_check_flushprepped(right_lock.node)) { -+ /* As usual, process parent before ... */ -+ ret = -+ check_parents_and_squalloc_upper_levels(pos, -+ pos->lock. -+ node, -+ right_lock. -+ node); -+ if (ret) -+ goto out; -+ -+ /* ... processing the child */ -+ ret = -+ lock_parent_and_allocate_znode(right_lock.node, -+ pos); -+ if (ret) -+ goto out; -+ } -+ } else { -+ coord_init_first_unit(&at_right, right_lock.node); -+ -+ /* check first child of next twig, should we continue there ? */ -+ ret = get_leftmost_child_of_unit(&at_right, &child); -+ if (ret || child == NULL || jnode_check_flushprepped(child)) { -+ pos_stop(pos); -+ goto out; -+ } -+ -+ /* check clean twig for possible relocation */ -+ if (!znode_check_flushprepped(right_lock.node)) { -+ ret = -+ reverse_relocate_check_dirty_parent(child, -+ &at_right, pos); -+ if (ret) -+ goto out; -+ if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) -+ goto became_dirty; -+ } -+ } -+ -+ assert("zam-875", znode_check_flushprepped(right_lock.node)); -+ -+ /* Update the preceder by a block number of just processed right twig -+ * node. The code above could miss the preceder updating because -+ * allocate_znode() could not be called for this node. */ -+ pos->preceder.blk = *znode_get_block(right_lock.node); -+ check_preceder(pos->preceder.blk); -+ -+ coord_init_first_unit(&at_right, right_lock.node); -+ assert("zam-868", coord_is_existing_unit(&at_right)); -+ -+ pos->state = item_is_extent(&at_right) ? POS_ON_EPOINT : POS_TO_LEAF; -+ move_flush_pos(pos, &right_lock, &right_load, &at_right); -+ -+ out: -+ done_load_count(&right_load); -+ done_lh(&right_lock); -+ -+ if (child) -+ jput(child); -+ -+ return ret; -+} -+ -+/* Move the pos->lock to leaf node pointed by pos->coord, check should we -+ * continue there. */ -+static int handle_pos_to_leaf(flush_pos_t * pos) -+{ -+ int ret; -+ lock_handle child_lock; -+ load_count child_load; -+ jnode *child; -+ -+ assert("zam-846", pos->state == POS_TO_LEAF); -+ assert("zam-847", item_is_internal(&pos->coord)); -+ -+ init_lh(&child_lock); -+ init_load_count(&child_load); -+ -+ ret = get_leftmost_child_of_unit(&pos->coord, &child); -+ if (ret) -+ return ret; -+ if (child == NULL) { -+ pos_stop(pos); -+ return 0; -+ } -+ -+ if (jnode_check_flushprepped(child)) { -+ pos->state = POS_INVALID; -+ goto out; -+ } -+ -+ ret = -+ longterm_lock_znode(&child_lock, JZNODE(child), ZNODE_WRITE_LOCK, -+ ZNODE_LOCK_LOPRI); -+ if (ret) -+ goto out; -+ -+ ret = incr_load_count_znode(&child_load, JZNODE(child)); -+ if (ret) -+ goto out; -+ -+ ret = allocate_znode(JZNODE(child), &pos->coord, pos); -+ if (ret) -+ goto out; -+ -+ /* move flush position to leaf level */ -+ pos->state = POS_ON_LEAF; -+ move_flush_pos(pos, &child_lock, &child_load, NULL); -+ -+ if (node_is_empty(JZNODE(child))) { -+ ret = delete_empty_node(JZNODE(child)); -+ pos->state = POS_INVALID; -+ } -+ out: -+ done_load_count(&child_load); -+ done_lh(&child_lock); -+ jput(child); -+ -+ return ret; -+} -+ -+/* move pos from leaf to twig, and move lock from leaf to twig. */ -+/* Move pos->lock to upper (twig) level */ -+static int handle_pos_to_twig(flush_pos_t * pos) -+{ -+ int ret; -+ -+ lock_handle parent_lock; -+ load_count parent_load; -+ coord_t pcoord; -+ -+ assert("zam-852", pos->state == POS_TO_TWIG); -+ -+ init_lh(&parent_lock); -+ init_load_count(&parent_load); -+ -+ ret = -+ reiser4_get_parent(&parent_lock, pos->lock.node, ZNODE_WRITE_LOCK); -+ if (ret) -+ goto out; -+ -+ ret = incr_load_count_znode(&parent_load, parent_lock.node); -+ if (ret) -+ goto out; -+ -+ ret = find_child_ptr(parent_lock.node, pos->lock.node, &pcoord); -+ if (ret) -+ goto out; -+ -+ assert("zam-870", item_is_internal(&pcoord)); -+ coord_next_item(&pcoord); -+ -+ if (coord_is_after_rightmost(&pcoord)) -+ pos->state = POS_END_OF_TWIG; -+ else if (item_is_extent(&pcoord)) -+ pos->state = POS_ON_EPOINT; -+ else { -+ /* Here we understand that getting -E_NO_NEIGHBOR in -+ * handle_pos_on_leaf() was because of just a reaching edge of -+ * slum */ -+ pos_stop(pos); -+ goto out; -+ } -+ -+ move_flush_pos(pos, &parent_lock, &parent_load, &pcoord); -+ -+ out: -+ done_load_count(&parent_load); -+ done_lh(&parent_lock); -+ -+ return ret; -+} -+ -+typedef int (*pos_state_handle_t) (flush_pos_t *); -+static pos_state_handle_t flush_pos_handlers[] = { -+ /* process formatted nodes on leaf level, keep lock on a leaf node */ -+ [POS_ON_LEAF] = handle_pos_on_leaf, -+ /* process unformatted nodes, keep lock on twig node, pos->coord points to extent currently -+ * being processed */ -+ [POS_ON_EPOINT] = handle_pos_on_twig, -+ /* move a lock from leaf node to its parent for further processing of unformatted nodes */ -+ [POS_TO_TWIG] = handle_pos_to_twig, -+ /* move a lock from twig to leaf level when a processing of unformatted nodes finishes, -+ * pos->coord points to the leaf node we jump to */ -+ [POS_TO_LEAF] = handle_pos_to_leaf, -+ /* after processing last extent in the twig node, attempting to shift items from the twigs -+ * right neighbor and process them while shifting */ -+ [POS_END_OF_TWIG] = handle_pos_end_of_twig, -+ /* process formatted nodes on internal level, keep lock on an internal node */ -+ [POS_ON_INTERNAL] = handle_pos_on_internal -+}; -+ -+/* Advance flush position horizontally, prepare for flushing ((re)allocate, squeeze, -+ * encrypt) nodes and their ancestors in "parent-first" order */ -+static int squalloc(flush_pos_t * pos) -+{ -+ int ret = 0; -+ -+ /* maybe needs to be made a case statement with handle_pos_on_leaf as first case, for -+ * greater CPU efficiency? Measure and see.... -Hans */ -+ while (pos_valid(pos)) { -+ ret = flush_pos_handlers[pos->state] (pos); -+ if (ret < 0) -+ break; -+ -+ ret = rapid_flush(pos); -+ if (ret) -+ break; -+ } -+ -+ /* any positive value or -E_NO_NEIGHBOR are legal return codes for handle_pos* -+ routines, -E_NO_NEIGHBOR means that slum edge was reached */ -+ if (ret > 0 || ret == -E_NO_NEIGHBOR) -+ ret = 0; -+ -+ return ret; -+} -+ -+static void update_ldkey(znode * node) -+{ -+ reiser4_key ldkey; -+ -+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock)); -+ if (node_is_empty(node)) -+ return; -+ -+ znode_set_ld_key(node, leftmost_key_in_node(node, &ldkey)); -+} -+ -+/* this is to be called after calling of shift node's method to shift data from @right to -+ @left. It sets left delimiting keys of @left and @right to keys of first items of @left -+ and @right correspondingly and sets right delimiting key of @left to first key of @right */ -+static void update_znode_dkeys(znode * left, znode * right) -+{ -+ assert_rw_write_locked(&(znode_get_tree(right)->dk_lock)); -+ assert("vs-1629", (znode_is_write_locked(left) && -+ znode_is_write_locked(right))); -+ -+ /* we need to update left delimiting of left if it was empty before shift */ -+ update_ldkey(left); -+ update_ldkey(right); -+ if (node_is_empty(right)) -+ znode_set_rd_key(left, znode_get_rd_key(right)); -+ else -+ znode_set_rd_key(left, znode_get_ld_key(right)); -+} -+ -+/* try to shift everything from @right to @left. If everything was shifted - -+ @right is removed from the tree. Result is the number of bytes shifted. */ -+static int -+shift_everything_left(znode * right, znode * left, carry_level * todo) -+{ -+ coord_t from; -+ node_plugin *nplug; -+ carry_plugin_info info; -+ -+ coord_init_after_last_item(&from, right); -+ -+ nplug = node_plugin_by_node(right); -+ info.doing = NULL; -+ info.todo = todo; -+ return nplug->shift(&from, left, SHIFT_LEFT, -+ 1 /* delete @right if it becomes empty */ , -+ 1 -+ /* move coord @from to node @left if everything will be shifted */ -+ , -+ &info); -+} -+ -+/* Shift as much as possible from @right to @left using the memcpy-optimized -+ shift_everything_left. @left and @right are formatted neighboring nodes on -+ leaf level. */ -+static int squeeze_right_non_twig(znode * left, znode * right) -+{ -+ int ret; -+ carry_pool *pool; -+ carry_level *todo; -+ -+ assert("nikita-2246", znode_get_level(left) == znode_get_level(right)); -+ -+ if (!JF_ISSET(ZJNODE(left), JNODE_DIRTY) || -+ !JF_ISSET(ZJNODE(right), JNODE_DIRTY)) -+ return SQUEEZE_TARGET_FULL; -+ -+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo)); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ todo = (carry_level *) (pool + 1); -+ init_carry_level(todo, pool); -+ -+ ret = shift_everything_left(right, left, todo); -+ if (ret > 0) { -+ /* something was shifted */ -+ reiser4_tree *tree; -+ __u64 grabbed; -+ -+ znode_make_dirty(left); -+ znode_make_dirty(right); -+ -+ /* update delimiting keys of nodes which participated in -+ shift. FIXME: it would be better to have this in shift -+ node's operation. But it can not be done there. Nobody -+ remembers why, though */ -+ tree = znode_get_tree(left); -+ write_lock_dk(tree); -+ update_znode_dkeys(left, right); -+ write_unlock_dk(tree); -+ -+ /* Carry is called to update delimiting key and, maybe, to remove empty -+ node. */ -+ grabbed = get_current_context()->grabbed_blocks; -+ ret = reiser4_grab_space_force(tree->height, BA_RESERVED); -+ assert("nikita-3003", ret == 0); /* reserved space is exhausted. Ask Hans. */ -+ ret = reiser4_carry(todo, NULL /* previous level */ ); -+ grabbed2free_mark(grabbed); -+ } else { -+ /* Shifting impossible, we return appropriate result code */ -+ ret = -+ node_is_empty(right) ? SQUEEZE_SOURCE_EMPTY : -+ SQUEEZE_TARGET_FULL; -+ } -+ -+ done_carry_pool(pool); -+ -+ return ret; -+} -+ -+#if REISER4_DEBUG -+static int sibling_link_is_ok(const znode *left, const znode *right) -+{ -+ int result; -+ -+ read_lock_tree(znode_get_tree(left)); -+ result = (left->right == right && left == right->left); -+ read_unlock_tree(znode_get_tree(left)); -+ return result; -+} -+#endif -+ -+/* Shift first unit of first item if it is an internal one. Return -+ SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return -+ SUBTREE_MOVED. */ -+static int shift_one_internal_unit(znode * left, znode * right) -+{ -+ int ret; -+ carry_pool *pool; -+ carry_level *todo; -+ coord_t *coord; -+ carry_plugin_info *info; -+ int size, moved; -+ -+ assert("nikita-2247", znode_get_level(left) == znode_get_level(right)); -+ assert("nikita-2435", znode_is_write_locked(left)); -+ assert("nikita-2436", znode_is_write_locked(right)); -+ assert("nikita-2434", sibling_link_is_ok(left, right)); -+ -+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) + -+ sizeof(*coord) + sizeof(*info) -+#if REISER4_DEBUG -+ + sizeof(*coord) + 2 * sizeof(reiser4_key) -+#endif -+ ); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ todo = (carry_level *) (pool + 1); -+ init_carry_level(todo, pool); -+ -+ coord = (coord_t *) (todo + 3); -+ coord_init_first_unit(coord, right); -+ info = (carry_plugin_info *) (coord + 1); -+ -+#if REISER4_DEBUG -+ if (!node_is_empty(left)) { -+ coord_t *last; -+ reiser4_key *right_key; -+ reiser4_key *left_key; -+ -+ last = (coord_t *) (info + 1); -+ right_key = (reiser4_key *) (last + 1); -+ left_key = right_key + 1; -+ coord_init_last_unit(last, left); -+ -+ assert("nikita-2463", -+ keyle(item_key_by_coord(last, left_key), -+ item_key_by_coord(coord, right_key))); -+ } -+#endif -+ -+ assert("jmacd-2007", item_is_internal(coord)); -+ -+ size = item_length_by_coord(coord); -+ info->todo = todo; -+ info->doing = NULL; -+ -+ ret = node_plugin_by_node(left)->shift(coord, left, SHIFT_LEFT, -+ 1 -+ /* delete @right if it becomes empty */ -+ , -+ 0 -+ /* do not move coord @coord to node @left */ -+ , -+ info); -+ -+ /* If shift returns positive, then we shifted the item. */ -+ assert("vs-423", ret <= 0 || size == ret); -+ moved = (ret > 0); -+ -+ if (moved) { -+ /* something was moved */ -+ reiser4_tree *tree; -+ int grabbed; -+ -+ znode_make_dirty(left); -+ znode_make_dirty(right); -+ tree = znode_get_tree(left); -+ write_lock_dk(tree); -+ update_znode_dkeys(left, right); -+ write_unlock_dk(tree); -+ -+ /* reserve space for delimiting keys after shifting */ -+ grabbed = get_current_context()->grabbed_blocks; -+ ret = reiser4_grab_space_force(tree->height, BA_RESERVED); -+ assert("nikita-3003", ret == 0); /* reserved space is exhausted. Ask Hans. */ -+ -+ ret = reiser4_carry(todo, NULL /* previous level */ ); -+ grabbed2free_mark(grabbed); -+ } -+ -+ done_carry_pool(pool); -+ -+ if (ret != 0) { -+ /* Shift or carry operation failed. */ -+ assert("jmacd-7325", ret < 0); -+ return ret; -+ } -+ -+ return moved ? SUBTREE_MOVED : SQUEEZE_TARGET_FULL; -+} -+ -+/* Make the final relocate/wander decision during forward parent-first squalloc for a -+ znode. For unformatted nodes this is done in plugin/item/extent.c:extent_needs_allocation(). */ -+static int -+allocate_znode_loaded(znode * node, -+ const coord_t * parent_coord, flush_pos_t * pos) -+{ -+ int ret; -+ reiser4_super_info_data *sbinfo = get_current_super_private(); -+ /* FIXME(D): We have the node write-locked and should have checked for ! -+ allocated() somewhere before reaching this point, but there can be a race, so -+ this assertion is bogus. */ -+ assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node))); -+ assert("jmacd-7988", znode_is_write_locked(node)); -+ assert("jmacd-7989", coord_is_invalid(parent_coord) -+ || znode_is_write_locked(parent_coord->node)); -+ -+ if (ZF_ISSET(node, JNODE_REPACK) || ZF_ISSET(node, JNODE_CREATED) || -+ znode_is_root(node) || -+ /* We have enough nodes to relocate no matter what. */ -+ (pos->leaf_relocate != 0 && znode_get_level(node) == LEAF_LEVEL)) { -+ /* No need to decide with new nodes, they are treated the same as -+ relocate. If the root node is dirty, relocate. */ -+ if (pos->preceder.blk == 0) { -+ /* preceder is unknown and we have decided to relocate node -- -+ using of default value for search start is better than search -+ from block #0. */ -+ get_blocknr_hint_default(&pos->preceder.blk); -+ check_preceder(pos->preceder.blk); -+ } -+ -+ goto best_reloc; -+ -+ } else if (pos->preceder.blk == 0) { -+ /* If we don't know the preceder, leave it where it is. */ -+ jnode_make_wander(ZJNODE(node)); -+ } else { -+ /* Make a decision based on block distance. */ -+ reiser4_block_nr dist; -+ reiser4_block_nr nblk = *znode_get_block(node); -+ -+ assert("jmacd-6172", !reiser4_blocknr_is_fake(&nblk)); -+ assert("jmacd-6173", !reiser4_blocknr_is_fake(&pos->preceder.blk)); -+ assert("jmacd-6174", pos->preceder.blk != 0); -+ -+ if (pos->preceder.blk == nblk - 1) { -+ /* Ideal. */ -+ jnode_make_wander(ZJNODE(node)); -+ } else { -+ -+ dist = -+ (nblk < -+ pos->preceder.blk) ? (pos->preceder.blk - -+ nblk) : (nblk - -+ pos->preceder.blk); -+ -+ /* See if we can find a closer block (forward direction only). */ -+ pos->preceder.max_dist = -+ min((reiser4_block_nr) sbinfo->flush. -+ relocate_distance, dist); -+ pos->preceder.level = znode_get_level(node); -+ -+ ret = allocate_znode_update(node, parent_coord, pos); -+ -+ pos->preceder.max_dist = 0; -+ -+ if (ret && (ret != -ENOSPC)) -+ return ret; -+ -+ if (ret == 0) { -+ /* Got a better allocation. */ -+ znode_make_reloc(node, pos->fq); -+ } else if (dist < sbinfo->flush.relocate_distance) { -+ /* The present allocation is good enough. */ -+ jnode_make_wander(ZJNODE(node)); -+ } else { -+ /* Otherwise, try to relocate to the best position. */ -+ best_reloc: -+ ret = -+ allocate_znode_update(node, parent_coord, -+ pos); -+ if (ret != 0) -+ return ret; -+ -+ /* set JNODE_RELOC bit _after_ node gets allocated */ -+ znode_make_reloc(node, pos->fq); -+ } -+ } -+ } -+ -+ /* This is the new preceder. */ -+ pos->preceder.blk = *znode_get_block(node); -+ check_preceder(pos->preceder.blk); -+ pos->alloc_cnt += 1; -+ -+ assert("jmacd-4277", !reiser4_blocknr_is_fake(&pos->preceder.blk)); -+ -+ return 0; -+} -+ -+static int -+allocate_znode(znode * node, const coord_t * parent_coord, flush_pos_t * pos) -+{ -+ /* -+ * perform znode allocation with znode pinned in memory to avoid races -+ * with asynchronous emergency flush (which plays with -+ * JNODE_FLUSH_RESERVED bit). -+ */ -+ return WITH_DATA(node, allocate_znode_loaded(node, parent_coord, pos)); -+} -+ -+/* A subroutine of allocate_znode, this is called first to see if there is a close -+ position to relocate to. It may return ENOSPC if there is no close position. If there -+ is no close position it may not relocate. This takes care of updating the parent node -+ with the relocated block address. */ -+static int -+allocate_znode_update(znode * node, const coord_t * parent_coord, -+ flush_pos_t * pos) -+{ -+ int ret; -+ reiser4_block_nr blk; -+ lock_handle uber_lock; -+ int flush_reserved_used = 0; -+ int grabbed; -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ init_lh(&uber_lock); -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ grabbed = ctx->grabbed_blocks; -+ -+ /* discard e-flush allocation */ -+ ret = zload(node); -+ if (ret) -+ return ret; -+ -+ if (ZF_ISSET(node, JNODE_CREATED)) { -+ assert("zam-816", reiser4_blocknr_is_fake(znode_get_block(node))); -+ pos->preceder.block_stage = BLOCK_UNALLOCATED; -+ } else { -+ pos->preceder.block_stage = BLOCK_GRABBED; -+ -+ /* The disk space for relocating the @node is already reserved in "flush reserved" -+ * counter if @node is leaf, otherwise we grab space using BA_RESERVED (means grab -+ * space from whole disk not from only 95%). */ -+ if (znode_get_level(node) == LEAF_LEVEL) { -+ /* -+ * earlier (during do_jnode_make_dirty()) we decided -+ * that @node can possibly go into overwrite set and -+ * reserved block for its wandering location. -+ */ -+ txn_atom *atom = get_current_atom_locked(); -+ assert("nikita-3449", -+ ZF_ISSET(node, JNODE_FLUSH_RESERVED)); -+ flush_reserved2grabbed(atom, (__u64) 1); -+ spin_unlock_atom(atom); -+ /* -+ * we are trying to move node into relocate -+ * set. Allocation of relocated position "uses" -+ * reserved block. -+ */ -+ ZF_CLR(node, JNODE_FLUSH_RESERVED); -+ flush_reserved_used = 1; -+ } else { -+ ret = reiser4_grab_space_force((__u64) 1, BA_RESERVED); -+ if (ret != 0) -+ goto exit; -+ } -+ } -+ -+ /* We may do not use 5% of reserved disk space here and flush will not pack tightly. */ -+ ret = reiser4_alloc_block(&pos->preceder, &blk, -+ BA_FORMATTED | BA_PERMANENT); -+ if (ret) -+ goto exit; -+ -+ if (!ZF_ISSET(node, JNODE_CREATED) && -+ (ret = -+ reiser4_dealloc_block(znode_get_block(node), 0, -+ BA_DEFER | BA_FORMATTED))) -+ goto exit; -+ -+ if (likely(!znode_is_root(node))) { -+ item_plugin *iplug; -+ -+ iplug = item_plugin_by_coord(parent_coord); -+ assert("nikita-2954", iplug->f.update != NULL); -+ iplug->f.update(parent_coord, &blk); -+ -+ znode_make_dirty(parent_coord->node); -+ -+ } else { -+ reiser4_tree *tree = znode_get_tree(node); -+ znode *uber; -+ -+ /* We take a longterm lock on the fake node in order to change -+ the root block number. This may cause atom fusion. */ -+ ret = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI, -+ &uber_lock); -+ /* The fake node cannot be deleted, and we must have priority -+ here, and may not be confused with ENOSPC. */ -+ assert("jmacd-74412", -+ ret != -EINVAL && ret != -E_DEADLOCK && ret != -ENOSPC); -+ -+ if (ret) -+ goto exit; -+ -+ uber = uber_lock.node; -+ -+ write_lock_tree(tree); -+ tree->root_block = blk; -+ write_unlock_tree(tree); -+ -+ znode_make_dirty(uber); -+ } -+ -+ ret = znode_rehash(node, &blk); -+ exit: -+ if (ret) { -+ /* Get flush reserved block back if something fails, because -+ * callers assume that on error block wasn't relocated and its -+ * flush reserved block wasn't used. */ -+ if (flush_reserved_used) { -+ /* -+ * ok, we failed to move node into relocate -+ * set. Restore status quo. -+ */ -+ grabbed2flush_reserved((__u64) 1); -+ ZF_SET(node, JNODE_FLUSH_RESERVED); -+ } -+ } -+ zrelse(node); -+ done_lh(&uber_lock); -+ grabbed2free_mark(grabbed); -+ return ret; -+} -+ -+/* JNODE INTERFACE */ -+ -+/* Lock a node (if formatted) and then get its parent locked, set the child's -+ coordinate in the parent. If the child is the root node, the above_root -+ znode is returned but the coord is not set. This function may cause atom -+ fusion, but it is only used for read locks (at this point) and therefore -+ fusion only occurs when the parent is already dirty. */ -+/* Hans adds this note: remember to ask how expensive this operation is vs. storing parent -+ pointer in jnodes. */ -+static int -+jnode_lock_parent_coord(jnode * node, -+ coord_t * coord, -+ lock_handle * parent_lh, -+ load_count * parent_zh, -+ znode_lock_mode parent_mode, int try) -+{ -+ int ret; -+ -+ assert("edward-53", jnode_is_unformatted(node) || jnode_is_znode(node)); -+ assert("edward-54", jnode_is_unformatted(node) -+ || znode_is_any_locked(JZNODE(node))); -+ -+ if (!jnode_is_znode(node)) { -+ reiser4_key key; -+ tree_level stop_level = TWIG_LEVEL; -+ lookup_bias bias = FIND_EXACT; -+ -+ assert("edward-168", !(jnode_get_type(node) == JNODE_BITMAP)); -+ -+ /* The case when node is not znode, but can have parent coord -+ (unformatted node, node which represents cluster page, -+ etc..). Generate a key for the appropriate entry, search -+ in the tree using coord_by_key, which handles locking for -+ us. */ -+ -+ /* -+ * nothing is locked at this moment, so, nothing prevents -+ * concurrent truncate from removing jnode from inode. To -+ * prevent this spin-lock jnode. jnode can be truncated just -+ * after call to the jnode_build_key(), but this is ok, -+ * because coord_by_key() will just fail to find appropriate -+ * extent. -+ */ -+ spin_lock_jnode(node); -+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) { -+ jnode_build_key(node, &key); -+ ret = 0; -+ } else -+ ret = RETERR(-ENOENT); -+ spin_unlock_jnode(node); -+ -+ if (ret != 0) -+ return ret; -+ -+ if (jnode_is_cluster_page(node)) -+ stop_level = LEAF_LEVEL; -+ -+ assert("jmacd-1812", coord != NULL); -+ -+ ret = coord_by_key(jnode_get_tree(node), &key, coord, parent_lh, -+ parent_mode, bias, stop_level, stop_level, -+ CBK_UNIQUE, NULL /*ra_info */ ); -+ switch (ret) { -+ case CBK_COORD_NOTFOUND: -+ assert("edward-1038", -+ ergo(jnode_is_cluster_page(node), -+ JF_ISSET(node, JNODE_HEARD_BANSHEE))); -+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) -+ warning("nikita-3177", "Parent not found"); -+ return ret; -+ case CBK_COORD_FOUND: -+ if (coord->between != AT_UNIT) { -+ /* FIXME: comment needed */ -+ done_lh(parent_lh); -+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) { -+ warning("nikita-3178", -+ "Found but not happy: %i", -+ coord->between); -+ } -+ return RETERR(-ENOENT); -+ } -+ ret = incr_load_count_znode(parent_zh, parent_lh->node); -+ if (ret != 0) -+ return ret; -+ /* if (jnode_is_cluster_page(node)) { -+ races with write() are possible -+ check_child_cluster (parent_lh->node); -+ } -+ */ -+ break; -+ default: -+ return ret; -+ } -+ -+ } else { -+ int flags; -+ znode *z; -+ -+ z = JZNODE(node); -+ /* Formatted node case: */ -+ assert("jmacd-2061", !znode_is_root(z)); -+ -+ flags = GN_ALLOW_NOT_CONNECTED; -+ if (try) -+ flags |= GN_TRY_LOCK; -+ -+ ret = -+ reiser4_get_parent_flags(parent_lh, z, parent_mode, flags); -+ if (ret != 0) -+ /* -E_REPEAT is ok here, it is handled by the caller. */ -+ return ret; -+ -+ /* Make the child's position "hint" up-to-date. (Unless above -+ root, which caller must check.) */ -+ if (coord != NULL) { -+ -+ ret = incr_load_count_znode(parent_zh, parent_lh->node); -+ if (ret != 0) { -+ warning("jmacd-976812386", -+ "incr_load_count_znode failed: %d", -+ ret); -+ return ret; -+ } -+ -+ ret = find_child_ptr(parent_lh->node, z, coord); -+ if (ret != 0) { -+ warning("jmacd-976812", -+ "find_child_ptr failed: %d", ret); -+ return ret; -+ } -+ } -+ } -+ -+ return 0; -+} -+ -+/* Get the (locked) next neighbor of a znode which is dirty and a member of the same atom. -+ If there is no next neighbor or the neighbor is not in memory or if there is a -+ neighbor but it is not dirty or not in the same atom, -E_NO_NEIGHBOR is returned. -+ In some cases the slum may include nodes which are not dirty, if so @check_dirty should be 0 */ -+static int neighbor_in_slum(znode * node, /* starting point */ -+ lock_handle * lock, /* lock on starting point */ -+ sideof side, /* left or right direction we seek the next node in */ -+ znode_lock_mode mode, /* kind of lock we want */ -+ int check_dirty, /* true if the neighbor should be dirty */ -+ int use_upper_levels /* get neighbor by going though -+ upper levels */) -+{ -+ int ret; -+ int flags; -+ -+ assert("jmacd-6334", znode_is_connected(node)); -+ -+ flags = GN_SAME_ATOM | (side == LEFT_SIDE ? GN_GO_LEFT : 0); -+ if (use_upper_levels) -+ flags |= GN_CAN_USE_UPPER_LEVELS; -+ -+ ret = reiser4_get_neighbor(lock, node, mode, flags); -+ if (ret) { -+ /* May return -ENOENT or -E_NO_NEIGHBOR. */ -+ /* FIXME(C): check EINVAL, E_DEADLOCK */ -+ if (ret == -ENOENT) { -+ ret = RETERR(-E_NO_NEIGHBOR); -+ } -+ return ret; -+ } -+ if (!check_dirty) -+ return 0; -+ /* Check dirty bit of locked znode, no races here */ -+ if (JF_ISSET(ZJNODE(lock->node), JNODE_DIRTY)) -+ return 0; -+ -+ done_lh(lock); -+ return RETERR(-E_NO_NEIGHBOR); -+} -+ -+/* Return true if two znodes have the same parent. This is called with both nodes -+ write-locked (for squeezing) so no tree lock is needed. */ -+static int znode_same_parents(znode * a, znode * b) -+{ -+ int result; -+ -+ assert("jmacd-7011", znode_is_write_locked(a)); -+ assert("jmacd-7012", znode_is_write_locked(b)); -+ -+ /* We lock the whole tree for this check.... I really don't like whole tree -+ * locks... -Hans */ -+ read_lock_tree(znode_get_tree(a)); -+ result = (znode_parent(a) == znode_parent(b)); -+ read_unlock_tree(znode_get_tree(a)); -+ return result; -+} -+ -+/* FLUSH SCAN */ -+ -+/* Initialize the flush_scan data structure. */ -+static void scan_init(flush_scan * scan) -+{ -+ memset(scan, 0, sizeof(*scan)); -+ init_lh(&scan->node_lock); -+ init_lh(&scan->parent_lock); -+ init_load_count(&scan->parent_load); -+ init_load_count(&scan->node_load); -+ coord_init_invalid(&scan->parent_coord, NULL); -+} -+ -+/* Release any resources held by the flush scan, e.g., release locks, free memory, etc. */ -+static void scan_done(flush_scan * scan) -+{ -+ done_load_count(&scan->node_load); -+ if (scan->node != NULL) { -+ jput(scan->node); -+ scan->node = NULL; -+ } -+ done_load_count(&scan->parent_load); -+ done_lh(&scan->parent_lock); -+ done_lh(&scan->node_lock); -+} -+ -+/* Returns true if flush scanning is finished. */ -+int reiser4_scan_finished(flush_scan * scan) -+{ -+ return scan->stop || (scan->direction == RIGHT_SIDE && -+ scan->count >= scan->max_count); -+} -+ -+/* Return true if the scan should continue to the @tonode. True if the node meets the -+ same_slum_check condition. If not, deref the "left" node and stop the scan. */ -+int reiser4_scan_goto(flush_scan * scan, jnode * tonode) -+{ -+ int go = same_slum_check(scan->node, tonode, 1, 0); -+ -+ if (!go) { -+ scan->stop = 1; -+ jput(tonode); -+ } -+ -+ return go; -+} -+ -+/* Set the current scan->node, refcount it, increment count by the @add_count (number to -+ count, e.g., skipped unallocated nodes), deref previous current, and copy the current -+ parent coordinate. */ -+int -+scan_set_current(flush_scan * scan, jnode * node, unsigned add_count, -+ const coord_t * parent) -+{ -+ /* Release the old references, take the new reference. */ -+ done_load_count(&scan->node_load); -+ -+ if (scan->node != NULL) { -+ jput(scan->node); -+ } -+ scan->node = node; -+ scan->count += add_count; -+ -+ /* This next stmt is somewhat inefficient. The reiser4_scan_extent() code could -+ delay this update step until it finishes and update the parent_coord only once. -+ It did that before, but there was a bug and this was the easiest way to make it -+ correct. */ -+ if (parent != NULL) { -+ coord_dup(&scan->parent_coord, parent); -+ } -+ -+ /* Failure may happen at the incr_load_count call, but the caller can assume the reference -+ is safely taken. */ -+ return incr_load_count_jnode(&scan->node_load, node); -+} -+ -+/* Return true if scanning in the leftward direction. */ -+int reiser4_scanning_left(flush_scan * scan) -+{ -+ return scan->direction == LEFT_SIDE; -+} -+ -+/* Performs leftward scanning starting from either kind of node. Counts the starting -+ node. The right-scan object is passed in for the left-scan in order to copy the parent -+ of an unformatted starting position. This way we avoid searching for the unformatted -+ node's parent when scanning in each direction. If we search for the parent once it is -+ set in both scan objects. The limit parameter tells flush-scan when to stop. -+ -+ Rapid scanning is used only during scan_left, where we are interested in finding the -+ 'leftpoint' where we begin flushing. We are interested in stopping at the left child -+ of a twig that does not have a dirty left neighbor. THIS IS A SPECIAL CASE. The -+ problem is finding a way to flush only those nodes without unallocated children, and it -+ is difficult to solve in the bottom-up flushing algorithm we are currently using. The -+ problem can be solved by scanning left at every level as we go upward, but this would -+ basically bring us back to using a top-down allocation strategy, which we already tried -+ (see BK history from May 2002), and has a different set of problems. The top-down -+ strategy makes avoiding unallocated children easier, but makes it difficult to -+ propertly flush dirty children with clean parents that would otherwise stop the -+ top-down flush, only later to dirty the parent once the children are flushed. So we -+ solve the problem in the bottom-up algorithm with a special case for twigs and leaves -+ only. -+ -+ The first step in solving the problem is this rapid leftward scan. After we determine -+ that there are at least enough nodes counted to qualify for FLUSH_RELOCATE_THRESHOLD we -+ are no longer interested in the exact count, we are only interested in finding a the -+ best place to start the flush. We could choose one of two possibilities: -+ -+ 1. Stop at the leftmost child (of a twig) that does not have a dirty left neighbor. -+ This requires checking one leaf per rapid-scan twig -+ -+ 2. Stop at the leftmost child (of a twig) where there are no dirty children of the twig -+ to the left. This requires checking possibly all of the in-memory children of each -+ twig during the rapid scan. -+ -+ For now we implement the first policy. -+*/ -+static int -+scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit) -+{ -+ int ret = 0; -+ -+ scan->max_count = limit; -+ scan->direction = LEFT_SIDE; -+ -+ ret = scan_set_current(scan, jref(node), 1, NULL); -+ if (ret != 0) { -+ return ret; -+ } -+ -+ ret = scan_common(scan, right); -+ if (ret != 0) { -+ return ret; -+ } -+ -+ /* Before rapid scanning, we need a lock on scan->node so that we can get its -+ parent, only if formatted. */ -+ if (jnode_is_znode(scan->node)) { -+ ret = longterm_lock_znode(&scan->node_lock, JZNODE(scan->node), -+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI); -+ } -+ -+ /* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD). */ -+ return ret; -+} -+ -+/* Performs rightward scanning... Does not count the starting node. The limit parameter -+ is described in scan_left. If the starting node is unformatted then the -+ parent_coord was already set during scan_left. The rapid_after parameter is not used -+ during right-scanning. -+ -+ scan_right is only called if the scan_left operation does not count at least -+ FLUSH_RELOCATE_THRESHOLD nodes for flushing. Otherwise, the limit parameter is set to -+ the difference between scan-left's count and FLUSH_RELOCATE_THRESHOLD, meaning -+ scan-right counts as high as FLUSH_RELOCATE_THRESHOLD and then stops. */ -+static int scan_right(flush_scan * scan, jnode * node, unsigned limit) -+{ -+ int ret; -+ -+ scan->max_count = limit; -+ scan->direction = RIGHT_SIDE; -+ -+ ret = scan_set_current(scan, jref(node), 0, NULL); -+ if (ret != 0) { -+ return ret; -+ } -+ -+ return scan_common(scan, NULL); -+} -+ -+/* Common code to perform left or right scanning. */ -+static int scan_common(flush_scan * scan, flush_scan * other) -+{ -+ int ret; -+ -+ assert("nikita-2376", scan->node != NULL); -+ assert("edward-54", jnode_is_unformatted(scan->node) -+ || jnode_is_znode(scan->node)); -+ -+ /* Special case for starting at an unformatted node. Optimization: we only want -+ to search for the parent (which requires a tree traversal) once. Obviously, we -+ shouldn't have to call it once for the left scan and once for the right scan. -+ For this reason, if we search for the parent during scan-left we then duplicate -+ the coord/lock/load into the scan-right object. */ -+ if (jnode_is_unformatted(scan->node)) { -+ ret = scan_unformatted(scan, other); -+ if (ret != 0) -+ return ret; -+ } -+ /* This loop expects to start at a formatted position and performs chaining of -+ formatted regions */ -+ while (!reiser4_scan_finished(scan)) { -+ -+ ret = scan_formatted(scan); -+ if (ret != 0) { -+ return ret; -+ } -+ } -+ -+ return 0; -+} -+ -+static int scan_unformatted(flush_scan * scan, flush_scan * other) -+{ -+ int ret = 0; -+ int try = 0; -+ -+ if (!coord_is_invalid(&scan->parent_coord)) -+ goto scan; -+ -+ /* set parent coord from */ -+ if (!jnode_is_unformatted(scan->node)) { -+ /* formatted position */ -+ -+ lock_handle lock; -+ assert("edward-301", jnode_is_znode(scan->node)); -+ init_lh(&lock); -+ -+ /* -+ * when flush starts from unformatted node, first thing it -+ * does is tree traversal to find formatted parent of starting -+ * node. This parent is then kept lock across scans to the -+ * left and to the right. This means that during scan to the -+ * left we cannot take left-ward lock, because this is -+ * dead-lock prone. So, if we are scanning to the left and -+ * there is already lock held by this thread, -+ * jnode_lock_parent_coord() should use try-lock. -+ */ -+ try = reiser4_scanning_left(scan) -+ && !lock_stack_isclean(get_current_lock_stack()); -+ /* Need the node locked to get the parent lock, We have to -+ take write lock since there is at least one call path -+ where this znode is already write-locked by us. */ -+ ret = -+ longterm_lock_znode(&lock, JZNODE(scan->node), -+ ZNODE_WRITE_LOCK, -+ reiser4_scanning_left(scan) ? -+ ZNODE_LOCK_LOPRI : -+ ZNODE_LOCK_HIPRI); -+ if (ret != 0) -+ /* EINVAL or E_DEADLOCK here mean... try again! At this point we've -+ scanned too far and can't back out, just start over. */ -+ return ret; -+ -+ ret = jnode_lock_parent_coord(scan->node, -+ &scan->parent_coord, -+ &scan->parent_lock, -+ &scan->parent_load, -+ ZNODE_WRITE_LOCK, try); -+ -+ /* FIXME(C): check EINVAL, E_DEADLOCK */ -+ done_lh(&lock); -+ if (ret == -E_REPEAT) { -+ scan->stop = 1; -+ return 0; -+ } -+ if (ret) -+ return ret; -+ -+ } else { -+ /* unformatted position */ -+ -+ ret = -+ jnode_lock_parent_coord(scan->node, &scan->parent_coord, -+ &scan->parent_lock, -+ &scan->parent_load, -+ ZNODE_WRITE_LOCK, try); -+ -+ if (IS_CBKERR(ret)) -+ return ret; -+ -+ if (ret == CBK_COORD_NOTFOUND) -+ /* FIXME(C): check EINVAL, E_DEADLOCK */ -+ return ret; -+ -+ /* parent was found */ -+ assert("jmacd-8661", other != NULL); -+ /* Duplicate the reference into the other flush_scan. */ -+ coord_dup(&other->parent_coord, &scan->parent_coord); -+ copy_lh(&other->parent_lock, &scan->parent_lock); -+ copy_load_count(&other->parent_load, &scan->parent_load); -+ } -+ scan: -+ return scan_by_coord(scan); -+} -+ -+/* Performs left- or rightward scanning starting from a formatted node. Follow left -+ pointers under tree lock as long as: -+ -+ - node->left/right is non-NULL -+ - node->left/right is connected, dirty -+ - node->left/right belongs to the same atom -+ - scan has not reached maximum count -+*/ -+static int scan_formatted(flush_scan * scan) -+{ -+ int ret; -+ znode *neighbor = NULL; -+ -+ assert("jmacd-1401", !reiser4_scan_finished(scan)); -+ -+ do { -+ znode *node = JZNODE(scan->node); -+ -+ /* Node should be connected, but if not stop the scan. */ -+ if (!znode_is_connected(node)) { -+ scan->stop = 1; -+ break; -+ } -+ -+ /* Lock the tree, check-for and reference the next sibling. */ -+ read_lock_tree(znode_get_tree(node)); -+ -+ /* It may be that a node is inserted or removed between a node and its -+ left sibling while the tree lock is released, but the flush-scan count -+ does not need to be precise. Thus, we release the tree lock as soon as -+ we get the neighboring node. */ -+ neighbor = -+ reiser4_scanning_left(scan) ? node->left : node->right; -+ if (neighbor != NULL) { -+ zref(neighbor); -+ } -+ -+ read_unlock_tree(znode_get_tree(node)); -+ -+ /* If neighbor is NULL at the leaf level, need to check for an unformatted -+ sibling using the parent--break in any case. */ -+ if (neighbor == NULL) { -+ break; -+ } -+ -+ /* Check the condition for going left, break if it is not met. This also -+ releases (jputs) the neighbor if false. */ -+ if (!reiser4_scan_goto(scan, ZJNODE(neighbor))) { -+ break; -+ } -+ -+ /* Advance the flush_scan state to the left, repeat. */ -+ ret = scan_set_current(scan, ZJNODE(neighbor), 1, NULL); -+ if (ret != 0) { -+ return ret; -+ } -+ -+ } while (!reiser4_scan_finished(scan)); -+ -+ /* If neighbor is NULL then we reached the end of a formatted region, or else the -+ sibling is out of memory, now check for an extent to the left (as long as -+ LEAF_LEVEL). */ -+ if (neighbor != NULL || jnode_get_level(scan->node) != LEAF_LEVEL -+ || reiser4_scan_finished(scan)) { -+ scan->stop = 1; -+ return 0; -+ } -+ /* Otherwise, calls scan_by_coord for the right(left)most item of the -+ left(right) neighbor on the parent level, then possibly continue. */ -+ -+ coord_init_invalid(&scan->parent_coord, NULL); -+ return scan_unformatted(scan, NULL); -+} -+ -+/* NOTE-EDWARD: -+ This scans adjacent items of the same type and calls scan flush plugin for each one. -+ Performs left(right)ward scanning starting from a (possibly) unformatted node. If we start -+ from unformatted node, then we continue only if the next neighbor is also unformatted. -+ When called from scan_formatted, we skip first iteration (to make sure that -+ right(left)most item of the left(right) neighbor on the parent level is of the same -+ type and set appropriate coord). */ -+static int scan_by_coord(flush_scan * scan) -+{ -+ int ret = 0; -+ int scan_this_coord; -+ lock_handle next_lock; -+ load_count next_load; -+ coord_t next_coord; -+ jnode *child; -+ item_plugin *iplug; -+ -+ init_lh(&next_lock); -+ init_load_count(&next_load); -+ scan_this_coord = (jnode_is_unformatted(scan->node) ? 1 : 0); -+ -+ /* set initial item id */ -+ iplug = item_plugin_by_coord(&scan->parent_coord); -+ -+ for (; !reiser4_scan_finished(scan); scan_this_coord = 1) { -+ if (scan_this_coord) { -+ /* Here we expect that unit is scannable. it would not be so due -+ * to race with extent->tail conversion. */ -+ if (iplug->f.scan == NULL) { -+ scan->stop = 1; -+ ret = -E_REPEAT; -+ /* skip the check at the end. */ -+ goto race; -+ } -+ -+ ret = iplug->f.scan(scan); -+ if (ret != 0) -+ goto exit; -+ -+ if (reiser4_scan_finished(scan)) { -+ checkchild(scan); -+ break; -+ } -+ } else { -+ /* the same race against truncate as above is possible -+ * here, it seems */ -+ -+ /* NOTE-JMACD: In this case, apply the same end-of-node logic but don't scan -+ the first coordinate. */ -+ assert("jmacd-1231", -+ item_is_internal(&scan->parent_coord)); -+ } -+ -+ if (iplug->f.utmost_child == NULL -+ || znode_get_level(scan->parent_coord.node) != TWIG_LEVEL) { -+ /* stop this coord and continue on parrent level */ -+ ret = -+ scan_set_current(scan, -+ ZJNODE(zref -+ (scan->parent_coord.node)), -+ 1, NULL); -+ if (ret != 0) -+ goto exit; -+ break; -+ } -+ -+ /* Either way, the invariant is that scan->parent_coord is set to the -+ parent of scan->node. Now get the next unit. */ -+ coord_dup(&next_coord, &scan->parent_coord); -+ coord_sideof_unit(&next_coord, scan->direction); -+ -+ /* If off-the-end of the twig, try the next twig. */ -+ if (coord_is_after_sideof_unit(&next_coord, scan->direction)) { -+ /* We take the write lock because we may start flushing from this -+ * coordinate. */ -+ ret = neighbor_in_slum(next_coord.node, -+ &next_lock, -+ scan->direction, -+ ZNODE_WRITE_LOCK, -+ 1 /* check dirty */, -+ 0 /* don't go though upper -+ levels */); -+ if (ret == -E_NO_NEIGHBOR) { -+ scan->stop = 1; -+ ret = 0; -+ break; -+ } -+ -+ if (ret != 0) { -+ goto exit; -+ } -+ -+ ret = incr_load_count_znode(&next_load, next_lock.node); -+ if (ret != 0) { -+ goto exit; -+ } -+ -+ coord_init_sideof_unit(&next_coord, next_lock.node, -+ sideof_reverse(scan->direction)); -+ } -+ -+ iplug = item_plugin_by_coord(&next_coord); -+ -+ /* Get the next child. */ -+ ret = -+ iplug->f.utmost_child(&next_coord, -+ sideof_reverse(scan->direction), -+ &child); -+ if (ret != 0) -+ goto exit; -+ /* If the next child is not in memory, or, item_utmost_child -+ failed (due to race with unlink, most probably), stop -+ here. */ -+ if (child == NULL || IS_ERR(child)) { -+ scan->stop = 1; -+ checkchild(scan); -+ break; -+ } -+ -+ assert("nikita-2374", jnode_is_unformatted(child) -+ || jnode_is_znode(child)); -+ -+ /* See if it is dirty, part of the same atom. */ -+ if (!reiser4_scan_goto(scan, child)) { -+ checkchild(scan); -+ break; -+ } -+ -+ /* If so, make this child current. */ -+ ret = scan_set_current(scan, child, 1, &next_coord); -+ if (ret != 0) -+ goto exit; -+ -+ /* Now continue. If formatted we release the parent lock and return, then -+ proceed. */ -+ if (jnode_is_znode(child)) -+ break; -+ -+ /* Otherwise, repeat the above loop with next_coord. */ -+ if (next_load.node != NULL) { -+ done_lh(&scan->parent_lock); -+ move_lh(&scan->parent_lock, &next_lock); -+ move_load_count(&scan->parent_load, &next_load); -+ } -+ } -+ -+ assert("jmacd-6233", -+ reiser4_scan_finished(scan) || jnode_is_znode(scan->node)); -+ exit: -+ checkchild(scan); -+ race: /* skip the above check */ -+ if (jnode_is_znode(scan->node)) { -+ done_lh(&scan->parent_lock); -+ done_load_count(&scan->parent_load); -+ } -+ -+ done_load_count(&next_load); -+ done_lh(&next_lock); -+ return ret; -+} -+ -+/* FLUSH POS HELPERS */ -+ -+/* Initialize the fields of a flush_position. */ -+static void pos_init(flush_pos_t * pos) -+{ -+ memset(pos, 0, sizeof *pos); -+ -+ pos->state = POS_INVALID; -+ coord_init_invalid(&pos->coord, NULL); -+ init_lh(&pos->lock); -+ init_load_count(&pos->load); -+ -+ reiser4_blocknr_hint_init(&pos->preceder); -+} -+ -+/* The flush loop inside squalloc periodically checks pos_valid to -+ determine when "enough flushing" has been performed. This will return true until one -+ of the following conditions is met: -+ -+ 1. the number of flush-queued nodes has reached the kernel-supplied "int *nr_to_flush" -+ parameter, meaning we have flushed as many blocks as the kernel requested. When -+ flushing to commit, this parameter is NULL. -+ -+ 2. pos_stop() is called because squalloc discovers that the "next" node in the -+ flush order is either non-existant, not dirty, or not in the same atom. -+*/ -+ -+static int pos_valid(flush_pos_t * pos) -+{ -+ return pos->state != POS_INVALID; -+} -+ -+/* Release any resources of a flush_position. Called when jnode_flush finishes. */ -+static void pos_done(flush_pos_t * pos) -+{ -+ pos_stop(pos); -+ reiser4_blocknr_hint_done(&pos->preceder); -+ if (convert_data(pos)) -+ free_convert_data(pos); -+} -+ -+/* Reset the point and parent. Called during flush subroutines to terminate the -+ squalloc loop. */ -+static int pos_stop(flush_pos_t * pos) -+{ -+ pos->state = POS_INVALID; -+ done_lh(&pos->lock); -+ done_load_count(&pos->load); -+ coord_init_invalid(&pos->coord, NULL); -+ -+ if (pos->child) { -+ jput(pos->child); -+ pos->child = NULL; -+ } -+ -+ return 0; -+} -+ -+/* Return the flush_position's block allocator hint. */ -+reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t * pos) -+{ -+ return &pos->preceder; -+} -+ -+flush_queue_t * reiser4_pos_fq(flush_pos_t * pos) -+{ -+ return pos->fq; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 90 -+ LocalWords: preceder -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/flush.h linux-2.6.24/fs/reiser4/flush.h ---- linux-2.6.24.orig/fs/reiser4/flush.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/flush.h 2008-01-25 11:39:06.924204598 +0300 -@@ -0,0 +1,290 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* DECLARATIONS: */ -+ -+#if !defined(__REISER4_FLUSH_H__) -+#define __REISER4_FLUSH_H__ -+ -+#include "plugin/cluster.h" -+ -+/* The flush_scan data structure maintains the state of an in-progress flush-scan on a -+ single level of the tree. A flush-scan is used for counting the number of adjacent -+ nodes to flush, which is used to determine whether we should relocate, and it is also -+ used to find a starting point for flush. A flush-scan object can scan in both right -+ and left directions via the scan_left() and scan_right() interfaces. The -+ right- and left-variations are similar but perform different functions. When scanning -+ left we (optionally perform rapid scanning and then) longterm-lock the endpoint node. -+ When scanning right we are simply counting the number of adjacent, dirty nodes. */ -+struct flush_scan { -+ -+ /* The current number of nodes scanned on this level. */ -+ unsigned count; -+ -+ /* There may be a maximum number of nodes for a scan on any single level. When -+ going leftward, max_count is determined by FLUSH_SCAN_MAXNODES (see reiser4.h) */ -+ unsigned max_count; -+ -+ /* Direction: Set to one of the sideof enumeration: { LEFT_SIDE, RIGHT_SIDE }. */ -+ sideof direction; -+ -+ /* Initially @stop is set to false then set true once some condition stops the -+ search (e.g., we found a clean node before reaching max_count or we found a -+ node belonging to another atom). */ -+ int stop; -+ -+ /* The current scan position. If @node is non-NULL then its reference count has -+ been incremented to reflect this reference. */ -+ jnode *node; -+ -+ /* A handle for zload/zrelse of current scan position node. */ -+ load_count node_load; -+ -+ /* During left-scan, if the final position (a.k.a. endpoint node) is formatted the -+ node is locked using this lock handle. The endpoint needs to be locked for -+ transfer to the flush_position object after scanning finishes. */ -+ lock_handle node_lock; -+ -+ /* When the position is unformatted, its parent, coordinate, and parent -+ zload/zrelse handle. */ -+ lock_handle parent_lock; -+ coord_t parent_coord; -+ load_count parent_load; -+ -+ /* The block allocator preceder hint. Sometimes flush_scan determines what the -+ preceder is and if so it sets it here, after which it is copied into the -+ flush_position. Otherwise, the preceder is computed later. */ -+ reiser4_block_nr preceder_blk; -+}; -+ -+struct convert_item_info { -+ dc_item_stat d_cur; /* disk cluster state of the current item */ -+ dc_item_stat d_next; /* disk cluster state of the next slum item */ -+ int cluster_shift; /* disk cluster shift */ -+ flow_t flow; /* disk cluster data */ -+}; -+ -+struct convert_info { -+ int count; /* for squalloc terminating */ -+ item_plugin *iplug; /* current item plugin */ -+ struct convert_item_info *itm; /* current item info */ -+ struct cluster_handle clust; /* transform cluster */ -+}; -+ -+typedef enum flush_position_state { -+ POS_INVALID, /* Invalid or stopped pos, do not continue slum -+ * processing */ -+ POS_ON_LEAF, /* pos points to already prepped, locked formatted node at -+ * leaf level */ -+ POS_ON_EPOINT, /* pos keeps a lock on twig level, "coord" field is used -+ * to traverse unformatted nodes */ -+ POS_TO_LEAF, /* pos is being moved to leaf level */ -+ POS_TO_TWIG, /* pos is being moved to twig level */ -+ POS_END_OF_TWIG, /* special case of POS_ON_TWIG, when coord is after -+ * rightmost unit of the current twig */ -+ POS_ON_INTERNAL /* same as POS_ON_LEAF, but points to internal node */ -+} flushpos_state_t; -+ -+/* An encapsulation of the current flush point and all the parameters that are passed -+ through the entire squeeze-and-allocate stage of the flush routine. A single -+ flush_position object is constructed after left- and right-scanning finishes. */ -+struct flush_position { -+ flushpos_state_t state; -+ -+ coord_t coord; /* coord to traverse unformatted nodes */ -+ lock_handle lock; /* current lock we hold */ -+ load_count load; /* load status for current locked formatted node */ -+ -+ jnode *child; /* for passing a reference to unformatted child -+ * across pos state changes */ -+ -+ reiser4_blocknr_hint preceder; /* The flush 'hint' state. */ -+ int leaf_relocate; /* True if enough leaf-level nodes were -+ * found to suggest a relocate policy. */ -+ int alloc_cnt; /* The number of nodes allocated during squeeze and allococate. */ -+ int prep_or_free_cnt; /* The number of nodes prepared for write (allocate) or squeezed and freed. */ -+ flush_queue_t *fq; -+ long *nr_written; /* number of nodes submitted to disk */ -+ int flags; /* a copy of jnode_flush flags argument */ -+ -+ znode *prev_twig; /* previous parent pointer value, used to catch -+ * processing of new twig node */ -+ struct convert_info *sq; /* convert info */ -+ -+ unsigned long pos_in_unit; /* for extents only. Position -+ within an extent unit of first -+ jnode of slum */ -+ long nr_to_write; /* number of unformatted nodes to handle on flush */ -+}; -+ -+static inline int item_convert_count(flush_pos_t * pos) -+{ -+ return pos->sq->count; -+} -+static inline void inc_item_convert_count(flush_pos_t * pos) -+{ -+ pos->sq->count++; -+} -+static inline void set_item_convert_count(flush_pos_t * pos, int count) -+{ -+ pos->sq->count = count; -+} -+static inline item_plugin *item_convert_plug(flush_pos_t * pos) -+{ -+ return pos->sq->iplug; -+} -+ -+static inline struct convert_info *convert_data(flush_pos_t * pos) -+{ -+ return pos->sq; -+} -+ -+static inline struct convert_item_info *item_convert_data(flush_pos_t * pos) -+{ -+ assert("edward-955", convert_data(pos)); -+ return pos->sq->itm; -+} -+ -+static inline struct tfm_cluster * tfm_cluster_sq(flush_pos_t * pos) -+{ -+ return &pos->sq->clust.tc; -+} -+ -+static inline struct tfm_stream * tfm_stream_sq(flush_pos_t * pos, -+ tfm_stream_id id) -+{ -+ assert("edward-854", pos->sq != NULL); -+ return get_tfm_stream(tfm_cluster_sq(pos), id); -+} -+ -+static inline int chaining_data_present(flush_pos_t * pos) -+{ -+ return convert_data(pos) && item_convert_data(pos); -+} -+ -+/* Returns true if next node contains next item of the disk cluster -+ so item convert data should be moved to the right slum neighbor. -+*/ -+static inline int should_chain_next_node(flush_pos_t * pos) -+{ -+ int result = 0; -+ -+ assert("edward-1007", chaining_data_present(pos)); -+ -+ switch (item_convert_data(pos)->d_next) { -+ case DC_CHAINED_ITEM: -+ result = 1; -+ break; -+ case DC_AFTER_CLUSTER: -+ break; -+ default: -+ impossible("edward-1009", "bad state of next slum item"); -+ } -+ return result; -+} -+ -+/* update item state in a disk cluster to assign conversion mode */ -+static inline void -+move_chaining_data(flush_pos_t * pos, int this_node /* where is next item */ ) -+{ -+ -+ assert("edward-1010", chaining_data_present(pos)); -+ -+ if (this_node == 0) { -+ /* next item is on the right neighbor */ -+ assert("edward-1011", -+ item_convert_data(pos)->d_cur == DC_FIRST_ITEM || -+ item_convert_data(pos)->d_cur == DC_CHAINED_ITEM); -+ assert("edward-1012", -+ item_convert_data(pos)->d_next == DC_CHAINED_ITEM); -+ -+ item_convert_data(pos)->d_cur = DC_CHAINED_ITEM; -+ item_convert_data(pos)->d_next = DC_INVALID_STATE; -+ } else { -+ /* next item is on the same node */ -+ assert("edward-1013", -+ item_convert_data(pos)->d_cur == DC_FIRST_ITEM || -+ item_convert_data(pos)->d_cur == DC_CHAINED_ITEM); -+ assert("edward-1227", -+ item_convert_data(pos)->d_next == DC_AFTER_CLUSTER || -+ item_convert_data(pos)->d_next == DC_INVALID_STATE); -+ -+ item_convert_data(pos)->d_cur = DC_AFTER_CLUSTER; -+ item_convert_data(pos)->d_next = DC_INVALID_STATE; -+ } -+} -+ -+static inline int should_convert_node(flush_pos_t * pos, znode * node) -+{ -+ return znode_convertible(node); -+} -+ -+/* true if there is attached convert item info */ -+static inline int should_convert_next_node(flush_pos_t * pos) -+{ -+ return convert_data(pos) && item_convert_data(pos); -+} -+ -+#define SQUALLOC_THRESHOLD 256 -+ -+static inline int should_terminate_squalloc(flush_pos_t * pos) -+{ -+ return convert_data(pos) && -+ !item_convert_data(pos) && -+ item_convert_count(pos) >= SQUALLOC_THRESHOLD; -+} -+ -+#if 1 -+#define check_convert_info(pos) \ -+do { \ -+ if (unlikely(should_convert_next_node(pos))){ \ -+ warning("edward-1006", "unprocessed chained data"); \ -+ printk("d_cur = %d, d_next = %d, flow.len = %llu\n", \ -+ item_convert_data(pos)->d_cur, \ -+ item_convert_data(pos)->d_next, \ -+ item_convert_data(pos)->flow.length); \ -+ } \ -+} while (0) -+#else -+#define check_convert_info(pos) -+#endif /* REISER4_DEBUG */ -+ -+void free_convert_data(flush_pos_t * pos); -+/* used in extent.c */ -+int scan_set_current(flush_scan * scan, jnode * node, unsigned add_size, -+ const coord_t * parent); -+int reiser4_scan_finished(flush_scan * scan); -+int reiser4_scanning_left(flush_scan * scan); -+int reiser4_scan_goto(flush_scan * scan, jnode * tonode); -+txn_atom *atom_locked_by_fq(flush_queue_t * fq); -+int reiser4_alloc_extent(flush_pos_t *flush_pos); -+squeeze_result squalloc_extent(znode *left, const coord_t *, flush_pos_t *, -+ reiser4_key *stop_key); -+extern int reiser4_init_fqs(void); -+extern void reiser4_done_fqs(void); -+ -+#if REISER4_DEBUG -+ -+extern void reiser4_check_fq(const txn_atom *atom); -+extern atomic_t flush_cnt; -+ -+#define check_preceder(blk) \ -+assert("nikita-2588", blk < reiser4_block_count(reiser4_get_current_sb())); -+extern void check_pos(flush_pos_t * pos); -+#else -+#define check_preceder(b) noop -+#define check_pos(pos) noop -+#endif -+ -+/* __REISER4_FLUSH_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 90 -+ LocalWords: preceder -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/flush_queue.c linux-2.6.24/fs/reiser4/flush_queue.c ---- linux-2.6.24.orig/fs/reiser4/flush_queue.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/flush_queue.c 2008-01-25 11:54:46.665843146 +0300 -@@ -0,0 +1,674 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "debug.h" -+#include "super.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "page_cache.h" -+#include "wander.h" -+#include "vfs_ops.h" -+#include "writeout.h" -+#include "flush.h" -+ -+#include -+#include -+#include -+#include -+#include -+ -+/* A flush queue object is an accumulator for keeping jnodes prepared -+ by the jnode_flush() function for writing to disk. Those "queued" jnodes are -+ kept on the flush queue until memory pressure or atom commit asks -+ flush queues to write some or all from their jnodes. */ -+ -+/* -+ LOCKING: -+ -+ fq->guard spin lock protects fq->atom pointer and nothing else. fq->prepped -+ list protected by atom spin lock. fq->prepped list uses the following -+ locking: -+ -+ two ways to protect fq->prepped list for read-only list traversal: -+ -+ 1. atom spin-lock atom. -+ 2. fq is IN_USE, atom->nr_running_queues increased. -+ -+ and one for list modification: -+ -+ 1. atom is spin-locked and one condition is true: fq is IN_USE or -+ atom->nr_running_queues == 0. -+ -+ The deadlock-safe order for flush queues and atoms is: first lock atom, then -+ lock flush queue, then lock jnode. -+*/ -+ -+#define fq_in_use(fq) ((fq)->state & FQ_IN_USE) -+#define fq_ready(fq) (!fq_in_use(fq)) -+ -+#define mark_fq_in_use(fq) do { (fq)->state |= FQ_IN_USE; } while (0) -+#define mark_fq_ready(fq) do { (fq)->state &= ~FQ_IN_USE; } while (0) -+ -+/* get lock on atom from locked flush queue object */ -+static txn_atom *atom_locked_by_fq_nolock(flush_queue_t * fq) -+{ -+ /* This code is similar to jnode_get_atom(), look at it for the -+ * explanation. */ -+ txn_atom *atom; -+ -+ assert_spin_locked(&(fq->guard)); -+ -+ while (1) { -+ atom = fq->atom; -+ if (atom == NULL) -+ break; -+ -+ if (spin_trylock_atom(atom)) -+ break; -+ -+ atomic_inc(&atom->refcount); -+ spin_unlock(&(fq->guard)); -+ spin_lock_atom(atom); -+ spin_lock(&(fq->guard)); -+ -+ if (fq->atom == atom) { -+ atomic_dec(&atom->refcount); -+ break; -+ } -+ -+ spin_unlock(&(fq->guard)); -+ atom_dec_and_unlock(atom); -+ spin_lock(&(fq->guard)); -+ } -+ -+ return atom; -+} -+ -+txn_atom *atom_locked_by_fq(flush_queue_t * fq) -+{ -+ txn_atom *atom; -+ -+ spin_lock(&(fq->guard)); -+ atom = atom_locked_by_fq_nolock(fq); -+ spin_unlock(&(fq->guard)); -+ return atom; -+} -+ -+static void init_fq(flush_queue_t * fq) -+{ -+ memset(fq, 0, sizeof *fq); -+ -+ atomic_set(&fq->nr_submitted, 0); -+ -+ INIT_LIST_HEAD(ATOM_FQ_LIST(fq)); -+ -+ init_waitqueue_head(&fq->wait); -+ spin_lock_init(&fq->guard); -+} -+ -+/* slab for flush queues */ -+static struct kmem_cache *fq_slab; -+ -+/** -+ * reiser4_init_fqs - create flush queue cache -+ * -+ * Initializes slab cache of flush queues. It is part of reiser4 module -+ * initialization. -+ */ -+int reiser4_init_fqs(void) -+{ -+ fq_slab = kmem_cache_create("fq", -+ sizeof(flush_queue_t), -+ 0, SLAB_HWCACHE_ALIGN, NULL); -+ if (fq_slab == NULL) -+ return RETERR(-ENOMEM); -+ return 0; -+} -+ -+/** -+ * reiser4_done_fqs - delete flush queue cache -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+void reiser4_done_fqs(void) -+{ -+ destroy_reiser4_cache(&fq_slab); -+} -+ -+/* create new flush queue object */ -+static flush_queue_t *create_fq(gfp_t gfp) -+{ -+ flush_queue_t *fq; -+ -+ fq = kmem_cache_alloc(fq_slab, gfp); -+ if (fq) -+ init_fq(fq); -+ -+ return fq; -+} -+ -+/* adjust atom's and flush queue's counters of queued nodes */ -+static void count_enqueued_node(flush_queue_t * fq) -+{ -+ ON_DEBUG(fq->atom->num_queued++); -+} -+ -+static void count_dequeued_node(flush_queue_t * fq) -+{ -+ assert("zam-993", fq->atom->num_queued > 0); -+ ON_DEBUG(fq->atom->num_queued--); -+} -+ -+/* attach flush queue object to the atom */ -+static void attach_fq(txn_atom *atom, flush_queue_t *fq) -+{ -+ assert_spin_locked(&(atom->alock)); -+ list_add(&fq->alink, &atom->flush_queues); -+ fq->atom = atom; -+ ON_DEBUG(atom->nr_flush_queues++); -+} -+ -+static void detach_fq(flush_queue_t * fq) -+{ -+ assert_spin_locked(&(fq->atom->alock)); -+ -+ spin_lock(&(fq->guard)); -+ list_del_init(&fq->alink); -+ assert("vs-1456", fq->atom->nr_flush_queues > 0); -+ ON_DEBUG(fq->atom->nr_flush_queues--); -+ fq->atom = NULL; -+ spin_unlock(&(fq->guard)); -+} -+ -+/* destroy flush queue object */ -+static void done_fq(flush_queue_t * fq) -+{ -+ assert("zam-763", list_empty_careful(ATOM_FQ_LIST(fq))); -+ assert("zam-766", atomic_read(&fq->nr_submitted) == 0); -+ -+ kmem_cache_free(fq_slab, fq); -+} -+ -+/* */ -+static void mark_jnode_queued(flush_queue_t * fq, jnode * node) -+{ -+ JF_SET(node, JNODE_FLUSH_QUEUED); -+ count_enqueued_node(fq); -+} -+ -+/* Putting jnode into the flush queue. Both atom and jnode should be -+ spin-locked. */ -+void queue_jnode(flush_queue_t * fq, jnode * node) -+{ -+ assert_spin_locked(&(node->guard)); -+ assert("zam-713", node->atom != NULL); -+ assert_spin_locked(&(node->atom->alock)); -+ assert("zam-716", fq->atom != NULL); -+ assert("zam-717", fq->atom == node->atom); -+ assert("zam-907", fq_in_use(fq)); -+ -+ assert("zam-714", JF_ISSET(node, JNODE_DIRTY)); -+ assert("zam-826", JF_ISSET(node, JNODE_RELOC)); -+ assert("vs-1481", !JF_ISSET(node, JNODE_FLUSH_QUEUED)); -+ assert("vs-1481", NODE_LIST(node) != FQ_LIST); -+ -+ mark_jnode_queued(fq, node); -+ list_move_tail(&node->capture_link, ATOM_FQ_LIST(fq)); -+ -+ ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node), -+ FQ_LIST, 1)); -+} -+ -+/* repeatable process for waiting io completion on a flush queue object */ -+static int wait_io(flush_queue_t * fq, int *nr_io_errors) -+{ -+ assert("zam-738", fq->atom != NULL); -+ assert_spin_locked(&(fq->atom->alock)); -+ assert("zam-736", fq_in_use(fq)); -+ assert("zam-911", list_empty_careful(ATOM_FQ_LIST(fq))); -+ -+ if (atomic_read(&fq->nr_submitted) != 0) { -+ struct super_block *super; -+ -+ spin_unlock_atom(fq->atom); -+ -+ assert("nikita-3013", reiser4_schedulable()); -+ -+ super = reiser4_get_current_sb(); -+ -+ /* FIXME: this is instead of blk_run_queues() */ -+ blk_run_address_space(reiser4_get_super_fake(super)->i_mapping); -+ -+ if (!(super->s_flags & MS_RDONLY)) -+ wait_event(fq->wait, atomic_read(&fq->nr_submitted) == 0); -+ -+ /* Ask the caller to re-acquire the locks and call this -+ function again. Note: this technique is commonly used in -+ the txnmgr code. */ -+ return -E_REPEAT; -+ } -+ -+ *nr_io_errors += atomic_read(&fq->nr_errors); -+ return 0; -+} -+ -+/* wait on I/O completion, re-submit dirty nodes to write */ -+static int finish_fq(flush_queue_t * fq, int *nr_io_errors) -+{ -+ int ret; -+ txn_atom *atom = fq->atom; -+ -+ assert("zam-801", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ assert("zam-762", fq_in_use(fq)); -+ -+ ret = wait_io(fq, nr_io_errors); -+ if (ret) -+ return ret; -+ -+ detach_fq(fq); -+ done_fq(fq); -+ -+ reiser4_atom_send_event(atom); -+ -+ return 0; -+} -+ -+/* wait for all i/o for given atom to be completed, actually do one iteration -+ on that and return -E_REPEAT if there more iterations needed */ -+static int finish_all_fq(txn_atom * atom, int *nr_io_errors) -+{ -+ flush_queue_t *fq; -+ -+ assert_spin_locked(&(atom->alock)); -+ -+ if (list_empty_careful(&atom->flush_queues)) -+ return 0; -+ -+ list_for_each_entry(fq, &atom->flush_queues, alink) { -+ if (fq_ready(fq)) { -+ int ret; -+ -+ mark_fq_in_use(fq); -+ assert("vs-1247", fq->owner == NULL); -+ ON_DEBUG(fq->owner = current); -+ ret = finish_fq(fq, nr_io_errors); -+ -+ if (*nr_io_errors) -+ reiser4_handle_error(); -+ -+ if (ret) { -+ reiser4_fq_put(fq); -+ return ret; -+ } -+ -+ spin_unlock_atom(atom); -+ -+ return -E_REPEAT; -+ } -+ } -+ -+ /* All flush queues are in use; atom remains locked */ -+ return -EBUSY; -+} -+ -+/* wait all i/o for current atom */ -+int current_atom_finish_all_fq(void) -+{ -+ txn_atom *atom; -+ int nr_io_errors = 0; -+ int ret = 0; -+ -+ do { -+ while (1) { -+ atom = get_current_atom_locked(); -+ ret = finish_all_fq(atom, &nr_io_errors); -+ if (ret != -EBUSY) -+ break; -+ reiser4_atom_wait_event(atom); -+ } -+ } while (ret == -E_REPEAT); -+ -+ /* we do not need locked atom after this function finishes, SUCCESS or -+ -EBUSY are two return codes when atom remains locked after -+ finish_all_fq */ -+ if (!ret) -+ spin_unlock_atom(atom); -+ -+ assert_spin_not_locked(&(atom->alock)); -+ -+ if (ret) -+ return ret; -+ -+ if (nr_io_errors) -+ return RETERR(-EIO); -+ -+ return 0; -+} -+ -+/* change node->atom field for all jnode from given list */ -+static void -+scan_fq_and_update_atom_ref(struct list_head *list, txn_atom *atom) -+{ -+ jnode *cur; -+ -+ list_for_each_entry(cur, list, capture_link) { -+ spin_lock_jnode(cur); -+ cur->atom = atom; -+ spin_unlock_jnode(cur); -+ } -+} -+ -+/* support for atom fusion operation */ -+void reiser4_fuse_fq(txn_atom *to, txn_atom *from) -+{ -+ flush_queue_t *fq; -+ -+ assert_spin_locked(&(to->alock)); -+ assert_spin_locked(&(from->alock)); -+ -+ list_for_each_entry(fq, &from->flush_queues, alink) { -+ scan_fq_and_update_atom_ref(ATOM_FQ_LIST(fq), to); -+ spin_lock(&(fq->guard)); -+ fq->atom = to; -+ spin_unlock(&(fq->guard)); -+ } -+ -+ list_splice_init(&from->flush_queues, to->flush_queues.prev); -+ -+#if REISER4_DEBUG -+ to->num_queued += from->num_queued; -+ to->nr_flush_queues += from->nr_flush_queues; -+ from->nr_flush_queues = 0; -+#endif -+} -+ -+#if REISER4_DEBUG -+int atom_fq_parts_are_clean(txn_atom * atom) -+{ -+ assert("zam-915", atom != NULL); -+ return list_empty_careful(&atom->flush_queues); -+} -+#endif -+/* Bio i/o completion routine for reiser4 write operations. */ -+static void -+end_io_handler(struct bio *bio, int err) -+{ -+ int i; -+ int nr_errors = 0; -+ flush_queue_t *fq; -+ -+ assert("zam-958", bio->bi_rw & WRITE); -+ -+ if (err == -EOPNOTSUPP) -+ set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); -+ -+ /* we expect that bio->private is set to NULL or fq object which is used -+ * for synchronization and error counting. */ -+ fq = bio->bi_private; -+ /* Check all elements of io_vec for correct write completion. */ -+ for (i = 0; i < bio->bi_vcnt; i += 1) { -+ struct page *pg = bio->bi_io_vec[i].bv_page; -+ -+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { -+ SetPageError(pg); -+ nr_errors++; -+ } -+ -+ { -+ /* jnode WRITEBACK ("write is in progress bit") is -+ * atomically cleared here. */ -+ jnode *node; -+ -+ assert("zam-736", pg != NULL); -+ assert("zam-736", PagePrivate(pg)); -+ node = jprivate(pg); -+ -+ JF_CLR(node, JNODE_WRITEBACK); -+ } -+ -+ end_page_writeback(pg); -+ page_cache_release(pg); -+ } -+ -+ if (fq) { -+ /* count i/o error in fq object */ -+ atomic_add(nr_errors, &fq->nr_errors); -+ -+ /* If all write requests registered in this "fq" are done we up -+ * the waiter. */ -+ if (atomic_sub_and_test(bio->bi_vcnt, &fq->nr_submitted)) -+ wake_up(&fq->wait); -+ } -+ -+ bio_put(bio); -+} -+ -+/* Count I/O requests which will be submitted by @bio in given flush queues -+ @fq */ -+void add_fq_to_bio(flush_queue_t * fq, struct bio *bio) -+{ -+ bio->bi_private = fq; -+ bio->bi_end_io = end_io_handler; -+ -+ if (fq) -+ atomic_add(bio->bi_vcnt, &fq->nr_submitted); -+} -+ -+/* Move all queued nodes out from @fq->prepped list. */ -+static void release_prepped_list(flush_queue_t * fq) -+{ -+ txn_atom *atom; -+ -+ assert("zam-904", fq_in_use(fq)); -+ atom = atom_locked_by_fq(fq); -+ -+ while (!list_empty(ATOM_FQ_LIST(fq))) { -+ jnode *cur; -+ -+ cur = list_entry(ATOM_FQ_LIST(fq)->next, jnode, capture_link); -+ list_del_init(&cur->capture_link); -+ -+ count_dequeued_node(fq); -+ spin_lock_jnode(cur); -+ assert("nikita-3154", !JF_ISSET(cur, JNODE_OVRWR)); -+ assert("nikita-3154", JF_ISSET(cur, JNODE_RELOC)); -+ assert("nikita-3154", JF_ISSET(cur, JNODE_FLUSH_QUEUED)); -+ JF_CLR(cur, JNODE_FLUSH_QUEUED); -+ -+ if (JF_ISSET(cur, JNODE_DIRTY)) { -+ list_add_tail(&cur->capture_link, -+ ATOM_DIRTY_LIST(atom, jnode_get_level(cur))); -+ ON_DEBUG(count_jnode(atom, cur, FQ_LIST, -+ DIRTY_LIST, 1)); -+ } else { -+ list_add_tail(&cur->capture_link, ATOM_CLEAN_LIST(atom)); -+ ON_DEBUG(count_jnode(atom, cur, FQ_LIST, -+ CLEAN_LIST, 1)); -+ } -+ -+ spin_unlock_jnode(cur); -+ } -+ -+ if (--atom->nr_running_queues == 0) -+ reiser4_atom_send_event(atom); -+ -+ spin_unlock_atom(atom); -+} -+ -+/* Submit write requests for nodes on the already filled flush queue @fq. -+ -+ @fq: flush queue object which contains jnodes we can (and will) write. -+ @return: number of submitted blocks (>=0) if success, otherwise -- an error -+ code (<0). */ -+int reiser4_write_fq(flush_queue_t * fq, long *nr_submitted, int flags) -+{ -+ int ret; -+ txn_atom *atom; -+ -+ while (1) { -+ atom = atom_locked_by_fq(fq); -+ assert("zam-924", atom); -+ /* do not write fq in parallel. */ -+ if (atom->nr_running_queues == 0 -+ || !(flags & WRITEOUT_SINGLE_STREAM)) -+ break; -+ reiser4_atom_wait_event(atom); -+ } -+ -+ atom->nr_running_queues++; -+ spin_unlock_atom(atom); -+ -+ ret = write_jnode_list(ATOM_FQ_LIST(fq), fq, nr_submitted, flags); -+ release_prepped_list(fq); -+ -+ return ret; -+} -+ -+/* Getting flush queue object for exclusive use by one thread. May require -+ several iterations which is indicated by -E_REPEAT return code. -+ -+ This function does not contain code for obtaining an atom lock because an -+ atom lock is obtained by different ways in different parts of reiser4, -+ usually it is current atom, but we need a possibility for getting fq for the -+ atom of given jnode. */ -+static int fq_by_atom_gfp(txn_atom *atom, flush_queue_t **new_fq, gfp_t gfp) -+{ -+ flush_queue_t *fq; -+ -+ assert_spin_locked(&(atom->alock)); -+ -+ fq = list_entry(atom->flush_queues.next, flush_queue_t, alink); -+ while (&atom->flush_queues != &fq->alink) { -+ spin_lock(&(fq->guard)); -+ -+ if (fq_ready(fq)) { -+ mark_fq_in_use(fq); -+ assert("vs-1246", fq->owner == NULL); -+ ON_DEBUG(fq->owner = current); -+ spin_unlock(&(fq->guard)); -+ -+ if (*new_fq) -+ done_fq(*new_fq); -+ -+ *new_fq = fq; -+ -+ return 0; -+ } -+ -+ spin_unlock(&(fq->guard)); -+ -+ fq = list_entry(fq->alink.next, flush_queue_t, alink); -+ } -+ -+ /* Use previously allocated fq object */ -+ if (*new_fq) { -+ mark_fq_in_use(*new_fq); -+ assert("vs-1248", (*new_fq)->owner == 0); -+ ON_DEBUG((*new_fq)->owner = current); -+ attach_fq(atom, *new_fq); -+ -+ return 0; -+ } -+ -+ spin_unlock_atom(atom); -+ -+ *new_fq = create_fq(gfp); -+ -+ if (*new_fq == NULL) -+ return RETERR(-ENOMEM); -+ -+ return RETERR(-E_REPEAT); -+} -+ -+int reiser4_fq_by_atom(txn_atom * atom, flush_queue_t ** new_fq) -+{ -+ return fq_by_atom_gfp(atom, new_fq, reiser4_ctx_gfp_mask_get()); -+} -+ -+/* A wrapper around reiser4_fq_by_atom for getting a flush queue -+ object for current atom, if success fq->atom remains locked. */ -+flush_queue_t *get_fq_for_current_atom(void) -+{ -+ flush_queue_t *fq = NULL; -+ txn_atom *atom; -+ int ret; -+ -+ do { -+ atom = get_current_atom_locked(); -+ ret = reiser4_fq_by_atom(atom, &fq); -+ } while (ret == -E_REPEAT); -+ -+ if (ret) -+ return ERR_PTR(ret); -+ return fq; -+} -+ -+/* Releasing flush queue object after exclusive use */ -+void reiser4_fq_put_nolock(flush_queue_t *fq) -+{ -+ assert("zam-747", fq->atom != NULL); -+ assert("zam-902", list_empty_careful(ATOM_FQ_LIST(fq))); -+ mark_fq_ready(fq); -+ assert("vs-1245", fq->owner == current); -+ ON_DEBUG(fq->owner = NULL); -+} -+ -+void reiser4_fq_put(flush_queue_t * fq) -+{ -+ txn_atom *atom; -+ -+ spin_lock(&(fq->guard)); -+ atom = atom_locked_by_fq_nolock(fq); -+ -+ assert("zam-746", atom != NULL); -+ -+ reiser4_fq_put_nolock(fq); -+ reiser4_atom_send_event(atom); -+ -+ spin_unlock(&(fq->guard)); -+ spin_unlock_atom(atom); -+} -+ -+/* A part of atom object initialization related to the embedded flush queue -+ list head */ -+ -+void init_atom_fq_parts(txn_atom *atom) -+{ -+ INIT_LIST_HEAD(&atom->flush_queues); -+} -+ -+#if REISER4_DEBUG -+ -+void reiser4_check_fq(const txn_atom *atom) -+{ -+ /* check number of nodes on all atom's flush queues */ -+ flush_queue_t *fq; -+ int count; -+ struct list_head *pos; -+ -+ count = 0; -+ list_for_each_entry(fq, &atom->flush_queues, alink) { -+ spin_lock(&(fq->guard)); -+ /* calculate number of jnodes on fq' list of prepped jnodes */ -+ list_for_each(pos, ATOM_FQ_LIST(fq)) -+ count++; -+ spin_unlock(&(fq->guard)); -+ } -+ if (count != atom->fq) -+ warning("", "fq counter %d, real %d\n", atom->fq, count); -+ -+} -+ -+#endif -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/forward.h linux-2.6.24/fs/reiser4/forward.h ---- linux-2.6.24.orig/fs/reiser4/forward.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/forward.h 2008-01-25 11:39:06.928205628 +0300 -@@ -0,0 +1,252 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Forward declarations. Thank you Kernighan. */ -+ -+#if !defined( __REISER4_FORWARD_H__ ) -+#define __REISER4_FORWARD_H__ -+ -+#include -+#include -+ -+typedef struct zlock zlock; -+typedef struct lock_stack lock_stack; -+typedef struct lock_handle lock_handle; -+typedef struct znode znode; -+typedef struct flow flow_t; -+typedef struct coord coord_t; -+typedef struct tree_access_pointer tap_t; -+typedef struct reiser4_object_create_data reiser4_object_create_data; -+typedef union reiser4_plugin reiser4_plugin; -+typedef __u16 reiser4_plugin_id; -+typedef __u64 reiser4_plugin_groups; -+typedef struct item_plugin item_plugin; -+typedef struct jnode_plugin jnode_plugin; -+typedef struct reiser4_item_data reiser4_item_data; -+typedef union reiser4_key reiser4_key; -+typedef struct reiser4_tree reiser4_tree; -+typedef struct carry_cut_data carry_cut_data; -+typedef struct carry_kill_data carry_kill_data; -+typedef struct carry_tree_op carry_tree_op; -+typedef struct carry_tree_node carry_tree_node; -+typedef struct carry_plugin_info carry_plugin_info; -+typedef struct reiser4_journal reiser4_journal; -+typedef struct txn_atom txn_atom; -+typedef struct txn_handle txn_handle; -+typedef struct txn_mgr txn_mgr; -+typedef struct reiser4_dir_entry_desc reiser4_dir_entry_desc; -+typedef struct reiser4_context reiser4_context; -+typedef struct carry_level carry_level; -+typedef struct blocknr_set_entry blocknr_set_entry; -+/* super_block->s_fs_info points to this */ -+typedef struct reiser4_super_info_data reiser4_super_info_data; -+/* next two objects are fields of reiser4_super_info_data */ -+typedef struct reiser4_oid_allocator reiser4_oid_allocator; -+typedef struct reiser4_space_allocator reiser4_space_allocator; -+ -+typedef struct flush_scan flush_scan; -+typedef struct flush_position flush_pos_t; -+ -+typedef unsigned short pos_in_node_t; -+#define MAX_POS_IN_NODE 65535 -+ -+typedef struct jnode jnode; -+typedef struct reiser4_blocknr_hint reiser4_blocknr_hint; -+ -+typedef struct uf_coord uf_coord_t; -+typedef struct hint hint_t; -+ -+typedef struct ktxnmgrd_context ktxnmgrd_context; -+ -+struct inode; -+struct page; -+struct file; -+struct dentry; -+struct super_block; -+ -+/* return values of coord_by_key(). cbk == coord_by_key */ -+typedef enum { -+ CBK_COORD_FOUND = 0, -+ CBK_COORD_NOTFOUND = -ENOENT, -+} lookup_result; -+ -+/* results of lookup with directory file */ -+typedef enum { -+ FILE_NAME_FOUND = 0, -+ FILE_NAME_NOTFOUND = -ENOENT, -+ FILE_IO_ERROR = -EIO, /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */ -+ FILE_OOM = -ENOMEM /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */ -+} file_lookup_result; -+ -+/* behaviors of lookup. If coord we are looking for is actually in a tree, -+ both coincide. */ -+typedef enum { -+ /* search exactly for the coord with key given */ -+ FIND_EXACT, -+ /* search for coord with the maximal key not greater than one -+ given */ -+ FIND_MAX_NOT_MORE_THAN /*LEFT_SLANT_BIAS */ -+} lookup_bias; -+ -+typedef enum { -+ /* number of leaf level of the tree -+ The fake root has (tree_level=0). */ -+ LEAF_LEVEL = 1, -+ -+ /* number of level one above leaf level of the tree. -+ -+ It is supposed that internal tree used by reiser4 to store file -+ system data and meta data will have height 2 initially (when -+ created by mkfs). -+ */ -+ TWIG_LEVEL = 2, -+} tree_level; -+ -+/* The "real" maximum ztree height is the 0-origin size of any per-level -+ array, since the zero'th level is not used. */ -+#define REAL_MAX_ZTREE_HEIGHT (REISER4_MAX_ZTREE_HEIGHT-LEAF_LEVEL) -+ -+/* enumeration of possible mutual position of item and coord. This enum is -+ return type of ->is_in_item() item plugin method which see. */ -+typedef enum { -+ /* coord is on the left of an item */ -+ IP_ON_THE_LEFT, -+ /* coord is inside item */ -+ IP_INSIDE, -+ /* coord is inside item, but to the right of the rightmost unit of -+ this item */ -+ IP_RIGHT_EDGE, -+ /* coord is on the right of an item */ -+ IP_ON_THE_RIGHT -+} interposition; -+ -+/* type of lock to acquire on znode before returning it to caller */ -+typedef enum { -+ ZNODE_NO_LOCK = 0, -+ ZNODE_READ_LOCK = 1, -+ ZNODE_WRITE_LOCK = 2, -+} znode_lock_mode; -+ -+/* type of lock request */ -+typedef enum { -+ ZNODE_LOCK_LOPRI = 0, -+ ZNODE_LOCK_HIPRI = (1 << 0), -+ -+ /* By setting the ZNODE_LOCK_NONBLOCK flag in a lock request the call to longterm_lock_znode will not sleep -+ waiting for the lock to become available. If the lock is unavailable, reiser4_znode_lock will immediately -+ return the value -E_REPEAT. */ -+ ZNODE_LOCK_NONBLOCK = (1 << 1), -+ /* An option for longterm_lock_znode which prevents atom fusion */ -+ ZNODE_LOCK_DONT_FUSE = (1 << 2) -+} znode_lock_request; -+ -+typedef enum { READ_OP = 0, WRITE_OP = 1 } rw_op; -+ -+/* used to specify direction of shift. These must be -1 and 1 */ -+typedef enum { -+ SHIFT_LEFT = 1, -+ SHIFT_RIGHT = -1 -+} shift_direction; -+ -+typedef enum { -+ LEFT_SIDE, -+ RIGHT_SIDE -+} sideof; -+ -+#define round_up( value, order ) \ -+ ( ( typeof( value ) )( ( ( long ) ( value ) + ( order ) - 1U ) & \ -+ ~( ( order ) - 1 ) ) ) -+ -+/* values returned by squalloc_right_neighbor and its auxiliary functions */ -+typedef enum { -+ /* unit of internal item is moved */ -+ SUBTREE_MOVED = 0, -+ /* nothing else can be squeezed into left neighbor */ -+ SQUEEZE_TARGET_FULL = 1, -+ /* all content of node is squeezed into its left neighbor */ -+ SQUEEZE_SOURCE_EMPTY = 2, -+ /* one more item is copied (this is only returned by -+ allocate_and_copy_extent to squalloc_twig)) */ -+ SQUEEZE_CONTINUE = 3 -+} squeeze_result; -+ -+/* Do not change items ids. If you do - there will be format change */ -+typedef enum { -+ STATIC_STAT_DATA_ID = 0x0, -+ SIMPLE_DIR_ENTRY_ID = 0x1, -+ COMPOUND_DIR_ID = 0x2, -+ NODE_POINTER_ID = 0x3, -+ EXTENT_POINTER_ID = 0x5, -+ FORMATTING_ID = 0x6, -+ CTAIL_ID = 0x7, -+ BLACK_BOX_ID = 0x8, -+ LAST_ITEM_ID = 0x9 -+} item_id; -+ -+/* Flags passed to jnode_flush() to allow it to distinguish default settings based on -+ whether commit() was called or VM memory pressure was applied. */ -+typedef enum { -+ /* submit flush queue to disk at jnode_flush completion */ -+ JNODE_FLUSH_WRITE_BLOCKS = 1, -+ -+ /* flush is called for commit */ -+ JNODE_FLUSH_COMMIT = 2, -+ /* not implemented */ -+ JNODE_FLUSH_MEMORY_FORMATTED = 4, -+ -+ /* not implemented */ -+ JNODE_FLUSH_MEMORY_UNFORMATTED = 8, -+} jnode_flush_flags; -+ -+/* Flags to insert/paste carry operations. Currently they only used in -+ flushing code, but in future, they can be used to optimize for repetitive -+ accesses. */ -+typedef enum { -+ /* carry is not allowed to shift data to the left when trying to find -+ free space */ -+ COPI_DONT_SHIFT_LEFT = (1 << 0), -+ /* carry is not allowed to shift data to the right when trying to find -+ free space */ -+ COPI_DONT_SHIFT_RIGHT = (1 << 1), -+ /* carry is not allowed to allocate new node(s) when trying to find -+ free space */ -+ COPI_DONT_ALLOCATE = (1 << 2), -+ /* try to load left neighbor if its not in a cache */ -+ COPI_LOAD_LEFT = (1 << 3), -+ /* try to load right neighbor if its not in a cache */ -+ COPI_LOAD_RIGHT = (1 << 4), -+ /* shift insertion point to the left neighbor */ -+ COPI_GO_LEFT = (1 << 5), -+ /* shift insertion point to the right neighbor */ -+ COPI_GO_RIGHT = (1 << 6), -+ /* try to step back into original node if insertion into new node -+ fails after shifting data there. */ -+ COPI_STEP_BACK = (1 << 7) -+} cop_insert_flag; -+ -+typedef enum { -+ SAFE_UNLINK, /* safe-link for unlink */ -+ SAFE_TRUNCATE /* safe-link for truncate */ -+} reiser4_safe_link_t; -+ -+/* this is to show on which list of atom jnode is */ -+typedef enum { -+ NOT_CAPTURED, -+ DIRTY_LIST, -+ CLEAN_LIST, -+ FQ_LIST, -+ WB_LIST, -+ OVRWR_LIST -+} atom_list; -+ -+/* __REISER4_FORWARD_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/fsdata.c linux-2.6.24/fs/reiser4/fsdata.c ---- linux-2.6.24.orig/fs/reiser4/fsdata.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/fsdata.c 2008-01-25 11:39:06.928205628 +0300 -@@ -0,0 +1,804 @@ -+/* Copyright 2001, 2002, 2003, 2004, 2005 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#include "fsdata.h" -+#include "inode.h" -+ -+ -+/* cache or dir_cursors */ -+static struct kmem_cache *d_cursor_cache; -+ -+/* list of unused cursors */ -+static LIST_HEAD(cursor_cache); -+ -+/* number of cursors in list of ununsed cursors */ -+static unsigned long d_cursor_unused = 0; -+ -+/* spinlock protecting manipulations with dir_cursor's hash table and lists */ -+DEFINE_SPINLOCK(d_lock); -+ -+static reiser4_file_fsdata *create_fsdata(struct file *file); -+static int file_is_stateless(struct file *file); -+static void free_fsdata(reiser4_file_fsdata *fsdata); -+static void kill_cursor(dir_cursor *); -+ -+/** -+ * d_cursor_shrink - shrink callback for cache of dir_cursor-s -+ * @nr: number of objects to free -+ * @mask: GFP mask -+ * -+ * Shrinks d_cursor_cache. Scan LRU list of unused cursors, freeing requested -+ * number. Return number of still freeable cursors. -+ */ -+static int d_cursor_shrink(int nr, gfp_t mask) -+{ -+ if (nr != 0) { -+ dir_cursor *scan; -+ int killed; -+ -+ killed = 0; -+ spin_lock(&d_lock); -+ while (!list_empty(&cursor_cache)) { -+ scan = list_entry(cursor_cache.next, dir_cursor, alist); -+ assert("nikita-3567", scan->ref == 0); -+ kill_cursor(scan); -+ ++killed; -+ --nr; -+ if (nr == 0) -+ break; -+ } -+ spin_unlock(&d_lock); -+ } -+ return d_cursor_unused; -+} -+ -+/* -+ * actually, d_cursors are "priceless", because there is no way to -+ * recover information stored in them. On the other hand, we don't -+ * want to consume all kernel memory by them. As a compromise, just -+ * assign higher "seeks" value to d_cursor cache, so that it will be -+ * shrunk only if system is really tight on memory. -+ */ -+static struct shrinker d_cursor_shrinker = { -+ .shrink = d_cursor_shrink, -+ .seeks = DEFAULT_SEEKS << 3, -+}; -+ -+/** -+ * reiser4_init_d_cursor - create d_cursor cache -+ * -+ * Initializes slab cache of d_cursors. It is part of reiser4 module -+ * initialization. -+ */ -+int reiser4_init_d_cursor(void) -+{ -+ d_cursor_cache = kmem_cache_create("d_cursor", sizeof(dir_cursor), 0, -+ SLAB_HWCACHE_ALIGN, NULL); -+ if (d_cursor_cache == NULL) -+ return RETERR(-ENOMEM); -+ -+ register_shrinker(&d_cursor_shrinker); -+ return 0; -+} -+ -+/** -+ * reiser4_done_d_cursor - delete d_cursor cache and d_cursor shrinker -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+void reiser4_done_d_cursor(void) -+{ -+ unregister_shrinker(&d_cursor_shrinker); -+ -+ destroy_reiser4_cache(&d_cursor_cache); -+} -+ -+#define D_CURSOR_TABLE_SIZE (256) -+ -+static inline unsigned long -+d_cursor_hash(d_cursor_hash_table *table, const struct d_cursor_key *key) -+{ -+ assert("nikita-3555", IS_POW(D_CURSOR_TABLE_SIZE)); -+ return (key->oid + key->cid) & (D_CURSOR_TABLE_SIZE - 1); -+} -+ -+static inline int d_cursor_eq(const struct d_cursor_key *k1, -+ const struct d_cursor_key *k2) -+{ -+ return k1->cid == k2->cid && k1->oid == k2->oid; -+} -+ -+/* -+ * define functions to manipulate reiser4 super block's hash table of -+ * dir_cursors -+ */ -+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get()) -+#define KFREE(ptr, size) kfree(ptr) -+TYPE_SAFE_HASH_DEFINE(d_cursor, -+ dir_cursor, -+ struct d_cursor_key, -+ key, hash, d_cursor_hash, d_cursor_eq); -+#undef KFREE -+#undef KMALLOC -+ -+/** -+ * reiser4_init_super_d_info - initialize per-super-block d_cursor resources -+ * @super: super block to initialize -+ * -+ * Initializes per-super-block d_cursor's hash table and radix tree. It is part -+ * of mount. -+ */ -+int reiser4_init_super_d_info(struct super_block *super) -+{ -+ struct d_cursor_info *p; -+ -+ p = &get_super_private(super)->d_info; -+ -+ INIT_RADIX_TREE(&p->tree, reiser4_ctx_gfp_mask_get()); -+ return d_cursor_hash_init(&p->table, D_CURSOR_TABLE_SIZE); -+} -+ -+/** -+ * reiser4_done_super_d_info - release per-super-block d_cursor resources -+ * @super: super block being umounted -+ * -+ * It is called on umount. Kills all directory cursors attached to suoer block. -+ */ -+void reiser4_done_super_d_info(struct super_block *super) -+{ -+ struct d_cursor_info *d_info; -+ dir_cursor *cursor, *next; -+ -+ d_info = &get_super_private(super)->d_info; -+ for_all_in_htable(&d_info->table, d_cursor, cursor, next) -+ kill_cursor(cursor); -+ -+ BUG_ON(d_info->tree.rnode != NULL); -+ d_cursor_hash_done(&d_info->table); -+} -+ -+/** -+ * kill_cursor - free dir_cursor and reiser4_file_fsdata attached to it -+ * @cursor: cursor to free -+ * -+ * Removes reiser4_file_fsdata attached to @cursor from readdir list of -+ * reiser4_inode, frees that reiser4_file_fsdata. Removes @cursor from from -+ * indices, hash table, list of unused cursors and frees it. -+ */ -+static void kill_cursor(dir_cursor *cursor) -+{ -+ unsigned long index; -+ -+ assert("nikita-3566", cursor->ref == 0); -+ assert("nikita-3572", cursor->fsdata != NULL); -+ -+ index = (unsigned long)cursor->key.oid; -+ list_del_init(&cursor->fsdata->dir.linkage); -+ free_fsdata(cursor->fsdata); -+ cursor->fsdata = NULL; -+ -+ if (list_empty_careful(&cursor->list)) -+ /* this is last cursor for a file. Kill radix-tree entry */ -+ radix_tree_delete(&cursor->info->tree, index); -+ else { -+ void **slot; -+ -+ /* -+ * there are other cursors for the same oid. -+ */ -+ -+ /* -+ * if radix tree point to the cursor being removed, re-target -+ * radix tree slot to the next cursor in the (non-empty as was -+ * checked above) element of the circular list of all cursors -+ * for this oid. -+ */ -+ slot = radix_tree_lookup_slot(&cursor->info->tree, index); -+ assert("nikita-3571", *slot != NULL); -+ if (*slot == cursor) -+ *slot = list_entry(cursor->list.next, dir_cursor, list); -+ /* remove cursor from circular list */ -+ list_del_init(&cursor->list); -+ } -+ /* remove cursor from the list of unused cursors */ -+ list_del_init(&cursor->alist); -+ /* remove cursor from the hash table */ -+ d_cursor_hash_remove(&cursor->info->table, cursor); -+ /* and free it */ -+ kmem_cache_free(d_cursor_cache, cursor); -+ --d_cursor_unused; -+} -+ -+/* possible actions that can be performed on all cursors for the given file */ -+enum cursor_action { -+ /* -+ * load all detached state: this is called when stat-data is loaded -+ * from the disk to recover information about all pending readdirs -+ */ -+ CURSOR_LOAD, -+ /* -+ * detach all state from inode, leaving it in the cache. This is called -+ * when inode is removed form the memory by memory pressure -+ */ -+ CURSOR_DISPOSE, -+ /* -+ * detach cursors from the inode, and free them. This is called when -+ * inode is destroyed -+ */ -+ CURSOR_KILL -+}; -+ -+/* -+ * return d_cursor data for the file system @inode is in. -+ */ -+static inline struct d_cursor_info *d_info(struct inode *inode) -+{ -+ return &get_super_private(inode->i_sb)->d_info; -+} -+ -+/* -+ * lookup d_cursor in the per-super-block radix tree. -+ */ -+static inline dir_cursor *lookup(struct d_cursor_info * info, -+ unsigned long index) -+{ -+ return (dir_cursor *) radix_tree_lookup(&info->tree, index); -+} -+ -+/* -+ * attach @cursor to the radix tree. There may be multiple cursors for the -+ * same oid, they are chained into circular list. -+ */ -+static void bind_cursor(dir_cursor * cursor, unsigned long index) -+{ -+ dir_cursor *head; -+ -+ head = lookup(cursor->info, index); -+ if (head == NULL) { -+ /* this is the first cursor for this index */ -+ INIT_LIST_HEAD(&cursor->list); -+ radix_tree_insert(&cursor->info->tree, index, cursor); -+ } else { -+ /* some cursor already exists. Chain ours */ -+ list_add(&cursor->list, &head->list); -+ } -+} -+ -+/* -+ * detach fsdata (if detachable) from file descriptor, and put cursor on the -+ * "unused" list. Called when file descriptor is not longer in active use. -+ */ -+static void clean_fsdata(struct file *file) -+{ -+ dir_cursor *cursor; -+ reiser4_file_fsdata *fsdata; -+ -+ assert("nikita-3570", file_is_stateless(file)); -+ -+ fsdata = (reiser4_file_fsdata *) file->private_data; -+ if (fsdata != NULL) { -+ cursor = fsdata->cursor; -+ if (cursor != NULL) { -+ spin_lock(&d_lock); -+ --cursor->ref; -+ if (cursor->ref == 0) { -+ list_add_tail(&cursor->alist, &cursor_cache); -+ ++d_cursor_unused; -+ } -+ spin_unlock(&d_lock); -+ file->private_data = NULL; -+ } -+ } -+} -+ -+/* -+ * global counter used to generate "client ids". These ids are encoded into -+ * high bits of fpos. -+ */ -+static __u32 cid_counter = 0; -+#define CID_SHIFT (20) -+#define CID_MASK (0xfffffull) -+ -+static void free_file_fsdata_nolock(struct file *); -+ -+/** -+ * insert_cursor - allocate file_fsdata, insert cursor to tree and hash table -+ * @cursor: -+ * @file: -+ * @inode: -+ * -+ * Allocates reiser4_file_fsdata, attaches it to @cursor, inserts cursor to -+ * reiser4 super block's hash table and radix tree. -+ add detachable readdir -+ * state to the @f -+ */ -+static int insert_cursor(dir_cursor *cursor, struct file *file, -+ struct inode *inode) -+{ -+ int result; -+ reiser4_file_fsdata *fsdata; -+ -+ memset(cursor, 0, sizeof *cursor); -+ -+ /* this is either first call to readdir, or rewind. Anyway, create new -+ * cursor. */ -+ fsdata = create_fsdata(NULL); -+ if (fsdata != NULL) { -+ result = radix_tree_preload(reiser4_ctx_gfp_mask_get()); -+ if (result == 0) { -+ struct d_cursor_info *info; -+ oid_t oid; -+ -+ info = d_info(inode); -+ oid = get_inode_oid(inode); -+ /* cid occupies higher 12 bits of f->f_pos. Don't -+ * allow it to become negative: this confuses -+ * nfsd_readdir() */ -+ cursor->key.cid = (++cid_counter) & 0x7ff; -+ cursor->key.oid = oid; -+ cursor->fsdata = fsdata; -+ cursor->info = info; -+ cursor->ref = 1; -+ -+ spin_lock_inode(inode); -+ /* install cursor as @f's private_data, discarding old -+ * one if necessary */ -+#if REISER4_DEBUG -+ if (file->private_data) -+ warning("", "file has fsdata already"); -+#endif -+ clean_fsdata(file); -+ free_file_fsdata_nolock(file); -+ file->private_data = fsdata; -+ fsdata->cursor = cursor; -+ spin_unlock_inode(inode); -+ spin_lock(&d_lock); -+ /* insert cursor into hash table */ -+ d_cursor_hash_insert(&info->table, cursor); -+ /* and chain it into radix-tree */ -+ bind_cursor(cursor, (unsigned long)oid); -+ spin_unlock(&d_lock); -+ radix_tree_preload_end(); -+ file->f_pos = ((__u64) cursor->key.cid) << CID_SHIFT; -+ } -+ } else -+ result = RETERR(-ENOMEM); -+ return result; -+} -+ -+/** -+ * process_cursors - do action on each cursor attached to inode -+ * @inode: -+ * @act: action to do -+ * -+ * Finds all cursors of @inode in reiser4's super block radix tree of cursors -+ * and performs action specified by @act on each of cursors. -+ */ -+static void process_cursors(struct inode *inode, enum cursor_action act) -+{ -+ oid_t oid; -+ dir_cursor *start; -+ struct list_head *head; -+ reiser4_context *ctx; -+ struct d_cursor_info *info; -+ -+ /* this can be called by -+ * -+ * kswapd->...->prune_icache->..reiser4_destroy_inode -+ * -+ * without reiser4_context -+ */ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) { -+ warning("vs-23", "failed to init context"); -+ return; -+ } -+ -+ assert("nikita-3558", inode != NULL); -+ -+ info = d_info(inode); -+ oid = get_inode_oid(inode); -+ spin_lock_inode(inode); -+ head = get_readdir_list(inode); -+ spin_lock(&d_lock); -+ /* find any cursor for this oid: reference to it is hanging of radix -+ * tree */ -+ start = lookup(info, (unsigned long)oid); -+ if (start != NULL) { -+ dir_cursor *scan; -+ reiser4_file_fsdata *fsdata; -+ -+ /* process circular list of cursors for this oid */ -+ scan = start; -+ do { -+ dir_cursor *next; -+ -+ next = list_entry(scan->list.next, dir_cursor, list); -+ fsdata = scan->fsdata; -+ assert("nikita-3557", fsdata != NULL); -+ if (scan->key.oid == oid) { -+ switch (act) { -+ case CURSOR_DISPOSE: -+ list_del_init(&fsdata->dir.linkage); -+ break; -+ case CURSOR_LOAD: -+ list_add(&fsdata->dir.linkage, head); -+ break; -+ case CURSOR_KILL: -+ kill_cursor(scan); -+ break; -+ } -+ } -+ if (scan == next) -+ /* last cursor was just killed */ -+ break; -+ scan = next; -+ } while (scan != start); -+ } -+ spin_unlock(&d_lock); -+ /* check that we killed 'em all */ -+ assert("nikita-3568", -+ ergo(act == CURSOR_KILL, -+ list_empty_careful(get_readdir_list(inode)))); -+ assert("nikita-3569", -+ ergo(act == CURSOR_KILL, lookup(info, oid) == NULL)); -+ spin_unlock_inode(inode); -+ reiser4_exit_context(ctx); -+} -+ -+/** -+ * reiser4_dispose_cursors - removes cursors from inode's list -+ * @inode: inode to dispose cursors of -+ * -+ * For each of cursors corresponding to @inode - removes reiser4_file_fsdata -+ * attached to cursor from inode's readdir list. This is called when inode is -+ * removed from the memory by memory pressure. -+ */ -+void reiser4_dispose_cursors(struct inode *inode) -+{ -+ process_cursors(inode, CURSOR_DISPOSE); -+} -+ -+/** -+ * reiser4_load_cursors - attach cursors to inode -+ * @inode: inode to load cursors to -+ * -+ * For each of cursors corresponding to @inode - attaches reiser4_file_fsdata -+ * attached to cursor to inode's readdir list. This is done when inode is -+ * loaded into memory. -+ */ -+void reiser4_load_cursors(struct inode *inode) -+{ -+ process_cursors(inode, CURSOR_LOAD); -+} -+ -+/** -+ * reiser4_kill_cursors - kill all inode cursors -+ * @inode: inode to kill cursors of -+ * -+ * Frees all cursors for this inode. This is called when inode is destroyed. -+ */ -+void reiser4_kill_cursors(struct inode *inode) -+{ -+ process_cursors(inode, CURSOR_KILL); -+} -+ -+/** -+ * file_is_stateless - -+ * @file: -+ * -+ * true, if file descriptor @f is created by NFS server by "demand" to serve -+ * one file system operation. This means that there may be "detached state" -+ * for underlying inode. -+ */ -+static int file_is_stateless(struct file *file) -+{ -+ return reiser4_get_dentry_fsdata(file->f_dentry)->stateless; -+} -+ -+/** -+ * reiser4_get_dir_fpos - -+ * @dir: -+ * -+ * Calculates ->fpos from user-supplied cookie. Normally it is dir->f_pos, but -+ * in the case of stateless directory operation (readdir-over-nfs), client id -+ * was encoded in the high bits of cookie and should me masked off. -+ */ -+loff_t reiser4_get_dir_fpos(struct file *dir) -+{ -+ if (file_is_stateless(dir)) -+ return dir->f_pos & CID_MASK; -+ else -+ return dir->f_pos; -+} -+ -+/** -+ * reiser4_attach_fsdata - try to attach fsdata -+ * @file: -+ * @inode: -+ * -+ * Finds or creates cursor for readdir-over-nfs. -+ */ -+int reiser4_attach_fsdata(struct file *file, struct inode *inode) -+{ -+ loff_t pos; -+ int result; -+ dir_cursor *cursor; -+ -+ /* -+ * we are serialized by inode->i_mutex -+ */ -+ if (!file_is_stateless(file)) -+ return 0; -+ -+ pos = file->f_pos; -+ result = 0; -+ if (pos == 0) { -+ /* -+ * first call to readdir (or rewind to the beginning of -+ * directory) -+ */ -+ cursor = kmem_cache_alloc(d_cursor_cache, -+ reiser4_ctx_gfp_mask_get()); -+ if (cursor != NULL) -+ result = insert_cursor(cursor, file, inode); -+ else -+ result = RETERR(-ENOMEM); -+ } else { -+ /* try to find existing cursor */ -+ struct d_cursor_key key; -+ -+ key.cid = pos >> CID_SHIFT; -+ key.oid = get_inode_oid(inode); -+ spin_lock(&d_lock); -+ cursor = d_cursor_hash_find(&d_info(inode)->table, &key); -+ if (cursor != NULL) { -+ /* cursor was found */ -+ if (cursor->ref == 0) { -+ /* move it from unused list */ -+ list_del_init(&cursor->alist); -+ --d_cursor_unused; -+ } -+ ++cursor->ref; -+ } -+ spin_unlock(&d_lock); -+ if (cursor != NULL) { -+ spin_lock_inode(inode); -+ assert("nikita-3556", cursor->fsdata->back == NULL); -+ clean_fsdata(file); -+ free_file_fsdata_nolock(file); -+ file->private_data = cursor->fsdata; -+ spin_unlock_inode(inode); -+ } -+ } -+ return result; -+} -+ -+/** -+ * reiser4_detach_fsdata - ??? -+ * @file: -+ * -+ * detach fsdata, if necessary -+ */ -+void reiser4_detach_fsdata(struct file *file) -+{ -+ struct inode *inode; -+ -+ if (!file_is_stateless(file)) -+ return; -+ -+ inode = file->f_dentry->d_inode; -+ spin_lock_inode(inode); -+ clean_fsdata(file); -+ spin_unlock_inode(inode); -+} -+ -+/* slab for reiser4_dentry_fsdata */ -+static struct kmem_cache *dentry_fsdata_cache; -+ -+/** -+ * reiser4_init_dentry_fsdata - create cache of dentry_fsdata -+ * -+ * Initializes slab cache of structures attached to denty->d_fsdata. It is -+ * part of reiser4 module initialization. -+ */ -+int reiser4_init_dentry_fsdata(void) -+{ -+ dentry_fsdata_cache = kmem_cache_create("dentry_fsdata", -+ sizeof(struct reiser4_dentry_fsdata), -+ 0, -+ SLAB_HWCACHE_ALIGN | -+ SLAB_RECLAIM_ACCOUNT, -+ NULL); -+ if (dentry_fsdata_cache == NULL) -+ return RETERR(-ENOMEM); -+ return 0; -+} -+ -+/** -+ * reiser4_done_dentry_fsdata - delete cache of dentry_fsdata -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+void reiser4_done_dentry_fsdata(void) -+{ -+ destroy_reiser4_cache(&dentry_fsdata_cache); -+} -+ -+/** -+ * reiser4_get_dentry_fsdata - get fs-specific dentry data -+ * @dentry: queried dentry -+ * -+ * Allocates if necessary and returns per-dentry data that we attach to each -+ * dentry. -+ */ -+struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *dentry) -+{ -+ assert("nikita-1365", dentry != NULL); -+ -+ if (dentry->d_fsdata == NULL) { -+ dentry->d_fsdata = kmem_cache_alloc(dentry_fsdata_cache, -+ reiser4_ctx_gfp_mask_get()); -+ if (dentry->d_fsdata == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ memset(dentry->d_fsdata, 0, -+ sizeof(struct reiser4_dentry_fsdata)); -+ } -+ return dentry->d_fsdata; -+} -+ -+/** -+ * reiser4_free_dentry_fsdata - detach and free dentry_fsdata -+ * @dentry: dentry to free fsdata of -+ * -+ * Detaches and frees fs-specific dentry data -+ */ -+void reiser4_free_dentry_fsdata(struct dentry *dentry) -+{ -+ if (dentry->d_fsdata != NULL) { -+ kmem_cache_free(dentry_fsdata_cache, dentry->d_fsdata); -+ dentry->d_fsdata = NULL; -+ } -+} -+ -+/* slab for reiser4_file_fsdata */ -+static struct kmem_cache *file_fsdata_cache; -+ -+/** -+ * reiser4_init_file_fsdata - create cache of reiser4_file_fsdata -+ * -+ * Initializes slab cache of structures attached to file->private_data. It is -+ * part of reiser4 module initialization. -+ */ -+int reiser4_init_file_fsdata(void) -+{ -+ file_fsdata_cache = kmem_cache_create("file_fsdata", -+ sizeof(reiser4_file_fsdata), -+ 0, -+ SLAB_HWCACHE_ALIGN | -+ SLAB_RECLAIM_ACCOUNT, NULL); -+ if (file_fsdata_cache == NULL) -+ return RETERR(-ENOMEM); -+ return 0; -+} -+ -+/** -+ * reiser4_done_file_fsdata - delete cache of reiser4_file_fsdata -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+void reiser4_done_file_fsdata(void) -+{ -+ destroy_reiser4_cache(&file_fsdata_cache); -+} -+ -+/** -+ * create_fsdata - allocate and initialize reiser4_file_fsdata -+ * @file: what to create file_fsdata for, may be NULL -+ * -+ * Allocates and initializes reiser4_file_fsdata structure. -+ */ -+static reiser4_file_fsdata *create_fsdata(struct file *file) -+{ -+ reiser4_file_fsdata *fsdata; -+ -+ fsdata = kmem_cache_alloc(file_fsdata_cache, -+ reiser4_ctx_gfp_mask_get()); -+ if (fsdata != NULL) { -+ memset(fsdata, 0, sizeof *fsdata); -+ fsdata->ra1.max_window_size = VM_MAX_READAHEAD * 1024; -+ fsdata->back = file; -+ INIT_LIST_HEAD(&fsdata->dir.linkage); -+ } -+ return fsdata; -+} -+ -+/** -+ * free_fsdata - free reiser4_file_fsdata -+ * @fsdata: object to free -+ * -+ * Dual to create_fsdata(). Free reiser4_file_fsdata. -+ */ -+static void free_fsdata(reiser4_file_fsdata *fsdata) -+{ -+ BUG_ON(fsdata == NULL); -+ kmem_cache_free(file_fsdata_cache, fsdata); -+} -+ -+/** -+ * reiser4_get_file_fsdata - get fs-specific file data -+ * @file: queried file -+ * -+ * Returns fs-specific data of @file. If it is NULL, allocates it and attaches -+ * to @file. -+ */ -+reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *file) -+{ -+ assert("nikita-1603", file != NULL); -+ -+ if (file->private_data == NULL) { -+ reiser4_file_fsdata *fsdata; -+ struct inode *inode; -+ -+ fsdata = create_fsdata(file); -+ if (fsdata == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ -+ inode = file->f_dentry->d_inode; -+ spin_lock_inode(inode); -+ if (file->private_data == NULL) { -+ file->private_data = fsdata; -+ fsdata = NULL; -+ } -+ spin_unlock_inode(inode); -+ if (fsdata != NULL) -+ /* other thread initialized ->fsdata */ -+ kmem_cache_free(file_fsdata_cache, fsdata); -+ } -+ assert("nikita-2665", file->private_data != NULL); -+ return file->private_data; -+} -+ -+/** -+ * free_file_fsdata_nolock - detach and free reiser4_file_fsdata -+ * @file: -+ * -+ * Detaches reiser4_file_fsdata from @file, removes reiser4_file_fsdata from -+ * readdir list, frees if it is not linked to d_cursor object. -+ */ -+static void free_file_fsdata_nolock(struct file *file) -+{ -+ reiser4_file_fsdata *fsdata; -+ -+ assert("", spin_inode_is_locked(file->f_dentry->d_inode)); -+ fsdata = file->private_data; -+ if (fsdata != NULL) { -+ list_del_init(&fsdata->dir.linkage); -+ if (fsdata->cursor == NULL) -+ free_fsdata(fsdata); -+ } -+ file->private_data = NULL; -+} -+ -+/** -+ * reiser4_free_file_fsdata - detach from struct file and free reiser4_file_fsdata -+ * @file: -+ * -+ * Spinlocks inode and calls free_file_fsdata_nolock to do the work. -+ */ -+void reiser4_free_file_fsdata(struct file *file) -+{ -+ spin_lock_inode(file->f_dentry->d_inode); -+ free_file_fsdata_nolock(file); -+ spin_unlock_inode(file->f_dentry->d_inode); -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/fsdata.h linux-2.6.24/fs/reiser4/fsdata.h ---- linux-2.6.24.orig/fs/reiser4/fsdata.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/fsdata.h 2008-01-25 11:39:06.928205628 +0300 -@@ -0,0 +1,205 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#if !defined( __REISER4_FSDATA_H__ ) -+#define __REISER4_FSDATA_H__ -+ -+#include "debug.h" -+#include "kassign.h" -+#include "seal.h" -+#include "type_safe_hash.h" -+#include "plugin/file/file.h" -+#include "readahead.h" -+ -+/* -+ * comment about reiser4_dentry_fsdata -+ * -+ * -+ */ -+ -+/* -+ * locking: fields of per file descriptor readdir_pos and ->f_pos are -+ * protected by ->i_mutex on inode. Under this lock following invariant -+ * holds: -+ * -+ * file descriptor is "looking" at the entry_no-th directory entry from -+ * the beginning of directory. This entry has key dir_entry_key and is -+ * pos-th entry with duplicate-key sequence. -+ * -+ */ -+ -+/* logical position within directory */ -+struct dir_pos { -+ /* key of directory entry (actually, part of a key sufficient to -+ identify directory entry) */ -+ de_id dir_entry_key; -+ /* ordinal number of directory entry among all entries with the same -+ key. (Starting from 0.) */ -+ unsigned pos; -+}; -+ -+struct readdir_pos { -+ /* f_pos corresponding to this readdir position */ -+ __u64 fpos; -+ /* logical position within directory */ -+ struct dir_pos position; -+ /* logical number of directory entry within -+ directory */ -+ __u64 entry_no; -+}; -+ -+/* -+ * this is used to speed up lookups for directory entry: on initial call to -+ * ->lookup() seal and coord of directory entry (if found, that is) are stored -+ * in struct dentry and reused later to avoid tree traversals. -+ */ -+struct de_location { -+ /* seal covering directory entry */ -+ seal_t entry_seal; -+ /* coord of directory entry */ -+ coord_t entry_coord; -+ /* ordinal number of directory entry among all entries with the same -+ key. (Starting from 0.) */ -+ int pos; -+}; -+ -+/** -+ * reiser4_dentry_fsdata - reiser4-specific data attached to dentries -+ * -+ * This is allocated dynamically and released in d_op->d_release() -+ * -+ * Currently it only contains cached location (hint) of directory entry, but -+ * it is expected that other information will be accumulated here. -+ */ -+struct reiser4_dentry_fsdata { -+ /* -+ * here will go fields filled by ->lookup() to speedup next -+ * create/unlink, like blocknr of znode with stat-data, or key of -+ * stat-data. -+ */ -+ struct de_location dec; -+ int stateless; /* created through reiser4_decode_fh, needs special -+ * treatment in readdir. */ -+}; -+ -+extern int reiser4_init_dentry_fsdata(void); -+extern void reiser4_done_dentry_fsdata(void); -+extern struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *); -+extern void reiser4_free_dentry_fsdata(struct dentry *dentry); -+ -+/** -+ * reiser4_file_fsdata - reiser4-specific data attached to file->private_data -+ * -+ * This is allocated dynamically and released in inode->i_fop->release -+ */ -+typedef struct reiser4_file_fsdata { -+ /* -+ * pointer back to the struct file which this reiser4_file_fsdata is -+ * part of -+ */ -+ struct file *back; -+ /* detached cursor for stateless readdir. */ -+ struct dir_cursor *cursor; -+ /* -+ * We need both directory and regular file parts here, because there -+ * are file system objects that are files and directories. -+ */ -+ struct { -+ /* -+ * position in directory. It is updated each time directory is -+ * modified -+ */ -+ struct readdir_pos readdir; -+ /* head of this list is reiser4_inode->lists.readdir_list */ -+ struct list_head linkage; -+ } dir; -+ /* hints to speed up operations with regular files: read and write. */ -+ struct { -+ hint_t hint; -+ } reg; -+ struct reiser4_file_ra_state ra1; -+ -+} reiser4_file_fsdata; -+ -+extern int reiser4_init_file_fsdata(void); -+extern void reiser4_done_file_fsdata(void); -+extern reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *); -+extern void reiser4_free_file_fsdata(struct file *); -+ -+/* -+ * d_cursor is reiser4_file_fsdata not attached to struct file. d_cursors are -+ * used to address problem reiser4 has with readdir accesses via NFS. See -+ * plugin/file_ops_readdir.c for more details. -+ */ -+struct d_cursor_key{ -+ __u16 cid; -+ __u64 oid; -+}; -+ -+/* -+ * define structures d_cursor_hash_table d_cursor_hash_link which are used to -+ * maintain hash table of dir_cursor-s in reiser4's super block -+ */ -+typedef struct dir_cursor dir_cursor; -+TYPE_SAFE_HASH_DECLARE(d_cursor, dir_cursor); -+ -+struct dir_cursor { -+ int ref; -+ reiser4_file_fsdata *fsdata; -+ -+ /* link to reiser4 super block hash table of cursors */ -+ d_cursor_hash_link hash; -+ -+ /* -+ * this is to link cursors to reiser4 super block's radix tree of -+ * cursors if there are more than one cursor of the same objectid -+ */ -+ struct list_head list; -+ struct d_cursor_key key; -+ struct d_cursor_info *info; -+ /* list of unused cursors */ -+ struct list_head alist; -+}; -+ -+extern int reiser4_init_d_cursor(void); -+extern void reiser4_done_d_cursor(void); -+ -+extern int reiser4_init_super_d_info(struct super_block *); -+extern void reiser4_done_super_d_info(struct super_block *); -+ -+extern loff_t reiser4_get_dir_fpos(struct file *); -+extern int reiser4_attach_fsdata(struct file *, struct inode *); -+extern void reiser4_detach_fsdata(struct file *); -+ -+/* these are needed for "stateless" readdir. See plugin/file_ops_readdir.c for -+ more details */ -+void reiser4_dispose_cursors(struct inode *inode); -+void reiser4_load_cursors(struct inode *inode); -+void reiser4_kill_cursors(struct inode *inode); -+void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de, -+ int offset, int adj); -+ -+/* -+ * this structure is embedded to reise4_super_info_data. It maintains d_cursors -+ * (detached readdir state). See plugin/file_ops_readdir.c for more details. -+ */ -+struct d_cursor_info { -+ d_cursor_hash_table table; -+ struct radix_tree_root tree; -+}; -+ -+/* spinlock protecting readdir cursors */ -+extern spinlock_t d_lock; -+ -+/* __REISER4_FSDATA_H__ */ -+#endif -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 120 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/init_super.c linux-2.6.24/fs/reiser4/init_super.c ---- linux-2.6.24.orig/fs/reiser4/init_super.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/init_super.c 2008-01-25 11:39:06.932206658 +0300 -@@ -0,0 +1,751 @@ -+/* Copyright by Hans Reiser, 2003 */ -+ -+#include "super.h" -+#include "inode.h" -+#include "plugin/plugin_set.h" -+ -+#include -+ -+/** -+ * init_fs_info - allocate reiser4 specific super block -+ * @super: super block of filesystem -+ * -+ * Allocates and initialize reiser4_super_info_data, attaches it to -+ * super->s_fs_info, initializes structures maintaining d_cursor-s. -+ */ -+int reiser4_init_fs_info(struct super_block *super) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = kzalloc(sizeof(reiser4_super_info_data), -+ reiser4_ctx_gfp_mask_get()); -+ if (!sbinfo) -+ return RETERR(-ENOMEM); -+ -+ super->s_fs_info = sbinfo; -+ super->s_op = NULL; -+ -+ ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes)); -+ ON_DEBUG(spin_lock_init(&sbinfo->all_guard)); -+ -+ mutex_init(&sbinfo->delete_mutex); -+ spin_lock_init(&(sbinfo->guard)); -+ -+ /* initialize per-super-block d_cursor resources */ -+ reiser4_init_super_d_info(super); -+ -+ return 0; -+} -+ -+/** -+ * reiser4_done_fs_info - free reiser4 specific super block -+ * @super: super block of filesystem -+ * -+ * Performs some sanity checks, releases structures maintaining d_cursor-s, -+ * frees reiser4_super_info_data. -+ */ -+void reiser4_done_fs_info(struct super_block *super) -+{ -+ assert("zam-990", super->s_fs_info != NULL); -+ -+ /* release per-super-block d_cursor resources */ -+ reiser4_done_super_d_info(super); -+ -+ /* make sure that there are not jnodes already */ -+ assert("", list_empty(&get_super_private(super)->all_jnodes)); -+ assert("", get_current_context()->trans->atom == NULL); -+ reiser4_check_block_counters(super); -+ kfree(super->s_fs_info); -+ super->s_fs_info = NULL; -+} -+ -+/* type of option parseable by parse_option() */ -+typedef enum { -+ /* value of option is arbitrary string */ -+ OPT_STRING, -+ -+ /* -+ * option specifies bit in a bitmask. When option is set - bit in -+ * sbinfo->fs_flags is set. Examples are bsdgroups, 32bittimes, mtflush, -+ * dont_load_bitmap, atomic_write. -+ */ -+ OPT_BIT, -+ -+ /* -+ * value of option should conform to sprintf() format. Examples are -+ * tmgr.atom_max_size=N, tmgr.atom_max_age=N -+ */ -+ OPT_FORMAT, -+ -+ /* -+ * option can take one of predefined values. Example is onerror=panic or -+ * onerror=remount-ro -+ */ -+ OPT_ONEOF, -+} opt_type_t; -+ -+#if 0 -+struct opt_bitmask_bit { -+ const char *bit_name; -+ int bit_nr; -+}; -+#endif -+ -+/* description of option parseable by parse_option() */ -+struct opt_desc { -+ /* option name. -+ -+ parsed portion of string has a form "name=value". -+ */ -+ const char *name; -+ /* type of option */ -+ opt_type_t type; -+ union { -+ /* where to store value of string option (type == OPT_STRING) */ -+ char **string; -+ /* description of bits for bit option (type == OPT_BIT) */ -+ struct { -+ int nr; -+ void *addr; -+ } bit; -+ /* description of format and targets for format option (type -+ == OPT_FORMAT) */ -+ struct { -+ const char *format; -+ int nr_args; -+ void *arg1; -+ void *arg2; -+ void *arg3; -+ void *arg4; -+ } f; -+ struct { -+ int *result; -+ const char *list[10]; -+ } oneof; -+ struct { -+ void *addr; -+ int nr_bits; -+ //struct opt_bitmask_bit *bits; -+ } bitmask; -+ } u; -+}; -+ -+/** -+ * parse_option - parse one option -+ * @opt_strin: starting point of parsing -+ * @opt: option description -+ * -+ * foo=bar, -+ * ^ ^ ^ -+ * | | +-- replaced to '\0' -+ * | +-- val_start -+ * +-- opt_string -+ * Figures out option type and handles option correspondingly. -+ */ -+static int parse_option(char *opt_string, struct opt_desc *opt) -+{ -+ char *val_start; -+ int result; -+ const char *err_msg; -+ -+ /* NOTE-NIKITA think about using lib/cmdline.c functions here. */ -+ -+ val_start = strchr(opt_string, '='); -+ if (val_start != NULL) { -+ *val_start = '\0'; -+ ++val_start; -+ } -+ -+ err_msg = NULL; -+ result = 0; -+ switch (opt->type) { -+ case OPT_STRING: -+ if (val_start == NULL) { -+ err_msg = "String arg missing"; -+ result = RETERR(-EINVAL); -+ } else -+ *opt->u.string = val_start; -+ break; -+ case OPT_BIT: -+ if (val_start != NULL) -+ err_msg = "Value ignored"; -+ else -+ set_bit(opt->u.bit.nr, opt->u.bit.addr); -+ break; -+ case OPT_FORMAT: -+ if (val_start == NULL) { -+ err_msg = "Formatted arg missing"; -+ result = RETERR(-EINVAL); -+ break; -+ } -+ if (sscanf(val_start, opt->u.f.format, -+ opt->u.f.arg1, opt->u.f.arg2, opt->u.f.arg3, -+ opt->u.f.arg4) != opt->u.f.nr_args) { -+ err_msg = "Wrong conversion"; -+ result = RETERR(-EINVAL); -+ } -+ break; -+ case OPT_ONEOF: -+ { -+ int i = 0; -+ -+ if (val_start == NULL) { -+ err_msg = "Value is missing"; -+ result = RETERR(-EINVAL); -+ break; -+ } -+ err_msg = "Wrong option value"; -+ result = RETERR(-EINVAL); -+ while (opt->u.oneof.list[i]) { -+ if (!strcmp(opt->u.oneof.list[i], val_start)) { -+ result = 0; -+ err_msg = NULL; -+ *opt->u.oneof.result = i; -+ break; -+ } -+ i++; -+ } -+ break; -+ } -+ default: -+ wrong_return_value("nikita-2100", "opt -> type"); -+ break; -+ } -+ if (err_msg != NULL) { -+ warning("nikita-2496", "%s when parsing option \"%s%s%s\"", -+ err_msg, opt->name, val_start ? "=" : "", -+ val_start ? : ""); -+ } -+ return result; -+} -+ -+/** -+ * parse_options - parse reiser4 mount options -+ * @opt_string: starting point -+ * @opts: array of option description -+ * @nr_opts: number of elements in @opts -+ * -+ * Parses comma separated list of reiser4 mount options. -+ */ -+static int parse_options(char *opt_string, struct opt_desc *opts, int nr_opts) -+{ -+ int result; -+ -+ result = 0; -+ while ((result == 0) && opt_string && *opt_string) { -+ int j; -+ char *next; -+ -+ next = strchr(opt_string, ','); -+ if (next != NULL) { -+ *next = '\0'; -+ ++next; -+ } -+ for (j = 0; j < nr_opts; ++j) { -+ if (!strncmp(opt_string, opts[j].name, -+ strlen(opts[j].name))) { -+ result = parse_option(opt_string, &opts[j]); -+ break; -+ } -+ } -+ if (j == nr_opts) { -+ warning("nikita-2307", "Unrecognized option: \"%s\"", -+ opt_string); -+ /* traditionally, -EINVAL is returned on wrong mount -+ option */ -+ result = RETERR(-EINVAL); -+ } -+ opt_string = next; -+ } -+ return result; -+} -+ -+#define NUM_OPT( label, fmt, addr ) \ -+ { \ -+ .name = ( label ), \ -+ .type = OPT_FORMAT, \ -+ .u = { \ -+ .f = { \ -+ .format = ( fmt ), \ -+ .nr_args = 1, \ -+ .arg1 = ( addr ), \ -+ .arg2 = NULL, \ -+ .arg3 = NULL, \ -+ .arg4 = NULL \ -+ } \ -+ } \ -+ } -+ -+#define SB_FIELD_OPT( field, fmt ) NUM_OPT( #field, fmt, &sbinfo -> field ) -+ -+#define BIT_OPT(label, bitnr) \ -+ { \ -+ .name = label, \ -+ .type = OPT_BIT, \ -+ .u = { \ -+ .bit = { \ -+ .nr = bitnr, \ -+ .addr = &sbinfo->fs_flags \ -+ } \ -+ } \ -+ } -+ -+#define MAX_NR_OPTIONS (30) -+ -+/** -+ * reiser4_init_super_data - initialize reiser4 private super block -+ * @super: super block to initialize -+ * @opt_string: list of reiser4 mount options -+ * -+ * Sets various reiser4 parameters to default values. Parses mount options and -+ * overwrites default settings. -+ */ -+int reiser4_init_super_data(struct super_block *super, char *opt_string) -+{ -+ int result; -+ struct opt_desc *opts, *p; -+ reiser4_super_info_data *sbinfo = get_super_private(super); -+ -+ /* initialize super, export, dentry operations */ -+ sbinfo->ops.super = reiser4_super_operations; -+ sbinfo->ops.export = reiser4_export_operations; -+ sbinfo->ops.dentry = reiser4_dentry_operations; -+ super->s_op = &sbinfo->ops.super; -+ super->s_export_op = &sbinfo->ops.export; -+ -+ /* initialize transaction manager parameters to default values */ -+ sbinfo->tmgr.atom_max_size = totalram_pages / 4; -+ sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE / HZ; -+ sbinfo->tmgr.atom_min_size = 256; -+ sbinfo->tmgr.atom_max_flushers = ATOM_MAX_FLUSHERS; -+ -+ /* initialize cbk cache parameter */ -+ sbinfo->tree.cbk_cache.nr_slots = CBK_CACHE_SLOTS; -+ -+ /* initialize flush parameters */ -+ sbinfo->flush.relocate_threshold = FLUSH_RELOCATE_THRESHOLD; -+ sbinfo->flush.relocate_distance = FLUSH_RELOCATE_DISTANCE; -+ sbinfo->flush.written_threshold = FLUSH_WRITTEN_THRESHOLD; -+ sbinfo->flush.scan_maxnodes = FLUSH_SCAN_MAXNODES; -+ -+ sbinfo->optimal_io_size = REISER4_OPTIMAL_IO_SIZE; -+ -+ /* preliminary tree initializations */ -+ sbinfo->tree.super = super; -+ sbinfo->tree.carry.new_node_flags = REISER4_NEW_NODE_FLAGS; -+ sbinfo->tree.carry.new_extent_flags = REISER4_NEW_EXTENT_FLAGS; -+ sbinfo->tree.carry.paste_flags = REISER4_PASTE_FLAGS; -+ sbinfo->tree.carry.insert_flags = REISER4_INSERT_FLAGS; -+ rwlock_init(&(sbinfo->tree.tree_lock)); -+ spin_lock_init(&(sbinfo->tree.epoch_lock)); -+ -+ /* initialize default readahead params */ -+ sbinfo->ra_params.max = num_physpages / 4; -+ sbinfo->ra_params.flags = 0; -+ -+ /* allocate memory for structure describing reiser4 mount options */ -+ opts = kmalloc(sizeof(struct opt_desc) * MAX_NR_OPTIONS, -+ reiser4_ctx_gfp_mask_get()); -+ if (opts == NULL) -+ return RETERR(-ENOMEM); -+ -+ /* initialize structure describing reiser4 mount options */ -+ p = opts; -+ -+#if REISER4_DEBUG -+# define OPT_ARRAY_CHECK if ((p) > (opts) + MAX_NR_OPTIONS) { \ -+ warning ("zam-1046", "opt array is overloaded"); break; \ -+ } -+#else -+# define OPT_ARRAY_CHECK noop -+#endif -+ -+#define PUSH_OPT(...) \ -+do { \ -+ struct opt_desc o = __VA_ARGS__; \ -+ OPT_ARRAY_CHECK; \ -+ *p ++ = o; \ -+} while (0) -+ -+#define PUSH_SB_FIELD_OPT(field, format) PUSH_OPT(SB_FIELD_OPT(field, format)) -+#define PUSH_BIT_OPT(name, bit) PUSH_OPT(BIT_OPT(name, bit)) -+ -+ /* -+ * tmgr.atom_max_size=N -+ * Atoms containing more than N blocks will be forced to commit. N is -+ * decimal. -+ */ -+ PUSH_SB_FIELD_OPT(tmgr.atom_max_size, "%u"); -+ /* -+ * tmgr.atom_max_age=N -+ * Atoms older than N seconds will be forced to commit. N is decimal. -+ */ -+ PUSH_SB_FIELD_OPT(tmgr.atom_max_age, "%u"); -+ /* -+ * tmgr.atom_min_size=N -+ * In committing an atom to free dirty pages, force the atom less than -+ * N in size to fuse with another one. -+ */ -+ PUSH_SB_FIELD_OPT(tmgr.atom_min_size, "%u"); -+ /* -+ * tmgr.atom_max_flushers=N -+ * limit of concurrent flushers for one atom. 0 means no limit. -+ */ -+ PUSH_SB_FIELD_OPT(tmgr.atom_max_flushers, "%u"); -+ /* -+ * tree.cbk_cache_slots=N -+ * Number of slots in the cbk cache. -+ */ -+ PUSH_SB_FIELD_OPT(tree.cbk_cache.nr_slots, "%u"); -+ /* -+ * If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty -+ * leaf-level blocks it will force them to be relocated. -+ */ -+ PUSH_SB_FIELD_OPT(flush.relocate_threshold, "%u"); -+ /* -+ * If flush finds can find a block allocation closer than at most -+ * FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that -+ * position. -+ */ -+ PUSH_SB_FIELD_OPT(flush.relocate_distance, "%u"); -+ /* -+ * If we have written this much or more blocks before encountering busy -+ * jnode in flush list - abort flushing hoping that next time we get -+ * called this jnode will be clean already, and we will save some -+ * seeks. -+ */ -+ PUSH_SB_FIELD_OPT(flush.written_threshold, "%u"); -+ /* The maximum number of nodes to scan left on a level during flush. */ -+ PUSH_SB_FIELD_OPT(flush.scan_maxnodes, "%u"); -+ /* preferred IO size */ -+ PUSH_SB_FIELD_OPT(optimal_io_size, "%u"); -+ /* carry flags used for insertion of new nodes */ -+ PUSH_SB_FIELD_OPT(tree.carry.new_node_flags, "%u"); -+ /* carry flags used for insertion of new extents */ -+ PUSH_SB_FIELD_OPT(tree.carry.new_extent_flags, "%u"); -+ /* carry flags used for paste operations */ -+ PUSH_SB_FIELD_OPT(tree.carry.paste_flags, "%u"); -+ /* carry flags used for insert operations */ -+ PUSH_SB_FIELD_OPT(tree.carry.insert_flags, "%u"); -+ -+#ifdef CONFIG_REISER4_BADBLOCKS -+ /* -+ * Alternative master superblock location in case if it's original -+ * location is not writeable/accessable. This is offset in BYTES. -+ */ -+ PUSH_SB_FIELD_OPT(altsuper, "%lu"); -+#endif -+ -+ /* turn on BSD-style gid assignment */ -+ PUSH_BIT_OPT("bsdgroups", REISER4_BSD_GID); -+ /* turn on 32 bit times */ -+ PUSH_BIT_OPT("32bittimes", REISER4_32_BIT_TIMES); -+ /* -+ * Don't load all bitmap blocks at mount time, it is useful for -+ * machines with tiny RAM and large disks. -+ */ -+ PUSH_BIT_OPT("dont_load_bitmap", REISER4_DONT_LOAD_BITMAP); -+ /* disable transaction commits during write() */ -+ PUSH_BIT_OPT("atomic_write", REISER4_ATOMIC_WRITE); -+ /* disable use of write barriers in the reiser4 log writer. */ -+ PUSH_BIT_OPT("no_write_barrier", REISER4_NO_WRITE_BARRIER); -+ -+ PUSH_OPT( -+ { -+ /* -+ * tree traversal readahead parameters: -+ * -o readahead:MAXNUM:FLAGS -+ * MAXNUM - max number fo nodes to request readahead for: -1UL -+ * will set it to max_sane_readahead() -+ * FLAGS - combination of bits: RA_ADJCENT_ONLY, RA_ALL_LEVELS, -+ * CONTINUE_ON_PRESENT -+ */ -+ .name = "readahead", -+ .type = OPT_FORMAT, -+ .u = { -+ .f = { -+ .format = "%u:%u", -+ .nr_args = 2, -+ .arg1 = &sbinfo->ra_params.max, -+ .arg2 = &sbinfo->ra_params.flags, -+ .arg3 = NULL, -+ .arg4 = NULL -+ } -+ } -+ } -+ ); -+ -+ /* What to do in case of fs error */ -+ PUSH_OPT( -+ { -+ .name = "onerror", -+ .type = OPT_ONEOF, -+ .u = { -+ .oneof = { -+ .result = &sbinfo->onerror, -+ .list = { -+ "panic", "remount-ro", NULL -+ }, -+ } -+ } -+ } -+ ); -+ -+ /* modify default settings to values set by mount options */ -+ result = parse_options(opt_string, opts, p - opts); -+ kfree(opts); -+ if (result != 0) -+ return result; -+ -+ /* correct settings to sanity values */ -+ sbinfo->tmgr.atom_max_age *= HZ; -+ if (sbinfo->tmgr.atom_max_age <= 0) -+ /* overflow */ -+ sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE; -+ -+ /* round optimal io size up to 512 bytes */ -+ sbinfo->optimal_io_size >>= VFS_BLKSIZE_BITS; -+ sbinfo->optimal_io_size <<= VFS_BLKSIZE_BITS; -+ if (sbinfo->optimal_io_size == 0) { -+ warning("nikita-2497", "optimal_io_size is too small"); -+ return RETERR(-EINVAL); -+ } -+ return result; -+} -+ -+/** -+ * reiser4_init_read_super - read reiser4 master super block -+ * @super: super block to fill -+ * @silent: if 0 - print warnings -+ * -+ * Reads reiser4 master super block either from predefined location or from -+ * location specified by altsuper mount option, initializes disk format plugin. -+ */ -+int reiser4_init_read_super(struct super_block *super, int silent) -+{ -+ struct buffer_head *super_bh; -+ struct reiser4_master_sb *master_sb; -+ reiser4_super_info_data *sbinfo = get_super_private(super); -+ unsigned long blocksize; -+ -+ read_super_block: -+#ifdef CONFIG_REISER4_BADBLOCKS -+ if (sbinfo->altsuper) -+ /* -+ * read reiser4 master super block at position specified by -+ * mount option -+ */ -+ super_bh = sb_bread(super, -+ (sector_t)(sbinfo->altsuper / super->s_blocksize)); -+ else -+#endif -+ /* read reiser4 master super block at 16-th 4096 block */ -+ super_bh = sb_bread(super, -+ (sector_t)(REISER4_MAGIC_OFFSET / super->s_blocksize)); -+ if (!super_bh) -+ return RETERR(-EIO); -+ -+ master_sb = (struct reiser4_master_sb *)super_bh->b_data; -+ /* check reiser4 magic string */ -+ if (!strncmp(master_sb->magic, REISER4_SUPER_MAGIC_STRING, -+ sizeof(REISER4_SUPER_MAGIC_STRING))) { -+ /* reiser4 master super block contains filesystem blocksize */ -+ blocksize = le16_to_cpu(get_unaligned(&master_sb->blocksize)); -+ -+ if (blocksize != PAGE_CACHE_SIZE) { -+ /* -+ * currenly reiser4's blocksize must be equal to -+ * pagesize -+ */ -+ if (!silent) -+ warning("nikita-2609", -+ "%s: wrong block size %ld\n", super->s_id, -+ blocksize); -+ brelse(super_bh); -+ return RETERR(-EINVAL); -+ } -+ if (blocksize != super->s_blocksize) { -+ /* -+ * filesystem uses different blocksize. Reread master -+ * super block with correct blocksize -+ */ -+ brelse(super_bh); -+ if (!sb_set_blocksize(super, (int)blocksize)) -+ return RETERR(-EINVAL); -+ goto read_super_block; -+ } -+ -+ sbinfo->df_plug = -+ disk_format_plugin_by_id( -+ le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id))); -+ if (sbinfo->df_plug == NULL) { -+ if (!silent) -+ warning("nikita-26091", -+ "%s: unknown disk format plugin %d\n", -+ super->s_id, -+ le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id))); -+ brelse(super_bh); -+ return RETERR(-EINVAL); -+ } -+ sbinfo->diskmap_block = le64_to_cpu(get_unaligned(&master_sb->diskmap)); -+ brelse(super_bh); -+ return 0; -+ } -+ -+ /* there is no reiser4 on the device */ -+ if (!silent) -+ warning("nikita-2608", -+ "%s: wrong master super block magic", super->s_id); -+ brelse(super_bh); -+ return RETERR(-EINVAL); -+} -+ -+static struct { -+ reiser4_plugin_type type; -+ reiser4_plugin_id id; -+} default_plugins[PSET_LAST] = { -+ [PSET_FILE] = { -+ .type = REISER4_FILE_PLUGIN_TYPE, -+ .id = UNIX_FILE_PLUGIN_ID -+ }, -+ [PSET_DIR] = { -+ .type = REISER4_DIR_PLUGIN_TYPE, -+ .id = HASHED_DIR_PLUGIN_ID -+ }, -+ [PSET_HASH] = { -+ .type = REISER4_HASH_PLUGIN_TYPE, -+ .id = R5_HASH_ID -+ }, -+ [PSET_FIBRATION] = { -+ .type = REISER4_FIBRATION_PLUGIN_TYPE, -+ .id = FIBRATION_DOT_O -+ }, -+ [PSET_PERM] = { -+ .type = REISER4_PERM_PLUGIN_TYPE, -+ .id = NULL_PERM_ID -+ }, -+ [PSET_FORMATTING] = { -+ .type = REISER4_FORMATTING_PLUGIN_TYPE, -+ .id = SMALL_FILE_FORMATTING_ID -+ }, -+ [PSET_SD] = { -+ .type = REISER4_ITEM_PLUGIN_TYPE, -+ .id = STATIC_STAT_DATA_ID -+ }, -+ [PSET_DIR_ITEM] = { -+ .type = REISER4_ITEM_PLUGIN_TYPE, -+ .id = COMPOUND_DIR_ID -+ }, -+ [PSET_CIPHER] = { -+ .type = REISER4_CIPHER_PLUGIN_TYPE, -+ .id = NONE_CIPHER_ID -+ }, -+ [PSET_DIGEST] = { -+ .type = REISER4_DIGEST_PLUGIN_TYPE, -+ .id = SHA256_32_DIGEST_ID -+ }, -+ [PSET_COMPRESSION] = { -+ .type = REISER4_COMPRESSION_PLUGIN_TYPE, -+ .id = LZO1_COMPRESSION_ID -+ }, -+ [PSET_COMPRESSION_MODE] = { -+ .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ .id = CONVX_COMPRESSION_MODE_ID -+ }, -+ [PSET_CLUSTER] = { -+ .type = REISER4_CLUSTER_PLUGIN_TYPE, -+ .id = CLUSTER_64K_ID -+ }, -+ [PSET_CREATE] = { -+ .type = REISER4_FILE_PLUGIN_TYPE, -+ .id = UNIX_FILE_PLUGIN_ID -+ } -+}; -+ -+/* access to default plugin table */ -+reiser4_plugin *get_default_plugin(pset_member memb) -+{ -+ return plugin_by_id(default_plugins[memb].type, -+ default_plugins[memb].id); -+} -+ -+/** -+ * reiser4_init_root_inode - obtain inode of root directory -+ * @super: super block of filesystem -+ * -+ * Obtains inode of root directory (reading it from disk), initializes plugin -+ * set it was not initialized. -+ */ -+int reiser4_init_root_inode(struct super_block *super) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(super); -+ struct inode *inode; -+ int result = 0; -+ -+ inode = reiser4_iget(super, sbinfo->df_plug->root_dir_key(super), 0); -+ if (IS_ERR(inode)) -+ return RETERR(PTR_ERR(inode)); -+ -+ super->s_root = d_alloc_root(inode); -+ if (!super->s_root) { -+ iput(inode); -+ return RETERR(-ENOMEM); -+ } -+ -+ super->s_root->d_op = &sbinfo->ops.dentry; -+ -+ if (!is_inode_loaded(inode)) { -+ pset_member memb; -+ plugin_set *pset; -+ -+ pset = reiser4_inode_data(inode)->pset; -+ for (memb = 0; memb < PSET_LAST; ++memb) { -+ -+ if (aset_get(pset, memb) != NULL) -+ continue; -+ -+ result = grab_plugin_pset(inode, NULL, memb); -+ if (result != 0) -+ break; -+ -+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN); -+ } -+ -+ if (result == 0) { -+ if (REISER4_DEBUG) { -+ for (memb = 0; memb < PSET_LAST; ++memb) -+ assert("nikita-3500", -+ aset_get(pset, memb) != NULL); -+ } -+ } else -+ warning("nikita-3448", "Cannot set plugins of root: %i", -+ result); -+ reiser4_iget_complete(inode); -+ -+ /* As the default pset kept in the root dir may has been changed -+ (length is unknown), call update_sd. */ -+ if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) { -+ result = reiser4_grab_space( -+ inode_file_plugin(inode)->estimate.update(inode), -+ BA_CAN_COMMIT); -+ -+ if (result == 0) -+ result = reiser4_update_sd(inode); -+ -+ all_grabbed2free(); -+ } -+ } -+ -+ super->s_maxbytes = MAX_LFS_FILESIZE; -+ return result; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/inode.c linux-2.6.24/fs/reiser4/inode.c ---- linux-2.6.24.orig/fs/reiser4/inode.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/inode.c 2008-01-25 11:39:06.932206658 +0300 -@@ -0,0 +1,709 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Inode specific operations. */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "key.h" -+#include "kassign.h" -+#include "coord.h" -+#include "seal.h" -+#include "dscale.h" -+#include "plugin/item/item.h" -+#include "plugin/security/perm.h" -+#include "plugin/plugin.h" -+#include "plugin/object.h" -+#include "znode.h" -+#include "vfs_ops.h" -+#include "inode.h" -+#include "super.h" -+#include "reiser4.h" -+ -+#include /* for struct super_block, address_space */ -+ -+/* return reiser4 internal tree which inode belongs to */ -+/* Audited by: green(2002.06.17) */ -+reiser4_tree *reiser4_tree_by_inode(const struct inode *inode /* inode queried */ ) -+{ -+ assert("nikita-256", inode != NULL); -+ assert("nikita-257", inode->i_sb != NULL); -+ return reiser4_get_tree(inode->i_sb); -+} -+ -+/* return reiser4-specific inode flags */ -+static inline unsigned long *inode_flags(const struct inode *const inode) -+{ -+ assert("nikita-2842", inode != NULL); -+ return &reiser4_inode_data(inode)->flags; -+} -+ -+/* set reiser4-specific flag @f in @inode */ -+void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f) -+{ -+ assert("nikita-2248", inode != NULL); -+ set_bit((int)f, inode_flags(inode)); -+} -+ -+/* clear reiser4-specific flag @f in @inode */ -+void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f) -+{ -+ assert("nikita-2250", inode != NULL); -+ clear_bit((int)f, inode_flags(inode)); -+} -+ -+/* true if reiser4-specific flag @f is set in @inode */ -+int reiser4_inode_get_flag(const struct inode *inode, -+ reiser4_file_plugin_flags f) -+{ -+ assert("nikita-2251", inode != NULL); -+ return test_bit((int)f, inode_flags(inode)); -+} -+ -+/* convert oid to inode number */ -+ino_t oid_to_ino(oid_t oid) -+{ -+ return (ino_t) oid; -+} -+ -+/* convert oid to user visible inode number */ -+ino_t oid_to_uino(oid_t oid) -+{ -+ /* reiser4 object is uniquely identified by oid which is 64 bit -+ quantity. Kernel in-memory inode is indexed (in the hash table) by -+ 32 bit i_ino field, but this is not a problem, because there is a -+ way to further distinguish inodes with identical inode numbers -+ (find_actor supplied to iget()). -+ -+ But user space expects unique 32 bit inode number. Obviously this -+ is impossible. Work-around is to somehow hash oid into user visible -+ inode number. -+ */ -+ oid_t max_ino = (ino_t) ~ 0; -+ -+ if (REISER4_INO_IS_OID || (oid <= max_ino)) -+ return oid; -+ else -+ /* this is remotely similar to algorithm used to find next pid -+ to use for process: after wrap-around start from some -+ offset rather than from 0. Idea is that there are some long -+ living objects with which we don't want to collide. -+ */ -+ return REISER4_UINO_SHIFT + ((oid - max_ino) & (max_ino >> 1)); -+} -+ -+/* check that "inode" is on reiser4 file-system */ -+int is_reiser4_inode(const struct inode *inode /* inode queried */ ) -+{ -+ return inode != NULL && is_reiser4_super(inode->i_sb); -+} -+ -+/* Maximal length of a name that can be stored in directory @inode. -+ -+ This is used in check during file creation and lookup. */ -+int reiser4_max_filename_len(const struct inode *inode /* inode queried */ ) -+{ -+ assert("nikita-287", is_reiser4_inode(inode)); -+ assert("nikita-1710", inode_dir_item_plugin(inode)); -+ if (inode_dir_item_plugin(inode)->s.dir.max_name_len) -+ return inode_dir_item_plugin(inode)->s.dir.max_name_len(inode); -+ else -+ return 255; -+} -+ -+#if REISER4_USE_COLLISION_LIMIT -+/* Maximal number of hash collisions for this directory. */ -+int max_hash_collisions(const struct inode *dir /* inode queried */ ) -+{ -+ assert("nikita-1711", dir != NULL); -+ return reiser4_inode_data(dir)->plugin.max_collisions; -+} -+#endif /* REISER4_USE_COLLISION_LIMIT */ -+ -+/* Install file, inode, and address_space operation on @inode, depending on -+ its mode. */ -+int setup_inode_ops(struct inode *inode /* inode to intialize */ , -+ reiser4_object_create_data * data /* parameters to create -+ * object */ ) -+{ -+ reiser4_super_info_data *sinfo; -+ file_plugin *fplug; -+ dir_plugin *dplug; -+ -+ fplug = inode_file_plugin(inode); -+ dplug = inode_dir_plugin(inode); -+ -+ sinfo = get_super_private(inode->i_sb); -+ -+ switch (inode->i_mode & S_IFMT) { -+ case S_IFSOCK: -+ case S_IFBLK: -+ case S_IFCHR: -+ case S_IFIFO: -+ { -+ dev_t rdev; /* to keep gcc happy */ -+ -+ assert("vs-46", fplug != NULL); -+ /* ugly hack with rdev */ -+ if (data == NULL) { -+ rdev = inode->i_rdev; -+ inode->i_rdev = 0; -+ } else -+ rdev = data->rdev; -+ inode->i_blocks = 0; -+ assert("vs-42", fplug->h.id == SPECIAL_FILE_PLUGIN_ID); -+ inode->i_op = file_plugins[fplug->h.id].inode_ops; -+ /* initialize inode->i_fop and inode->i_rdev for block and char -+ devices */ -+ init_special_inode(inode, inode->i_mode, rdev); -+ /* all address space operations are null */ -+ inode->i_mapping->a_ops = -+ file_plugins[fplug->h.id].as_ops; -+ break; -+ } -+ case S_IFLNK: -+ assert("vs-46", fplug != NULL); -+ assert("vs-42", fplug->h.id == SYMLINK_FILE_PLUGIN_ID); -+ inode->i_op = file_plugins[fplug->h.id].inode_ops; -+ inode->i_fop = NULL; -+ /* all address space operations are null */ -+ inode->i_mapping->a_ops = file_plugins[fplug->h.id].as_ops; -+ break; -+ case S_IFDIR: -+ assert("vs-46", dplug != NULL); -+ assert("vs-43", (dplug->h.id == HASHED_DIR_PLUGIN_ID || -+ dplug->h.id == SEEKABLE_HASHED_DIR_PLUGIN_ID)); -+ inode->i_op = dir_plugins[dplug->h.id].inode_ops; -+ inode->i_fop = dir_plugins[dplug->h.id].file_ops; -+ inode->i_mapping->a_ops = dir_plugins[dplug->h.id].as_ops; -+ break; -+ case S_IFREG: -+ assert("vs-46", fplug != NULL); -+ assert("vs-43", (fplug->h.id == UNIX_FILE_PLUGIN_ID || -+ fplug->h.id == CRYPTCOMPRESS_FILE_PLUGIN_ID)); -+ inode->i_op = file_plugins[fplug->h.id].inode_ops; -+ inode->i_fop = file_plugins[fplug->h.id].file_ops; -+ inode->i_mapping->a_ops = file_plugins[fplug->h.id].as_ops; -+ break; -+ default: -+ warning("nikita-291", "wrong file mode: %o for %llu", -+ inode->i_mode, -+ (unsigned long long)get_inode_oid(inode)); -+ reiser4_make_bad_inode(inode); -+ return RETERR(-EINVAL); -+ } -+ return 0; -+} -+ -+/* Initialize inode from disk data. Called with inode locked. -+ Return inode locked. */ -+static int init_inode(struct inode *inode /* inode to intialise */ , -+ coord_t * coord /* coord of stat data */ ) -+{ -+ int result; -+ item_plugin *iplug; -+ void *body; -+ int length; -+ reiser4_inode *state; -+ -+ assert("nikita-292", coord != NULL); -+ assert("nikita-293", inode != NULL); -+ -+ coord_clear_iplug(coord); -+ result = zload(coord->node); -+ if (result) -+ return result; -+ iplug = item_plugin_by_coord(coord); -+ body = item_body_by_coord(coord); -+ length = item_length_by_coord(coord); -+ -+ assert("nikita-295", iplug != NULL); -+ assert("nikita-296", body != NULL); -+ assert("nikita-297", length > 0); -+ -+ /* inode is under I_LOCK now */ -+ -+ state = reiser4_inode_data(inode); -+ /* call stat-data plugin method to load sd content into inode */ -+ result = iplug->s.sd.init_inode(inode, body, length); -+ set_plugin(&state->pset, PSET_SD, item_plugin_to_plugin(iplug)); -+ if (result == 0) { -+ result = setup_inode_ops(inode, NULL); -+ if (result == 0 && inode->i_sb->s_root && -+ inode->i_sb->s_root->d_inode) -+ result = finish_pset(inode); -+ } -+ zrelse(coord->node); -+ return result; -+} -+ -+/* read `inode' from the disk. This is what was previously in -+ reiserfs_read_inode2(). -+ -+ Must be called with inode locked. Return inode still locked. -+*/ -+static int read_inode(struct inode *inode /* inode to read from disk */ , -+ const reiser4_key * key /* key of stat data */ , -+ int silent) -+{ -+ int result; -+ lock_handle lh; -+ reiser4_inode *info; -+ coord_t coord; -+ -+ assert("nikita-298", inode != NULL); -+ assert("nikita-1945", !is_inode_loaded(inode)); -+ -+ info = reiser4_inode_data(inode); -+ assert("nikita-300", info->locality_id != 0); -+ -+ coord_init_zero(&coord); -+ init_lh(&lh); -+ /* locate stat-data in a tree and return znode locked */ -+ result = lookup_sd(inode, ZNODE_READ_LOCK, &coord, &lh, key, silent); -+ assert("nikita-301", !is_inode_loaded(inode)); -+ if (result == 0) { -+ /* use stat-data plugin to load sd into inode. */ -+ result = init_inode(inode, &coord); -+ if (result == 0) { -+ /* initialize stat-data seal */ -+ spin_lock_inode(inode); -+ reiser4_seal_init(&info->sd_seal, &coord, key); -+ info->sd_coord = coord; -+ spin_unlock_inode(inode); -+ -+ /* call file plugin's method to initialize plugin -+ * specific part of inode */ -+ if (inode_file_plugin(inode)->init_inode_data) -+ inode_file_plugin(inode)->init_inode_data(inode, -+ NULL, -+ 0); -+ /* load detached directory cursors for stateless -+ * directory readers (NFS). */ -+ reiser4_load_cursors(inode); -+ -+ /* Check the opened inode for consistency. */ -+ result = -+ get_super_private(inode->i_sb)->df_plug-> -+ check_open(inode); -+ } -+ } -+ /* lookup_sd() doesn't release coord because we want znode -+ stay read-locked while stat-data fields are accessed in -+ init_inode() */ -+ done_lh(&lh); -+ -+ if (result != 0) -+ reiser4_make_bad_inode(inode); -+ return result; -+} -+ -+/* initialise new reiser4 inode being inserted into hash table. */ -+static int init_locked_inode(struct inode *inode /* new inode */ , -+ void *opaque /* key of stat data passed to the -+ * iget5_locked as cookie */ ) -+{ -+ reiser4_key *key; -+ -+ assert("nikita-1995", inode != NULL); -+ assert("nikita-1996", opaque != NULL); -+ key = opaque; -+ set_inode_oid(inode, get_key_objectid(key)); -+ reiser4_inode_data(inode)->locality_id = get_key_locality(key); -+ return 0; -+} -+ -+/* reiser4_inode_find_actor() - "find actor" supplied by reiser4 to iget5_locked(). -+ -+ This function is called by iget5_locked() to distinguish reiser4 inodes -+ having the same inode numbers. Such inodes can only exist due to some error -+ condition. One of them should be bad. Inodes with identical inode numbers -+ (objectids) are distinguished by their packing locality. -+ -+*/ -+static int reiser4_inode_find_actor(struct inode *inode /* inode from hash table to -+ * check */ , -+ void *opaque /* "cookie" passed to -+ * iget5_locked(). This is stat data -+ * key */ ) -+{ -+ reiser4_key *key; -+ -+ key = opaque; -+ return -+ /* oid is unique, so first term is enough, actually. */ -+ get_inode_oid(inode) == get_key_objectid(key) && -+ /* -+ * also, locality should be checked, but locality is stored in -+ * the reiser4-specific part of the inode, and actor can be -+ * called against arbitrary inode that happened to be in this -+ * hash chain. Hence we first have to check that this is -+ * reiser4 inode at least. is_reiser4_inode() is probably too -+ * early to call, as inode may have ->i_op not yet -+ * initialised. -+ */ -+ is_reiser4_super(inode->i_sb) && -+ /* -+ * usually objectid is unique, but pseudo files use counter to -+ * generate objectid. All pseudo files are placed into special -+ * (otherwise unused) locality. -+ */ -+ reiser4_inode_data(inode)->locality_id == get_key_locality(key); -+} -+ -+/* hook for kmem_cache_create */ -+void loading_init_once(reiser4_inode * info) -+{ -+ mutex_init(&info->loading); -+} -+ -+/* for reiser4_alloc_inode */ -+void loading_alloc(reiser4_inode * info) -+{ -+ assert("vs-1717", !mutex_is_locked(&info->loading)); -+} -+ -+/* for reiser4_destroy */ -+void loading_destroy(reiser4_inode * info) -+{ -+ assert("vs-1717a", !mutex_is_locked(&info->loading)); -+} -+ -+static void loading_begin(reiser4_inode * info) -+{ -+ mutex_lock(&info->loading); -+} -+ -+static void loading_end(reiser4_inode * info) -+{ -+ mutex_unlock(&info->loading); -+} -+ -+/** -+ * reiser4_iget - obtain inode via iget5_locked, read from disk if necessary -+ * @super: super block of filesystem -+ * @key: key of inode's stat-data -+ * @silent: -+ * -+ * This is our helper function a la iget(). This is be called by -+ * lookup_common() and reiser4_read_super(). Return inode locked or error -+ * encountered. -+ */ -+struct inode *reiser4_iget(struct super_block *super, const reiser4_key *key, -+ int silent) -+{ -+ struct inode *inode; -+ int result; -+ reiser4_inode *info; -+ -+ assert("nikita-302", super != NULL); -+ assert("nikita-303", key != NULL); -+ -+ result = 0; -+ -+ /* call iget(). Our ->read_inode() is dummy, so this will either -+ find inode in cache or return uninitialised inode */ -+ inode = iget5_locked(super, -+ (unsigned long)get_key_objectid(key), -+ reiser4_inode_find_actor, -+ init_locked_inode, (reiser4_key *) key); -+ if (inode == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ if (is_bad_inode(inode)) { -+ warning("nikita-304", "Bad inode found"); -+ reiser4_print_key("key", key); -+ iput(inode); -+ return ERR_PTR(RETERR(-EIO)); -+ } -+ -+ info = reiser4_inode_data(inode); -+ -+ /* Reiser4 inode state bit REISER4_LOADED is used to distinguish fully -+ loaded and initialized inode from just allocated inode. If -+ REISER4_LOADED bit is not set, reiser4_iget() completes loading under -+ info->loading. The place in reiser4 which uses not initialized inode -+ is the reiser4 repacker, see repacker-related functions in -+ plugin/item/extent.c */ -+ if (!is_inode_loaded(inode)) { -+ loading_begin(info); -+ if (!is_inode_loaded(inode)) { -+ /* locking: iget5_locked returns locked inode */ -+ assert("nikita-1941", !is_inode_loaded(inode)); -+ assert("nikita-1949", -+ reiser4_inode_find_actor(inode, -+ (reiser4_key *) key)); -+ /* now, inode has objectid as ->i_ino and locality in -+ reiser4-specific part. This is enough for -+ read_inode() to read stat data from the disk */ -+ result = read_inode(inode, key, silent); -+ } else -+ loading_end(info); -+ } -+ -+ if (inode->i_state & I_NEW) -+ unlock_new_inode(inode); -+ -+ if (is_bad_inode(inode)) { -+ assert("vs-1717", result != 0); -+ loading_end(info); -+ iput(inode); -+ inode = ERR_PTR(result); -+ } else if (REISER4_DEBUG) { -+ reiser4_key found_key; -+ -+ assert("vs-1717", result == 0); -+ build_sd_key(inode, &found_key); -+ if (!keyeq(&found_key, key)) { -+ warning("nikita-305", "Wrong key in sd"); -+ reiser4_print_key("sought for", key); -+ reiser4_print_key("found", &found_key); -+ } -+ if (inode->i_nlink == 0) { -+ warning("nikita-3559", "Unlinked inode found: %llu\n", -+ (unsigned long long)get_inode_oid(inode)); -+ } -+ } -+ return inode; -+} -+ -+/* reiser4_iget() may return not fully initialized inode, this function should -+ * be called after one completes reiser4 inode initializing. */ -+void reiser4_iget_complete(struct inode *inode) -+{ -+ assert("zam-988", is_reiser4_inode(inode)); -+ -+ if (!is_inode_loaded(inode)) { -+ reiser4_inode_set_flag(inode, REISER4_LOADED); -+ loading_end(reiser4_inode_data(inode)); -+ } -+} -+ -+void reiser4_make_bad_inode(struct inode *inode) -+{ -+ assert("nikita-1934", inode != NULL); -+ -+ /* clear LOADED bit */ -+ reiser4_inode_clr_flag(inode, REISER4_LOADED); -+ make_bad_inode(inode); -+ return; -+} -+ -+file_plugin *inode_file_plugin(const struct inode * inode) -+{ -+ assert("nikita-1997", inode != NULL); -+ return reiser4_inode_data(inode)->pset->file; -+} -+ -+dir_plugin *inode_dir_plugin(const struct inode * inode) -+{ -+ assert("nikita-1998", inode != NULL); -+ return reiser4_inode_data(inode)->pset->dir; -+} -+ -+formatting_plugin *inode_formatting_plugin(const struct inode * inode) -+{ -+ assert("nikita-2000", inode != NULL); -+ return reiser4_inode_data(inode)->pset->formatting; -+} -+ -+hash_plugin *inode_hash_plugin(const struct inode * inode) -+{ -+ assert("nikita-2001", inode != NULL); -+ return reiser4_inode_data(inode)->pset->hash; -+} -+ -+fibration_plugin *inode_fibration_plugin(const struct inode * inode) -+{ -+ assert("nikita-2001", inode != NULL); -+ return reiser4_inode_data(inode)->pset->fibration; -+} -+ -+cipher_plugin *inode_cipher_plugin(const struct inode * inode) -+{ -+ assert("edward-36", inode != NULL); -+ return reiser4_inode_data(inode)->pset->cipher; -+} -+ -+compression_plugin *inode_compression_plugin(const struct inode * inode) -+{ -+ assert("edward-37", inode != NULL); -+ return reiser4_inode_data(inode)->pset->compression; -+} -+ -+compression_mode_plugin *inode_compression_mode_plugin(const struct inode * -+ inode) -+{ -+ assert("edward-1330", inode != NULL); -+ return reiser4_inode_data(inode)->pset->compression_mode; -+} -+ -+cluster_plugin *inode_cluster_plugin(const struct inode * inode) -+{ -+ assert("edward-1328", inode != NULL); -+ return reiser4_inode_data(inode)->pset->cluster; -+} -+ -+file_plugin *inode_create_plugin(const struct inode * inode) -+{ -+ assert("edward-1329", inode != NULL); -+ return reiser4_inode_data(inode)->pset->create; -+} -+ -+digest_plugin *inode_digest_plugin(const struct inode * inode) -+{ -+ assert("edward-86", inode != NULL); -+ return reiser4_inode_data(inode)->pset->digest; -+} -+ -+item_plugin *inode_sd_plugin(const struct inode * inode) -+{ -+ assert("vs-534", inode != NULL); -+ return reiser4_inode_data(inode)->pset->sd; -+} -+ -+item_plugin *inode_dir_item_plugin(const struct inode * inode) -+{ -+ assert("vs-534", inode != NULL); -+ return reiser4_inode_data(inode)->pset->dir_item; -+} -+ -+file_plugin *child_create_plugin(const struct inode * inode) -+{ -+ assert("edward-1329", inode != NULL); -+ return reiser4_inode_data(inode)->hset->create; -+} -+ -+void inode_set_extension(struct inode *inode, sd_ext_bits ext) -+{ -+ reiser4_inode *state; -+ -+ assert("nikita-2716", inode != NULL); -+ assert("nikita-2717", ext < LAST_SD_EXTENSION); -+ assert("nikita-3491", spin_inode_is_locked(inode)); -+ -+ state = reiser4_inode_data(inode); -+ state->extmask |= 1 << ext; -+ /* force re-calculation of stat-data length on next call to -+ update_sd(). */ -+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN); -+} -+ -+void inode_clr_extension(struct inode *inode, sd_ext_bits ext) -+{ -+ reiser4_inode *state; -+ -+ assert("vpf-1926", inode != NULL); -+ assert("vpf-1927", ext < LAST_SD_EXTENSION); -+ assert("vpf-1928", spin_inode_is_locked(inode)); -+ -+ state = reiser4_inode_data(inode); -+ state->extmask &= ~(1 << ext); -+ /* force re-calculation of stat-data length on next call to -+ update_sd(). */ -+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN); -+} -+ -+void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new) -+{ -+ assert("edward-1287", inode != NULL); -+ if (!dscale_fit(old, new)) -+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN); -+ return; -+} -+ -+void inode_check_scale(struct inode *inode, __u64 old, __u64 new) -+{ -+ assert("nikita-2875", inode != NULL); -+ spin_lock_inode(inode); -+ inode_check_scale_nolock(inode, old, new); -+ spin_unlock_inode(inode); -+} -+ -+/* -+ * initialize ->ordering field of inode. This field defines how file stat-data -+ * and body is ordered within a tree with respect to other objects within the -+ * same parent directory. -+ */ -+void -+init_inode_ordering(struct inode *inode, -+ reiser4_object_create_data * crd, int create) -+{ -+ reiser4_key key; -+ -+ if (create) { -+ struct inode *parent; -+ -+ parent = crd->parent; -+ assert("nikita-3224", inode_dir_plugin(parent) != NULL); -+ inode_dir_plugin(parent)->build_entry_key(parent, -+ &crd->dentry->d_name, -+ &key); -+ } else { -+ coord_t *coord; -+ -+ coord = &reiser4_inode_data(inode)->sd_coord; -+ coord_clear_iplug(coord); -+ /* safe to use ->sd_coord, because node is under long term -+ * lock */ -+ WITH_DATA(coord->node, item_key_by_coord(coord, &key)); -+ } -+ -+ set_inode_ordering(inode, get_key_ordering(&key)); -+} -+ -+znode *inode_get_vroot(struct inode *inode) -+{ -+ reiser4_block_nr blk; -+ znode *result; -+ -+ spin_lock_inode(inode); -+ blk = reiser4_inode_data(inode)->vroot; -+ spin_unlock_inode(inode); -+ if (!disk_addr_eq(&UBER_TREE_ADDR, &blk)) -+ result = zlook(reiser4_tree_by_inode(inode), &blk); -+ else -+ result = NULL; -+ return result; -+} -+ -+void inode_set_vroot(struct inode *inode, znode *vroot) -+{ -+ spin_lock_inode(inode); -+ reiser4_inode_data(inode)->vroot = *znode_get_block(vroot); -+ spin_unlock_inode(inode); -+} -+ -+#if REISER4_DEBUG -+ -+void reiser4_inode_invariant(const struct inode *inode) -+{ -+ assert("nikita-3077", spin_inode_is_locked(inode)); -+} -+ -+int inode_has_no_jnodes(reiser4_inode * r4_inode) -+{ -+ return jnode_tree_by_reiser4_inode(r4_inode)->rnode == NULL && -+ r4_inode->nr_jnodes == 0; -+} -+ -+#endif -+ -+/* true if directory is empty (only contains dot and dotdot) */ -+/* FIXME: shouldn't it be dir plugin method? */ -+int is_dir_empty(const struct inode *dir) -+{ -+ assert("nikita-1976", dir != NULL); -+ -+ /* rely on our method to maintain directory i_size being equal to the -+ number of entries. */ -+ return dir->i_size <= 2 ? 0 : RETERR(-ENOTEMPTY); -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/inode.h linux-2.6.24/fs/reiser4/inode.h ---- linux-2.6.24.orig/fs/reiser4/inode.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/inode.h 2008-01-25 11:39:06.936207689 +0300 -@@ -0,0 +1,449 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Inode functions. */ -+ -+#if !defined( __REISER4_INODE_H__ ) -+#define __REISER4_INODE_H__ -+ -+#include "forward.h" -+#include "debug.h" -+#include "key.h" -+#include "seal.h" -+#include "plugin/plugin.h" -+#include "plugin/file/cryptcompress.h" -+#include "plugin/file/file.h" -+#include "plugin/dir/dir.h" -+#include "plugin/plugin_set.h" -+#include "plugin/security/perm.h" -+#include "vfs_ops.h" -+#include "jnode.h" -+#include "fsdata.h" -+ -+#include /* for __u?? , ino_t */ -+#include /* for struct super_block, struct -+ * rw_semaphore, etc */ -+#include -+#include -+ -+/* reiser4-specific inode flags. They are "transient" and are not -+ supposed to be stored on disk. Used to trace "state" of -+ inode -+*/ -+typedef enum { -+ /* this is light-weight inode, inheriting some state from its -+ parent */ -+ REISER4_LIGHT_WEIGHT = 0, -+ /* stat data wasn't yet created */ -+ REISER4_NO_SD = 1, -+ /* internal immutable flag. Currently is only used -+ to avoid race condition during file creation. -+ See comment in create_object(). */ -+ REISER4_IMMUTABLE = 2, -+ /* inode was read from storage */ -+ REISER4_LOADED = 3, -+ /* this bit is set for symlinks. inode->i_private points to target -+ name of symlink. */ -+ REISER4_GENERIC_PTR_USED = 4, -+ /* set if size of stat-data item for this inode is known. If this is -+ * set we can avoid recalculating size of stat-data on each update. */ -+ REISER4_SDLEN_KNOWN = 5, -+ /* reiser4_inode->crypt points to the crypto stat */ -+ REISER4_CRYPTO_STAT_LOADED = 6, -+ /* cryptcompress_inode_data points to the secret key */ -+ REISER4_SECRET_KEY_INSTALLED = 7, -+ /* File (possibly) has pages corresponding to the tail items, that -+ * were created by ->readpage. It is set by mmap_unix_file() and -+ * sendfile_unix_file(). This bit is inspected by write_unix_file and -+ * kill-hook of tail items. It is never cleared once set. This bit is -+ * modified and inspected under i_mutex. */ -+ REISER4_HAS_MMAP = 8, -+ REISER4_PART_MIXED = 9, -+ REISER4_PART_IN_CONV = 10, -+ /* This flag indicates that file plugin conversion is in progress */ -+ REISER4_FILE_CONV_IN_PROGRESS = 11 -+} reiser4_file_plugin_flags; -+ -+/* state associated with each inode. -+ reiser4 inode. -+ -+ NOTE-NIKITA In 2.5 kernels it is not necessary that all file-system inodes -+ be of the same size. File-system allocates inodes by itself through -+ s_op->allocate_inode() method. So, it is possible to adjust size of inode -+ at the time of its creation. -+ -+ Invariants involving parts of this data-type: -+ -+ [inode->eflushed] -+ -+*/ -+ -+typedef struct reiser4_inode reiser4_inode; -+/* return pointer to reiser4-specific part of inode */ -+static inline reiser4_inode *reiser4_inode_data(const struct inode *inode -+ /* inode queried */ ); -+ -+#if BITS_PER_LONG == 64 -+ -+#define REISER4_INO_IS_OID (1) -+typedef struct {; -+} oid_hi_t; -+ -+/* BITS_PER_LONG == 64 */ -+#else -+ -+#define REISER4_INO_IS_OID (0) -+typedef __u32 oid_hi_t; -+ -+/* BITS_PER_LONG == 64 */ -+#endif -+ -+struct reiser4_inode { -+ /* spin lock protecting fields of this structure. */ -+ spinlock_t guard; -+ /* main plugin set that control the file -+ (see comments in plugin/plugin_set.c) */ -+ plugin_set *pset; -+ /* plugin set for inheritance -+ (see comments in plugin/plugin_set.c) */ -+ plugin_set *hset; -+ /* high 32 bits of object id */ -+ oid_hi_t oid_hi; -+ /* seal for stat-data */ -+ seal_t sd_seal; -+ /* locality id for this file */ -+ oid_t locality_id; -+#if REISER4_LARGE_KEY -+ __u64 ordering; -+#endif -+ /* coord of stat-data in sealed node */ -+ coord_t sd_coord; -+ /* bit-mask of stat-data extentions used by this file */ -+ __u64 extmask; -+ /* bitmask of non-default plugins for this inode */ -+ __u16 plugin_mask; -+ /* bitmask of set heir plugins for this inode. */ -+ __u16 heir_mask; -+ union { -+ struct list_head readdir_list; -+ struct list_head not_used; -+ } lists; -+ /* per-inode flags. Filled by values of reiser4_file_plugin_flags */ -+ unsigned long flags; -+ union { -+ /* fields specific to unix_file plugin */ -+ struct unix_file_info unix_file_info; -+ /* fields specific to cryptcompress file plugin */ -+ struct cryptcompress_info cryptcompress_info; -+ } file_plugin_data; -+ -+ /* this semaphore is to serialize readers and writers of @pset->file -+ * when file plugin conversion is enabled -+ */ -+ struct rw_semaphore conv_sem; -+ -+ /* tree of jnodes. Phantom jnodes (ones not attched to any atom) are -+ tagged in that tree by EFLUSH_TAG_ANONYMOUS */ -+ struct radix_tree_root jnodes_tree; -+#if REISER4_DEBUG -+ /* number of unformatted node jnodes of this file in jnode hash table */ -+ unsigned long nr_jnodes; -+#endif -+ -+ /* block number of virtual root for this object. See comment above -+ * fs/reiser4/search.c:handle_vroot() */ -+ reiser4_block_nr vroot; -+ struct mutex loading; -+}; -+ -+void loading_init_once(reiser4_inode *); -+void loading_alloc(reiser4_inode *); -+void loading_destroy(reiser4_inode *); -+ -+struct reiser4_inode_object { -+ /* private part */ -+ reiser4_inode p; -+ /* generic fields not specific to reiser4, but used by VFS */ -+ struct inode vfs_inode; -+}; -+ -+/* return pointer to the reiser4 specific portion of @inode */ -+static inline reiser4_inode *reiser4_inode_data(const struct inode *inode -+ /* inode queried */ ) -+{ -+ assert("nikita-254", inode != NULL); -+ return &container_of(inode, struct reiser4_inode_object, vfs_inode)->p; -+} -+ -+static inline struct inode *inode_by_reiser4_inode(const reiser4_inode * -+ r4_inode /* inode queried */ -+ ) -+{ -+ return &container_of(r4_inode, struct reiser4_inode_object, p)->vfs_inode; -+} -+ -+/* -+ * reiser4 inodes are identified by 64bit object-id (oid_t), but in struct -+ * inode ->i_ino field is of type ino_t (long) that can be either 32 or 64 -+ * bits. -+ * -+ * If ->i_ino is 32 bits we store remaining 32 bits in reiser4 specific part -+ * of inode, otherwise whole oid is stored in i_ino. -+ * -+ * Wrappers below ([sg]et_inode_oid()) are used to hide this difference. -+ */ -+ -+#define OID_HI_SHIFT (sizeof(ino_t) * 8) -+ -+#if REISER4_INO_IS_OID -+ -+static inline oid_t get_inode_oid(const struct inode *inode) -+{ -+ return inode->i_ino; -+} -+ -+static inline void set_inode_oid(struct inode *inode, oid_t oid) -+{ -+ inode->i_ino = oid; -+} -+ -+/* REISER4_INO_IS_OID */ -+#else -+ -+static inline oid_t get_inode_oid(const struct inode *inode) -+{ -+ return -+ ((__u64) reiser4_inode_data(inode)->oid_hi << OID_HI_SHIFT) | -+ inode->i_ino; -+} -+ -+static inline void set_inode_oid(struct inode *inode, oid_t oid) -+{ -+ assert("nikita-2519", inode != NULL); -+ inode->i_ino = (ino_t) (oid); -+ reiser4_inode_data(inode)->oid_hi = (oid) >> OID_HI_SHIFT; -+ assert("nikita-2521", get_inode_oid(inode) == (oid)); -+} -+ -+/* REISER4_INO_IS_OID */ -+#endif -+ -+static inline oid_t get_inode_locality(const struct inode *inode) -+{ -+ return reiser4_inode_data(inode)->locality_id; -+} -+ -+#if REISER4_LARGE_KEY -+static inline __u64 get_inode_ordering(const struct inode *inode) -+{ -+ return reiser4_inode_data(inode)->ordering; -+} -+ -+static inline void set_inode_ordering(const struct inode *inode, __u64 ordering) -+{ -+ reiser4_inode_data(inode)->ordering = ordering; -+} -+ -+#else -+ -+#define get_inode_ordering(inode) (0) -+#define set_inode_ordering(inode, val) noop -+ -+#endif -+ -+/* return inode in which @uf_info is embedded */ -+static inline struct inode * -+unix_file_info_to_inode(const struct unix_file_info * uf_info) -+{ -+ return &container_of(uf_info, struct reiser4_inode_object, -+ p.file_plugin_data.unix_file_info)->vfs_inode; -+} -+ -+extern ino_t oid_to_ino(oid_t oid) __attribute__ ((const)); -+extern ino_t oid_to_uino(oid_t oid) __attribute__ ((const)); -+ -+extern reiser4_tree *reiser4_tree_by_inode(const struct inode *inode); -+ -+#if REISER4_DEBUG -+extern void reiser4_inode_invariant(const struct inode *inode); -+extern int inode_has_no_jnodes(reiser4_inode *); -+#else -+#define reiser4_inode_invariant(inode) noop -+#endif -+ -+static inline int spin_inode_is_locked(const struct inode *inode) -+{ -+ assert_spin_locked(&reiser4_inode_data(inode)->guard); -+ return 1; -+} -+ -+/** -+ * spin_lock_inode - lock reiser4_inode' embedded spinlock -+ * @inode: inode to lock -+ * -+ * In debug mode it checks that lower priority locks are not held and -+ * increments reiser4_context's lock counters on which lock ordering checking -+ * is based. -+ */ -+static inline void spin_lock_inode(struct inode *inode) -+{ -+ assert("", LOCK_CNT_NIL(spin_locked)); -+ /* check lock ordering */ -+ assert_spin_not_locked(&d_lock); -+ -+ spin_lock(&reiser4_inode_data(inode)->guard); -+ -+ LOCK_CNT_INC(spin_locked_inode); -+ LOCK_CNT_INC(spin_locked); -+ -+ reiser4_inode_invariant(inode); -+} -+ -+/** -+ * spin_unlock_inode - unlock reiser4_inode' embedded spinlock -+ * @inode: inode to unlock -+ * -+ * In debug mode it checks that spinlock is held and decrements -+ * reiser4_context's lock counters on which lock ordering checking is based. -+ */ -+static inline void spin_unlock_inode(struct inode *inode) -+{ -+ assert_spin_locked(&reiser4_inode_data(inode)->guard); -+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_inode)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ reiser4_inode_invariant(inode); -+ -+ LOCK_CNT_DEC(spin_locked_inode); -+ LOCK_CNT_DEC(spin_locked); -+ -+ spin_unlock(&reiser4_inode_data(inode)->guard); -+} -+ -+extern znode *inode_get_vroot(struct inode *inode); -+extern void inode_set_vroot(struct inode *inode, znode * vroot); -+ -+extern int reiser4_max_filename_len(const struct inode *inode); -+extern int max_hash_collisions(const struct inode *dir); -+extern void reiser4_unlock_inode(struct inode *inode); -+extern int is_reiser4_inode(const struct inode *inode); -+extern int setup_inode_ops(struct inode *inode, reiser4_object_create_data *); -+extern struct inode *reiser4_iget(struct super_block *super, -+ const reiser4_key * key, int silent); -+extern void reiser4_iget_complete(struct inode *inode); -+extern void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f); -+extern void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f); -+extern int reiser4_inode_get_flag(const struct inode *inode, -+ reiser4_file_plugin_flags f); -+ -+/* has inode been initialized? */ -+static inline int -+is_inode_loaded(const struct inode *inode /* inode queried */ ) -+{ -+ assert("nikita-1120", inode != NULL); -+ return reiser4_inode_get_flag(inode, REISER4_LOADED); -+} -+ -+extern file_plugin *inode_file_plugin(const struct inode *inode); -+extern dir_plugin *inode_dir_plugin(const struct inode *inode); -+extern formatting_plugin *inode_formatting_plugin(const struct inode *inode); -+extern hash_plugin *inode_hash_plugin(const struct inode *inode); -+extern fibration_plugin *inode_fibration_plugin(const struct inode *inode); -+extern cipher_plugin *inode_cipher_plugin(const struct inode *inode); -+extern digest_plugin *inode_digest_plugin(const struct inode *inode); -+extern compression_plugin *inode_compression_plugin(const struct inode *inode); -+extern compression_mode_plugin *inode_compression_mode_plugin(const struct inode -+ *inode); -+extern cluster_plugin *inode_cluster_plugin(const struct inode *inode); -+extern file_plugin *inode_create_plugin(const struct inode *inode); -+extern item_plugin *inode_sd_plugin(const struct inode *inode); -+extern item_plugin *inode_dir_item_plugin(const struct inode *inode); -+extern file_plugin *child_create_plugin(const struct inode *inode); -+ -+extern void reiser4_make_bad_inode(struct inode *inode); -+ -+extern void inode_set_extension(struct inode *inode, sd_ext_bits ext); -+extern void inode_clr_extension(struct inode *inode, sd_ext_bits ext); -+extern void inode_check_scale(struct inode *inode, __u64 old, __u64 new); -+extern void inode_check_scale_nolock(struct inode * inode, __u64 old, __u64 new); -+ -+#define INODE_SET_SIZE(i, value) \ -+({ \ -+ struct inode *__i; \ -+ typeof(value) __v; \ -+ \ -+ __i = (i); \ -+ __v = (value); \ -+ inode_check_scale(__i, __i->i_size, __v); \ -+ i_size_write(__i, __v); \ -+}) -+ -+/* -+ * update field @field in inode @i to contain value @value. -+ */ -+#define INODE_SET_FIELD(i, field, value) \ -+({ \ -+ struct inode *__i; \ -+ typeof(value) __v; \ -+ \ -+ __i = (i); \ -+ __v = (value); \ -+ inode_check_scale(__i, __i->field, __v); \ -+ __i->field = __v; \ -+}) -+ -+#define INODE_INC_FIELD(i, field) \ -+({ \ -+ struct inode *__i; \ -+ \ -+ __i = (i); \ -+ inode_check_scale(__i, __i->field, __i->field + 1); \ -+ ++ __i->field; \ -+}) -+ -+#define INODE_DEC_FIELD(i, field) \ -+({ \ -+ struct inode *__i; \ -+ \ -+ __i = (i); \ -+ inode_check_scale(__i, __i->field, __i->field - 1); \ -+ -- __i->field; \ -+}) -+ -+/* See comment before reiser4_readdir_common() for description. */ -+static inline struct list_head *get_readdir_list(const struct inode *inode) -+{ -+ return &reiser4_inode_data(inode)->lists.readdir_list; -+} -+ -+extern void init_inode_ordering(struct inode *inode, -+ reiser4_object_create_data * crd, int create); -+ -+static inline struct radix_tree_root *jnode_tree_by_inode(struct inode *inode) -+{ -+ return &reiser4_inode_data(inode)->jnodes_tree; -+} -+ -+static inline struct radix_tree_root *jnode_tree_by_reiser4_inode(reiser4_inode -+ * r4_inode) -+{ -+ return &r4_inode->jnodes_tree; -+} -+ -+#if REISER4_DEBUG -+extern void print_inode(const char *prefix, const struct inode *i); -+#endif -+ -+int is_dir_empty(const struct inode *); -+ -+/* __REISER4_INODE_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/ioctl.h linux-2.6.24/fs/reiser4/ioctl.h ---- linux-2.6.24.orig/fs/reiser4/ioctl.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/ioctl.h 2008-01-25 11:39:06.936207689 +0300 -@@ -0,0 +1,41 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#if !defined( __REISER4_IOCTL_H__ ) -+#define __REISER4_IOCTL_H__ -+ -+#include -+ -+/* -+ * ioctl(2) command used to "unpack" reiser4 file, that is, convert it into -+ * extents and fix in this state. This is used by applications that rely on -+ * -+ * . files being block aligned, and -+ * -+ * . files never migrating on disk -+ * -+ * for example, boot loaders (LILO) need this. -+ * -+ * This ioctl should be used as -+ * -+ * result = ioctl(fd, REISER4_IOC_UNPACK); -+ * -+ * File behind fd descriptor will be converted to the extents (if necessary), -+ * and its stat-data will be updated so that it will never be converted back -+ * into tails again. -+ */ -+#define REISER4_IOC_UNPACK _IOW(0xCD,1,long) -+ -+/* __REISER4_IOCTL_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/jnode.c linux-2.6.24/fs/reiser4/jnode.c ---- linux-2.6.24.orig/fs/reiser4/jnode.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/jnode.c 2008-01-25 11:39:06.940208719 +0300 -@@ -0,0 +1,1924 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+/* Jnode manipulation functions. */ -+/* Jnode is entity used to track blocks with data and meta-data in reiser4. -+ -+ In particular, jnodes are used to track transactional information -+ associated with each block. Each znode contains jnode as ->zjnode field. -+ -+ Jnode stands for either Josh or Journal node. -+*/ -+ -+/* -+ * Taxonomy. -+ * -+ * Jnode represents block containing data or meta-data. There are jnodes -+ * for: -+ * -+ * unformatted blocks (jnodes proper). There are plans, however to -+ * have a handle per extent unit rather than per each unformatted -+ * block, because there are so many of them. -+ * -+ * For bitmaps. Each bitmap is actually represented by two jnodes--one -+ * for working and another for "commit" data, together forming bnode. -+ * -+ * For io-heads. These are used by log writer. -+ * -+ * For formatted nodes (znode). See comment at the top of znode.c for -+ * details specific to the formatted nodes (znodes). -+ * -+ * Node data. -+ * -+ * Jnode provides access to the data of node it represents. Data are -+ * stored in a page. Page is kept in a page cache. This means, that jnodes -+ * are highly interconnected with page cache and VM internals. -+ * -+ * jnode has a pointer to page (->pg) containing its data. Pointer to data -+ * themselves is cached in ->data field to avoid frequent calls to -+ * page_address(). -+ * -+ * jnode and page are attached to each other by jnode_attach_page(). This -+ * function places pointer to jnode in set_page_private(), sets PG_private -+ * flag and increments page counter. -+ * -+ * Opposite operation is performed by page_clear_jnode(). -+ * -+ * jnode->pg is protected by jnode spin lock, and page->private is -+ * protected by page lock. See comment at the top of page_cache.c for -+ * more. -+ * -+ * page can be detached from jnode for two reasons: -+ * -+ * . jnode is removed from a tree (file is truncated, of formatted -+ * node is removed by balancing). -+ * -+ * . during memory pressure, VM calls ->releasepage() method -+ * (reiser4_releasepage()) to evict page from memory. -+ * -+ * (there, of course, is also umount, but this is special case we are not -+ * concerned with here). -+ * -+ * To protect jnode page from eviction, one calls jload() function that -+ * "pins" page in memory (loading it if necessary), increments -+ * jnode->d_count, and kmap()s page. Page is unpinned through call to -+ * jrelse(). -+ * -+ * Jnode life cycle. -+ * -+ * jnode is created, placed in hash table, and, optionally, in per-inode -+ * radix tree. Page can be attached to jnode, pinned, released, etc. -+ * -+ * When jnode is captured into atom its reference counter is -+ * increased. While being part of an atom, jnode can be "early -+ * flushed". This means that as part of flush procedure, jnode is placed -+ * into "relocate set", and its page is submitted to the disk. After io -+ * completes, page can be detached, then loaded again, re-dirtied, etc. -+ * -+ * Thread acquired reference to jnode by calling jref() and releases it by -+ * jput(). When last reference is removed, jnode is still retained in -+ * memory (cached) if it has page attached, _unless_ it is scheduled for -+ * destruction (has JNODE_HEARD_BANSHEE bit set). -+ * -+ * Tree read-write lock was used as "existential" lock for jnodes. That is, -+ * jnode->x_count could be changed from 0 to 1 only under tree write lock, -+ * that is, tree lock protected unreferenced jnodes stored in the hash -+ * table, from recycling. -+ * -+ * This resulted in high contention on tree lock, because jref()/jput() is -+ * frequent operation. To ameliorate this problem, RCU is used: when jput() -+ * is just about to release last reference on jnode it sets JNODE_RIP bit -+ * on it, and then proceed with jnode destruction (removing jnode from hash -+ * table, cbk_cache, detaching page, etc.). All places that change jnode -+ * reference counter from 0 to 1 (jlookup(), zlook(), zget(), and -+ * cbk_cache_scan_slots()) check for JNODE_RIP bit (this is done by -+ * jnode_rip_check() function), and pretend that nothing was found in hash -+ * table if bit is set. -+ * -+ * jput defers actual return of jnode into slab cache to some later time -+ * (by call_rcu()), this guarantees that other threads can safely continue -+ * working with JNODE_RIP-ped jnode. -+ * -+ */ -+ -+#include "reiser4.h" -+#include "debug.h" -+#include "dformat.h" -+#include "jnode.h" -+#include "plugin/plugin_header.h" -+#include "plugin/plugin.h" -+#include "txnmgr.h" -+/*#include "jnode.h"*/ -+#include "znode.h" -+#include "tree.h" -+#include "tree_walk.h" -+#include "super.h" -+#include "inode.h" -+#include "page_cache.h" -+ -+#include /* UML needs this for PAGE_OFFSET */ -+#include -+#include -+#include -+#include -+#include /* for struct address_space */ -+#include /* for inode_lock */ -+ -+static struct kmem_cache *_jnode_slab = NULL; -+ -+static void jnode_set_type(jnode * node, jnode_type type); -+static int jdelete(jnode * node); -+static int jnode_try_drop(jnode * node); -+ -+#if REISER4_DEBUG -+static int jnode_invariant(const jnode * node, int tlocked, int jlocked); -+#endif -+ -+/* true if valid page is attached to jnode */ -+static inline int jnode_is_parsed(jnode * node) -+{ -+ return JF_ISSET(node, JNODE_PARSED); -+} -+ -+/* hash table support */ -+ -+/* compare two jnode keys for equality. Used by hash-table macros */ -+static inline int jnode_key_eq(const struct jnode_key * k1, -+ const struct jnode_key * k2) -+{ -+ assert("nikita-2350", k1 != NULL); -+ assert("nikita-2351", k2 != NULL); -+ -+ return (k1->index == k2->index && k1->objectid == k2->objectid); -+} -+ -+/* Hash jnode by its key (inode plus offset). Used by hash-table macros */ -+static inline __u32 jnode_key_hashfn(j_hash_table * table, -+ const struct jnode_key * key) -+{ -+ assert("nikita-2352", key != NULL); -+ assert("nikita-3346", IS_POW(table->_buckets)); -+ -+ /* yes, this is remarkable simply (where not stupid) hash function. */ -+ return (key->objectid + key->index) & (table->_buckets - 1); -+} -+ -+/* The hash table definition */ -+#define KMALLOC(size) reiser4_vmalloc(size) -+#define KFREE(ptr, size) vfree(ptr) -+TYPE_SAFE_HASH_DEFINE(j, jnode, struct jnode_key, key.j, link.j, -+ jnode_key_hashfn, jnode_key_eq); -+#undef KFREE -+#undef KMALLOC -+ -+/* call this to initialise jnode hash table */ -+int jnodes_tree_init(reiser4_tree * tree /* tree to initialise jnodes for */ ) -+{ -+ assert("nikita-2359", tree != NULL); -+ return j_hash_init(&tree->jhash_table, 16384); -+} -+ -+/* call this to destroy jnode hash table. This is called during umount. */ -+int jnodes_tree_done(reiser4_tree * tree /* tree to destroy jnodes for */ ) -+{ -+ j_hash_table *jtable; -+ jnode *node; -+ jnode *next; -+ -+ assert("nikita-2360", tree != NULL); -+ -+ /* -+ * Scan hash table and free all jnodes. -+ */ -+ jtable = &tree->jhash_table; -+ if (jtable->_table) { -+ for_all_in_htable(jtable, j, node, next) { -+ assert("nikita-2361", !atomic_read(&node->x_count)); -+ jdrop(node); -+ } -+ -+ j_hash_done(&tree->jhash_table); -+ } -+ return 0; -+} -+ -+/** -+ * init_jnodes - create jnode cache -+ * -+ * Initializes slab cache jnodes. It is part of reiser4 module initialization. -+ */ -+int init_jnodes(void) -+{ -+ assert("umka-168", _jnode_slab == NULL); -+ -+ _jnode_slab = kmem_cache_create("jnode", sizeof(jnode), 0, -+ SLAB_HWCACHE_ALIGN | -+ SLAB_RECLAIM_ACCOUNT, NULL); -+ if (_jnode_slab == NULL) -+ return RETERR(-ENOMEM); -+ -+ return 0; -+} -+ -+/** -+ * done_znodes - delete znode cache -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+void done_jnodes(void) -+{ -+ destroy_reiser4_cache(&_jnode_slab); -+} -+ -+/* Initialize a jnode. */ -+void jnode_init(jnode * node, reiser4_tree * tree, jnode_type type) -+{ -+ assert("umka-175", node != NULL); -+ -+ memset(node, 0, sizeof(jnode)); -+ ON_DEBUG(node->magic = JMAGIC); -+ jnode_set_type(node, type); -+ atomic_set(&node->d_count, 0); -+ atomic_set(&node->x_count, 0); -+ spin_lock_init(&node->guard); -+ spin_lock_init(&node->load); -+ node->atom = NULL; -+ node->tree = tree; -+ INIT_LIST_HEAD(&node->capture_link); -+ -+ ASSIGN_NODE_LIST(node, NOT_CAPTURED); -+ -+ INIT_RCU_HEAD(&node->rcu); -+ -+#if REISER4_DEBUG -+ { -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = get_super_private(tree->super); -+ spin_lock_irq(&sbinfo->all_guard); -+ list_add(&node->jnodes, &sbinfo->all_jnodes); -+ spin_unlock_irq(&sbinfo->all_guard); -+ } -+#endif -+} -+ -+#if REISER4_DEBUG -+/* -+ * Remove jnode from ->all_jnodes list. -+ */ -+static void jnode_done(jnode * node, reiser4_tree * tree) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = get_super_private(tree->super); -+ -+ spin_lock_irq(&sbinfo->all_guard); -+ assert("nikita-2422", !list_empty(&node->jnodes)); -+ list_del_init(&node->jnodes); -+ spin_unlock_irq(&sbinfo->all_guard); -+} -+#endif -+ -+/* return already existing jnode of page */ -+jnode *jnode_by_page(struct page *pg) -+{ -+ assert("nikita-2066", pg != NULL); -+ assert("nikita-2400", PageLocked(pg)); -+ assert("nikita-2068", PagePrivate(pg)); -+ assert("nikita-2067", jprivate(pg) != NULL); -+ return jprivate(pg); -+} -+ -+/* exported functions to allocate/free jnode objects outside this file */ -+jnode *jalloc(void) -+{ -+ jnode *jal = kmem_cache_alloc(_jnode_slab, reiser4_ctx_gfp_mask_get()); -+ return jal; -+} -+ -+/* return jnode back to the slab allocator */ -+inline void jfree(jnode * node) -+{ -+ assert("zam-449", node != NULL); -+ -+ assert("nikita-2663", (list_empty_careful(&node->capture_link) && -+ NODE_LIST(node) == NOT_CAPTURED)); -+ assert("nikita-3222", list_empty(&node->jnodes)); -+ assert("nikita-3221", jnode_page(node) == NULL); -+ -+ /* not yet phash_jnode_destroy(node); */ -+ -+ kmem_cache_free(_jnode_slab, node); -+} -+ -+/* -+ * This function is supplied as RCU callback. It actually frees jnode when -+ * last reference to it is gone. -+ */ -+static void jnode_free_actor(struct rcu_head *head) -+{ -+ jnode *node; -+ jnode_type jtype; -+ -+ node = container_of(head, jnode, rcu); -+ jtype = jnode_get_type(node); -+ -+ ON_DEBUG(jnode_done(node, jnode_get_tree(node))); -+ -+ switch (jtype) { -+ case JNODE_IO_HEAD: -+ case JNODE_BITMAP: -+ case JNODE_UNFORMATTED_BLOCK: -+ jfree(node); -+ break; -+ case JNODE_FORMATTED_BLOCK: -+ zfree(JZNODE(node)); -+ break; -+ case JNODE_INODE: -+ default: -+ wrong_return_value("nikita-3197", "Wrong jnode type"); -+ } -+} -+ -+/* -+ * Free a jnode. Post a callback to be executed later through RCU when all -+ * references to @node are released. -+ */ -+static inline void jnode_free(jnode * node, jnode_type jtype) -+{ -+ if (jtype != JNODE_INODE) { -+ /*assert("nikita-3219", list_empty(&node->rcu.list)); */ -+ call_rcu(&node->rcu, jnode_free_actor); -+ } else -+ jnode_list_remove(node); -+} -+ -+/* allocate new unformatted jnode */ -+static jnode *jnew_unformatted(void) -+{ -+ jnode *jal; -+ -+ jal = jalloc(); -+ if (jal == NULL) -+ return NULL; -+ -+ jnode_init(jal, current_tree, JNODE_UNFORMATTED_BLOCK); -+ jal->key.j.mapping = NULL; -+ jal->key.j.index = (unsigned long)-1; -+ jal->key.j.objectid = 0; -+ return jal; -+} -+ -+/* look for jnode with given mapping and offset within hash table */ -+jnode *jlookup(reiser4_tree * tree, oid_t objectid, unsigned long index) -+{ -+ struct jnode_key jkey; -+ jnode *node; -+ -+ assert("nikita-2353", tree != NULL); -+ -+ jkey.objectid = objectid; -+ jkey.index = index; -+ -+ /* -+ * hash table is _not_ protected by any lock during lookups. All we -+ * have to do is to disable preemption to keep RCU happy. -+ */ -+ -+ rcu_read_lock(); -+ node = j_hash_find(&tree->jhash_table, &jkey); -+ if (node != NULL) { -+ /* protect @node from recycling */ -+ jref(node); -+ assert("nikita-2955", jnode_invariant(node, 0, 0)); -+ node = jnode_rip_check(tree, node); -+ } -+ rcu_read_unlock(); -+ return node; -+} -+ -+/* per inode radix tree of jnodes is protected by tree's read write spin lock */ -+static jnode *jfind_nolock(struct address_space *mapping, unsigned long index) -+{ -+ assert("vs-1694", mapping->host != NULL); -+ -+ return radix_tree_lookup(jnode_tree_by_inode(mapping->host), index); -+} -+ -+jnode *jfind(struct address_space * mapping, unsigned long index) -+{ -+ reiser4_tree *tree; -+ jnode *node; -+ -+ assert("vs-1694", mapping->host != NULL); -+ tree = reiser4_tree_by_inode(mapping->host); -+ -+ read_lock_tree(tree); -+ node = jfind_nolock(mapping, index); -+ if (node != NULL) -+ jref(node); -+ read_unlock_tree(tree); -+ return node; -+} -+ -+static void inode_attach_jnode(jnode * node) -+{ -+ struct inode *inode; -+ reiser4_inode *info; -+ struct radix_tree_root *rtree; -+ -+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock)); -+ assert("zam-1043", node->key.j.mapping != NULL); -+ inode = node->key.j.mapping->host; -+ info = reiser4_inode_data(inode); -+ rtree = jnode_tree_by_reiser4_inode(info); -+ if (rtree->rnode == NULL) { -+ /* prevent inode from being pruned when it has jnodes attached -+ to it */ -+ write_lock_irq(&inode->i_data.tree_lock); -+ inode->i_data.nrpages++; -+ write_unlock_irq(&inode->i_data.tree_lock); -+ } -+ assert("zam-1049", equi(rtree->rnode != NULL, info->nr_jnodes != 0)); -+ check_me("zam-1045", -+ !radix_tree_insert(rtree, node->key.j.index, node)); -+ ON_DEBUG(info->nr_jnodes++); -+} -+ -+static void inode_detach_jnode(jnode * node) -+{ -+ struct inode *inode; -+ reiser4_inode *info; -+ struct radix_tree_root *rtree; -+ -+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock)); -+ assert("zam-1044", node->key.j.mapping != NULL); -+ inode = node->key.j.mapping->host; -+ info = reiser4_inode_data(inode); -+ rtree = jnode_tree_by_reiser4_inode(info); -+ -+ assert("zam-1051", info->nr_jnodes != 0); -+ assert("zam-1052", rtree->rnode != NULL); -+ ON_DEBUG(info->nr_jnodes--); -+ -+ /* delete jnode from inode's radix tree of jnodes */ -+ check_me("zam-1046", radix_tree_delete(rtree, node->key.j.index)); -+ if (rtree->rnode == NULL) { -+ /* inode can be pruned now */ -+ write_lock_irq(&inode->i_data.tree_lock); -+ inode->i_data.nrpages--; -+ write_unlock_irq(&inode->i_data.tree_lock); -+ } -+} -+ -+/* put jnode into hash table (where they can be found by flush who does not know -+ mapping) and to inode's tree of jnodes (where they can be found (hopefully -+ faster) in places where mapping is known). Currently it is used by -+ fs/reiser4/plugin/item/extent_file_ops.c:index_extent_jnode when new jnode is -+ created */ -+static void -+hash_unformatted_jnode(jnode * node, struct address_space *mapping, -+ unsigned long index) -+{ -+ j_hash_table *jtable; -+ -+ assert("vs-1446", jnode_is_unformatted(node)); -+ assert("vs-1442", node->key.j.mapping == 0); -+ assert("vs-1443", node->key.j.objectid == 0); -+ assert("vs-1444", node->key.j.index == (unsigned long)-1); -+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock)); -+ -+ node->key.j.mapping = mapping; -+ node->key.j.objectid = get_inode_oid(mapping->host); -+ node->key.j.index = index; -+ -+ jtable = &jnode_get_tree(node)->jhash_table; -+ -+ /* race with some other thread inserting jnode into the hash table is -+ * impossible, because we keep the page lock. */ -+ /* -+ * following assertion no longer holds because of RCU: it is possible -+ * jnode is in the hash table, but with JNODE_RIP bit set. -+ */ -+ /* assert("nikita-3211", j_hash_find(jtable, &node->key.j) == NULL); */ -+ j_hash_insert_rcu(jtable, node); -+ inode_attach_jnode(node); -+} -+ -+static void unhash_unformatted_node_nolock(jnode * node) -+{ -+ assert("vs-1683", node->key.j.mapping != NULL); -+ assert("vs-1684", -+ node->key.j.objectid == -+ get_inode_oid(node->key.j.mapping->host)); -+ -+ /* remove jnode from hash-table */ -+ j_hash_remove_rcu(&node->tree->jhash_table, node); -+ inode_detach_jnode(node); -+ node->key.j.mapping = NULL; -+ node->key.j.index = (unsigned long)-1; -+ node->key.j.objectid = 0; -+ -+} -+ -+/* remove jnode from hash table and from inode's tree of jnodes. This is used in -+ reiser4_invalidatepage and in kill_hook_extent -> truncate_inode_jnodes -> -+ reiser4_uncapture_jnode */ -+void unhash_unformatted_jnode(jnode * node) -+{ -+ assert("vs-1445", jnode_is_unformatted(node)); -+ -+ write_lock_tree(node->tree); -+ unhash_unformatted_node_nolock(node); -+ write_unlock_tree(node->tree); -+} -+ -+/* -+ * search hash table for a jnode with given oid and index. If not found, -+ * allocate new jnode, insert it, and also insert into radix tree for the -+ * given inode/mapping. -+ */ -+static jnode *find_get_jnode(reiser4_tree * tree, -+ struct address_space *mapping, -+ oid_t oid, unsigned long index) -+{ -+ jnode *result; -+ jnode *shadow; -+ int preload; -+ -+ result = jnew_unformatted(); -+ -+ if (unlikely(result == NULL)) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ -+ preload = radix_tree_preload(reiser4_ctx_gfp_mask_get()); -+ if (preload != 0) -+ return ERR_PTR(preload); -+ -+ write_lock_tree(tree); -+ shadow = jfind_nolock(mapping, index); -+ if (likely(shadow == NULL)) { -+ /* add new jnode to hash table and inode's radix tree of jnodes */ -+ jref(result); -+ hash_unformatted_jnode(result, mapping, index); -+ } else { -+ /* jnode is found in inode's radix tree of jnodes */ -+ jref(shadow); -+ jnode_free(result, JNODE_UNFORMATTED_BLOCK); -+ assert("vs-1498", shadow->key.j.mapping == mapping); -+ result = shadow; -+ } -+ write_unlock_tree(tree); -+ -+ assert("nikita-2955", -+ ergo(result != NULL, jnode_invariant(result, 0, 0))); -+ radix_tree_preload_end(); -+ return result; -+} -+ -+/* jget() (a la zget() but for unformatted nodes). Returns (and possibly -+ creates) jnode corresponding to page @pg. jnode is attached to page and -+ inserted into jnode hash-table. */ -+static jnode *do_jget(reiser4_tree * tree, struct page *pg) -+{ -+ /* -+ * There are two ways to create jnode: starting with pre-existing page -+ * and without page. -+ * -+ * When page already exists, jnode is created -+ * (jnode_of_page()->do_jget()) under page lock. This is done in -+ * ->writepage(), or when capturing anonymous page dirtied through -+ * mmap. -+ * -+ * Jnode without page is created by index_extent_jnode(). -+ * -+ */ -+ -+ jnode *result; -+ oid_t oid = get_inode_oid(pg->mapping->host); -+ -+ assert("umka-176", pg != NULL); -+ assert("nikita-2394", PageLocked(pg)); -+ -+ result = jprivate(pg); -+ if (likely(result != NULL)) -+ return jref(result); -+ -+ tree = reiser4_tree_by_page(pg); -+ -+ /* check hash-table first */ -+ result = jfind(pg->mapping, pg->index); -+ if (unlikely(result != NULL)) { -+ spin_lock_jnode(result); -+ jnode_attach_page(result, pg); -+ spin_unlock_jnode(result); -+ result->key.j.mapping = pg->mapping; -+ return result; -+ } -+ -+ /* since page is locked, jnode should be allocated with GFP_NOFS flag */ -+ reiser4_ctx_gfp_mask_force(GFP_NOFS); -+ result = find_get_jnode(tree, pg->mapping, oid, pg->index); -+ if (unlikely(IS_ERR(result))) -+ return result; -+ /* attach jnode to page */ -+ spin_lock_jnode(result); -+ jnode_attach_page(result, pg); -+ spin_unlock_jnode(result); -+ return result; -+} -+ -+/* -+ * return jnode for @pg, creating it if necessary. -+ */ -+jnode *jnode_of_page(struct page * pg) -+{ -+ jnode *result; -+ -+ assert("umka-176", pg != NULL); -+ assert("nikita-2394", PageLocked(pg)); -+ -+ result = do_jget(reiser4_tree_by_page(pg), pg); -+ -+ if (REISER4_DEBUG && !IS_ERR(result)) { -+ assert("nikita-3210", result == jprivate(pg)); -+ assert("nikita-2046", jnode_page(jprivate(pg)) == pg); -+ if (jnode_is_unformatted(jprivate(pg))) { -+ assert("nikita-2364", -+ jprivate(pg)->key.j.index == pg->index); -+ assert("nikita-2367", -+ jprivate(pg)->key.j.mapping == pg->mapping); -+ assert("nikita-2365", -+ jprivate(pg)->key.j.objectid == -+ get_inode_oid(pg->mapping->host)); -+ assert("vs-1200", -+ jprivate(pg)->key.j.objectid == -+ pg->mapping->host->i_ino); -+ assert("nikita-2356", -+ jnode_is_unformatted(jnode_by_page(pg))); -+ } -+ assert("nikita-2956", jnode_invariant(jprivate(pg), 0, 0)); -+ } -+ return result; -+} -+ -+/* attach page to jnode: set ->pg pointer in jnode, and ->private one in the -+ * page.*/ -+void jnode_attach_page(jnode * node, struct page *pg) -+{ -+ assert("nikita-2060", node != NULL); -+ assert("nikita-2061", pg != NULL); -+ -+ assert("nikita-2050", jprivate(pg) == 0ul); -+ assert("nikita-2393", !PagePrivate(pg)); -+ assert("vs-1741", node->pg == NULL); -+ -+ assert("nikita-2396", PageLocked(pg)); -+ assert_spin_locked(&(node->guard)); -+ -+ page_cache_get(pg); -+ set_page_private(pg, (unsigned long)node); -+ node->pg = pg; -+ SetPagePrivate(pg); -+} -+ -+/* Dual to jnode_attach_page: break a binding between page and jnode */ -+void page_clear_jnode(struct page *page, jnode * node) -+{ -+ assert("nikita-2424", page != NULL); -+ assert("nikita-2425", PageLocked(page)); -+ assert("nikita-2426", node != NULL); -+ assert_spin_locked(&(node->guard)); -+ assert("nikita-2428", PagePrivate(page)); -+ -+ assert("nikita-3551", !PageWriteback(page)); -+ -+ JF_CLR(node, JNODE_PARSED); -+ set_page_private(page, 0ul); -+ ClearPagePrivate(page); -+ node->pg = NULL; -+ page_cache_release(page); -+} -+ -+#if 0 -+/* it is only used in one place to handle error */ -+void -+page_detach_jnode(struct page *page, struct address_space *mapping, -+ unsigned long index) -+{ -+ assert("nikita-2395", page != NULL); -+ -+ lock_page(page); -+ if ((page->mapping == mapping) && (page->index == index) -+ && PagePrivate(page)) { -+ jnode *node; -+ -+ node = jprivate(page); -+ spin_lock_jnode(node); -+ page_clear_jnode(page, node); -+ spin_unlock_jnode(node); -+ } -+ unlock_page(page); -+} -+#endif /* 0 */ -+ -+/* return @node page locked. -+ -+ Locking ordering requires that one first takes page lock and afterwards -+ spin lock on node attached to this page. Sometimes it is necessary to go in -+ the opposite direction. This is done through standard trylock-and-release -+ loop. -+*/ -+static struct page *jnode_lock_page(jnode * node) -+{ -+ struct page *page; -+ -+ assert("nikita-2052", node != NULL); -+ assert("nikita-2401", LOCK_CNT_NIL(spin_locked_jnode)); -+ -+ while (1) { -+ -+ spin_lock_jnode(node); -+ page = jnode_page(node); -+ if (page == NULL) { -+ break; -+ } -+ -+ /* no need to page_cache_get( page ) here, because page cannot -+ be evicted from memory without detaching it from jnode and -+ this requires spin lock on jnode that we already hold. -+ */ -+ if (!TestSetPageLocked(page)) { -+ /* We won a lock on jnode page, proceed. */ -+ break; -+ } -+ -+ /* Page is locked by someone else. */ -+ page_cache_get(page); -+ spin_unlock_jnode(node); -+ wait_on_page_locked(page); -+ /* it is possible that page was detached from jnode and -+ returned to the free pool, or re-assigned while we were -+ waiting on locked bit. This will be rechecked on the next -+ loop iteration. -+ */ -+ page_cache_release(page); -+ -+ /* try again */ -+ } -+ return page; -+} -+ -+/* -+ * is JNODE_PARSED bit is not set, call ->parse() method of jnode, to verify -+ * validness of jnode content. -+ */ -+static inline int jparse(jnode * node) -+{ -+ int result; -+ -+ assert("nikita-2466", node != NULL); -+ -+ spin_lock_jnode(node); -+ if (likely(!jnode_is_parsed(node))) { -+ result = jnode_ops(node)->parse(node); -+ if (likely(result == 0)) -+ JF_SET(node, JNODE_PARSED); -+ } else -+ result = 0; -+ spin_unlock_jnode(node); -+ return result; -+} -+ -+/* Lock a page attached to jnode, create and attach page to jnode if it had no -+ * one. */ -+static struct page *jnode_get_page_locked(jnode * node, gfp_t gfp_flags) -+{ -+ struct page *page; -+ -+ spin_lock_jnode(node); -+ page = jnode_page(node); -+ -+ if (page == NULL) { -+ spin_unlock_jnode(node); -+ page = find_or_create_page(jnode_get_mapping(node), -+ jnode_get_index(node), gfp_flags); -+ if (page == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ } else { -+ if (!TestSetPageLocked(page)) { -+ spin_unlock_jnode(node); -+ return page; -+ } -+ page_cache_get(page); -+ spin_unlock_jnode(node); -+ lock_page(page); -+ assert("nikita-3134", page->mapping == jnode_get_mapping(node)); -+ } -+ -+ spin_lock_jnode(node); -+ if (!jnode_page(node)) -+ jnode_attach_page(node, page); -+ spin_unlock_jnode(node); -+ -+ page_cache_release(page); -+ assert("zam-894", jnode_page(node) == page); -+ return page; -+} -+ -+/* Start read operation for jnode's page if page is not up-to-date. */ -+static int jnode_start_read(jnode * node, struct page *page) -+{ -+ assert("zam-893", PageLocked(page)); -+ -+ if (PageUptodate(page)) { -+ unlock_page(page); -+ return 0; -+ } -+ return reiser4_page_io(page, node, READ, reiser4_ctx_gfp_mask_get()); -+} -+ -+#if REISER4_DEBUG -+static void check_jload(jnode * node, struct page *page) -+{ -+ if (jnode_is_znode(node)) { -+ node40_header *nh; -+ znode *z; -+ -+ z = JZNODE(node); -+ if (znode_is_any_locked(z)) { -+ nh = (node40_header *) kmap(page); -+ /* this only works for node40-only file systems. For -+ * debugging. */ -+ assert("nikita-3253", -+ z->nr_items == le16_to_cpu(get_unaligned(&nh->nr_items))); -+ kunmap(page); -+ } -+ assert("nikita-3565", znode_invariant(z)); -+ } -+} -+#else -+#define check_jload(node, page) noop -+#endif -+ -+/* prefetch jnode to speed up next call to jload. Call this when you are going -+ * to call jload() shortly. This will bring appropriate portion of jnode into -+ * CPU cache. */ -+void jload_prefetch(jnode * node) -+{ -+ prefetchw(&node->x_count); -+} -+ -+/* load jnode's data into memory */ -+int jload_gfp(jnode * node /* node to load */ , -+ gfp_t gfp_flags /* allocation flags */ , -+ int do_kmap /* true if page should be kmapped */ ) -+{ -+ struct page *page; -+ int result = 0; -+ int parsed; -+ -+ assert("nikita-3010", reiser4_schedulable()); -+ -+ prefetchw(&node->pg); -+ -+ /* taking d-reference implies taking x-reference. */ -+ jref(node); -+ -+ /* -+ * acquiring d-reference to @jnode and check for JNODE_PARSED bit -+ * should be atomic, otherwise there is a race against -+ * reiser4_releasepage(). -+ */ -+ spin_lock(&(node->load)); -+ add_d_ref(node); -+ parsed = jnode_is_parsed(node); -+ spin_unlock(&(node->load)); -+ -+ if (unlikely(!parsed)) { -+ page = jnode_get_page_locked(node, gfp_flags); -+ if (unlikely(IS_ERR(page))) { -+ result = PTR_ERR(page); -+ goto failed; -+ } -+ -+ result = jnode_start_read(node, page); -+ if (unlikely(result != 0)) -+ goto failed; -+ -+ wait_on_page_locked(page); -+ if (unlikely(!PageUptodate(page))) { -+ result = RETERR(-EIO); -+ goto failed; -+ } -+ -+ if (do_kmap) -+ node->data = kmap(page); -+ -+ result = jparse(node); -+ if (unlikely(result != 0)) { -+ if (do_kmap) -+ kunmap(page); -+ goto failed; -+ } -+ check_jload(node, page); -+ } else { -+ page = jnode_page(node); -+ check_jload(node, page); -+ if (do_kmap) -+ node->data = kmap(page); -+ } -+ -+ if (!is_writeout_mode()) -+ /* We do not mark pages active if jload is called as a part of -+ * jnode_flush() or reiser4_write_logs(). Both jnode_flush() -+ * and write_logs() add no value to cached data, there is no -+ * sense to mark pages as active when they go to disk, it just -+ * confuses vm scanning routines because clean page could be -+ * moved out from inactive list as a result of this -+ * mark_page_accessed() call. */ -+ mark_page_accessed(page); -+ -+ return 0; -+ -+ failed: -+ jrelse_tail(node); -+ return result; -+ -+} -+ -+/* start asynchronous reading for given jnode's page. */ -+int jstartio(jnode * node) -+{ -+ struct page *page; -+ -+ page = jnode_get_page_locked(node, reiser4_ctx_gfp_mask_get()); -+ if (IS_ERR(page)) -+ return PTR_ERR(page); -+ -+ return jnode_start_read(node, page); -+} -+ -+/* Initialize a node by calling appropriate plugin instead of reading -+ * node from disk as in jload(). */ -+int jinit_new(jnode * node, gfp_t gfp_flags) -+{ -+ struct page *page; -+ int result; -+ -+ jref(node); -+ add_d_ref(node); -+ -+ page = jnode_get_page_locked(node, gfp_flags); -+ if (IS_ERR(page)) { -+ result = PTR_ERR(page); -+ goto failed; -+ } -+ -+ SetPageUptodate(page); -+ unlock_page(page); -+ -+ node->data = kmap(page); -+ -+ if (!jnode_is_parsed(node)) { -+ jnode_plugin *jplug = jnode_ops(node); -+ spin_lock_jnode(node); -+ result = jplug->init(node); -+ spin_unlock_jnode(node); -+ if (result) { -+ kunmap(page); -+ goto failed; -+ } -+ JF_SET(node, JNODE_PARSED); -+ } -+ -+ return 0; -+ -+ failed: -+ jrelse(node); -+ return result; -+} -+ -+/* release a reference to jnode acquired by jload(), decrement ->d_count */ -+void jrelse_tail(jnode * node /* jnode to release references to */ ) -+{ -+ assert("nikita-489", atomic_read(&node->d_count) > 0); -+ atomic_dec(&node->d_count); -+ /* release reference acquired in jload_gfp() or jinit_new() */ -+ jput(node); -+ if (jnode_is_unformatted(node) || jnode_is_znode(node)) -+ LOCK_CNT_DEC(d_refs); -+} -+ -+/* drop reference to node data. When last reference is dropped, data are -+ unloaded. */ -+void jrelse(jnode * node /* jnode to release references to */ ) -+{ -+ struct page *page; -+ -+ assert("nikita-487", node != NULL); -+ assert_spin_not_locked(&(node->guard)); -+ -+ page = jnode_page(node); -+ if (likely(page != NULL)) { -+ /* -+ * it is safe not to lock jnode here, because at this point -+ * @node->d_count is greater than zero (if jrelse() is used -+ * correctly, that is). JNODE_PARSED may be not set yet, if, -+ * for example, we got here as a result of error handling path -+ * in jload(). Anyway, page cannot be detached by -+ * reiser4_releasepage(). truncate will invalidate page -+ * regardless, but this should not be a problem. -+ */ -+ kunmap(page); -+ } -+ jrelse_tail(node); -+} -+ -+/* called from jput() to wait for io completion */ -+static void jnode_finish_io(jnode * node) -+{ -+ struct page *page; -+ -+ assert("nikita-2922", node != NULL); -+ -+ spin_lock_jnode(node); -+ page = jnode_page(node); -+ if (page != NULL) { -+ page_cache_get(page); -+ spin_unlock_jnode(node); -+ wait_on_page_writeback(page); -+ page_cache_release(page); -+ } else -+ spin_unlock_jnode(node); -+} -+ -+/* -+ * This is called by jput() when last reference to jnode is released. This is -+ * separate function, because we want fast path of jput() to be inline and, -+ * therefore, small. -+ */ -+void jput_final(jnode * node) -+{ -+ int r_i_p; -+ -+ /* A fast check for keeping node in cache. We always keep node in cache -+ * if its page is present and node was not marked for deletion */ -+ if (jnode_page(node) != NULL && !JF_ISSET(node, JNODE_HEARD_BANSHEE)) { -+ rcu_read_unlock(); -+ return; -+ } -+ r_i_p = !JF_TEST_AND_SET(node, JNODE_RIP); -+ /* -+ * if r_i_p is true, we were first to set JNODE_RIP on this node. In -+ * this case it is safe to access node after unlock. -+ */ -+ rcu_read_unlock(); -+ if (r_i_p) { -+ jnode_finish_io(node); -+ if (JF_ISSET(node, JNODE_HEARD_BANSHEE)) -+ /* node is removed from the tree. */ -+ jdelete(node); -+ else -+ jnode_try_drop(node); -+ } -+ /* if !r_i_p some other thread is already killing it */ -+} -+ -+int jwait_io(jnode * node, int rw) -+{ -+ struct page *page; -+ int result; -+ -+ assert("zam-447", node != NULL); -+ assert("zam-448", jnode_page(node) != NULL); -+ -+ page = jnode_page(node); -+ -+ result = 0; -+ if (rw == READ) { -+ wait_on_page_locked(page); -+ } else { -+ assert("nikita-2227", rw == WRITE); -+ wait_on_page_writeback(page); -+ } -+ if (PageError(page)) -+ result = RETERR(-EIO); -+ -+ return result; -+} -+ -+/* -+ * jnode types and plugins. -+ * -+ * jnode by itself is a "base type". There are several different jnode -+ * flavors, called "jnode types" (see jnode_type for a list). Sometimes code -+ * has to do different things based on jnode type. In the standard reiser4 way -+ * this is done by having jnode plugin (see fs/reiser4/plugin.h:jnode_plugin). -+ * -+ * Functions below deal with jnode types and define methods of jnode plugin. -+ * -+ */ -+ -+/* set jnode type. This is done during jnode initialization. */ -+static void jnode_set_type(jnode * node, jnode_type type) -+{ -+ static unsigned long type_to_mask[] = { -+ [JNODE_UNFORMATTED_BLOCK] = 1, -+ [JNODE_FORMATTED_BLOCK] = 0, -+ [JNODE_BITMAP] = 2, -+ [JNODE_IO_HEAD] = 6, -+ [JNODE_INODE] = 4 -+ }; -+ -+ assert("zam-647", type < LAST_JNODE_TYPE); -+ assert("nikita-2815", !jnode_is_loaded(node)); -+ assert("nikita-3386", node->state == 0); -+ -+ node->state |= (type_to_mask[type] << JNODE_TYPE_1); -+} -+ -+/* ->init() method of jnode plugin for jnodes that don't require plugin -+ * specific initialization. */ -+static int init_noinit(jnode * node UNUSED_ARG) -+{ -+ return 0; -+} -+ -+/* ->parse() method of jnode plugin for jnodes that don't require plugin -+ * specific pasring. */ -+static int parse_noparse(jnode * node UNUSED_ARG) -+{ -+ return 0; -+} -+ -+/* ->mapping() method for unformatted jnode */ -+struct address_space *mapping_jnode(const jnode * node) -+{ -+ struct address_space *map; -+ -+ assert("nikita-2713", node != NULL); -+ -+ /* mapping is stored in jnode */ -+ -+ map = node->key.j.mapping; -+ assert("nikita-2714", map != NULL); -+ assert("nikita-2897", is_reiser4_inode(map->host)); -+ assert("nikita-2715", get_inode_oid(map->host) == node->key.j.objectid); -+ return map; -+} -+ -+/* ->index() method for unformatted jnodes */ -+unsigned long index_jnode(const jnode * node) -+{ -+ /* index is stored in jnode */ -+ return node->key.j.index; -+} -+ -+/* ->remove() method for unformatted jnodes */ -+static inline void remove_jnode(jnode * node, reiser4_tree * tree) -+{ -+ /* remove jnode from hash table and radix tree */ -+ if (node->key.j.mapping) -+ unhash_unformatted_node_nolock(node); -+} -+ -+/* ->mapping() method for znodes */ -+static struct address_space *mapping_znode(const jnode * node) -+{ -+ /* all znodes belong to fake inode */ -+ return reiser4_get_super_fake(jnode_get_tree(node)->super)->i_mapping; -+} -+ -+/* ->index() method for znodes */ -+static unsigned long index_znode(const jnode * node) -+{ -+ unsigned long addr; -+ assert("nikita-3317", (1 << znode_shift_order) < sizeof(znode)); -+ -+ /* index of znode is just its address (shifted) */ -+ addr = (unsigned long)node; -+ return (addr - PAGE_OFFSET) >> znode_shift_order; -+} -+ -+/* ->mapping() method for bitmap jnode */ -+static struct address_space *mapping_bitmap(const jnode * node) -+{ -+ /* all bitmap blocks belong to special bitmap inode */ -+ return get_super_private(jnode_get_tree(node)->super)->bitmap-> -+ i_mapping; -+} -+ -+/* ->index() method for jnodes that are indexed by address */ -+static unsigned long index_is_address(const jnode * node) -+{ -+ unsigned long ind; -+ -+ ind = (unsigned long)node; -+ return ind - PAGE_OFFSET; -+} -+ -+/* resolve race with jput */ -+jnode *jnode_rip_sync(reiser4_tree *tree, jnode *node) -+{ -+ /* -+ * This is used as part of RCU-based jnode handling. -+ * -+ * jlookup(), zlook(), zget(), and cbk_cache_scan_slots() have to work -+ * with unreferenced jnodes (ones with ->x_count == 0). Hash table is -+ * not protected during this, so concurrent thread may execute -+ * zget-set-HEARD_BANSHEE-zput, or somehow else cause jnode to be -+ * freed in jput_final(). To avoid such races, jput_final() sets -+ * JNODE_RIP on jnode (under tree lock). All places that work with -+ * unreferenced jnodes call this function. It checks for JNODE_RIP bit -+ * (first without taking tree lock), and if this bit is set, released -+ * reference acquired by the current thread and returns NULL. -+ * -+ * As a result, if jnode is being concurrently freed, NULL is returned -+ * and caller should pretend that jnode wasn't found in the first -+ * place. -+ * -+ * Otherwise it's safe to release "rcu-read-lock" and continue with -+ * jnode. -+ */ -+ if (unlikely(JF_ISSET(node, JNODE_RIP))) { -+ read_lock_tree(tree); -+ if (JF_ISSET(node, JNODE_RIP)) { -+ dec_x_ref(node); -+ node = NULL; -+ } -+ read_unlock_tree(tree); -+ } -+ return node; -+} -+ -+reiser4_key *jnode_build_key(const jnode * node, reiser4_key * key) -+{ -+ struct inode *inode; -+ item_plugin *iplug; -+ loff_t off; -+ -+ assert("nikita-3092", node != NULL); -+ assert("nikita-3093", key != NULL); -+ assert("nikita-3094", jnode_is_unformatted(node)); -+ -+ off = ((loff_t) index_jnode(node)) << PAGE_CACHE_SHIFT; -+ inode = mapping_jnode(node)->host; -+ -+ if (node->parent_item_id != 0) -+ iplug = item_plugin_by_id(node->parent_item_id); -+ else -+ iplug = NULL; -+ -+ if (iplug != NULL && iplug->f.key_by_offset) -+ iplug->f.key_by_offset(inode, off, key); -+ else { -+ file_plugin *fplug; -+ -+ fplug = inode_file_plugin(inode); -+ assert("zam-1007", fplug != NULL); -+ assert("zam-1008", fplug->key_by_inode != NULL); -+ -+ fplug->key_by_inode(inode, off, key); -+ } -+ -+ return key; -+} -+ -+/* ->parse() method for formatted nodes */ -+static int parse_znode(jnode * node) -+{ -+ return zparse(JZNODE(node)); -+} -+ -+/* ->delete() method for formatted nodes */ -+static void delete_znode(jnode * node, reiser4_tree * tree) -+{ -+ znode *z; -+ -+ assert_rw_write_locked(&(tree->tree_lock)); -+ assert("vs-898", JF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ -+ z = JZNODE(node); -+ assert("vs-899", z->c_count == 0); -+ -+ /* delete znode from sibling list. */ -+ sibling_list_remove(z); -+ -+ znode_remove(z, tree); -+} -+ -+/* ->remove() method for formatted nodes */ -+static int remove_znode(jnode * node, reiser4_tree * tree) -+{ -+ znode *z; -+ -+ assert_rw_write_locked(&(tree->tree_lock)); -+ z = JZNODE(node); -+ -+ if (z->c_count == 0) { -+ /* detach znode from sibling list. */ -+ sibling_list_drop(z); -+ /* this is called with tree spin-lock held, so call -+ znode_remove() directly (rather than znode_lock_remove()). */ -+ znode_remove(z, tree); -+ return 0; -+ } -+ return RETERR(-EBUSY); -+} -+ -+/* ->init() method for formatted nodes */ -+static int init_znode(jnode * node) -+{ -+ znode *z; -+ -+ z = JZNODE(node); -+ /* call node plugin to do actual initialization */ -+ return z->nplug->init(z); -+} -+ -+/* ->clone() method for formatted nodes */ -+static jnode *clone_formatted(jnode * node) -+{ -+ znode *clone; -+ -+ assert("vs-1430", jnode_is_znode(node)); -+ clone = zalloc(reiser4_ctx_gfp_mask_get()); -+ if (clone == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ zinit(clone, NULL, current_tree); -+ jnode_set_block(ZJNODE(clone), jnode_get_block(node)); -+ /* ZJNODE(clone)->key.z is not initialized */ -+ clone->level = JZNODE(node)->level; -+ -+ return ZJNODE(clone); -+} -+ -+/* jplug->clone for unformatted nodes */ -+static jnode *clone_unformatted(jnode * node) -+{ -+ jnode *clone; -+ -+ assert("vs-1431", jnode_is_unformatted(node)); -+ clone = jalloc(); -+ if (clone == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ -+ jnode_init(clone, current_tree, JNODE_UNFORMATTED_BLOCK); -+ jnode_set_block(clone, jnode_get_block(node)); -+ -+ return clone; -+ -+} -+ -+/* -+ * Setup jnode plugin methods for various jnode types. -+ */ -+jnode_plugin jnode_plugins[LAST_JNODE_TYPE] = { -+ [JNODE_UNFORMATTED_BLOCK] = { -+ .h = { -+ .type_id = REISER4_JNODE_PLUGIN_TYPE, -+ .id = JNODE_UNFORMATTED_BLOCK, -+ .pops = NULL, -+ .label = "unformatted", -+ .desc = "unformatted node", -+ .linkage = {NULL, NULL} -+ }, -+ .init = init_noinit, -+ .parse = parse_noparse, -+ .mapping = mapping_jnode, -+ .index = index_jnode, -+ .clone = clone_unformatted -+ }, -+ [JNODE_FORMATTED_BLOCK] = { -+ .h = { -+ .type_id = REISER4_JNODE_PLUGIN_TYPE, -+ .id = JNODE_FORMATTED_BLOCK, -+ .pops = NULL, -+ .label = "formatted", -+ .desc = "formatted tree node", -+ .linkage = {NULL, NULL} -+ }, -+ .init = init_znode, -+ .parse = parse_znode, -+ .mapping = mapping_znode, -+ .index = index_znode, -+ .clone = clone_formatted -+ }, -+ [JNODE_BITMAP] = { -+ .h = { -+ .type_id = REISER4_JNODE_PLUGIN_TYPE, -+ .id = JNODE_BITMAP, -+ .pops = NULL, -+ .label = "bitmap", -+ .desc = "bitmap node", -+ .linkage = {NULL, NULL} -+ }, -+ .init = init_noinit, -+ .parse = parse_noparse, -+ .mapping = mapping_bitmap, -+ .index = index_is_address, -+ .clone = NULL -+ }, -+ [JNODE_IO_HEAD] = { -+ .h = { -+ .type_id = REISER4_JNODE_PLUGIN_TYPE, -+ .id = JNODE_IO_HEAD, -+ .pops = NULL, -+ .label = "io head", -+ .desc = "io head", -+ .linkage = {NULL, NULL} -+ }, -+ .init = init_noinit, -+ .parse = parse_noparse, -+ .mapping = mapping_bitmap, -+ .index = index_is_address, -+ .clone = NULL -+ }, -+ [JNODE_INODE] = { -+ .h = { -+ .type_id = REISER4_JNODE_PLUGIN_TYPE, -+ .id = JNODE_INODE, -+ .pops = NULL, -+ .label = "inode", -+ .desc = "inode's builtin jnode", -+ .linkage = {NULL, NULL} -+ }, -+ .init = NULL, -+ .parse = NULL, -+ .mapping = NULL, -+ .index = NULL, -+ .clone = NULL -+ } -+}; -+ -+/* -+ * jnode destruction. -+ * -+ * Thread may use a jnode after it acquired a reference to it. References are -+ * counted in ->x_count field. Reference protects jnode from being -+ * recycled. This is different from protecting jnode data (that are stored in -+ * jnode page) from being evicted from memory. Data are protected by jload() -+ * and released by jrelse(). -+ * -+ * If thread already possesses a reference to the jnode it can acquire another -+ * one through jref(). Initial reference is obtained (usually) by locating -+ * jnode in some indexing structure that depends on jnode type: formatted -+ * nodes are kept in global hash table, where they are indexed by block -+ * number, and also in the cbk cache. Unformatted jnodes are also kept in hash -+ * table, which is indexed by oid and offset within file, and in per-inode -+ * radix tree. -+ * -+ * Reference to jnode is released by jput(). If last reference is released, -+ * jput_final() is called. This function determines whether jnode has to be -+ * deleted (this happens when corresponding node is removed from the file -+ * system, jnode is marked with JNODE_HEARD_BANSHEE bit in this case), or it -+ * should be just "removed" (deleted from memory). -+ * -+ * Jnode destruction is signally delicate dance because of locking and RCU. -+ */ -+ -+/* -+ * Returns true if jnode cannot be removed right now. This check is called -+ * under tree lock. If it returns true, jnode is irrevocably committed to be -+ * deleted/removed. -+ */ -+static inline int jnode_is_busy(const jnode * node, jnode_type jtype) -+{ -+ /* if other thread managed to acquire a reference to this jnode, don't -+ * free it. */ -+ if (atomic_read(&node->x_count) > 0) -+ return 1; -+ /* also, don't free znode that has children in memory */ -+ if (jtype == JNODE_FORMATTED_BLOCK && JZNODE(node)->c_count > 0) -+ return 1; -+ return 0; -+} -+ -+/* -+ * this is called as part of removing jnode. Based on jnode type, call -+ * corresponding function that removes jnode from indices and returns it back -+ * to the appropriate slab (through RCU). -+ */ -+static inline void -+jnode_remove(jnode * node, jnode_type jtype, reiser4_tree * tree) -+{ -+ switch (jtype) { -+ case JNODE_UNFORMATTED_BLOCK: -+ remove_jnode(node, tree); -+ break; -+ case JNODE_IO_HEAD: -+ case JNODE_BITMAP: -+ break; -+ case JNODE_INODE: -+ break; -+ case JNODE_FORMATTED_BLOCK: -+ remove_znode(node, tree); -+ break; -+ default: -+ wrong_return_value("nikita-3196", "Wrong jnode type"); -+ } -+} -+ -+/* -+ * this is called as part of deleting jnode. Based on jnode type, call -+ * corresponding function that removes jnode from indices and returns it back -+ * to the appropriate slab (through RCU). -+ * -+ * This differs from jnode_remove() only for formatted nodes---for them -+ * sibling list handling is different for removal and deletion. -+ */ -+static inline void -+jnode_delete(jnode * node, jnode_type jtype, reiser4_tree * tree UNUSED_ARG) -+{ -+ switch (jtype) { -+ case JNODE_UNFORMATTED_BLOCK: -+ remove_jnode(node, tree); -+ break; -+ case JNODE_IO_HEAD: -+ case JNODE_BITMAP: -+ break; -+ case JNODE_FORMATTED_BLOCK: -+ delete_znode(node, tree); -+ break; -+ case JNODE_INODE: -+ default: -+ wrong_return_value("nikita-3195", "Wrong jnode type"); -+ } -+} -+ -+#if REISER4_DEBUG -+/* -+ * remove jnode from the debugging list of all jnodes hanging off super-block. -+ */ -+void jnode_list_remove(jnode * node) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = get_super_private(jnode_get_tree(node)->super); -+ -+ spin_lock_irq(&sbinfo->all_guard); -+ assert("nikita-2422", !list_empty(&node->jnodes)); -+ list_del_init(&node->jnodes); -+ spin_unlock_irq(&sbinfo->all_guard); -+} -+#endif -+ -+/* -+ * this is called by jput_final() to remove jnode when last reference to it is -+ * released. -+ */ -+static int jnode_try_drop(jnode * node) -+{ -+ int result; -+ reiser4_tree *tree; -+ jnode_type jtype; -+ -+ assert("nikita-2491", node != NULL); -+ assert("nikita-2583", JF_ISSET(node, JNODE_RIP)); -+ -+ tree = jnode_get_tree(node); -+ jtype = jnode_get_type(node); -+ -+ spin_lock_jnode(node); -+ write_lock_tree(tree); -+ /* -+ * if jnode has a page---leave it alone. Memory pressure will -+ * eventually kill page and jnode. -+ */ -+ if (jnode_page(node) != NULL) { -+ write_unlock_tree(tree); -+ spin_unlock_jnode(node); -+ JF_CLR(node, JNODE_RIP); -+ return RETERR(-EBUSY); -+ } -+ -+ /* re-check ->x_count under tree lock. */ -+ result = jnode_is_busy(node, jtype); -+ if (result == 0) { -+ assert("nikita-2582", !JF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ assert("jmacd-511/b", atomic_read(&node->d_count) == 0); -+ -+ spin_unlock_jnode(node); -+ /* no page and no references---despatch him. */ -+ jnode_remove(node, jtype, tree); -+ write_unlock_tree(tree); -+ jnode_free(node, jtype); -+ } else { -+ /* busy check failed: reference was acquired by concurrent -+ * thread. */ -+ write_unlock_tree(tree); -+ spin_unlock_jnode(node); -+ JF_CLR(node, JNODE_RIP); -+ } -+ return result; -+} -+ -+/* jdelete() -- Delete jnode from the tree and file system */ -+static int jdelete(jnode * node /* jnode to finish with */ ) -+{ -+ struct page *page; -+ int result; -+ reiser4_tree *tree; -+ jnode_type jtype; -+ -+ assert("nikita-467", node != NULL); -+ assert("nikita-2531", JF_ISSET(node, JNODE_RIP)); -+ -+ jtype = jnode_get_type(node); -+ -+ page = jnode_lock_page(node); -+ assert_spin_locked(&(node->guard)); -+ -+ tree = jnode_get_tree(node); -+ -+ write_lock_tree(tree); -+ /* re-check ->x_count under tree lock. */ -+ result = jnode_is_busy(node, jtype); -+ if (likely(!result)) { -+ assert("nikita-2123", JF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ assert("jmacd-511", atomic_read(&node->d_count) == 0); -+ -+ /* detach page */ -+ if (page != NULL) { -+ /* -+ * FIXME this is racy against jnode_extent_write(). -+ */ -+ page_clear_jnode(page, node); -+ } -+ spin_unlock_jnode(node); -+ /* goodbye */ -+ jnode_delete(node, jtype, tree); -+ write_unlock_tree(tree); -+ jnode_free(node, jtype); -+ /* @node is no longer valid pointer */ -+ if (page != NULL) -+ reiser4_drop_page(page); -+ } else { -+ /* busy check failed: reference was acquired by concurrent -+ * thread. */ -+ JF_CLR(node, JNODE_RIP); -+ write_unlock_tree(tree); -+ spin_unlock_jnode(node); -+ if (page != NULL) -+ unlock_page(page); -+ } -+ return result; -+} -+ -+/* drop jnode on the floor. -+ -+ Return value: -+ -+ -EBUSY: failed to drop jnode, because there are still references to it -+ -+ 0: successfully dropped jnode -+ -+*/ -+static int jdrop_in_tree(jnode * node, reiser4_tree * tree) -+{ -+ struct page *page; -+ jnode_type jtype; -+ int result; -+ -+ assert("zam-602", node != NULL); -+ assert_rw_not_read_locked(&(tree->tree_lock)); -+ assert_rw_not_write_locked(&(tree->tree_lock)); -+ assert("nikita-2403", !JF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ -+ jtype = jnode_get_type(node); -+ -+ page = jnode_lock_page(node); -+ assert_spin_locked(&(node->guard)); -+ -+ write_lock_tree(tree); -+ -+ /* re-check ->x_count under tree lock. */ -+ result = jnode_is_busy(node, jtype); -+ if (!result) { -+ assert("nikita-2488", page == jnode_page(node)); -+ assert("nikita-2533", atomic_read(&node->d_count) == 0); -+ if (page != NULL) { -+ assert("nikita-2126", !PageDirty(page)); -+ assert("nikita-2127", PageUptodate(page)); -+ assert("nikita-2181", PageLocked(page)); -+ page_clear_jnode(page, node); -+ } -+ spin_unlock_jnode(node); -+ jnode_remove(node, jtype, tree); -+ write_unlock_tree(tree); -+ jnode_free(node, jtype); -+ if (page != NULL) { -+ reiser4_drop_page(page); -+ } -+ } else { -+ /* busy check failed: reference was acquired by concurrent -+ * thread. */ -+ JF_CLR(node, JNODE_RIP); -+ write_unlock_tree(tree); -+ spin_unlock_jnode(node); -+ if (page != NULL) -+ unlock_page(page); -+ } -+ return result; -+} -+ -+/* This function frees jnode "if possible". In particular, [dcx]_count has to -+ be 0 (where applicable). */ -+void jdrop(jnode * node) -+{ -+ jdrop_in_tree(node, jnode_get_tree(node)); -+} -+ -+/* IO head jnode implementation; The io heads are simple j-nodes with limited -+ functionality (these j-nodes are not in any hash table) just for reading -+ from and writing to disk. */ -+ -+jnode *reiser4_alloc_io_head(const reiser4_block_nr * block) -+{ -+ jnode *jal = jalloc(); -+ -+ if (jal != NULL) { -+ jnode_init(jal, current_tree, JNODE_IO_HEAD); -+ jnode_set_block(jal, block); -+ } -+ -+ jref(jal); -+ -+ return jal; -+} -+ -+void reiser4_drop_io_head(jnode * node) -+{ -+ assert("zam-648", jnode_get_type(node) == JNODE_IO_HEAD); -+ -+ jput(node); -+ jdrop(node); -+} -+ -+/* protect keep jnode data from reiser4_releasepage() */ -+void pin_jnode_data(jnode * node) -+{ -+ assert("zam-671", jnode_page(node) != NULL); -+ page_cache_get(jnode_page(node)); -+} -+ -+/* make jnode data free-able again */ -+void unpin_jnode_data(jnode * node) -+{ -+ assert("zam-672", jnode_page(node) != NULL); -+ page_cache_release(jnode_page(node)); -+} -+ -+struct address_space *jnode_get_mapping(const jnode * node) -+{ -+ assert("nikita-3162", node != NULL); -+ return jnode_ops(node)->mapping(node); -+} -+ -+#if REISER4_DEBUG -+/* debugging aid: jnode invariant */ -+int jnode_invariant_f(const jnode * node, char const **msg) -+{ -+#define _ergo(ant, con) \ -+ ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con))) -+#define _check(exp) ((*msg) = #exp, (exp)) -+ -+ return _check(node != NULL) && -+ /* [jnode-queued] */ -+ /* only relocated node can be queued, except that when znode -+ * is being deleted, its JNODE_RELOC bit is cleared */ -+ _ergo(JF_ISSET(node, JNODE_FLUSH_QUEUED), -+ JF_ISSET(node, JNODE_RELOC) || -+ JF_ISSET(node, JNODE_HEARD_BANSHEE)) && -+ _check(node->jnodes.prev != NULL) && -+ _check(node->jnodes.next != NULL) && -+ /* [jnode-dirty] invariant */ -+ /* dirty inode is part of atom */ -+ _ergo(JF_ISSET(node, JNODE_DIRTY), node->atom != NULL) && -+ /* [jnode-oid] invariant */ -+ /* for unformatted node ->objectid and ->mapping fields are -+ * consistent */ -+ _ergo(jnode_is_unformatted(node) && node->key.j.mapping != NULL, -+ node->key.j.objectid == -+ get_inode_oid(node->key.j.mapping->host)) && -+ /* [jnode-atom-valid] invariant */ -+ /* node atom has valid state */ -+ _ergo(node->atom != NULL, node->atom->stage != ASTAGE_INVALID) && -+ /* [jnode-page-binding] invariant */ -+ /* if node points to page, it points back to node */ -+ _ergo(node->pg != NULL, jprivate(node->pg) == node) && -+ /* [jnode-refs] invariant */ -+ /* only referenced jnode can be loaded */ -+ _check(atomic_read(&node->x_count) >= atomic_read(&node->d_count)); -+ -+} -+ -+static const char *jnode_type_name(jnode_type type) -+{ -+ switch (type) { -+ case JNODE_UNFORMATTED_BLOCK: -+ return "unformatted"; -+ case JNODE_FORMATTED_BLOCK: -+ return "formatted"; -+ case JNODE_BITMAP: -+ return "bitmap"; -+ case JNODE_IO_HEAD: -+ return "io head"; -+ case JNODE_INODE: -+ return "inode"; -+ case LAST_JNODE_TYPE: -+ return "last"; -+ default:{ -+ static char unknown[30]; -+ -+ sprintf(unknown, "unknown %i", type); -+ return unknown; -+ } -+ } -+} -+ -+#define jnode_state_name( node, flag ) \ -+ ( JF_ISSET( ( node ), ( flag ) ) ? ((#flag "|")+6) : "" ) -+ -+/* debugging aid: output human readable information about @node */ -+static void info_jnode(const char *prefix /* prefix to print */ , -+ const jnode * node /* node to print */ ) -+{ -+ assert("umka-068", prefix != NULL); -+ -+ if (node == NULL) { -+ printk("%s: null\n", prefix); -+ return; -+ } -+ -+ printk -+ ("%s: %p: state: %lx: [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s], level: %i," -+ " block: %s, d_count: %d, x_count: %d, " -+ "pg: %p, atom: %p, lock: %i:%i, type: %s, ", prefix, node, -+ node->state, -+ jnode_state_name(node, JNODE_PARSED), -+ jnode_state_name(node, JNODE_HEARD_BANSHEE), -+ jnode_state_name(node, JNODE_LEFT_CONNECTED), -+ jnode_state_name(node, JNODE_RIGHT_CONNECTED), -+ jnode_state_name(node, JNODE_ORPHAN), -+ jnode_state_name(node, JNODE_CREATED), -+ jnode_state_name(node, JNODE_RELOC), -+ jnode_state_name(node, JNODE_OVRWR), -+ jnode_state_name(node, JNODE_DIRTY), -+ jnode_state_name(node, JNODE_IS_DYING), -+ jnode_state_name(node, JNODE_RIP), -+ jnode_state_name(node, JNODE_MISSED_IN_CAPTURE), -+ jnode_state_name(node, JNODE_WRITEBACK), -+ jnode_state_name(node, JNODE_NEW), -+ jnode_state_name(node, JNODE_DKSET), -+ jnode_state_name(node, JNODE_REPACK), -+ jnode_state_name(node, JNODE_CLUSTER_PAGE), -+ jnode_get_level(node), sprint_address(jnode_get_block(node)), -+ atomic_read(&node->d_count), atomic_read(&node->x_count), -+ jnode_page(node), node->atom, 0, 0, -+ jnode_type_name(jnode_get_type(node))); -+ if (jnode_is_unformatted(node)) { -+ printk("inode: %llu, index: %lu, ", -+ node->key.j.objectid, node->key.j.index); -+ } -+} -+ -+/* debugging aid: check znode invariant and panic if it doesn't hold */ -+static int jnode_invariant(const jnode * node, int tlocked, int jlocked) -+{ -+ char const *failed_msg; -+ int result; -+ reiser4_tree *tree; -+ -+ tree = jnode_get_tree(node); -+ -+ assert("umka-063312", node != NULL); -+ assert("umka-064321", tree != NULL); -+ -+ if (!jlocked && !tlocked) -+ spin_lock_jnode((jnode *) node); -+ if (!tlocked) -+ read_lock_tree(jnode_get_tree(node)); -+ result = jnode_invariant_f(node, &failed_msg); -+ if (!result) { -+ info_jnode("corrupted node", node); -+ warning("jmacd-555", "Condition %s failed", failed_msg); -+ } -+ if (!tlocked) -+ read_unlock_tree(jnode_get_tree(node)); -+ if (!jlocked && !tlocked) -+ spin_unlock_jnode((jnode *) node); -+ return result; -+} -+ -+#endif /* REISER4_DEBUG */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/jnode.h linux-2.6.24/fs/reiser4/jnode.h ---- linux-2.6.24.orig/fs/reiser4/jnode.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/jnode.h 2008-01-25 11:39:06.940208719 +0300 -@@ -0,0 +1,702 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Declaration of jnode. See jnode.c for details. */ -+ -+#ifndef __JNODE_H__ -+#define __JNODE_H__ -+ -+#include "forward.h" -+#include "type_safe_hash.h" -+#include "txnmgr.h" -+#include "key.h" -+#include "debug.h" -+#include "dformat.h" -+#include "page_cache.h" -+#include "context.h" -+ -+#include "plugin/plugin.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* declare hash table of jnodes (jnodes proper, that is, unformatted -+ nodes) */ -+TYPE_SAFE_HASH_DECLARE(j, jnode); -+ -+/* declare hash table of znodes */ -+TYPE_SAFE_HASH_DECLARE(z, znode); -+ -+struct jnode_key { -+ __u64 objectid; -+ unsigned long index; -+ struct address_space *mapping; -+}; -+ -+/* -+ Jnode is the "base class" of other nodes in reiser4. It is also happens to -+ be exactly the node we use for unformatted tree nodes. -+ -+ Jnode provides following basic functionality: -+ -+ . reference counting and indexing. -+ -+ . integration with page cache. Jnode has ->pg reference to which page can -+ be attached. -+ -+ . interface to transaction manager. It is jnode that is kept in transaction -+ manager lists, attached to atoms, etc. (NOTE-NIKITA one may argue that this -+ means, there should be special type of jnode for inode.) -+ -+ Locking: -+ -+ Spin lock: the following fields are protected by the per-jnode spin lock: -+ -+ ->state -+ ->atom -+ ->capture_link -+ -+ Following fields are protected by the global tree lock: -+ -+ ->link -+ ->key.z (content of ->key.z is only changed in znode_rehash()) -+ ->key.j -+ -+ Atomic counters -+ -+ ->x_count -+ ->d_count -+ -+ ->pg, and ->data are protected by spin lock for unused jnode and are -+ immutable for used jnode (one for which fs/reiser4/vfs_ops.c:releasable() -+ is false). -+ -+ ->tree is immutable after creation -+ -+ Unclear -+ -+ ->blocknr: should be under jnode spin-lock, but current interface is based -+ on passing of block address. -+ -+ If you ever need to spin lock two nodes at once, do this in "natural" -+ memory order: lock znode with lower address first. (See lock_two_nodes().) -+ -+ Invariants involving this data-type: -+ -+ [jnode-dirty] -+ [jnode-refs] -+ [jnode-oid] -+ [jnode-queued] -+ [jnode-atom-valid] -+ [jnode-page-binding] -+*/ -+ -+struct jnode { -+#if REISER4_DEBUG -+#define JMAGIC 0x52654973 /* "ReIs" */ -+ int magic; -+#endif -+ /* FIRST CACHE LINE (16 bytes): data used by jload */ -+ -+ /* jnode's state: bitwise flags from the reiser4_jnode_state enum. */ -+ /* 0 */ unsigned long state; -+ -+ /* lock, protecting jnode's fields. */ -+ /* 4 */ spinlock_t load; -+ -+ /* counter of references to jnode itself. Increased on jref(). -+ Decreased on jput(). -+ */ -+ /* 8 */ atomic_t x_count; -+ -+ /* counter of references to jnode's data. Pin data page(s) in -+ memory while this is greater than 0. Increased on jload(). -+ Decreased on jrelse(). -+ */ -+ /* 12 */ atomic_t d_count; -+ -+ /* SECOND CACHE LINE: data used by hash table lookups */ -+ -+ /* 16 */ union { -+ /* znodes are hashed by block number */ -+ reiser4_block_nr z; -+ /* unformatted nodes are hashed by mapping plus offset */ -+ struct jnode_key j; -+ } key; -+ -+ /* THIRD CACHE LINE */ -+ -+ /* 32 */ union { -+ /* pointers to maintain hash-table */ -+ z_hash_link z; -+ j_hash_link j; -+ } link; -+ -+ /* pointer to jnode page. */ -+ /* 36 */ struct page *pg; -+ /* pointer to node itself. This is page_address(node->pg) when page is -+ attached to the jnode -+ */ -+ /* 40 */ void *data; -+ -+ /* 44 */ reiser4_tree *tree; -+ -+ /* FOURTH CACHE LINE: atom related fields */ -+ -+ /* 48 */ spinlock_t guard; -+ -+ /* atom the block is in, if any */ -+ /* 52 */ txn_atom *atom; -+ -+ /* capture list */ -+ /* 56 */ struct list_head capture_link; -+ -+ /* FIFTH CACHE LINE */ -+ -+ /* 64 */ struct rcu_head rcu; -+ /* crosses cache line */ -+ -+ /* SIXTH CACHE LINE */ -+ -+ /* the real blocknr (where io is going to/from) */ -+ /* 80 */ reiser4_block_nr blocknr; -+ /* Parent item type, unformatted and CRC need it for offset => key conversion. */ -+ /* NOTE: this parent_item_id looks like jnode type. */ -+ /* 88 */ reiser4_plugin_id parent_item_id; -+ /* 92 */ -+#if REISER4_DEBUG -+ /* list of all jnodes for debugging purposes. */ -+ struct list_head jnodes; -+ /* how many times this jnode was written in one transaction */ -+ int written; -+ /* this indicates which atom's list the jnode is on */ -+ atom_list list; -+#endif -+} __attribute__ ((aligned(16))); -+ -+/* -+ * jnode types. Enumeration of existing jnode types. -+ */ -+typedef enum { -+ JNODE_UNFORMATTED_BLOCK, /* unformatted block */ -+ JNODE_FORMATTED_BLOCK, /* formatted block, znode */ -+ JNODE_BITMAP, /* bitmap */ -+ JNODE_IO_HEAD, /* jnode representing a block in the -+ * wandering log */ -+ JNODE_INODE, /* jnode embedded into inode */ -+ LAST_JNODE_TYPE -+} jnode_type; -+ -+/* jnode states */ -+typedef enum { -+ /* jnode's page is loaded and data checked */ -+ JNODE_PARSED = 0, -+ /* node was deleted, not all locks on it were released. This -+ node is empty and is going to be removed from the tree -+ shortly. */ -+ JNODE_HEARD_BANSHEE = 1, -+ /* left sibling pointer is valid */ -+ JNODE_LEFT_CONNECTED = 2, -+ /* right sibling pointer is valid */ -+ JNODE_RIGHT_CONNECTED = 3, -+ -+ /* znode was just created and doesn't yet have a pointer from -+ its parent */ -+ JNODE_ORPHAN = 4, -+ -+ /* this node was created by its transaction and has not been assigned -+ a block address. */ -+ JNODE_CREATED = 5, -+ -+ /* this node is currently relocated */ -+ JNODE_RELOC = 6, -+ /* this node is currently wandered */ -+ JNODE_OVRWR = 7, -+ -+ /* this znode has been modified */ -+ JNODE_DIRTY = 8, -+ -+ /* znode lock is being invalidated */ -+ JNODE_IS_DYING = 9, -+ -+ /* THIS PLACE IS INTENTIONALLY LEFT BLANK */ -+ -+ /* jnode is queued for flushing. */ -+ JNODE_FLUSH_QUEUED = 12, -+ -+ /* In the following bits jnode type is encoded. */ -+ JNODE_TYPE_1 = 13, -+ JNODE_TYPE_2 = 14, -+ JNODE_TYPE_3 = 15, -+ -+ /* jnode is being destroyed */ -+ JNODE_RIP = 16, -+ -+ /* znode was not captured during locking (it might so be because -+ ->level != LEAF_LEVEL and lock_mode == READ_LOCK) */ -+ JNODE_MISSED_IN_CAPTURE = 17, -+ -+ /* write is in progress */ -+ JNODE_WRITEBACK = 18, -+ -+ /* FIXME: now it is used by crypto-compress plugin only */ -+ JNODE_NEW = 19, -+ -+ /* delimiting keys are already set for this znode. */ -+ JNODE_DKSET = 20, -+ -+ /* when this bit is set page and jnode can not be disconnected */ -+ JNODE_WRITE_PREPARED = 21, -+ -+ JNODE_CLUSTER_PAGE = 22, -+ /* Jnode is marked for repacking, that means the reiser4 flush and the -+ * block allocator should process this node special way */ -+ JNODE_REPACK = 23, -+ /* node should be converted by flush in squalloc phase */ -+ JNODE_CONVERTIBLE = 24, -+ /* -+ * When jnode is dirtied for the first time in given transaction, -+ * do_jnode_make_dirty() checks whether this jnode can possible became -+ * member of overwrite set. If so, this bit is set, and one block is -+ * reserved in the ->flush_reserved space of atom. -+ * -+ * This block is "used" (and JNODE_FLUSH_RESERVED bit is cleared) when -+ * -+ * (1) flush decides that we want this block to go into relocate -+ * set after all. -+ * -+ * (2) wandering log is allocated (by log writer) -+ * -+ * (3) extent is allocated -+ * -+ */ -+ JNODE_FLUSH_RESERVED = 29 -+} reiser4_jnode_state; -+ -+/* Macros for accessing the jnode state. */ -+ -+static inline void JF_CLR(jnode * j, int f) -+{ -+ assert("unknown-1", j->magic == JMAGIC); -+ clear_bit(f, &j->state); -+} -+static inline int JF_ISSET(const jnode * j, int f) -+{ -+ assert("unknown-2", j->magic == JMAGIC); -+ return test_bit(f, &((jnode *) j)->state); -+} -+static inline void JF_SET(jnode * j, int f) -+{ -+ assert("unknown-3", j->magic == JMAGIC); -+ set_bit(f, &j->state); -+} -+ -+static inline int JF_TEST_AND_SET(jnode * j, int f) -+{ -+ assert("unknown-4", j->magic == JMAGIC); -+ return test_and_set_bit(f, &j->state); -+} -+ -+static inline void spin_lock_jnode(jnode *node) -+{ -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", (LOCK_CNT_NIL(rw_locked_tree) && -+ LOCK_CNT_NIL(spin_locked_txnh) && -+ LOCK_CNT_NIL(spin_locked_zlock) && -+ LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_LT(spin_locked_jnode, 2))); -+ -+ spin_lock(&(node->guard)); -+ -+ LOCK_CNT_INC(spin_locked_jnode); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline void spin_unlock_jnode(jnode *node) -+{ -+ assert_spin_locked(&(node->guard)); -+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_jnode)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(spin_locked_jnode); -+ LOCK_CNT_DEC(spin_locked); -+ -+ spin_unlock(&(node->guard)); -+} -+ -+static inline int jnode_is_in_deleteset(const jnode * node) -+{ -+ return JF_ISSET(node, JNODE_RELOC); -+} -+ -+extern int init_jnodes(void); -+extern void done_jnodes(void); -+ -+/* Jnode routines */ -+extern jnode *jalloc(void); -+extern void jfree(jnode * node) NONNULL; -+extern jnode *jclone(jnode *); -+extern jnode *jlookup(reiser4_tree * tree, -+ oid_t objectid, unsigned long ind) NONNULL; -+extern jnode *jfind(struct address_space *, unsigned long index) NONNULL; -+extern jnode *jnode_by_page(struct page *pg) NONNULL; -+extern jnode *jnode_of_page(struct page *pg) NONNULL; -+void jnode_attach_page(jnode * node, struct page *pg); -+ -+void unhash_unformatted_jnode(jnode *); -+extern jnode *page_next_jnode(jnode * node) NONNULL; -+extern void jnode_init(jnode * node, reiser4_tree * tree, jnode_type) NONNULL; -+extern void jnode_make_dirty(jnode * node) NONNULL; -+extern void jnode_make_clean(jnode * node) NONNULL; -+extern void jnode_make_wander_nolock(jnode * node) NONNULL; -+extern void jnode_make_wander(jnode *) NONNULL; -+extern void znode_make_reloc(znode *, flush_queue_t *) NONNULL; -+extern void unformatted_make_reloc(jnode *, flush_queue_t *) NONNULL; -+extern struct address_space *jnode_get_mapping(const jnode * node) NONNULL; -+ -+/** -+ * jnode_get_block -+ * @node: jnode to query -+ * -+ */ -+static inline const reiser4_block_nr *jnode_get_block(const jnode *node) -+{ -+ assert("nikita-528", node != NULL); -+ -+ return &node->blocknr; -+} -+ -+/** -+ * jnode_set_block -+ * @node: jnode to update -+ * @blocknr: new block nr -+ */ -+static inline void jnode_set_block(jnode *node, const reiser4_block_nr *blocknr) -+{ -+ assert("nikita-2020", node != NULL); -+ assert("umka-055", blocknr != NULL); -+ node->blocknr = *blocknr; -+} -+ -+ -+/* block number for IO. Usually this is the same as jnode_get_block(), unless -+ * jnode was emergency flushed---then block number chosen by eflush is -+ * used. */ -+static inline const reiser4_block_nr *jnode_get_io_block(jnode * node) -+{ -+ assert("nikita-2768", node != NULL); -+ assert_spin_locked(&(node->guard)); -+ -+ return jnode_get_block(node); -+} -+ -+/* Jnode flush interface. */ -+extern reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t * pos); -+extern flush_queue_t *reiser4_pos_fq(flush_pos_t * pos); -+ -+/* FIXME-VS: these are used in plugin/item/extent.c */ -+ -+/* does extent_get_block have to be called */ -+#define jnode_mapped(node) JF_ISSET (node, JNODE_MAPPED) -+#define jnode_set_mapped(node) JF_SET (node, JNODE_MAPPED) -+ -+/* the node should be converted during flush squalloc phase */ -+#define jnode_convertible(node) JF_ISSET (node, JNODE_CONVERTIBLE) -+#define jnode_set_convertible(node) JF_SET (node, JNODE_CONVERTIBLE) -+ -+/* Macros to convert from jnode to znode, znode to jnode. These are macros -+ because C doesn't allow overloading of const prototypes. */ -+#define ZJNODE(x) (& (x) -> zjnode) -+#define JZNODE(x) \ -+({ \ -+ typeof (x) __tmp_x; \ -+ \ -+ __tmp_x = (x); \ -+ assert ("jmacd-1300", jnode_is_znode (__tmp_x)); \ -+ (znode*) __tmp_x; \ -+}) -+ -+extern int jnodes_tree_init(reiser4_tree * tree); -+extern int jnodes_tree_done(reiser4_tree * tree); -+ -+#if REISER4_DEBUG -+ -+extern int znode_is_any_locked(const znode * node); -+extern void jnode_list_remove(jnode * node); -+ -+#else -+ -+#define jnode_list_remove(node) noop -+ -+#endif -+ -+int znode_is_root(const znode * node) NONNULL; -+ -+/* bump reference counter on @node */ -+static inline void add_x_ref(jnode * node /* node to increase x_count of */ ) -+{ -+ assert("nikita-1911", node != NULL); -+ -+ atomic_inc(&node->x_count); -+ LOCK_CNT_INC(x_refs); -+} -+ -+static inline void dec_x_ref(jnode * node) -+{ -+ assert("nikita-3215", node != NULL); -+ assert("nikita-3216", atomic_read(&node->x_count) > 0); -+ -+ atomic_dec(&node->x_count); -+ assert("nikita-3217", LOCK_CNT_GTZ(x_refs)); -+ LOCK_CNT_DEC(x_refs); -+} -+ -+/* jref() - increase counter of references to jnode/znode (x_count) */ -+static inline jnode *jref(jnode * node) -+{ -+ assert("jmacd-508", (node != NULL) && !IS_ERR(node)); -+ add_x_ref(node); -+ return node; -+} -+ -+/* get the page of jnode */ -+static inline struct page *jnode_page(const jnode * node) -+{ -+ return node->pg; -+} -+ -+/* return pointer to jnode data */ -+static inline char *jdata(const jnode * node) -+{ -+ assert("nikita-1415", node != NULL); -+ assert("nikita-3198", jnode_page(node) != NULL); -+ return node->data; -+} -+ -+static inline int jnode_is_loaded(const jnode * node) -+{ -+ assert("zam-506", node != NULL); -+ return atomic_read(&node->d_count) > 0; -+} -+ -+extern void page_clear_jnode(struct page *page, jnode * node) NONNULL; -+ -+static inline void jnode_set_reloc(jnode * node) -+{ -+ assert("nikita-2431", node != NULL); -+ assert("nikita-2432", !JF_ISSET(node, JNODE_OVRWR)); -+ JF_SET(node, JNODE_RELOC); -+} -+ -+/* jload/jwrite/junload give a bread/bwrite/brelse functionality for jnodes */ -+ -+extern int jload_gfp(jnode *, gfp_t, int do_kmap) NONNULL; -+ -+static inline int jload(jnode *node) -+{ -+ return jload_gfp(node, reiser4_ctx_gfp_mask_get(), 1); -+} -+ -+extern int jinit_new(jnode *, gfp_t) NONNULL; -+extern int jstartio(jnode *) NONNULL; -+ -+extern void jdrop(jnode *) NONNULL; -+extern int jwait_io(jnode *, int rw) NONNULL; -+ -+void jload_prefetch(jnode *); -+ -+extern jnode *reiser4_alloc_io_head(const reiser4_block_nr * block) NONNULL; -+extern void reiser4_drop_io_head(jnode * node) NONNULL; -+ -+static inline reiser4_tree *jnode_get_tree(const jnode * node) -+{ -+ assert("nikita-2691", node != NULL); -+ return node->tree; -+} -+ -+extern void pin_jnode_data(jnode *); -+extern void unpin_jnode_data(jnode *); -+ -+static inline jnode_type jnode_get_type(const jnode * node) -+{ -+ static const unsigned long state_mask = -+ (1 << JNODE_TYPE_1) | (1 << JNODE_TYPE_2) | (1 << JNODE_TYPE_3); -+ -+ static jnode_type mask_to_type[] = { -+ /* JNODE_TYPE_3 : JNODE_TYPE_2 : JNODE_TYPE_1 */ -+ -+ /* 000 */ -+ [0] = JNODE_FORMATTED_BLOCK, -+ /* 001 */ -+ [1] = JNODE_UNFORMATTED_BLOCK, -+ /* 010 */ -+ [2] = JNODE_BITMAP, -+ /* 011 */ -+ [3] = LAST_JNODE_TYPE, /*invalid */ -+ /* 100 */ -+ [4] = JNODE_INODE, -+ /* 101 */ -+ [5] = LAST_JNODE_TYPE, -+ /* 110 */ -+ [6] = JNODE_IO_HEAD, -+ /* 111 */ -+ [7] = LAST_JNODE_TYPE, /* invalid */ -+ }; -+ -+ return mask_to_type[(node->state & state_mask) >> JNODE_TYPE_1]; -+} -+ -+/* returns true if node is a znode */ -+static inline int jnode_is_znode(const jnode * node) -+{ -+ return jnode_get_type(node) == JNODE_FORMATTED_BLOCK; -+} -+ -+static inline int jnode_is_flushprepped(jnode * node) -+{ -+ assert("jmacd-78212", node != NULL); -+ assert_spin_locked(&(node->guard)); -+ return !JF_ISSET(node, JNODE_DIRTY) || JF_ISSET(node, JNODE_RELOC) || -+ JF_ISSET(node, JNODE_OVRWR); -+} -+ -+/* Return true if @node has already been processed by the squeeze and allocate -+ process. This implies the block address has been finalized for the -+ duration of this atom (or it is clean and will remain in place). If this -+ returns true you may use the block number as a hint. */ -+static inline int jnode_check_flushprepped(jnode * node) -+{ -+ int result; -+ -+ /* It must be clean or relocated or wandered. New allocations are set to relocate. */ -+ spin_lock_jnode(node); -+ result = jnode_is_flushprepped(node); -+ spin_unlock_jnode(node); -+ return result; -+} -+ -+/* returns true if node is unformatted */ -+static inline int jnode_is_unformatted(const jnode * node) -+{ -+ assert("jmacd-0123", node != NULL); -+ return jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK; -+} -+ -+/* returns true if node represents a cluster cache page */ -+static inline int jnode_is_cluster_page(const jnode * node) -+{ -+ assert("edward-50", node != NULL); -+ return (JF_ISSET(node, JNODE_CLUSTER_PAGE)); -+} -+ -+/* returns true is node is builtin inode's jnode */ -+static inline int jnode_is_inode(const jnode * node) -+{ -+ assert("vs-1240", node != NULL); -+ return jnode_get_type(node) == JNODE_INODE; -+} -+ -+static inline jnode_plugin *jnode_ops_of(const jnode_type type) -+{ -+ assert("nikita-2367", type < LAST_JNODE_TYPE); -+ return jnode_plugin_by_id((reiser4_plugin_id) type); -+} -+ -+static inline jnode_plugin *jnode_ops(const jnode * node) -+{ -+ assert("nikita-2366", node != NULL); -+ -+ return jnode_ops_of(jnode_get_type(node)); -+} -+ -+/* Get the index of a block. */ -+static inline unsigned long jnode_get_index(jnode * node) -+{ -+ return jnode_ops(node)->index(node); -+} -+ -+/* return true if "node" is the root */ -+static inline int jnode_is_root(const jnode * node) -+{ -+ return jnode_is_znode(node) && znode_is_root(JZNODE(node)); -+} -+ -+extern struct address_space *mapping_jnode(const jnode * node); -+extern unsigned long index_jnode(const jnode * node); -+ -+static inline void jput(jnode * node); -+extern void jput_final(jnode * node); -+ -+/* bump data counter on @node */ -+static inline void add_d_ref(jnode * node /* node to increase d_count of */ ) -+{ -+ assert("nikita-1962", node != NULL); -+ -+ atomic_inc(&node->d_count); -+ if (jnode_is_unformatted(node) || jnode_is_znode(node)) -+ LOCK_CNT_INC(d_refs); -+} -+ -+/* jput() - decrement x_count reference counter on znode. -+ -+ Count may drop to 0, jnode stays in cache until memory pressure causes the -+ eviction of its page. The c_count variable also ensures that children are -+ pressured out of memory before the parent. The jnode remains hashed as -+ long as the VM allows its page to stay in memory. -+*/ -+static inline void jput(jnode * node) -+{ -+ assert("jmacd-509", node != NULL); -+ assert("jmacd-510", atomic_read(&node->x_count) > 0); -+ assert("zam-926", reiser4_schedulable()); -+ LOCK_CNT_DEC(x_refs); -+ -+ rcu_read_lock(); -+ /* -+ * we don't need any kind of lock here--jput_final() uses RCU. -+ */ -+ if (unlikely(atomic_dec_and_test(&node->x_count))) { -+ jput_final(node); -+ } else -+ rcu_read_unlock(); -+ assert("nikita-3473", reiser4_schedulable()); -+} -+ -+extern void jrelse(jnode * node); -+extern void jrelse_tail(jnode * node); -+ -+extern jnode *jnode_rip_sync(reiser4_tree * t, jnode * node); -+ -+/* resolve race with jput */ -+static inline jnode *jnode_rip_check(reiser4_tree * tree, jnode * node) -+{ -+ if (unlikely(JF_ISSET(node, JNODE_RIP))) -+ node = jnode_rip_sync(tree, node); -+ return node; -+} -+ -+extern reiser4_key *jnode_build_key(const jnode *node, reiser4_key * key); -+ -+#if REISER4_DEBUG -+extern int jnode_invariant_f(const jnode *node, char const **msg); -+#endif -+ -+extern jnode_plugin jnode_plugins[LAST_JNODE_TYPE]; -+ -+/* __JNODE_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/kassign.c linux-2.6.24/fs/reiser4/kassign.c ---- linux-2.6.24.orig/fs/reiser4/kassign.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/kassign.c 2008-01-25 11:55:43.900543447 +0300 -@@ -0,0 +1,677 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Key assignment policy implementation */ -+ -+/* -+ * In reiser4 every piece of file system data and meta-data has a key. Keys -+ * are used to store information in and retrieve it from reiser4 internal -+ * tree. In addition to this, keys define _ordering_ of all file system -+ * information: things having close keys are placed into the same or -+ * neighboring (in the tree order) nodes of the tree. As our block allocator -+ * tries to respect tree order (see flush.c), keys also define order in which -+ * things are laid out on the disk, and hence, affect performance directly. -+ * -+ * Obviously, assignment of keys to data and meta-data should be consistent -+ * across whole file system. Algorithm that calculates a key for a given piece -+ * of data or meta-data is referred to as "key assignment". -+ * -+ * Key assignment is too expensive to be implemented as a plugin (that is, -+ * with an ability to support different key assignment schemas in the same -+ * compiled kernel image). As a compromise, all key-assignment functions and -+ * data-structures are collected in this single file, so that modifications to -+ * key assignment algorithm can be localized. Additional changes may be -+ * required in key.[ch]. -+ * -+ * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one -+ * may guess, there is "Plan B" too. -+ * -+ */ -+ -+/* -+ * Additional complication with key assignment implementation is a requirement -+ * to support different key length. -+ */ -+ -+/* -+ * KEY ASSIGNMENT: PLAN A, LONG KEYS. -+ * -+ * DIRECTORY ITEMS -+ * -+ * | 60 | 4 | 7 |1| 56 | 64 | 64 | -+ * +--------------+---+---+-+-------------+------------------+-----------------+ -+ * | dirid | 0 | F |H| prefix-1 | prefix-2 | prefix-3/hash | -+ * +--------------+---+---+-+-------------+------------------+-----------------+ -+ * | | | | | -+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes | -+ * -+ * dirid objectid of directory this item is for -+ * -+ * F fibration, see fs/reiser4/plugin/fibration.[ch] -+ * -+ * H 1 if last 8 bytes of the key contain hash, -+ * 0 if last 8 bytes of the key contain prefix-3 -+ * -+ * prefix-1 first 7 characters of file name. -+ * Padded by zeroes if name is not long enough. -+ * -+ * prefix-2 next 8 characters of the file name. -+ * -+ * prefix-3 next 8 characters of the file name. -+ * -+ * hash hash of the rest of file name (i.e., portion of file -+ * name not included into prefix-1 and prefix-2). -+ * -+ * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded -+ * in the key. Such file names are called "short". They are distinguished by H -+ * bit set 0 in the key. -+ * -+ * Other file names are "long". For long name, H bit is 1, and first 15 (== 7 -+ * + 8) characters are encoded in prefix-1 and prefix-2 portions of the -+ * key. Last 8 bytes of the key are occupied by hash of the remaining -+ * characters of the name. -+ * -+ * This key assignment reaches following important goals: -+ * -+ * (1) directory entries are sorted in approximately lexicographical -+ * order. -+ * -+ * (2) collisions (when multiple directory items have the same key), while -+ * principally unavoidable in a tree with fixed length keys, are rare. -+ * -+ * STAT DATA -+ * -+ * | 60 | 4 | 64 | 4 | 60 | 64 | -+ * +--------------+---+-----------------+---+--------------+-----------------+ -+ * | locality id | 1 | ordering | 0 | objectid | 0 | -+ * +--------------+---+-----------------+---+--------------+-----------------+ -+ * | | | | | -+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes | -+ * -+ * locality id object id of a directory where first name was created for -+ * the object -+ * -+ * ordering copy of second 8-byte portion of the key of directory -+ * entry for the first name of this object. Ordering has a form -+ * { -+ * fibration :7; -+ * h :1; -+ * prefix1 :56; -+ * } -+ * see description of key for directory entry above. -+ * -+ * objectid object id for this object -+ * -+ * This key assignment policy is designed to keep stat-data in the same order -+ * as corresponding directory items, thus speeding up readdir/stat types of -+ * workload. -+ * -+ * FILE BODY -+ * -+ * | 60 | 4 | 64 | 4 | 60 | 64 | -+ * +--------------+---+-----------------+---+--------------+-----------------+ -+ * | locality id | 4 | ordering | 0 | objectid | offset | -+ * +--------------+---+-----------------+---+--------------+-----------------+ -+ * | | | | | -+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes | -+ * -+ * locality id object id of a directory where first name was created for -+ * the object -+ * -+ * ordering the same as in the key of stat-data for this object -+ * -+ * objectid object id for this object -+ * -+ * offset logical offset from the beginning of this file. -+ * Measured in bytes. -+ * -+ * -+ * KEY ASSIGNMENT: PLAN A, SHORT KEYS. -+ * -+ * DIRECTORY ITEMS -+ * -+ * | 60 | 4 | 7 |1| 56 | 64 | -+ * +--------------+---+---+-+-------------+-----------------+ -+ * | dirid | 0 | F |H| prefix-1 | prefix-2/hash | -+ * +--------------+---+---+-+-------------+-----------------+ -+ * | | | | -+ * | 8 bytes | 8 bytes | 8 bytes | -+ * -+ * dirid objectid of directory this item is for -+ * -+ * F fibration, see fs/reiser4/plugin/fibration.[ch] -+ * -+ * H 1 if last 8 bytes of the key contain hash, -+ * 0 if last 8 bytes of the key contain prefix-2 -+ * -+ * prefix-1 first 7 characters of file name. -+ * Padded by zeroes if name is not long enough. -+ * -+ * prefix-2 next 8 characters of the file name. -+ * -+ * hash hash of the rest of file name (i.e., portion of file -+ * name not included into prefix-1). -+ * -+ * File names shorter than 15 (== 7 + 8) characters are completely encoded in -+ * the key. Such file names are called "short". They are distinguished by H -+ * bit set in the key. -+ * -+ * Other file names are "long". For long name, H bit is 0, and first 7 -+ * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the -+ * key are occupied by hash of the remaining characters of the name. -+ * -+ * STAT DATA -+ * -+ * | 60 | 4 | 4 | 60 | 64 | -+ * +--------------+---+---+--------------+-----------------+ -+ * | locality id | 1 | 0 | objectid | 0 | -+ * +--------------+---+---+--------------+-----------------+ -+ * | | | | -+ * | 8 bytes | 8 bytes | 8 bytes | -+ * -+ * locality id object id of a directory where first name was created for -+ * the object -+ * -+ * objectid object id for this object -+ * -+ * FILE BODY -+ * -+ * | 60 | 4 | 4 | 60 | 64 | -+ * +--------------+---+---+--------------+-----------------+ -+ * | locality id | 4 | 0 | objectid | offset | -+ * +--------------+---+---+--------------+-----------------+ -+ * | | | | -+ * | 8 bytes | 8 bytes | 8 bytes | -+ * -+ * locality id object id of a directory where first name was created for -+ * the object -+ * -+ * objectid object id for this object -+ * -+ * offset logical offset from the beginning of this file. -+ * Measured in bytes. -+ * -+ * -+ */ -+ -+#include "debug.h" -+#include "key.h" -+#include "kassign.h" -+#include "vfs_ops.h" -+#include "inode.h" -+#include "super.h" -+#include "dscale.h" -+ -+#include /* for __u?? */ -+#include /* for struct super_block, etc */ -+ -+/* bitmask for H bit (see comment at the beginning of this file */ -+static const __u64 longname_mark = 0x0100000000000000ull; -+/* bitmask for F and H portions of the key. */ -+static const __u64 fibration_mask = 0xff00000000000000ull; -+ -+/* return true if name is not completely encoded in @key */ -+int is_longname_key(const reiser4_key * key) -+{ -+ __u64 highpart; -+ -+ assert("nikita-2863", key != NULL); -+ if (get_key_type(key) != KEY_FILE_NAME_MINOR) -+ reiser4_print_key("oops", key); -+ assert("nikita-2864", get_key_type(key) == KEY_FILE_NAME_MINOR); -+ -+ if (REISER4_LARGE_KEY) -+ highpart = get_key_ordering(key); -+ else -+ highpart = get_key_objectid(key); -+ -+ return (highpart & longname_mark) ? 1 : 0; -+} -+ -+/* return true if @name is too long to be completely encoded in the key */ -+int is_longname(const char *name UNUSED_ARG, int len) -+{ -+ if (REISER4_LARGE_KEY) -+ return len > 23; -+ else -+ return len > 15; -+} -+ -+/* code ascii string into __u64. -+ -+ Put characters of @name into result (@str) one after another starting -+ from @start_idx-th highest (arithmetically) byte. This produces -+ endian-safe encoding. memcpy(2) will not do. -+ -+*/ -+static __u64 pack_string(const char *name /* string to encode */ , -+ int start_idx /* highest byte in result from -+ * which to start encoding */ ) -+{ -+ unsigned i; -+ __u64 str; -+ -+ str = 0; -+ for (i = 0; (i < sizeof str - start_idx) && name[i]; ++i) { -+ str <<= 8; -+ str |= (unsigned char)name[i]; -+ } -+ str <<= (sizeof str - i - start_idx) << 3; -+ return str; -+} -+ -+/* opposite to pack_string(). Takes value produced by pack_string(), restores -+ * string encoded in it and stores result in @buf */ -+char * reiser4_unpack_string(__u64 value, char *buf) -+{ -+ do { -+ *buf = value >> (64 - 8); -+ if (*buf) -+ ++buf; -+ value <<= 8; -+ } while (value != 0); -+ *buf = 0; -+ return buf; -+} -+ -+/* obtain name encoded in @key and store it in @buf */ -+char *extract_name_from_key(const reiser4_key * key, char *buf) -+{ -+ char *c; -+ -+ assert("nikita-2868", !is_longname_key(key)); -+ -+ c = buf; -+ if (REISER4_LARGE_KEY) { -+ c = reiser4_unpack_string(get_key_ordering(key) & -+ ~fibration_mask, c); -+ c = reiser4_unpack_string(get_key_fulloid(key), c); -+ } else -+ c = reiser4_unpack_string(get_key_fulloid(key) & -+ ~fibration_mask, c); -+ reiser4_unpack_string(get_key_offset(key), c); -+ return buf; -+} -+ -+/** -+ * complete_entry_key - calculate entry key by name -+ * @dir: directory where entry is (or will be) in -+ * @name: name to calculate key of -+ * @len: lenth of name -+ * @result: place to store result in -+ * -+ * Sets fields of entry key @result which depend on file name. -+ * When REISER4_LARGE_KEY is defined three fields of @result are set: ordering, -+ * objectid and offset. Otherwise, objectid and offset are set. -+ */ -+void complete_entry_key(const struct inode *dir, const char *name, -+ int len, reiser4_key *result) -+{ -+#if REISER4_LARGE_KEY -+ __u64 ordering; -+ __u64 objectid; -+ __u64 offset; -+ -+ assert("nikita-1139", dir != NULL); -+ assert("nikita-1142", result != NULL); -+ assert("nikita-2867", strlen(name) == len); -+ -+ /* -+ * key allocation algorithm for directory entries in case of large -+ * keys: -+ * -+ * If name is not longer than 7 + 8 + 8 = 23 characters, put first 7 -+ * characters into ordering field of key, next 8 charactes (if any) -+ * into objectid field of key and next 8 ones (of any) into offset -+ * field of key -+ * -+ * If file name is longer than 23 characters, put first 7 characters -+ * into key's ordering, next 8 to objectid and hash of remaining -+ * characters into offset field. -+ * -+ * To distinguish above cases, in latter set up unused high bit in -+ * ordering field. -+ */ -+ -+ /* [0-6] characters to ordering */ -+ ordering = pack_string(name, 1); -+ if (len > 7) { -+ /* [7-14] characters to objectid */ -+ objectid = pack_string(name + 7, 0); -+ if (len > 15) { -+ if (len <= 23) { -+ /* [15-23] characters to offset */ -+ offset = pack_string(name + 15, 0); -+ } else { -+ /* note in a key the fact that offset contains hash. */ -+ ordering |= longname_mark; -+ -+ /* offset is the hash of the file name's tail. */ -+ offset = inode_hash_plugin(dir)->hash(name + 15, -+ len - 15); -+ } -+ } else { -+ offset = 0ull; -+ } -+ } else { -+ objectid = 0ull; -+ offset = 0ull; -+ } -+ -+ assert("nikita-3480", inode_fibration_plugin(dir) != NULL); -+ ordering |= inode_fibration_plugin(dir)->fibre(dir, name, len); -+ -+ set_key_ordering(result, ordering); -+ set_key_fulloid(result, objectid); -+ set_key_offset(result, offset); -+ return; -+ -+#else -+ __u64 objectid; -+ __u64 offset; -+ -+ assert("nikita-1139", dir != NULL); -+ assert("nikita-1142", result != NULL); -+ assert("nikita-2867", strlen(name) == len); -+ -+ /* -+ * key allocation algorithm for directory entries in case of not large -+ * keys: -+ * -+ * If name is not longer than 7 + 8 = 15 characters, put first 7 -+ * characters into objectid field of key, next 8 charactes (if any) -+ * into offset field of key -+ * -+ * If file name is longer than 15 characters, put first 7 characters -+ * into key's objectid, and hash of remaining characters into offset -+ * field. -+ * -+ * To distinguish above cases, in latter set up unused high bit in -+ * objectid field. -+ */ -+ -+ /* [0-6] characters to objectid */ -+ objectid = pack_string(name, 1); -+ if (len > 7) { -+ if (len <= 15) { -+ /* [7-14] characters to offset */ -+ offset = pack_string(name + 7, 0); -+ } else { -+ /* note in a key the fact that offset contains hash. */ -+ objectid |= longname_mark; -+ -+ /* offset is the hash of the file name. */ -+ offset = inode_hash_plugin(dir)->hash(name + 7, -+ len - 7); -+ } -+ } else -+ offset = 0ull; -+ -+ assert("nikita-3480", inode_fibration_plugin(dir) != NULL); -+ objectid |= inode_fibration_plugin(dir)->fibre(dir, name, len); -+ -+ set_key_fulloid(result, objectid); -+ set_key_offset(result, offset); -+ return; -+#endif /* ! REISER4_LARGE_KEY */ -+} -+ -+/* true, if @key is the key of "." */ -+int is_dot_key(const reiser4_key * key /* key to check */ ) -+{ -+ assert("nikita-1717", key != NULL); -+ assert("nikita-1718", get_key_type(key) == KEY_FILE_NAME_MINOR); -+ return -+ (get_key_ordering(key) == 0ull) && -+ (get_key_objectid(key) == 0ull) && (get_key_offset(key) == 0ull); -+} -+ -+/* build key for stat-data. -+ -+ return key of stat-data of this object. This should became sd plugin -+ method in the future. For now, let it be here. -+ -+*/ -+reiser4_key *build_sd_key(const struct inode * target /* inode of an object */ , -+ reiser4_key * result /* resulting key of @target -+ stat-data */ ) -+{ -+ assert("nikita-261", result != NULL); -+ -+ reiser4_key_init(result); -+ set_key_locality(result, reiser4_inode_data(target)->locality_id); -+ set_key_ordering(result, get_inode_ordering(target)); -+ set_key_objectid(result, get_inode_oid(target)); -+ set_key_type(result, KEY_SD_MINOR); -+ set_key_offset(result, (__u64) 0); -+ return result; -+} -+ -+/* encode part of key into &obj_key_id -+ -+ This encodes into @id part of @key sufficient to restore @key later, -+ given that latter is key of object (key of stat-data). -+ -+ See &obj_key_id -+*/ -+int build_obj_key_id(const reiser4_key * key /* key to encode */ , -+ obj_key_id * id /* id where key is encoded in */ ) -+{ -+ assert("nikita-1151", key != NULL); -+ assert("nikita-1152", id != NULL); -+ -+ memcpy(id, key, sizeof *id); -+ return 0; -+} -+ -+/* encode reference to @obj in @id. -+ -+ This is like build_obj_key_id() above, but takes inode as parameter. */ -+int build_inode_key_id(const struct inode *obj /* object to build key of */ , -+ obj_key_id * id /* result */ ) -+{ -+ reiser4_key sdkey; -+ -+ assert("nikita-1166", obj != NULL); -+ assert("nikita-1167", id != NULL); -+ -+ build_sd_key(obj, &sdkey); -+ build_obj_key_id(&sdkey, id); -+ return 0; -+} -+ -+/* decode @id back into @key -+ -+ Restore key of object stat-data from @id. This is dual to -+ build_obj_key_id() above. -+*/ -+int extract_key_from_id(const obj_key_id * id /* object key id to extract key -+ * from */ , -+ reiser4_key * key /* result */ ) -+{ -+ assert("nikita-1153", id != NULL); -+ assert("nikita-1154", key != NULL); -+ -+ reiser4_key_init(key); -+ memcpy(key, id, sizeof *id); -+ return 0; -+} -+ -+/* extract objectid of directory from key of directory entry within said -+ directory. -+ */ -+oid_t extract_dir_id_from_key(const reiser4_key * de_key /* key of -+ * directory -+ * entry */ ) -+{ -+ assert("nikita-1314", de_key != NULL); -+ return get_key_locality(de_key); -+} -+ -+/* encode into @id key of directory entry. -+ -+ Encode into @id information sufficient to later distinguish directory -+ entries within the same directory. This is not whole key, because all -+ directory entries within directory item share locality which is equal -+ to objectid of their directory. -+ -+*/ -+int build_de_id(const struct inode *dir /* inode of directory */ , -+ const struct qstr *name /* name to be given to @obj by -+ * directory entry being -+ * constructed */ , -+ de_id * id /* short key of directory entry */ ) -+{ -+ reiser4_key key; -+ -+ assert("nikita-1290", dir != NULL); -+ assert("nikita-1292", id != NULL); -+ -+ /* NOTE-NIKITA this is suboptimal. */ -+ inode_dir_plugin(dir)->build_entry_key(dir, name, &key); -+ return build_de_id_by_key(&key, id); -+} -+ -+/* encode into @id key of directory entry. -+ -+ Encode into @id information sufficient to later distinguish directory -+ entries within the same directory. This is not whole key, because all -+ directory entries within directory item share locality which is equal -+ to objectid of their directory. -+ -+*/ -+int build_de_id_by_key(const reiser4_key * entry_key /* full key of directory -+ * entry */ , -+ de_id * id /* short key of directory entry */ ) -+{ -+ memcpy(id, ((__u64 *) entry_key) + 1, sizeof *id); -+ return 0; -+} -+ -+/* restore from @id key of directory entry. -+ -+ Function dual to build_de_id(): given @id and locality, build full -+ key of directory entry within directory item. -+ -+*/ -+int extract_key_from_de_id(const oid_t locality /* locality of directory -+ * entry */ , -+ const de_id * id /* directory entry id */ , -+ reiser4_key * key /* result */ ) -+{ -+ /* no need to initialise key here: all fields are overwritten */ -+ memcpy(((__u64 *) key) + 1, id, sizeof *id); -+ set_key_locality(key, locality); -+ set_key_type(key, KEY_FILE_NAME_MINOR); -+ return 0; -+} -+ -+/* compare two &de_id's */ -+cmp_t de_id_cmp(const de_id * id1 /* first &de_id to compare */ , -+ const de_id * id2 /* second &de_id to compare */ ) -+{ -+ /* NOTE-NIKITA ugly implementation */ -+ reiser4_key k1; -+ reiser4_key k2; -+ -+ extract_key_from_de_id((oid_t) 0, id1, &k1); -+ extract_key_from_de_id((oid_t) 0, id2, &k2); -+ return keycmp(&k1, &k2); -+} -+ -+/* compare &de_id with key */ -+cmp_t de_id_key_cmp(const de_id * id /* directory entry id to compare */ , -+ const reiser4_key * key /* key to compare */ ) -+{ -+ cmp_t result; -+ reiser4_key *k1; -+ -+ k1 = (reiser4_key *) (((unsigned long)id) - sizeof key->el[0]); -+ result = KEY_DIFF_EL(k1, key, 1); -+ if (result == EQUAL_TO) { -+ result = KEY_DIFF_EL(k1, key, 2); -+ if (REISER4_LARGE_KEY && result == EQUAL_TO) { -+ result = KEY_DIFF_EL(k1, key, 3); -+ } -+ } -+ return result; -+} -+ -+/* -+ * return number of bytes necessary to encode @inode identity. -+ */ -+int inode_onwire_size(const struct inode *inode) -+{ -+ int result; -+ -+ result = dscale_bytes_to_write(get_inode_oid(inode)); -+ result += dscale_bytes_to_write(get_inode_locality(inode)); -+ -+ /* -+ * ordering is large (it usually has highest bits set), so it makes -+ * little sense to dscale it. -+ */ -+ if (REISER4_LARGE_KEY) -+ result += sizeof(get_inode_ordering(inode)); -+ return result; -+} -+ -+/* -+ * encode @inode identity at @start -+ */ -+char *build_inode_onwire(const struct inode *inode, char *start) -+{ -+ start += dscale_write(start, get_inode_locality(inode)); -+ start += dscale_write(start, get_inode_oid(inode)); -+ -+ if (REISER4_LARGE_KEY) { -+ put_unaligned(cpu_to_le64(get_inode_ordering(inode)), (__le64 *)start); -+ start += sizeof(get_inode_ordering(inode)); -+ } -+ return start; -+} -+ -+/* -+ * extract key that was previously encoded by build_inode_onwire() at @addr -+ */ -+char *extract_obj_key_id_from_onwire(char *addr, obj_key_id * key_id) -+{ -+ __u64 val; -+ -+ addr += dscale_read(addr, &val); -+ val = (val << KEY_LOCALITY_SHIFT) | KEY_SD_MINOR; -+ put_unaligned(cpu_to_le64(val), (__le64 *)key_id->locality); -+ addr += dscale_read(addr, &val); -+ put_unaligned(cpu_to_le64(val), (__le64 *)key_id->objectid); -+#if REISER4_LARGE_KEY -+ memcpy(&key_id->ordering, addr, sizeof key_id->ordering); -+ addr += sizeof key_id->ordering; -+#endif -+ return addr; -+} -+ -+/* -+ * skip a key that was previously encoded by build_inode_onwire() at @addr -+ * FIXME: handle IO errors. -+ */ -+char * locate_obj_key_id_onwire(char * addr) -+{ -+ /* locality */ -+ addr += dscale_bytes_to_read(addr); -+ /* objectid */ -+ addr += dscale_bytes_to_read(addr); -+#if REISER4_LARGE_KEY -+ addr += sizeof ((obj_key_id *)0)->ordering; -+#endif -+ return addr; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/kassign.h linux-2.6.24/fs/reiser4/kassign.h ---- linux-2.6.24.orig/fs/reiser4/kassign.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/kassign.h 2008-01-25 11:55:43.900543447 +0300 -@@ -0,0 +1,111 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Key assignment policy interface. See kassign.c for details. */ -+ -+#if !defined( __KASSIGN_H__ ) -+#define __KASSIGN_H__ -+ -+#include "forward.h" -+#include "key.h" -+#include "dformat.h" -+ -+#include /* for __u?? */ -+#include /* for struct super_block, etc */ -+#include /* for struct qstr */ -+ -+/* key assignment functions */ -+ -+/* Information from which key of file stat-data can be uniquely -+ restored. This depends on key assignment policy for -+ stat-data. Currently it's enough to store object id and locality id -+ (60+60==120) bits, because minor packing locality and offset of -+ stat-data key are always known constants: KEY_SD_MINOR and 0 -+ respectively. For simplicity 4 bits are wasted in each id, and just -+ two 64 bit integers are stored. -+ -+ This field has to be byte-aligned, because we don't want to waste -+ space in directory entries. There is another side of a coin of -+ course: we waste CPU and bus bandwidth in stead, by copying data back -+ and forth. -+ -+ Next optimization: &obj_key_id is mainly used to address stat data from -+ directory entries. Under the assumption that majority of files only have -+ only name (one hard link) from *the* parent directory it seems reasonable -+ to only store objectid of stat data and take its locality from key of -+ directory item. -+ -+ This requires some flag to be added to the &obj_key_id to distinguish -+ between these two cases. Remaining bits in flag byte are then asking to be -+ used to store file type. -+ -+ This optimization requires changes in directory item handling code. -+ -+*/ -+typedef struct obj_key_id { -+ d8 locality[sizeof(__u64)]; -+ ON_LARGE_KEY(d8 ordering[sizeof(__u64)]; -+ ) -+ d8 objectid[sizeof(__u64)]; -+} -+obj_key_id; -+ -+/* Information sufficient to uniquely identify directory entry within -+ compressed directory item. -+ -+ For alignment issues see &obj_key_id above. -+*/ -+typedef struct de_id { -+ ON_LARGE_KEY(d8 ordering[sizeof(__u64)];) -+ d8 objectid[sizeof(__u64)]; -+ d8 offset[sizeof(__u64)]; -+} -+de_id; -+ -+extern int inode_onwire_size(const struct inode *obj); -+extern char *build_inode_onwire(const struct inode *obj, char *area); -+extern char *locate_obj_key_id_onwire(char *area); -+extern char *extract_obj_key_id_from_onwire(char *area, obj_key_id * key_id); -+ -+extern int build_inode_key_id(const struct inode *obj, obj_key_id * id); -+extern int extract_key_from_id(const obj_key_id * id, reiser4_key * key); -+extern int build_obj_key_id(const reiser4_key * key, obj_key_id * id); -+extern oid_t extract_dir_id_from_key(const reiser4_key * de_key); -+extern int build_de_id(const struct inode *dir, const struct qstr *name, -+ de_id * id); -+extern int build_de_id_by_key(const reiser4_key * entry_key, de_id * id); -+extern int extract_key_from_de_id(const oid_t locality, const de_id * id, -+ reiser4_key * key); -+extern cmp_t de_id_cmp(const de_id * id1, const de_id * id2); -+extern cmp_t de_id_key_cmp(const de_id * id, const reiser4_key * key); -+ -+extern int build_readdir_key_common(struct file *dir, reiser4_key * result); -+extern void build_entry_key_common(const struct inode *dir, -+ const struct qstr *name, -+ reiser4_key * result); -+extern void build_entry_key_stable_entry(const struct inode *dir, -+ const struct qstr *name, -+ reiser4_key * result); -+extern int is_dot_key(const reiser4_key * key); -+extern reiser4_key *build_sd_key(const struct inode *target, -+ reiser4_key * result); -+ -+extern int is_longname_key(const reiser4_key * key); -+extern int is_longname(const char *name, int len); -+extern char *extract_name_from_key(const reiser4_key * key, char *buf); -+extern char *reiser4_unpack_string(__u64 value, char *buf); -+extern void complete_entry_key(const struct inode *dir, const char *name, -+ int len, reiser4_key *result); -+ -+/* __KASSIGN_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/Kconfig linux-2.6.24/fs/reiser4/Kconfig ---- linux-2.6.24.orig/fs/reiser4/Kconfig 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/Kconfig 2008-01-25 11:39:06.944209750 +0300 -@@ -0,0 +1,34 @@ -+config REISER4_FS -+ tristate "Reiser4 (EXPERIMENTAL)" -+ depends on EXPERIMENTAL -+ select ZLIB_INFLATE -+ select ZLIB_DEFLATE -+ select LZO_COMPRESS -+ select LZO_DECOMPRESS -+ select CRYPTO -+ help -+ Reiser4 is a filesystem that performs all filesystem operations -+ as atomic transactions, which means that it either performs a -+ write, or it does not, and in the event of a crash it does not -+ partially perform it or corrupt it. -+ -+ It stores files in dancing trees, which are like balanced trees but -+ faster. It packs small files together so that they share blocks -+ without wasting space. This means you can use it to store really -+ small files. It also means that it saves you disk space. It avoids -+ hassling you with anachronisms like having a maximum number of -+ inodes, and wasting space if you use less than that number. -+ -+ Reiser4 is a distinct filesystem type from reiserfs (V3). -+ It's therefore not possible to use reiserfs file systems -+ with reiser4. -+ -+ To learn more about reiser4, go to http://www.namesys.com -+ -+config REISER4_DEBUG -+ bool "Enable reiser4 debug mode" -+ depends on REISER4_FS -+ help -+ Don't use this unless you are debugging reiser4. -+ -+ If unsure, say N. -diff -urN linux-2.6.24.orig/fs/reiser4/key.c linux-2.6.24/fs/reiser4/key.c ---- linux-2.6.24.orig/fs/reiser4/key.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/key.c 2008-01-25 11:39:06.944209750 +0300 -@@ -0,0 +1,137 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Key manipulations. */ -+ -+#include "debug.h" -+#include "key.h" -+#include "super.h" -+#include "reiser4.h" -+ -+#include /* for __u?? */ -+ -+/* Minimal possible key: all components are zero. It is presumed that this is -+ independent of key scheme. */ -+static const reiser4_key MINIMAL_KEY = { -+ .el = { -+ 0ull, -+ ON_LARGE_KEY(0ull,) -+ 0ull, -+ 0ull -+ } -+}; -+ -+/* Maximal possible key: all components are ~0. It is presumed that this is -+ independent of key scheme. */ -+static const reiser4_key MAXIMAL_KEY = { -+ .el = { -+ __constant_cpu_to_le64(~0ull), -+ ON_LARGE_KEY(__constant_cpu_to_le64(~0ull),) -+ __constant_cpu_to_le64(~0ull), -+ __constant_cpu_to_le64(~0ull) -+ } -+}; -+ -+/* Initialize key. */ -+void reiser4_key_init(reiser4_key * key /* key to init */ ) -+{ -+ assert("nikita-1169", key != NULL); -+ memset(key, 0, sizeof *key); -+} -+ -+/* minimal possible key in the tree. Return pointer to the static storage. */ -+const reiser4_key *reiser4_min_key(void) -+{ -+ return &MINIMAL_KEY; -+} -+ -+/* maximum possible key in the tree. Return pointer to the static storage. */ -+const reiser4_key *reiser4_max_key(void) -+{ -+ return &MAXIMAL_KEY; -+} -+ -+#if REISER4_DEBUG -+/* debugging aid: print symbolic name of key type */ -+static const char *type_name(unsigned int key_type /* key type */ ) -+{ -+ switch (key_type) { -+ case KEY_FILE_NAME_MINOR: -+ return "file name"; -+ case KEY_SD_MINOR: -+ return "stat data"; -+ case KEY_ATTR_NAME_MINOR: -+ return "attr name"; -+ case KEY_ATTR_BODY_MINOR: -+ return "attr body"; -+ case KEY_BODY_MINOR: -+ return "file body"; -+ default: -+ return "unknown"; -+ } -+} -+ -+/* debugging aid: print human readable information about key */ -+void reiser4_print_key(const char *prefix /* prefix to print */ , -+ const reiser4_key * key /* key to print */ ) -+{ -+ /* turn bold on */ -+ /* printf ("\033[1m"); */ -+ if (key == NULL) -+ printk("%s: null key\n", prefix); -+ else { -+ if (REISER4_LARGE_KEY) -+ printk("%s: (%Lx:%x:%Lx:%Lx:%Lx:%Lx)", prefix, -+ get_key_locality(key), -+ get_key_type(key), -+ get_key_ordering(key), -+ get_key_band(key), -+ get_key_objectid(key), get_key_offset(key)); -+ else -+ printk("%s: (%Lx:%x:%Lx:%Lx:%Lx)", prefix, -+ get_key_locality(key), -+ get_key_type(key), -+ get_key_band(key), -+ get_key_objectid(key), get_key_offset(key)); -+ /* -+ * if this is a key of directory entry, try to decode part of -+ * a name stored in the key, and output it. -+ */ -+ if (get_key_type(key) == KEY_FILE_NAME_MINOR) { -+ char buf[DE_NAME_BUF_LEN]; -+ char *c; -+ -+ c = buf; -+ c = reiser4_unpack_string(get_key_ordering(key), c); -+ reiser4_unpack_string(get_key_fulloid(key), c); -+ printk("[%s", buf); -+ if (is_longname_key(key)) -+ /* -+ * only part of the name is stored in the key. -+ */ -+ printk("...]\n"); -+ else { -+ /* -+ * whole name is stored in the key. -+ */ -+ reiser4_unpack_string(get_key_offset(key), buf); -+ printk("%s]\n", buf); -+ } -+ } else { -+ printk("[%s]\n", type_name(get_key_type(key))); -+ } -+ } -+ /* turn bold off */ -+ /* printf ("\033[m\017"); */ -+} -+ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/key.h linux-2.6.24/fs/reiser4/key.h ---- linux-2.6.24.orig/fs/reiser4/key.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/key.h 2008-01-25 11:39:06.944209750 +0300 -@@ -0,0 +1,384 @@ -+/* Copyright 2000, 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Declarations of key-related data-structures and operations on keys. */ -+ -+#if !defined( __REISER4_KEY_H__ ) -+#define __REISER4_KEY_H__ -+ -+#include "dformat.h" -+#include "forward.h" -+#include "debug.h" -+ -+#include /* for __u?? */ -+ -+/* Operations on keys in reiser4 tree */ -+ -+/* No access to any of these fields shall be done except via a -+ wrapping macro/function, and that wrapping macro/function shall -+ convert to little endian order. Compare keys will consider cpu byte order. */ -+ -+/* A storage layer implementation difference between a regular unix file body and its attributes is in the typedef below -+ which causes all of the attributes of a file to be near in key to all of the other attributes for all of the files -+ within that directory, and not near to the file itself. It is interesting to consider whether this is the wrong -+ approach, and whether there should be no difference at all. For current usage patterns this choice is probably the -+ right one. */ -+ -+/* possible values for minor packing locality (4 bits required) */ -+typedef enum { -+ /* file name */ -+ KEY_FILE_NAME_MINOR = 0, -+ /* stat-data */ -+ KEY_SD_MINOR = 1, -+ /* file attribute name */ -+ KEY_ATTR_NAME_MINOR = 2, -+ /* file attribute value */ -+ KEY_ATTR_BODY_MINOR = 3, -+ /* file body (tail or extent) */ -+ KEY_BODY_MINOR = 4, -+} key_minor_locality; -+ -+/* everything stored in the tree has a unique key, which means that the tree is (logically) fully ordered by key. -+ Physical order is determined by dynamic heuristics that attempt to reflect key order when allocating available space, -+ and by the repacker. It is stylistically better to put aggregation information into the key. Thus, if you want to -+ segregate extents from tails, it is better to give them distinct minor packing localities rather than changing -+ block_alloc.c to check the node type when deciding where to allocate the node. -+ -+ The need to randomly displace new directories and large files disturbs this symmetry unfortunately. However, it -+ should be noted that this is a need that is not clearly established given the existence of a repacker. Also, in our -+ current implementation tails have a different minor packing locality from extents, and no files have both extents and -+ tails, so maybe symmetry can be had without performance cost after all. Symmetry is what we ship for now.... -+*/ -+ -+/* Arbitrary major packing localities can be assigned to objects using -+ the reiser4(filenameA/..packing<=some_number) system call. -+ -+ In reiser4, the creat() syscall creates a directory -+ -+ whose default flow (that which is referred to if the directory is -+ read as a file) is the traditional unix file body. -+ -+ whose directory plugin is the 'filedir' -+ -+ whose major packing locality is that of the parent of the object created. -+ -+ The static_stat item is a particular commonly used directory -+ compression (the one for normal unix files). -+ -+ The filedir plugin checks to see if the static_stat item exists. -+ There is a unique key for static_stat. If yes, then it uses the -+ static_stat item for all of the values that it contains. The -+ static_stat item contains a flag for each stat it contains which -+ indicates whether one should look outside the static_stat item for its -+ contents. -+*/ -+ -+/* offset of fields in reiser4_key. Value of each element of this enum -+ is index within key (thought as array of __u64's) where this field -+ is. */ -+typedef enum { -+ /* major "locale", aka dirid. Sits in 1st element */ -+ KEY_LOCALITY_INDEX = 0, -+ /* minor "locale", aka item type. Sits in 1st element */ -+ KEY_TYPE_INDEX = 0, -+ ON_LARGE_KEY(KEY_ORDERING_INDEX,) -+ /* "object band". Sits in 2nd element */ -+ KEY_BAND_INDEX, -+ /* objectid. Sits in 2nd element */ -+ KEY_OBJECTID_INDEX = KEY_BAND_INDEX, -+ /* full objectid. Sits in 2nd element */ -+ KEY_FULLOID_INDEX = KEY_BAND_INDEX, -+ /* Offset. Sits in 3rd element */ -+ KEY_OFFSET_INDEX, -+ /* Name hash. Sits in 3rd element */ -+ KEY_HASH_INDEX = KEY_OFFSET_INDEX, -+ KEY_CACHELINE_END = KEY_OFFSET_INDEX, -+ KEY_LAST_INDEX -+} reiser4_key_field_index; -+ -+/* key in reiser4 internal "balanced" tree. It is just array of three -+ 64bit integers in disk byte order (little-endian by default). This -+ array is actually indexed by reiser4_key_field. Each __u64 within -+ this array is called "element". Logical key component encoded within -+ elements are called "fields". -+ -+ We declare this as union with second component dummy to suppress -+ inconvenient array<->pointer casts implied in C. */ -+union reiser4_key { -+ __le64 el[KEY_LAST_INDEX]; -+ int pad; -+}; -+ -+/* bitmasks showing where within reiser4_key particular key is stored. */ -+/* major locality occupies higher 60 bits of the first element */ -+#define KEY_LOCALITY_MASK 0xfffffffffffffff0ull -+ -+/* minor locality occupies lower 4 bits of the first element */ -+#define KEY_TYPE_MASK 0xfull -+ -+/* controversial band occupies higher 4 bits of the 2nd element */ -+#define KEY_BAND_MASK 0xf000000000000000ull -+ -+/* objectid occupies lower 60 bits of the 2nd element */ -+#define KEY_OBJECTID_MASK 0x0fffffffffffffffull -+ -+/* full 64bit objectid*/ -+#define KEY_FULLOID_MASK 0xffffffffffffffffull -+ -+/* offset is just 3rd L.M.Nt itself */ -+#define KEY_OFFSET_MASK 0xffffffffffffffffull -+ -+/* ordering is whole second element */ -+#define KEY_ORDERING_MASK 0xffffffffffffffffull -+ -+/* how many bits key element should be shifted to left to get particular field */ -+typedef enum { -+ KEY_LOCALITY_SHIFT = 4, -+ KEY_TYPE_SHIFT = 0, -+ KEY_BAND_SHIFT = 60, -+ KEY_OBJECTID_SHIFT = 0, -+ KEY_FULLOID_SHIFT = 0, -+ KEY_OFFSET_SHIFT = 0, -+ KEY_ORDERING_SHIFT = 0, -+} reiser4_key_field_shift; -+ -+static inline __u64 -+get_key_el(const reiser4_key * key, reiser4_key_field_index off) -+{ -+ assert("nikita-753", key != NULL); -+ assert("nikita-754", off < KEY_LAST_INDEX); -+ return le64_to_cpu(get_unaligned(&key->el[off])); -+} -+ -+static inline void -+set_key_el(reiser4_key * key, reiser4_key_field_index off, __u64 value) -+{ -+ assert("nikita-755", key != NULL); -+ assert("nikita-756", off < KEY_LAST_INDEX); -+ put_unaligned(cpu_to_le64(value), &key->el[off]); -+} -+ -+/* macro to define getter and setter functions for field F with type T */ -+#define DEFINE_KEY_FIELD( L, U, T ) \ -+static inline T get_key_ ## L ( const reiser4_key *key ) \ -+{ \ -+ assert( "nikita-750", key != NULL ); \ -+ return ( T ) ( get_key_el( key, KEY_ ## U ## _INDEX ) & \ -+ KEY_ ## U ## _MASK ) >> KEY_ ## U ## _SHIFT; \ -+} \ -+ \ -+static inline void set_key_ ## L ( reiser4_key *key, T loc ) \ -+{ \ -+ __u64 el; \ -+ \ -+ assert( "nikita-752", key != NULL ); \ -+ \ -+ el = get_key_el( key, KEY_ ## U ## _INDEX ); \ -+ /* clear field bits in the key */ \ -+ el &= ~KEY_ ## U ## _MASK; \ -+ /* actually it should be \ -+ \ -+ el |= ( loc << KEY_ ## U ## _SHIFT ) & KEY_ ## U ## _MASK; \ -+ \ -+ but we trust user to never pass values that wouldn't fit \ -+ into field. Clearing extra bits is one operation, but this \ -+ function is time-critical. \ -+ But check this in assertion. */ \ -+ assert( "nikita-759", ( ( loc << KEY_ ## U ## _SHIFT ) & \ -+ ~KEY_ ## U ## _MASK ) == 0 ); \ -+ el |= ( loc << KEY_ ## U ## _SHIFT ); \ -+ set_key_el( key, KEY_ ## U ## _INDEX, el ); \ -+} -+ -+typedef __u64 oid_t; -+ -+/* define get_key_locality(), set_key_locality() */ -+DEFINE_KEY_FIELD(locality, LOCALITY, oid_t); -+/* define get_key_type(), set_key_type() */ -+DEFINE_KEY_FIELD(type, TYPE, key_minor_locality); -+/* define get_key_band(), set_key_band() */ -+DEFINE_KEY_FIELD(band, BAND, __u64); -+/* define get_key_objectid(), set_key_objectid() */ -+DEFINE_KEY_FIELD(objectid, OBJECTID, oid_t); -+/* define get_key_fulloid(), set_key_fulloid() */ -+DEFINE_KEY_FIELD(fulloid, FULLOID, oid_t); -+/* define get_key_offset(), set_key_offset() */ -+DEFINE_KEY_FIELD(offset, OFFSET, __u64); -+#if (REISER4_LARGE_KEY) -+/* define get_key_ordering(), set_key_ordering() */ -+DEFINE_KEY_FIELD(ordering, ORDERING, __u64); -+#else -+static inline __u64 get_key_ordering(const reiser4_key * key) -+{ -+ return 0; -+} -+ -+static inline void set_key_ordering(reiser4_key * key, __u64 val) -+{ -+} -+#endif -+ -+/* key comparison result */ -+typedef enum { LESS_THAN = -1, /* if first key is less than second */ -+ EQUAL_TO = 0, /* if keys are equal */ -+ GREATER_THAN = +1 /* if first key is greater than second */ -+} cmp_t; -+ -+void reiser4_key_init(reiser4_key * key); -+ -+/* minimal possible key in the tree. Return pointer to the static storage. */ -+extern const reiser4_key *reiser4_min_key(void); -+extern const reiser4_key *reiser4_max_key(void); -+ -+/* helper macro for keycmp() */ -+#define KEY_DIFF(k1, k2, field) \ -+({ \ -+ typeof (get_key_ ## field (k1)) f1; \ -+ typeof (get_key_ ## field (k2)) f2; \ -+ \ -+ f1 = get_key_ ## field (k1); \ -+ f2 = get_key_ ## field (k2); \ -+ \ -+ (f1 < f2) ? LESS_THAN : ((f1 == f2) ? EQUAL_TO : GREATER_THAN); \ -+}) -+ -+/* helper macro for keycmp() */ -+#define KEY_DIFF_EL(k1, k2, off) \ -+({ \ -+ __u64 e1; \ -+ __u64 e2; \ -+ \ -+ e1 = get_key_el(k1, off); \ -+ e2 = get_key_el(k2, off); \ -+ \ -+ (e1 < e2) ? LESS_THAN : ((e1 == e2) ? EQUAL_TO : GREATER_THAN); \ -+}) -+ -+/* compare `k1' and `k2'. This function is a heart of "key allocation -+ policy". All you need to implement new policy is to add yet another -+ clause here. */ -+static inline cmp_t keycmp(const reiser4_key * k1 /* first key to compare */ , -+ const reiser4_key * k2 /* second key to compare */ ) -+{ -+ cmp_t result; -+ -+ /* -+ * This function is the heart of reiser4 tree-routines. Key comparison -+ * is among most heavily used operations in the file system. -+ */ -+ -+ assert("nikita-439", k1 != NULL); -+ assert("nikita-440", k2 != NULL); -+ -+ /* there is no actual branch here: condition is compile time constant -+ * and constant folding and propagation ensures that only one branch -+ * is actually compiled in. */ -+ -+ if (REISER4_PLANA_KEY_ALLOCATION) { -+ /* if physical order of fields in a key is identical -+ with logical order, we can implement key comparison -+ as three 64bit comparisons. */ -+ /* logical order of fields in plan-a: -+ locality->type->objectid->offset. */ -+ /* compare locality and type at once */ -+ result = KEY_DIFF_EL(k1, k2, 0); -+ if (result == EQUAL_TO) { -+ /* compare objectid (and band if it's there) */ -+ result = KEY_DIFF_EL(k1, k2, 1); -+ /* compare offset */ -+ if (result == EQUAL_TO) { -+ result = KEY_DIFF_EL(k1, k2, 2); -+ if (REISER4_LARGE_KEY && result == EQUAL_TO) { -+ result = KEY_DIFF_EL(k1, k2, 3); -+ } -+ } -+ } -+ } else if (REISER4_3_5_KEY_ALLOCATION) { -+ result = KEY_DIFF(k1, k2, locality); -+ if (result == EQUAL_TO) { -+ result = KEY_DIFF(k1, k2, objectid); -+ if (result == EQUAL_TO) { -+ result = KEY_DIFF(k1, k2, type); -+ if (result == EQUAL_TO) -+ result = KEY_DIFF(k1, k2, offset); -+ } -+ } -+ } else -+ impossible("nikita-441", "Unknown key allocation scheme!"); -+ return result; -+} -+ -+/* true if @k1 equals @k2 */ -+static inline int keyeq(const reiser4_key * k1 /* first key to compare */ , -+ const reiser4_key * k2 /* second key to compare */ ) -+{ -+ assert("nikita-1879", k1 != NULL); -+ assert("nikita-1880", k2 != NULL); -+ return !memcmp(k1, k2, sizeof *k1); -+} -+ -+/* true if @k1 is less than @k2 */ -+static inline int keylt(const reiser4_key * k1 /* first key to compare */ , -+ const reiser4_key * k2 /* second key to compare */ ) -+{ -+ assert("nikita-1952", k1 != NULL); -+ assert("nikita-1953", k2 != NULL); -+ return keycmp(k1, k2) == LESS_THAN; -+} -+ -+/* true if @k1 is less than or equal to @k2 */ -+static inline int keyle(const reiser4_key * k1 /* first key to compare */ , -+ const reiser4_key * k2 /* second key to compare */ ) -+{ -+ assert("nikita-1954", k1 != NULL); -+ assert("nikita-1955", k2 != NULL); -+ return keycmp(k1, k2) != GREATER_THAN; -+} -+ -+/* true if @k1 is greater than @k2 */ -+static inline int keygt(const reiser4_key * k1 /* first key to compare */ , -+ const reiser4_key * k2 /* second key to compare */ ) -+{ -+ assert("nikita-1959", k1 != NULL); -+ assert("nikita-1960", k2 != NULL); -+ return keycmp(k1, k2) == GREATER_THAN; -+} -+ -+/* true if @k1 is greater than or equal to @k2 */ -+static inline int keyge(const reiser4_key * k1 /* first key to compare */ , -+ const reiser4_key * k2 /* second key to compare */ ) -+{ -+ assert("nikita-1956", k1 != NULL); -+ assert("nikita-1957", k2 != NULL); /* October 4: sputnik launched -+ * November 3: Laika */ -+ return keycmp(k1, k2) != LESS_THAN; -+} -+ -+static inline void prefetchkey(reiser4_key * key) -+{ -+ prefetch(key); -+ prefetch(&key->el[KEY_CACHELINE_END]); -+} -+ -+/* (%Lx:%x:%Lx:%Lx:%Lx:%Lx) = -+ 1 + 16 + 1 + 1 + 1 + 1 + 1 + 16 + 1 + 16 + 1 + 16 + 1 */ -+/* size of a buffer suitable to hold human readable key representation */ -+#define KEY_BUF_LEN (80) -+ -+#if REISER4_DEBUG -+extern void reiser4_print_key(const char *prefix, const reiser4_key * key); -+#else -+#define reiser4_print_key(p,k) noop -+#endif -+ -+/* __FS_REISERFS_KEY_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/ktxnmgrd.c linux-2.6.24/fs/reiser4/ktxnmgrd.c ---- linux-2.6.24.orig/fs/reiser4/ktxnmgrd.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/ktxnmgrd.c 2008-01-25 11:39:06.944209750 +0300 -@@ -0,0 +1,214 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+/* Transaction manager daemon. */ -+ -+/* -+ * ktxnmgrd is a kernel daemon responsible for committing transactions. It is -+ * needed/important for the following reasons: -+ * -+ * 1. in reiser4 atom is not committed immediately when last transaction -+ * handle closes, unless atom is either too old or too large (see -+ * atom_should_commit()). This is done to avoid committing too frequently. -+ * because: -+ * -+ * 2. sometimes we don't want to commit atom when closing last transaction -+ * handle even if it is old and fat enough. For example, because we are at -+ * this point under directory semaphore, and committing would stall all -+ * accesses to this directory. -+ * -+ * ktxnmgrd binds its time sleeping on condition variable. When is awakes -+ * either due to (tunable) timeout or because it was explicitly woken up by -+ * call to ktxnmgrd_kick(), it scans list of all atoms and commits ones -+ * eligible. -+ * -+ */ -+ -+#include "debug.h" -+#include "txnmgr.h" -+#include "tree.h" -+#include "ktxnmgrd.h" -+#include "super.h" -+#include "reiser4.h" -+ -+#include /* for struct task_struct */ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static int scan_mgr(struct super_block *); -+ -+/* -+ * change current->comm so that ps, top, and friends will see changed -+ * state. This serves no useful purpose whatsoever, but also costs nothing. May -+ * be it will make lonely system administrator feeling less alone at 3 A.M. -+ */ -+#define set_comm( state ) \ -+ snprintf( current -> comm, sizeof( current -> comm ), \ -+ "%s:%s:%s", __FUNCTION__, (super)->s_id, ( state ) ) -+ -+/** -+ * ktxnmgrd - kernel txnmgr daemon -+ * @arg: pointer to super block -+ * -+ * The background transaction manager daemon, started as a kernel thread during -+ * reiser4 initialization. -+ */ -+static int ktxnmgrd(void *arg) -+{ -+ struct super_block *super; -+ ktxnmgrd_context *ctx; -+ txn_mgr *mgr; -+ int done = 0; -+ -+ super = arg; -+ mgr = &get_super_private(super)->tmgr; -+ -+ /* -+ * do_fork() just copies task_struct into the new thread. ->fs_context -+ * shouldn't be copied of course. This shouldn't be a problem for the -+ * rest of the code though. -+ */ -+ current->journal_info = NULL; -+ ctx = mgr->daemon; -+ while (1) { -+ try_to_freeze(); -+ set_comm("wait"); -+ { -+ DEFINE_WAIT(__wait); -+ -+ prepare_to_wait(&ctx->wait, &__wait, TASK_INTERRUPTIBLE); -+ if (kthread_should_stop()) { -+ done = 1; -+ } else -+ schedule_timeout(ctx->timeout); -+ finish_wait(&ctx->wait, &__wait); -+ } -+ if (done) -+ break; -+ set_comm("run"); -+ spin_lock(&ctx->guard); -+ /* -+ * wait timed out or ktxnmgrd was woken up by explicit request -+ * to commit something. Scan list of atoms in txnmgr and look -+ * for too old atoms. -+ */ -+ do { -+ ctx->rescan = 0; -+ scan_mgr(super); -+ spin_lock(&ctx->guard); -+ if (ctx->rescan) { -+ /* -+ * the list could be modified while ctx -+ * spinlock was released, we have to repeat -+ * scanning from the beginning -+ */ -+ break; -+ } -+ } while (ctx->rescan); -+ spin_unlock(&ctx->guard); -+ } -+ return 0; -+} -+ -+#undef set_comm -+ -+/** -+ * reiser4_init_ktxnmgrd - initialize ktxnmgrd context and start kernel daemon -+ * @super: pointer to super block -+ * -+ * Allocates and initializes ktxnmgrd_context, attaches it to transaction -+ * manager. Starts kernel txnmgr daemon. This is called on mount. -+ */ -+int reiser4_init_ktxnmgrd(struct super_block *super) -+{ -+ txn_mgr *mgr; -+ ktxnmgrd_context *ctx; -+ -+ mgr = &get_super_private(super)->tmgr; -+ -+ assert("zam-1014", mgr->daemon == NULL); -+ -+ ctx = kzalloc(sizeof(ktxnmgrd_context), reiser4_ctx_gfp_mask_get()); -+ if (!ctx) -+ return RETERR(-ENOMEM); -+ -+ assert("nikita-2442", ctx != NULL); -+ -+ init_waitqueue_head(&ctx->wait); -+ -+ /*kcond_init(&ctx->startup);*/ -+ spin_lock_init(&ctx->guard); -+ ctx->timeout = REISER4_TXNMGR_TIMEOUT; -+ ctx->rescan = 1; -+ mgr->daemon = ctx; -+ -+ ctx->tsk = kthread_run(ktxnmgrd, super, "ktxnmgrd"); -+ if (IS_ERR(ctx->tsk)) { -+ int ret = PTR_ERR(ctx->tsk); -+ mgr->daemon = NULL; -+ kfree(ctx); -+ return RETERR(ret); -+ } -+ return 0; -+} -+ -+void ktxnmgrd_kick(txn_mgr *mgr) -+{ -+ assert("nikita-3234", mgr != NULL); -+ assert("nikita-3235", mgr->daemon != NULL); -+ wake_up(&mgr->daemon->wait); -+} -+ -+int is_current_ktxnmgrd(void) -+{ -+ return (get_current_super_private()->tmgr.daemon->tsk == current); -+} -+ -+/** -+ * scan_mgr - commit atoms which are to be committed -+ * @super: super block to commit atoms of -+ * -+ * Commits old atoms. -+ */ -+static int scan_mgr(struct super_block *super) -+{ -+ int ret; -+ reiser4_context ctx; -+ -+ init_stack_context(&ctx, super); -+ -+ ret = commit_some_atoms(&get_super_private(super)->tmgr); -+ -+ reiser4_exit_context(&ctx); -+ return ret; -+} -+ -+/** -+ * reiser4_done_ktxnmgrd - stop kernel thread and frees ktxnmgrd context -+ * @mgr: -+ * -+ * This is called on umount. Stops ktxnmgrd and free t -+ */ -+void reiser4_done_ktxnmgrd(struct super_block *super) -+{ -+ txn_mgr *mgr; -+ -+ mgr = &get_super_private(super)->tmgr; -+ assert("zam-1012", mgr->daemon != NULL); -+ -+ kthread_stop(mgr->daemon->tsk); -+ kfree(mgr->daemon); -+ mgr->daemon = NULL; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 120 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/ktxnmgrd.h linux-2.6.24/fs/reiser4/ktxnmgrd.h ---- linux-2.6.24.orig/fs/reiser4/ktxnmgrd.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/ktxnmgrd.h 2008-01-25 11:39:06.944209750 +0300 -@@ -0,0 +1,52 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Transaction manager daemon. See ktxnmgrd.c for comments. */ -+ -+#ifndef __KTXNMGRD_H__ -+#define __KTXNMGRD_H__ -+ -+#include "txnmgr.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include /* for struct task_struct */ -+ -+/* in this structure all data necessary to start up, shut down and communicate -+ * with ktxnmgrd are kept. */ -+struct ktxnmgrd_context { -+ /* wait queue head on which ktxnmgrd sleeps */ -+ wait_queue_head_t wait; -+ /* spin lock protecting all fields of this structure */ -+ spinlock_t guard; -+ /* timeout of sleeping on ->wait */ -+ signed long timeout; -+ /* kernel thread running ktxnmgrd */ -+ struct task_struct *tsk; -+ /* list of all file systems served by this ktxnmgrd */ -+ struct list_head queue; -+ /* should ktxnmgrd repeat scanning of atoms? */ -+ unsigned int rescan:1; -+}; -+ -+extern int reiser4_init_ktxnmgrd(struct super_block *); -+extern void reiser4_done_ktxnmgrd(struct super_block *); -+ -+extern void ktxnmgrd_kick(txn_mgr * mgr); -+extern int is_current_ktxnmgrd(void); -+ -+/* __KTXNMGRD_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/lock.c linux-2.6.24/fs/reiser4/lock.c ---- linux-2.6.24.orig/fs/reiser4/lock.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/lock.c 2008-01-25 11:39:06.948210780 +0300 -@@ -0,0 +1,1232 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Traditional deadlock avoidance is achieved by acquiring all locks in a single -+ order. V4 balances the tree from the bottom up, and searches the tree from -+ the top down, and that is really the way we want it, so tradition won't work -+ for us. -+ -+ Instead we have two lock orderings, a high priority lock ordering, and a low -+ priority lock ordering. Each node in the tree has a lock in its znode. -+ -+ Suppose we have a set of processes which lock (R/W) tree nodes. Each process -+ has a set (maybe empty) of already locked nodes ("process locked set"). Each -+ process may have a pending lock request to a node locked by another process. -+ Note: we lock and unlock, but do not transfer locks: it is possible -+ transferring locks instead would save some bus locking.... -+ -+ Deadlock occurs when we have a loop constructed from process locked sets and -+ lock request vectors. -+ -+ NOTE: The reiser4 "tree" is a tree on disk, but its cached representation in -+ memory is extended with "znodes" with which we connect nodes with their left -+ and right neighbors using sibling pointers stored in the znodes. When we -+ perform balancing operations we often go from left to right and from right to -+ left. -+ -+ +-P1-+ +-P3-+ -+ |+--+| V1 |+--+| -+ ||N1|| -------> ||N3|| -+ |+--+| |+--+| -+ +----+ +----+ -+ ^ | -+ |V2 |V3 -+ | v -+ +---------P2---------+ -+ |+--+ +--+| -+ ||N2| -------- |N4|| -+ |+--+ +--+| -+ +--------------------+ -+ -+ We solve this by ensuring that only low priority processes lock in top to -+ bottom order and from right to left, and high priority processes lock from -+ bottom to top and left to right. -+ -+ ZAM-FIXME-HANS: order not just node locks in this way, order atom locks, and -+ kill those damn busy loops. -+ ANSWER(ZAM): atom locks (which are introduced by ASTAGE_CAPTURE_WAIT atom -+ stage) cannot be ordered that way. There are no rules what nodes can belong -+ to the atom and what nodes cannot. We cannot define what is right or left -+ direction, what is top or bottom. We can take immediate parent or side -+ neighbor of one node, but nobody guarantees that, say, left neighbor node is -+ not a far right neighbor for other nodes from the same atom. It breaks -+ deadlock avoidance rules and hi-low priority locking cannot be applied for -+ atom locks. -+ -+ How does it help to avoid deadlocks ? -+ -+ Suppose we have a deadlock with n processes. Processes from one priority -+ class never deadlock because they take locks in one consistent -+ order. -+ -+ So, any possible deadlock loop must have low priority as well as high -+ priority processes. There are no other lock priority levels except low and -+ high. We know that any deadlock loop contains at least one node locked by a -+ low priority process and requested by a high priority process. If this -+ situation is caught and resolved it is sufficient to avoid deadlocks. -+ -+ V4 DEADLOCK PREVENTION ALGORITHM IMPLEMENTATION. -+ -+ The deadlock prevention algorithm is based on comparing -+ priorities of node owners (processes which keep znode locked) and -+ requesters (processes which want to acquire a lock on znode). We -+ implement a scheme where low-priority owners yield locks to -+ high-priority requesters. We created a signal passing system that -+ is used to ask low-priority processes to yield one or more locked -+ znodes. -+ -+ The condition when a znode needs to change its owners is described by the -+ following formula: -+ -+ ############################################# -+ # # -+ # (number of high-priority requesters) > 0 # -+ # AND # -+ # (numbers of high-priority owners) == 0 # -+ # # -+ ############################################# -+ -+ Note that a low-priority process delays node releasing if another -+ high-priority process owns this node. So, slightly more strictly speaking, -+ to have a deadlock capable cycle you must have a loop in which a high -+ priority process is waiting on a low priority process to yield a node, which -+ is slightly different from saying a high priority process is waiting on a -+ node owned by a low priority process. -+ -+ It is enough to avoid deadlocks if we prevent any low-priority process from -+ falling asleep if its locked set contains a node which satisfies the -+ deadlock condition. -+ -+ That condition is implicitly or explicitly checked in all places where new -+ high-priority requests may be added or removed from node request queue or -+ high-priority process takes or releases a lock on node. The main -+ goal of these checks is to never lose the moment when node becomes "has -+ wrong owners" and send "must-yield-this-lock" signals to its low-pri owners -+ at that time. -+ -+ The information about received signals is stored in the per-process -+ structure (lock stack) and analyzed before a low-priority process goes to -+ sleep but after a "fast" attempt to lock a node fails. Any signal wakes -+ sleeping process up and forces him to re-check lock status and received -+ signal info. If "must-yield-this-lock" signals were received the locking -+ primitive (longterm_lock_znode()) fails with -E_DEADLOCK error code. -+ -+ V4 LOCKING DRAWBACKS -+ -+ If we have already balanced on one level, and we are propagating our changes -+ upward to a higher level, it could be very messy to surrender all locks on -+ the lower level because we put so much computational work into it, and -+ reverting them to their state before they were locked might be very complex. -+ We also don't want to acquire all locks before performing balancing because -+ that would either be almost as much work as the balancing, or it would be -+ too conservative and lock too much. We want balancing to be done only at -+ high priority. Yet, we might want to go to the left one node and use some -+ of its empty space... So we make one attempt at getting the node to the left -+ using try_lock, and if it fails we do without it, because we didn't really -+ need it, it was only a nice to have. -+ -+ LOCK STRUCTURES DESCRIPTION -+ -+ The following data structures are used in the reiser4 locking -+ implementation: -+ -+ All fields related to long-term locking are stored in znode->lock. -+ -+ The lock stack is a per thread object. It owns all znodes locked by the -+ thread. One znode may be locked by several threads in case of read lock or -+ one znode may be write locked by one thread several times. The special link -+ objects (lock handles) support n<->m relation between znodes and lock -+ owners. -+ -+ -+ -+ +---------+ +---------+ -+ | LS1 | | LS2 | -+ +---------+ +---------+ -+ ^ ^ -+ |---------------+ +----------+ -+ v v v v -+ +---------+ +---------+ +---------+ +---------+ -+ | LH1 | | LH2 | | LH3 | | LH4 | -+ +---------+ +---------+ +---------+ +---------+ -+ ^ ^ ^ ^ -+ | +------------+ | -+ v v v -+ +---------+ +---------+ +---------+ -+ | Z1 | | Z2 | | Z3 | -+ +---------+ +---------+ +---------+ -+ -+ Thread 1 locked znodes Z1 and Z2, thread 2 locked znodes Z2 and Z3. The -+ picture above shows that lock stack LS1 has a list of 2 lock handles LH1 and -+ LH2, lock stack LS2 has a list with lock handles LH3 and LH4 on it. Znode -+ Z1 is locked by only one thread, znode has only one lock handle LH1 on its -+ list, similar situation is for Z3 which is locked by the thread 2 only. Z2 -+ is locked (for read) twice by different threads and two lock handles are on -+ its list. Each lock handle represents a single relation of a locking of a -+ znode by a thread. Locking of a znode is an establishing of a locking -+ relation between the lock stack and the znode by adding of a new lock handle -+ to a list of lock handles, the lock stack. The lock stack links all lock -+ handles for all znodes locked by the lock stack. The znode list groups all -+ lock handles for all locks stacks which locked the znode. -+ -+ Yet another relation may exist between znode and lock owners. If lock -+ procedure cannot immediately take lock on an object it adds the lock owner -+ on special `requestors' list belongs to znode. That list represents a -+ queue of pending lock requests. Because one lock owner may request only -+ only one lock object at a time, it is a 1->n relation between lock objects -+ and a lock owner implemented as it is described above. Full information -+ (priority, pointers to lock and link objects) about each lock request is -+ stored in lock owner structure in `request' field. -+ -+ SHORT_TERM LOCKING -+ -+ This is a list of primitive operations over lock stacks / lock handles / -+ znodes and locking descriptions for them. -+ -+ 1. locking / unlocking which is done by two list insertion/deletion, one -+ to/from znode's list of lock handles, another one is to/from lock stack's -+ list of lock handles. The first insertion is protected by -+ znode->lock.guard spinlock. The list owned by the lock stack can be -+ modified only by thread who owns the lock stack and nobody else can -+ modify/read it. There is nothing to be protected by a spinlock or -+ something else. -+ -+ 2. adding/removing a lock request to/from znode requesters list. The rule is -+ that znode->lock.guard spinlock should be taken for this. -+ -+ 3. we can traverse list of lock handles and use references to lock stacks who -+ locked given znode if znode->lock.guard spinlock is taken. -+ -+ 4. If a lock stack is associated with a znode as a lock requestor or lock -+ owner its existence is guaranteed by znode->lock.guard spinlock. Some its -+ (lock stack's) fields should be protected from being accessed in parallel -+ by two or more threads. Please look at lock_stack structure definition -+ for the info how those fields are protected. */ -+ -+/* Znode lock and capturing intertwining. */ -+/* In current implementation we capture formatted nodes before locking -+ them. Take a look on longterm lock znode, reiser4_try_capture() request -+ precedes locking requests. The longterm_lock_znode function unconditionally -+ captures znode before even checking of locking conditions. -+ -+ Another variant is to capture znode after locking it. It was not tested, but -+ at least one deadlock condition is supposed to be there. One thread has -+ locked a znode (Node-1) and calls reiser4_try_capture() for it. -+ reiser4_try_capture() sleeps because znode's atom has CAPTURE_WAIT state. -+ Second thread is a flushing thread, its current atom is the atom Node-1 -+ belongs to. Second thread wants to lock Node-1 and sleeps because Node-1 -+ is locked by the first thread. The described situation is a deadlock. */ -+ -+#include "debug.h" -+#include "txnmgr.h" -+#include "znode.h" -+#include "jnode.h" -+#include "tree.h" -+#include "plugin/node/node.h" -+#include "super.h" -+ -+#include -+ -+#if REISER4_DEBUG -+static int request_is_deadlock_safe(znode *, znode_lock_mode, -+ znode_lock_request); -+#endif -+ -+/* Returns a lock owner associated with current thread */ -+lock_stack *get_current_lock_stack(void) -+{ -+ return &get_current_context()->stack; -+} -+ -+/* Wakes up all low priority owners informing them about possible deadlock */ -+static void wake_up_all_lopri_owners(znode * node) -+{ -+ lock_handle *handle; -+ -+ assert_spin_locked(&(node->lock.guard)); -+ list_for_each_entry(handle, &node->lock.owners, owners_link) { -+ assert("nikita-1832", handle->node == node); -+ /* count this signal in owner->nr_signaled */ -+ if (!handle->signaled) { -+ handle->signaled = 1; -+ atomic_inc(&handle->owner->nr_signaled); -+ /* Wake up a single process */ -+ reiser4_wake_up(handle->owner); -+ } -+ } -+} -+ -+/* Adds a lock to a lock owner, which means creating a link to the lock and -+ putting the link into the two lists all links are on (the doubly linked list -+ that forms the lock_stack, and the doubly linked list of links attached -+ to a lock. -+*/ -+static inline void -+link_object(lock_handle * handle, lock_stack * owner, znode * node) -+{ -+ assert("jmacd-810", handle->owner == NULL); -+ assert_spin_locked(&(node->lock.guard)); -+ -+ handle->owner = owner; -+ handle->node = node; -+ -+ assert("reiser4-4", -+ ergo(list_empty_careful(&owner->locks), owner->nr_locks == 0)); -+ -+ /* add lock handle to the end of lock_stack's list of locks */ -+ list_add_tail(&handle->locks_link, &owner->locks); -+ ON_DEBUG(owner->nr_locks++); -+ reiser4_ctx_gfp_mask_set(); -+ -+ /* add lock handle to the head of znode's list of owners */ -+ list_add(&handle->owners_link, &node->lock.owners); -+ handle->signaled = 0; -+} -+ -+/* Breaks a relation between a lock and its owner */ -+static inline void unlink_object(lock_handle * handle) -+{ -+ assert("zam-354", handle->owner != NULL); -+ assert("nikita-1608", handle->node != NULL); -+ assert_spin_locked(&(handle->node->lock.guard)); -+ assert("nikita-1829", handle->owner == get_current_lock_stack()); -+ assert("reiser4-5", handle->owner->nr_locks > 0); -+ -+ /* remove lock handle from lock_stack's list of locks */ -+ list_del(&handle->locks_link); -+ ON_DEBUG(handle->owner->nr_locks--); -+ reiser4_ctx_gfp_mask_set(); -+ assert("reiser4-6", -+ ergo(list_empty_careful(&handle->owner->locks), -+ handle->owner->nr_locks == 0)); -+ /* remove lock handle from znode's list of owners */ -+ list_del(&handle->owners_link); -+ /* indicates that lock handle is free now */ -+ handle->node = NULL; -+#if REISER4_DEBUG -+ INIT_LIST_HEAD(&handle->locks_link); -+ INIT_LIST_HEAD(&handle->owners_link); -+ handle->owner = NULL; -+#endif -+} -+ -+/* Actually locks an object knowing that we are able to do this */ -+static void lock_object(lock_stack * owner) -+{ -+ struct lock_request *request; -+ znode *node; -+ -+ request = &owner->request; -+ node = request->node; -+ assert_spin_locked(&(node->lock.guard)); -+ if (request->mode == ZNODE_READ_LOCK) { -+ node->lock.nr_readers++; -+ } else { -+ /* check that we don't switched from read to write lock */ -+ assert("nikita-1840", node->lock.nr_readers <= 0); -+ /* We allow recursive locking; a node can be locked several -+ times for write by same process */ -+ node->lock.nr_readers--; -+ } -+ -+ link_object(request->handle, owner, node); -+ -+ if (owner->curpri) { -+ node->lock.nr_hipri_owners++; -+ } -+} -+ -+/* Check for recursive write locking */ -+static int recursive(lock_stack * owner) -+{ -+ int ret; -+ znode *node; -+ lock_handle *lh; -+ -+ node = owner->request.node; -+ -+ /* Owners list is not empty for a locked node */ -+ assert("zam-314", !list_empty_careful(&node->lock.owners)); -+ assert("nikita-1841", owner == get_current_lock_stack()); -+ assert_spin_locked(&(node->lock.guard)); -+ -+ lh = list_entry(node->lock.owners.next, lock_handle, owners_link); -+ ret = (lh->owner == owner); -+ -+ /* Recursive read locking should be done usual way */ -+ assert("zam-315", !ret || owner->request.mode == ZNODE_WRITE_LOCK); -+ /* mixing of read/write locks is not allowed */ -+ assert("zam-341", !ret || znode_is_wlocked(node)); -+ -+ return ret; -+} -+ -+#if REISER4_DEBUG -+/* Returns true if the lock is held by the calling thread. */ -+int znode_is_any_locked(const znode * node) -+{ -+ lock_handle *handle; -+ lock_stack *stack; -+ int ret; -+ -+ if (!znode_is_locked(node)) { -+ return 0; -+ } -+ -+ stack = get_current_lock_stack(); -+ -+ spin_lock_stack(stack); -+ -+ ret = 0; -+ -+ list_for_each_entry(handle, &stack->locks, locks_link) { -+ if (handle->node == node) { -+ ret = 1; -+ break; -+ } -+ } -+ -+ spin_unlock_stack(stack); -+ -+ return ret; -+} -+ -+#endif -+ -+/* Returns true if a write lock is held by the calling thread. */ -+int znode_is_write_locked(const znode * node) -+{ -+ lock_stack *stack; -+ lock_handle *handle; -+ -+ assert("jmacd-8765", node != NULL); -+ -+ if (!znode_is_wlocked(node)) { -+ return 0; -+ } -+ -+ stack = get_current_lock_stack(); -+ -+ /* -+ * When znode is write locked, all owner handles point to the same lock -+ * stack. Get pointer to lock stack from the first lock handle from -+ * znode's owner list -+ */ -+ handle = list_entry(node->lock.owners.next, lock_handle, owners_link); -+ -+ return (handle->owner == stack); -+} -+ -+/* This "deadlock" condition is the essential part of reiser4 locking -+ implementation. This condition is checked explicitly by calling -+ check_deadlock_condition() or implicitly in all places where znode lock -+ state (set of owners and request queue) is changed. Locking code is -+ designed to use this condition to trigger procedure of passing object from -+ low priority owner(s) to high priority one(s). -+ -+ The procedure results in passing an event (setting lock_handle->signaled -+ flag) and counting this event in nr_signaled field of owner's lock stack -+ object and wakeup owner's process. -+*/ -+static inline int check_deadlock_condition(znode * node) -+{ -+ assert_spin_locked(&(node->lock.guard)); -+ return node->lock.nr_hipri_requests > 0 -+ && node->lock.nr_hipri_owners == 0; -+} -+ -+static int check_livelock_condition(znode * node, znode_lock_mode mode) -+{ -+ zlock * lock = &node->lock; -+ -+ return mode == ZNODE_READ_LOCK && -+ lock -> nr_readers >= 0 && lock->nr_hipri_write_requests > 0; -+} -+ -+/* checks lock/request compatibility */ -+static int can_lock_object(lock_stack * owner) -+{ -+ znode *node = owner->request.node; -+ -+ assert_spin_locked(&(node->lock.guard)); -+ -+ /* See if the node is disconnected. */ -+ if (unlikely(ZF_ISSET(node, JNODE_IS_DYING))) -+ return RETERR(-EINVAL); -+ -+ /* Do not ever try to take a lock if we are going in low priority -+ direction and a node have a high priority request without high -+ priority owners. */ -+ if (unlikely(!owner->curpri && check_deadlock_condition(node))) -+ return RETERR(-E_REPEAT); -+ if (unlikely(owner->curpri && check_livelock_condition(node, owner->request.mode))) -+ return RETERR(-E_REPEAT); -+ if (unlikely(!is_lock_compatible(node, owner->request.mode))) -+ return RETERR(-E_REPEAT); -+ return 0; -+} -+ -+/* Setting of a high priority to the process. It clears "signaled" flags -+ because znode locked by high-priority process can't satisfy our "deadlock -+ condition". */ -+static void set_high_priority(lock_stack * owner) -+{ -+ assert("nikita-1846", owner == get_current_lock_stack()); -+ /* Do nothing if current priority is already high */ -+ if (!owner->curpri) { -+ /* We don't need locking for owner->locks list, because, this -+ * function is only called with the lock stack of the current -+ * thread, and no other thread can play with owner->locks list -+ * and/or change ->node pointers of lock handles in this list. -+ * -+ * (Interrupts also are not involved.) -+ */ -+ lock_handle *item = list_entry(owner->locks.next, lock_handle, locks_link); -+ while (&owner->locks != &item->locks_link) { -+ znode *node = item->node; -+ -+ spin_lock_zlock(&node->lock); -+ -+ node->lock.nr_hipri_owners++; -+ -+ /* we can safely set signaled to zero, because -+ previous statement (nr_hipri_owners ++) guarantees -+ that signaled will be never set again. */ -+ item->signaled = 0; -+ spin_unlock_zlock(&node->lock); -+ -+ item = list_entry(item->locks_link.next, lock_handle, locks_link); -+ } -+ owner->curpri = 1; -+ atomic_set(&owner->nr_signaled, 0); -+ } -+} -+ -+/* Sets a low priority to the process. */ -+static void set_low_priority(lock_stack * owner) -+{ -+ assert("nikita-3075", owner == get_current_lock_stack()); -+ /* Do nothing if current priority is already low */ -+ if (owner->curpri) { -+ /* scan all locks (lock handles) held by @owner, which is -+ actually current thread, and check whether we are reaching -+ deadlock possibility anywhere. -+ */ -+ lock_handle *handle = list_entry(owner->locks.next, lock_handle, locks_link); -+ while (&owner->locks != &handle->locks_link) { -+ znode *node = handle->node; -+ spin_lock_zlock(&node->lock); -+ /* this thread just was hipri owner of @node, so -+ nr_hipri_owners has to be greater than zero. */ -+ assert("nikita-1835", node->lock.nr_hipri_owners > 0); -+ node->lock.nr_hipri_owners--; -+ /* If we have deadlock condition, adjust a nr_signaled -+ field. It is enough to set "signaled" flag only for -+ current process, other low-pri owners will be -+ signaled and waken up after current process unlocks -+ this object and any high-priority requestor takes -+ control. */ -+ if (check_deadlock_condition(node) -+ && !handle->signaled) { -+ handle->signaled = 1; -+ atomic_inc(&owner->nr_signaled); -+ } -+ spin_unlock_zlock(&node->lock); -+ handle = list_entry(handle->locks_link.next, lock_handle, locks_link); -+ } -+ owner->curpri = 0; -+ } -+} -+ -+static void remove_lock_request(lock_stack * requestor) -+{ -+ zlock * lock = &requestor->request.node->lock; -+ -+ if (requestor->curpri) { -+ assert("nikita-1838", lock->nr_hipri_requests > 0); -+ lock->nr_hipri_requests--; -+ if (requestor->request.mode == ZNODE_WRITE_LOCK) -+ lock->nr_hipri_write_requests --; -+ } -+ list_del(&requestor->requestors_link); -+} -+ -+static void invalidate_all_lock_requests(znode * node) -+{ -+ lock_stack *requestor, *tmp; -+ -+ assert_spin_locked(&(node->lock.guard)); -+ -+ list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) { -+ remove_lock_request(requestor); -+ requestor->request.ret_code = -EINVAL; -+ reiser4_wake_up(requestor); -+ requestor->request.mode = ZNODE_NO_LOCK; -+ } -+} -+ -+static void dispatch_lock_requests(znode * node) -+{ -+ lock_stack *requestor, *tmp; -+ -+ assert_spin_locked(&(node->lock.guard)); -+ -+ list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) { -+ if (znode_is_write_locked(node)) -+ break; -+ if (!can_lock_object(requestor)) { -+ lock_object(requestor); -+ remove_lock_request(requestor); -+ requestor->request.ret_code = 0; -+ reiser4_wake_up(requestor); -+ requestor->request.mode = ZNODE_NO_LOCK; -+ } -+ } -+} -+ -+/* release long-term lock, acquired by longterm_lock_znode() */ -+void longterm_unlock_znode(lock_handle * handle) -+{ -+ znode *node = handle->node; -+ lock_stack *oldowner = handle->owner; -+ int hipri; -+ int readers; -+ int rdelta; -+ int youdie; -+ -+ /* -+ * this is time-critical and highly optimized code. Modify carefully. -+ */ -+ -+ assert("jmacd-1021", handle != NULL); -+ assert("jmacd-1022", handle->owner != NULL); -+ assert("nikita-1392", LOCK_CNT_GTZ(long_term_locked_znode)); -+ -+ assert("zam-130", oldowner == get_current_lock_stack()); -+ -+ LOCK_CNT_DEC(long_term_locked_znode); -+ -+ /* -+ * to minimize amount of operations performed under lock, pre-compute -+ * all variables used within critical section. This makes code -+ * obscure. -+ */ -+ -+ /* was this lock of hi or lo priority */ -+ hipri = oldowner->curpri ? 1 : 0; -+ /* number of readers */ -+ readers = node->lock.nr_readers; -+ /* +1 if write lock, -1 if read lock */ -+ rdelta = (readers > 0) ? -1 : +1; -+ /* true if node is to die and write lock is released */ -+ youdie = ZF_ISSET(node, JNODE_HEARD_BANSHEE) && (readers < 0); -+ -+ spin_lock_zlock(&node->lock); -+ -+ assert("zam-101", znode_is_locked(node)); -+ -+ /* Adjust a number of high priority owners of this lock */ -+ assert("nikita-1836", node->lock.nr_hipri_owners >= hipri); -+ node->lock.nr_hipri_owners -= hipri; -+ -+ /* Handle znode deallocation on last write-lock release. */ -+ if (znode_is_wlocked_once(node)) { -+ if (youdie) { -+ forget_znode(handle); -+ assert("nikita-2191", znode_invariant(node)); -+ zput(node); -+ return; -+ } -+ } -+ -+ if (handle->signaled) -+ atomic_dec(&oldowner->nr_signaled); -+ -+ /* Unlocking means owner<->object link deletion */ -+ unlink_object(handle); -+ -+ /* This is enough to be sure whether an object is completely -+ unlocked. */ -+ node->lock.nr_readers += rdelta; -+ -+ /* If the node is locked it must have an owners list. Likewise, if -+ the node is unlocked it must have an empty owners list. */ -+ assert("zam-319", equi(znode_is_locked(node), -+ !list_empty_careful(&node->lock.owners))); -+ -+#if REISER4_DEBUG -+ if (!znode_is_locked(node)) -+ ++node->times_locked; -+#endif -+ -+ /* If there are pending lock requests we wake up a requestor */ -+ if (!znode_is_wlocked(node)) -+ dispatch_lock_requests(node); -+ if (check_deadlock_condition(node)) -+ wake_up_all_lopri_owners(node); -+ spin_unlock_zlock(&node->lock); -+ -+ /* minus one reference from handle->node */ -+ assert("nikita-2190", znode_invariant(node)); -+ ON_DEBUG(check_lock_data()); -+ ON_DEBUG(check_lock_node_data(node)); -+ zput(node); -+} -+ -+/* final portion of longterm-lock */ -+static int -+lock_tail(lock_stack * owner, int ok, znode_lock_mode mode) -+{ -+ znode *node = owner->request.node; -+ -+ assert_spin_locked(&(node->lock.guard)); -+ -+ /* If we broke with (ok == 0) it means we can_lock, now do it. */ -+ if (ok == 0) { -+ lock_object(owner); -+ owner->request.mode = 0; -+ /* count a reference from lockhandle->node -+ -+ znode was already referenced at the entry to this function, -+ hence taking spin-lock here is not necessary (see comment -+ in the zref()). -+ */ -+ zref(node); -+ -+ LOCK_CNT_INC(long_term_locked_znode); -+ } -+ spin_unlock_zlock(&node->lock); -+ ON_DEBUG(check_lock_data()); -+ ON_DEBUG(check_lock_node_data(node)); -+ return ok; -+} -+ -+/* -+ * version of longterm_znode_lock() optimized for the most common case: read -+ * lock without any special flags. This is the kind of lock that any tree -+ * traversal takes on the root node of the tree, which is very frequent. -+ */ -+static int longterm_lock_tryfast(lock_stack * owner) -+{ -+ int result; -+ znode *node; -+ zlock *lock; -+ -+ node = owner->request.node; -+ lock = &node->lock; -+ -+ assert("nikita-3340", reiser4_schedulable()); -+ assert("nikita-3341", request_is_deadlock_safe(node, -+ ZNODE_READ_LOCK, -+ ZNODE_LOCK_LOPRI)); -+ spin_lock_zlock(lock); -+ result = can_lock_object(owner); -+ spin_unlock_zlock(lock); -+ -+ if (likely(result != -EINVAL)) { -+ spin_lock_znode(node); -+ result = reiser4_try_capture(ZJNODE(node), ZNODE_READ_LOCK, 0); -+ spin_unlock_znode(node); -+ spin_lock_zlock(lock); -+ if (unlikely(result != 0)) { -+ owner->request.mode = 0; -+ } else { -+ result = can_lock_object(owner); -+ if (unlikely(result == -E_REPEAT)) { -+ /* fall back to longterm_lock_znode() */ -+ spin_unlock_zlock(lock); -+ return 1; -+ } -+ } -+ return lock_tail(owner, result, ZNODE_READ_LOCK); -+ } else -+ return 1; -+} -+ -+/* locks given lock object */ -+int longterm_lock_znode( -+ /* local link object (allocated by lock owner thread, usually on its own -+ * stack) */ -+ lock_handle * handle, -+ /* znode we want to lock. */ -+ znode * node, -+ /* {ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}; */ -+ znode_lock_mode mode, -+ /* {0, -EINVAL, -E_DEADLOCK}, see return codes description. */ -+ znode_lock_request request) { -+ int ret; -+ int hipri = (request & ZNODE_LOCK_HIPRI) != 0; -+ int non_blocking = 0; -+ int has_atom; -+ txn_capture cap_flags; -+ zlock *lock; -+ txn_handle *txnh; -+ tree_level level; -+ -+ /* Get current process context */ -+ lock_stack *owner = get_current_lock_stack(); -+ -+ /* Check that the lock handle is initialized and isn't already being -+ * used. */ -+ assert("jmacd-808", handle->owner == NULL); -+ assert("nikita-3026", reiser4_schedulable()); -+ assert("nikita-3219", request_is_deadlock_safe(node, mode, request)); -+ assert("zam-1056", atomic_read(&ZJNODE(node)->x_count) > 0); -+ /* long term locks are not allowed in the VM contexts (->writepage(), -+ * prune_{d,i}cache()). -+ * -+ * FIXME this doesn't work due to unused-dentry-with-unlinked-inode -+ * bug caused by d_splice_alias() only working for directories. -+ */ -+ assert("nikita-3547", 1 || ((current->flags & PF_MEMALLOC) == 0)); -+ assert ("zam-1055", mode != ZNODE_NO_LOCK); -+ -+ cap_flags = 0; -+ if (request & ZNODE_LOCK_NONBLOCK) { -+ cap_flags |= TXN_CAPTURE_NONBLOCKING; -+ non_blocking = 1; -+ } -+ -+ if (request & ZNODE_LOCK_DONT_FUSE) -+ cap_flags |= TXN_CAPTURE_DONT_FUSE; -+ -+ /* If we are changing our process priority we must adjust a number -+ of high priority owners for each znode that we already lock */ -+ if (hipri) { -+ set_high_priority(owner); -+ } else { -+ set_low_priority(owner); -+ } -+ -+ level = znode_get_level(node); -+ -+ /* Fill request structure with our values. */ -+ owner->request.mode = mode; -+ owner->request.handle = handle; -+ owner->request.node = node; -+ -+ txnh = get_current_context()->trans; -+ lock = &node->lock; -+ -+ if (mode == ZNODE_READ_LOCK && request == 0) { -+ ret = longterm_lock_tryfast(owner); -+ if (ret <= 0) -+ return ret; -+ } -+ -+ has_atom = (txnh->atom != NULL); -+ -+ /* Synchronize on node's zlock guard lock. */ -+ spin_lock_zlock(lock); -+ -+ if (znode_is_locked(node) && -+ mode == ZNODE_WRITE_LOCK && recursive(owner)) -+ return lock_tail(owner, 0, mode); -+ -+ for (;;) { -+ /* Check the lock's availability: if it is unavaiable we get -+ E_REPEAT, 0 indicates "can_lock", otherwise the node is -+ invalid. */ -+ ret = can_lock_object(owner); -+ -+ if (unlikely(ret == -EINVAL)) { -+ /* @node is dying. Leave it alone. */ -+ break; -+ } -+ -+ if (unlikely(ret == -E_REPEAT && non_blocking)) { -+ /* either locking of @node by the current thread will -+ * lead to the deadlock, or lock modes are -+ * incompatible. */ -+ break; -+ } -+ -+ assert("nikita-1844", (ret == 0) -+ || ((ret == -E_REPEAT) && !non_blocking)); -+ /* If we can get the lock... Try to capture first before -+ taking the lock. */ -+ -+ /* first handle commonest case where node and txnh are already -+ * in the same atom. */ -+ /* safe to do without taking locks, because: -+ * -+ * 1. read of aligned word is atomic with respect to writes to -+ * this word -+ * -+ * 2. false negatives are handled in reiser4_try_capture(). -+ * -+ * 3. false positives are impossible. -+ * -+ * PROOF: left as an exercise to the curious reader. -+ * -+ * Just kidding. Here is one: -+ * -+ * At the time T0 txnh->atom is stored in txnh_atom. -+ * -+ * At the time T1 node->atom is stored in node_atom. -+ * -+ * At the time T2 we observe that -+ * -+ * txnh_atom != NULL && node_atom == txnh_atom. -+ * -+ * Imagine that at this moment we acquire node and txnh spin -+ * lock in this order. Suppose that under spin lock we have -+ * -+ * node->atom != txnh->atom, (S1) -+ * -+ * at the time T3. -+ * -+ * txnh->atom != NULL still, because txnh is open by the -+ * current thread. -+ * -+ * Suppose node->atom == NULL, that is, node was un-captured -+ * between T1, and T3. But un-capturing of formatted node is -+ * always preceded by the call to reiser4_invalidate_lock(), -+ * which marks znode as JNODE_IS_DYING under zlock spin -+ * lock. Contradiction, because can_lock_object() above checks -+ * for JNODE_IS_DYING. Hence, node->atom != NULL at T3. -+ * -+ * Suppose that node->atom != node_atom, that is, atom, node -+ * belongs to was fused into another atom: node_atom was fused -+ * into node->atom. Atom of txnh was equal to node_atom at T2, -+ * which means that under spin lock, txnh->atom == node->atom, -+ * because txnh->atom can only follow fusion -+ * chain. Contradicts S1. -+ * -+ * The same for hypothesis txnh->atom != txnh_atom. Hence, -+ * node->atom == node_atom == txnh_atom == txnh->atom. Again -+ * contradicts S1. Hence S1 is false. QED. -+ * -+ */ -+ -+ if (likely(has_atom && ZJNODE(node)->atom == txnh->atom)) { -+ ; -+ } else { -+ /* -+ * unlock zlock spin lock here. It is possible for -+ * longterm_unlock_znode() to sneak in here, but there -+ * is no harm: reiser4_invalidate_lock() will mark znode -+ * as JNODE_IS_DYING and this will be noted by -+ * can_lock_object() below. -+ */ -+ spin_unlock_zlock(lock); -+ spin_lock_znode(node); -+ ret = reiser4_try_capture(ZJNODE(node), mode, cap_flags); -+ spin_unlock_znode(node); -+ spin_lock_zlock(lock); -+ if (unlikely(ret != 0)) { -+ /* In the failure case, the txnmgr releases -+ the znode's lock (or in some cases, it was -+ released a while ago). There's no need to -+ reacquire it so we should return here, -+ avoid releasing the lock. */ -+ owner->request.mode = 0; -+ break; -+ } -+ -+ /* Check the lock's availability again -- this is -+ because under some circumstances the capture code -+ has to release and reacquire the znode spinlock. */ -+ ret = can_lock_object(owner); -+ } -+ -+ /* This time, a return of (ret == 0) means we can lock, so we -+ should break out of the loop. */ -+ if (likely(ret != -E_REPEAT || non_blocking)) -+ break; -+ -+ /* Lock is unavailable, we have to wait. */ -+ ret = reiser4_prepare_to_sleep(owner); -+ if (unlikely(ret != 0)) -+ break; -+ -+ assert_spin_locked(&(node->lock.guard)); -+ if (hipri) { -+ /* If we are going in high priority direction then -+ increase high priority requests counter for the -+ node */ -+ lock->nr_hipri_requests++; -+ if (mode == ZNODE_WRITE_LOCK) -+ lock->nr_hipri_write_requests ++; -+ /* If there are no high priority owners for a node, -+ then immediately wake up low priority owners, so -+ they can detect possible deadlock */ -+ if (lock->nr_hipri_owners == 0) -+ wake_up_all_lopri_owners(node); -+ } -+ list_add_tail(&owner->requestors_link, &lock->requestors); -+ -+ /* Ok, here we have prepared a lock request, so unlock -+ a znode ... */ -+ spin_unlock_zlock(lock); -+ /* ... and sleep */ -+ reiser4_go_to_sleep(owner); -+ if (owner->request.mode == ZNODE_NO_LOCK) -+ goto request_is_done; -+ spin_lock_zlock(lock); -+ if (owner->request.mode == ZNODE_NO_LOCK) { -+ spin_unlock_zlock(lock); -+ request_is_done: -+ if (owner->request.ret_code == 0) { -+ LOCK_CNT_INC(long_term_locked_znode); -+ zref(node); -+ } -+ return owner->request.ret_code; -+ } -+ remove_lock_request(owner); -+ } -+ -+ return lock_tail(owner, ret, mode); -+} -+ -+/* lock object invalidation means changing of lock object state to `INVALID' -+ and waiting for all other processes to cancel theirs lock requests. */ -+void reiser4_invalidate_lock(lock_handle * handle /* path to lock -+ * owner and lock -+ * object is being -+ * invalidated. */ ) -+{ -+ znode *node = handle->node; -+ lock_stack *owner = handle->owner; -+ -+ assert("zam-325", owner == get_current_lock_stack()); -+ assert("zam-103", znode_is_write_locked(node)); -+ assert("nikita-1393", !ZF_ISSET(node, JNODE_LEFT_CONNECTED)); -+ assert("nikita-1793", !ZF_ISSET(node, JNODE_RIGHT_CONNECTED)); -+ assert("nikita-1394", ZF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ assert("nikita-3097", znode_is_wlocked_once(node)); -+ assert_spin_locked(&(node->lock.guard)); -+ -+ if (handle->signaled) -+ atomic_dec(&owner->nr_signaled); -+ -+ ZF_SET(node, JNODE_IS_DYING); -+ unlink_object(handle); -+ node->lock.nr_readers = 0; -+ -+ invalidate_all_lock_requests(node); -+ spin_unlock_zlock(&node->lock); -+} -+ -+/* Initializes lock_stack. */ -+void init_lock_stack(lock_stack * owner /* pointer to -+ * allocated -+ * structure. */ ) -+{ -+ INIT_LIST_HEAD(&owner->locks); -+ INIT_LIST_HEAD(&owner->requestors_link); -+ spin_lock_init(&owner->sguard); -+ owner->curpri = 1; -+ init_waitqueue_head(&owner->wait); -+} -+ -+/* Initializes lock object. */ -+void reiser4_init_lock(zlock * lock /* pointer on allocated -+ * uninitialized lock object -+ * structure. */ ) -+{ -+ memset(lock, 0, sizeof(zlock)); -+ spin_lock_init(&lock->guard); -+ INIT_LIST_HEAD(&lock->requestors); -+ INIT_LIST_HEAD(&lock->owners); -+} -+ -+/* Transfer a lock handle (presumably so that variables can be moved between stack and -+ heap locations). */ -+static void -+move_lh_internal(lock_handle * new, lock_handle * old, int unlink_old) -+{ -+ znode *node = old->node; -+ lock_stack *owner = old->owner; -+ int signaled; -+ -+ /* locks_list, modified by link_object() is not protected by -+ anything. This is valid because only current thread ever modifies -+ locks_list of its lock_stack. -+ */ -+ assert("nikita-1827", owner == get_current_lock_stack()); -+ assert("nikita-1831", new->owner == NULL); -+ -+ spin_lock_zlock(&node->lock); -+ -+ signaled = old->signaled; -+ if (unlink_old) { -+ unlink_object(old); -+ } else { -+ if (node->lock.nr_readers > 0) { -+ node->lock.nr_readers += 1; -+ } else { -+ node->lock.nr_readers -= 1; -+ } -+ if (signaled) { -+ atomic_inc(&owner->nr_signaled); -+ } -+ if (owner->curpri) { -+ node->lock.nr_hipri_owners += 1; -+ } -+ LOCK_CNT_INC(long_term_locked_znode); -+ -+ zref(node); -+ } -+ link_object(new, owner, node); -+ new->signaled = signaled; -+ -+ spin_unlock_zlock(&node->lock); -+} -+ -+void move_lh(lock_handle * new, lock_handle * old) -+{ -+ move_lh_internal(new, old, /*unlink_old */ 1); -+} -+ -+void copy_lh(lock_handle * new, lock_handle * old) -+{ -+ move_lh_internal(new, old, /*unlink_old */ 0); -+} -+ -+/* after getting -E_DEADLOCK we unlock znodes until this function returns false */ -+int reiser4_check_deadlock(void) -+{ -+ lock_stack *owner = get_current_lock_stack(); -+ return atomic_read(&owner->nr_signaled) != 0; -+} -+ -+/* Before going to sleep we re-check "release lock" requests which might come from threads with hi-pri lock -+ priorities. */ -+int reiser4_prepare_to_sleep(lock_stack * owner) -+{ -+ assert("nikita-1847", owner == get_current_lock_stack()); -+ -+ /* We return -E_DEADLOCK if one or more "give me the lock" messages are -+ * counted in nr_signaled */ -+ if (unlikely(atomic_read(&owner->nr_signaled) != 0)) { -+ assert("zam-959", !owner->curpri); -+ return RETERR(-E_DEADLOCK); -+ } -+ return 0; -+} -+ -+/* Wakes up a single thread */ -+void __reiser4_wake_up(lock_stack * owner) -+{ -+ atomic_set(&owner->wakeup, 1); -+ wake_up(&owner->wait); -+} -+ -+/* Puts a thread to sleep */ -+void reiser4_go_to_sleep(lock_stack * owner) -+{ -+ /* Well, we might sleep here, so holding of any spinlocks is no-no */ -+ assert("nikita-3027", reiser4_schedulable()); -+ -+ wait_event(owner->wait, atomic_read(&owner->wakeup)); -+ atomic_set(&owner->wakeup, 0); -+} -+ -+int lock_stack_isclean(lock_stack * owner) -+{ -+ if (list_empty_careful(&owner->locks)) { -+ assert("zam-353", atomic_read(&owner->nr_signaled) == 0); -+ return 1; -+ } -+ -+ return 0; -+} -+ -+#if REISER4_DEBUG -+ -+/* -+ * debugging functions -+ */ -+ -+static void list_check(struct list_head *head) -+{ -+ struct list_head *pos; -+ -+ list_for_each(pos, head) -+ assert("", (pos->prev != NULL && pos->next != NULL && -+ pos->prev->next == pos && pos->next->prev == pos)); -+} -+ -+/* check consistency of locking data-structures hanging of the @stack */ -+static void check_lock_stack(lock_stack * stack) -+{ -+ spin_lock_stack(stack); -+ /* check that stack->locks is not corrupted */ -+ list_check(&stack->locks); -+ spin_unlock_stack(stack); -+} -+ -+/* check consistency of locking data structures */ -+void check_lock_data(void) -+{ -+ check_lock_stack(&get_current_context()->stack); -+} -+ -+/* check consistency of locking data structures for @node */ -+void check_lock_node_data(znode * node) -+{ -+ spin_lock_zlock(&node->lock); -+ list_check(&node->lock.owners); -+ list_check(&node->lock.requestors); -+ spin_unlock_zlock(&node->lock); -+} -+ -+/* check that given lock request is dead lock safe. This check is, of course, -+ * not exhaustive. */ -+static int -+request_is_deadlock_safe(znode * node, znode_lock_mode mode, -+ znode_lock_request request) -+{ -+ lock_stack *owner; -+ -+ owner = get_current_lock_stack(); -+ /* -+ * check that hipri lock request is not issued when there are locked -+ * nodes at the higher levels. -+ */ -+ if (request & ZNODE_LOCK_HIPRI && !(request & ZNODE_LOCK_NONBLOCK) && -+ znode_get_level(node) != 0) { -+ lock_handle *item; -+ -+ list_for_each_entry(item, &owner->locks, locks_link) { -+ znode *other; -+ -+ other = item->node; -+ -+ if (znode_get_level(other) == 0) -+ continue; -+ if (znode_get_level(other) > znode_get_level(node)) -+ return 0; -+ } -+ } -+ return 1; -+} -+ -+#endif -+ -+/* return pointer to static storage with name of lock_mode. For -+ debugging */ -+const char *lock_mode_name(znode_lock_mode lock /* lock mode to get name of */ ) -+{ -+ if (lock == ZNODE_READ_LOCK) -+ return "read"; -+ else if (lock == ZNODE_WRITE_LOCK) -+ return "write"; -+ else { -+ static char buf[30]; -+ -+ sprintf(buf, "unknown: %i", lock); -+ return buf; -+ } -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 79 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/lock.h linux-2.6.24/fs/reiser4/lock.h ---- linux-2.6.24.orig/fs/reiser4/lock.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/lock.h 2008-01-25 11:39:06.948210780 +0300 -@@ -0,0 +1,249 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Long term locking data structures. See lock.c for details. */ -+ -+#ifndef __LOCK_H__ -+#define __LOCK_H__ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/node/node.h" -+#include "txnmgr.h" -+#include "readahead.h" -+ -+#include -+#include -+#include /* for PAGE_CACHE_SIZE */ -+#include -+#include -+ -+/* Per-znode lock object */ -+struct zlock { -+ spinlock_t guard; -+ /* The number of readers if positive; the number of recursively taken -+ write locks if negative. Protected by zlock spin lock. */ -+ int nr_readers; -+ /* A number of processes (lock_stacks) that have this object -+ locked with high priority */ -+ unsigned nr_hipri_owners; -+ /* A number of attempts to lock znode in high priority direction */ -+ unsigned nr_hipri_requests; -+ /* A linked list of lock_handle objects that contains pointers -+ for all lock_stacks which have this lock object locked */ -+ unsigned nr_hipri_write_requests; -+ struct list_head owners; -+ /* A linked list of lock_stacks that wait for this lock */ -+ struct list_head requestors; -+}; -+ -+static inline void spin_lock_zlock(zlock *lock) -+{ -+ /* check that zlock is not locked */ -+ assert("", LOCK_CNT_NIL(spin_locked_zlock)); -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", LOCK_CNT_NIL(spin_locked_stack)); -+ -+ spin_lock(&lock->guard); -+ -+ LOCK_CNT_INC(spin_locked_zlock); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline void spin_unlock_zlock(zlock *lock) -+{ -+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_zlock)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(spin_locked_zlock); -+ LOCK_CNT_DEC(spin_locked); -+ -+ spin_unlock(&lock->guard); -+} -+ -+#define lock_is_locked(lock) ((lock)->nr_readers != 0) -+#define lock_is_rlocked(lock) ((lock)->nr_readers > 0) -+#define lock_is_wlocked(lock) ((lock)->nr_readers < 0) -+#define lock_is_wlocked_once(lock) ((lock)->nr_readers == -1) -+#define lock_can_be_rlocked(lock) ((lock)->nr_readers >=0) -+#define lock_mode_compatible(lock, mode) \ -+ (((mode) == ZNODE_WRITE_LOCK && !lock_is_locked(lock)) || \ -+ ((mode) == ZNODE_READ_LOCK && lock_can_be_rlocked(lock))) -+ -+/* Since we have R/W znode locks we need additional bidirectional `link' -+ objects to implement n<->m relationship between lock owners and lock -+ objects. We call them `lock handles'. -+ -+ Locking: see lock.c/"SHORT-TERM LOCKING" -+*/ -+struct lock_handle { -+ /* This flag indicates that a signal to yield a lock was passed to -+ lock owner and counted in owner->nr_signalled -+ -+ Locking: this is accessed under spin lock on ->node. -+ */ -+ int signaled; -+ /* A link to owner of a lock */ -+ lock_stack *owner; -+ /* A link to znode locked */ -+ znode *node; -+ /* A list of all locks for a process */ -+ struct list_head locks_link; -+ /* A list of all owners for a znode */ -+ struct list_head owners_link; -+}; -+ -+struct lock_request { -+ /* A pointer to uninitialized link object */ -+ lock_handle *handle; -+ /* A pointer to the object we want to lock */ -+ znode *node; -+ /* Lock mode (ZNODE_READ_LOCK or ZNODE_WRITE_LOCK) */ -+ znode_lock_mode mode; -+ /* how dispatch_lock_requests() returns lock request result code */ -+ int ret_code; -+}; -+ -+/* A lock stack structure for accumulating locks owned by a process */ -+struct lock_stack { -+ /* A guard lock protecting a lock stack */ -+ spinlock_t sguard; -+ /* number of znodes which were requested by high priority processes */ -+ atomic_t nr_signaled; -+ /* Current priority of a process -+ -+ This is only accessed by the current thread and thus requires no -+ locking. -+ */ -+ int curpri; -+ /* A list of all locks owned by this process. Elements can be added to -+ * this list only by the current thread. ->node pointers in this list -+ * can be only changed by the current thread. */ -+ struct list_head locks; -+ /* When lock_stack waits for the lock, it puts itself on double-linked -+ requestors list of that lock */ -+ struct list_head requestors_link; -+ /* Current lock request info. -+ -+ This is only accessed by the current thread and thus requires no -+ locking. -+ */ -+ struct lock_request request; -+ /* the following two fields are the lock stack's -+ * synchronization object to use with the standard linux/wait.h -+ * interface. See reiser4_go_to_sleep and __reiser4_wake_up for -+ * usage details. */ -+ wait_queue_head_t wait; -+ atomic_t wakeup; -+#if REISER4_DEBUG -+ int nr_locks; /* number of lock handles in the above list */ -+#endif -+}; -+ -+/* -+ User-visible znode locking functions -+*/ -+ -+extern int longterm_lock_znode(lock_handle * handle, -+ znode * node, -+ znode_lock_mode mode, -+ znode_lock_request request); -+ -+extern void longterm_unlock_znode(lock_handle * handle); -+ -+extern int reiser4_check_deadlock(void); -+ -+extern lock_stack *get_current_lock_stack(void); -+ -+extern void init_lock_stack(lock_stack * owner); -+extern void reiser4_init_lock(zlock * lock); -+ -+static inline void init_lh(lock_handle *lh) -+{ -+#if REISER4_DEBUG -+ memset(lh, 0, sizeof *lh); -+ INIT_LIST_HEAD(&lh->locks_link); -+ INIT_LIST_HEAD(&lh->owners_link); -+#else -+ lh->node = NULL; -+#endif -+} -+ -+static inline void done_lh(lock_handle *lh) -+{ -+ assert("zam-342", lh != NULL); -+ if (lh->node != NULL) -+ longterm_unlock_znode(lh); -+} -+ -+extern void move_lh(lock_handle * new, lock_handle * old); -+extern void copy_lh(lock_handle * new, lock_handle * old); -+ -+extern int reiser4_prepare_to_sleep(lock_stack * owner); -+extern void reiser4_go_to_sleep(lock_stack * owner); -+extern void __reiser4_wake_up(lock_stack * owner); -+ -+extern int lock_stack_isclean(lock_stack * owner); -+ -+/* zlock object state check macros: only used in assertions. Both forms imply that the -+ lock is held by the current thread. */ -+extern int znode_is_write_locked(const znode *); -+extern void reiser4_invalidate_lock(lock_handle *); -+ -+/* lock ordering is: first take zlock spin lock, then lock stack spin lock */ -+#define spin_ordering_pred_stack(stack) \ -+ (LOCK_CNT_NIL(spin_locked_stack) && \ -+ LOCK_CNT_NIL(spin_locked_txnmgr) && \ -+ LOCK_CNT_NIL(spin_locked_inode) && \ -+ LOCK_CNT_NIL(rw_locked_cbk_cache) && \ -+ LOCK_CNT_NIL(spin_locked_super_eflush) ) -+ -+static inline void spin_lock_stack(lock_stack *stack) -+{ -+ assert("", spin_ordering_pred_stack(stack)); -+ spin_lock(&(stack->sguard)); -+ LOCK_CNT_INC(spin_locked_stack); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline void spin_unlock_stack(lock_stack *stack) -+{ -+ assert_spin_locked(&(stack->sguard)); -+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_stack)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ LOCK_CNT_DEC(spin_locked_stack); -+ LOCK_CNT_DEC(spin_locked); -+ spin_unlock(&(stack->sguard)); -+} -+ -+static inline void reiser4_wake_up(lock_stack * owner) -+{ -+ spin_lock_stack(owner); -+ __reiser4_wake_up(owner); -+ spin_unlock_stack(owner); -+} -+ -+const char *lock_mode_name(znode_lock_mode lock); -+ -+#if REISER4_DEBUG -+extern void check_lock_data(void); -+extern void check_lock_node_data(znode * node); -+#else -+#define check_lock_data() noop -+#define check_lock_node_data() noop -+#endif -+ -+/* __LOCK_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/Makefile linux-2.6.24/fs/reiser4/Makefile ---- linux-2.6.24.orig/fs/reiser4/Makefile 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/Makefile 2008-01-25 11:39:06.948210780 +0300 -@@ -0,0 +1,98 @@ -+# -+# reiser4/Makefile -+# -+ -+obj-$(CONFIG_REISER4_FS) += reiser4.o -+ -+reiser4-y := \ -+ debug.o \ -+ jnode.o \ -+ znode.o \ -+ key.o \ -+ pool.o \ -+ tree_mod.o \ -+ estimate.o \ -+ carry.o \ -+ carry_ops.o \ -+ lock.o \ -+ tree.o \ -+ context.o \ -+ tap.o \ -+ coord.o \ -+ block_alloc.o \ -+ txnmgr.o \ -+ kassign.o \ -+ flush.o \ -+ wander.o \ -+ eottl.o \ -+ search.o \ -+ page_cache.o \ -+ seal.o \ -+ dscale.o \ -+ flush_queue.o \ -+ ktxnmgrd.o \ -+ blocknrset.o \ -+ super.o \ -+ super_ops.o \ -+ fsdata.o \ -+ export_ops.o \ -+ oid.o \ -+ tree_walk.o \ -+ inode.o \ -+ vfs_ops.o \ -+ as_ops.o \ -+ entd.o\ -+ readahead.o \ -+ status_flags.o \ -+ init_super.o \ -+ safe_link.o \ -+ \ -+ plugin/plugin.o \ -+ plugin/plugin_set.o \ -+ plugin/node/node.o \ -+ plugin/object.o \ -+ plugin/cluster.o \ -+ plugin/inode_ops.o \ -+ plugin/inode_ops_rename.o \ -+ plugin/file_ops.o \ -+ plugin/file_ops_readdir.o \ -+ plugin/file_plugin_common.o \ -+ plugin/file/file.o \ -+ plugin/file/tail_conversion.o \ -+ plugin/file/file_conversion.o \ -+ plugin/file/symlink.o \ -+ plugin/file/cryptcompress.o \ -+ plugin/dir_plugin_common.o \ -+ plugin/dir/hashed_dir.o \ -+ plugin/dir/seekable_dir.o \ -+ plugin/node/node40.o \ -+ \ -+ plugin/crypto/cipher.o \ -+ plugin/crypto/digest.o \ -+ \ -+ plugin/compress/compress.o \ -+ plugin/compress/compress_mode.o \ -+ \ -+ plugin/item/static_stat.o \ -+ plugin/item/sde.o \ -+ plugin/item/cde.o \ -+ plugin/item/blackbox.o \ -+ plugin/item/internal.o \ -+ plugin/item/tail.o \ -+ plugin/item/ctail.o \ -+ plugin/item/extent.o \ -+ plugin/item/extent_item_ops.o \ -+ plugin/item/extent_file_ops.o \ -+ plugin/item/extent_flush_ops.o \ -+ \ -+ plugin/hash.o \ -+ plugin/fibration.o \ -+ plugin/tail_policy.o \ -+ plugin/item/item.o \ -+ \ -+ plugin/security/perm.o \ -+ plugin/space/bitmap.o \ -+ \ -+ plugin/disk_format/disk_format40.o \ -+ plugin/disk_format/disk_format.o -+ -diff -urN linux-2.6.24.orig/fs/reiser4/oid.c linux-2.6.24/fs/reiser4/oid.c ---- linux-2.6.24.orig/fs/reiser4/oid.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/oid.c 2008-01-25 11:39:06.952211810 +0300 -@@ -0,0 +1,141 @@ -+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "debug.h" -+#include "super.h" -+#include "txnmgr.h" -+ -+/* we used to have oid allocation plugin. It was removed because it -+ was recognized as providing unneeded level of abstraction. If one -+ ever will find it useful - look at yet_unneeded_abstractions/oid -+*/ -+ -+/* -+ * initialize in-memory data for oid allocator at @super. @nr_files and @next -+ * are provided by disk format plugin that reads them from the disk during -+ * mount. -+ */ -+int oid_init_allocator(struct super_block *super, oid_t nr_files, oid_t next) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = get_super_private(super); -+ -+ sbinfo->next_to_use = next; -+ sbinfo->oids_in_use = nr_files; -+ return 0; -+} -+ -+/* -+ * allocate oid and return it. ABSOLUTE_MAX_OID is returned when allocator -+ * runs out of oids. -+ */ -+oid_t oid_allocate(struct super_block * super) -+{ -+ reiser4_super_info_data *sbinfo; -+ oid_t oid; -+ -+ sbinfo = get_super_private(super); -+ -+ spin_lock_reiser4_super(sbinfo); -+ if (sbinfo->next_to_use != ABSOLUTE_MAX_OID) { -+ oid = sbinfo->next_to_use++; -+ sbinfo->oids_in_use++; -+ } else -+ oid = ABSOLUTE_MAX_OID; -+ spin_unlock_reiser4_super(sbinfo); -+ return oid; -+} -+ -+/* -+ * Tell oid allocator that @oid is now free. -+ */ -+int oid_release(struct super_block *super, oid_t oid UNUSED_ARG) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = get_super_private(super); -+ -+ spin_lock_reiser4_super(sbinfo); -+ sbinfo->oids_in_use--; -+ spin_unlock_reiser4_super(sbinfo); -+ return 0; -+} -+ -+/* -+ * return next @oid that would be allocated (i.e., returned by oid_allocate()) -+ * without actually allocating it. This is used by disk format plugin to save -+ * oid allocator state on the disk. -+ */ -+oid_t oid_next(const struct super_block * super) -+{ -+ reiser4_super_info_data *sbinfo; -+ oid_t oid; -+ -+ sbinfo = get_super_private(super); -+ -+ spin_lock_reiser4_super(sbinfo); -+ oid = sbinfo->next_to_use; -+ spin_unlock_reiser4_super(sbinfo); -+ return oid; -+} -+ -+/* -+ * returns number of currently used oids. This is used by statfs(2) to report -+ * number of "inodes" and by disk format plugin to save oid allocator state on -+ * the disk. -+ */ -+long oids_used(const struct super_block *super) -+{ -+ reiser4_super_info_data *sbinfo; -+ oid_t used; -+ -+ sbinfo = get_super_private(super); -+ -+ spin_lock_reiser4_super(sbinfo); -+ used = sbinfo->oids_in_use; -+ spin_unlock_reiser4_super(sbinfo); -+ if (used < (__u64) ((long)~0) >> 1) -+ return (long)used; -+ else -+ return (long)-1; -+} -+ -+/* -+ * Count oid as allocated in atom. This is done after call to oid_allocate() -+ * at the point when we are irrevocably committed to creation of the new file -+ * (i.e., when oid allocation cannot be any longer rolled back due to some -+ * error). -+ */ -+void oid_count_allocated(void) -+{ -+ txn_atom *atom; -+ -+ atom = get_current_atom_locked(); -+ atom->nr_objects_created++; -+ spin_unlock_atom(atom); -+} -+ -+/* -+ * Count oid as free in atom. This is done after call to oid_release() at the -+ * point when we are irrevocably committed to the deletion of the file (i.e., -+ * when oid release cannot be any longer rolled back due to some error). -+ */ -+void oid_count_released(void) -+{ -+ txn_atom *atom; -+ -+ atom = get_current_atom_locked(); -+ atom->nr_objects_deleted++; -+ spin_unlock_atom(atom); -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/page_cache.c linux-2.6.24/fs/reiser4/page_cache.c ---- linux-2.6.24.orig/fs/reiser4/page_cache.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/page_cache.c 2008-01-25 11:54:46.665843146 +0300 -@@ -0,0 +1,714 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Memory pressure hooks. Fake inodes handling. */ -+ -+/* GLOSSARY -+ -+ . Formatted and unformatted nodes. -+ Elements of reiser4 balanced tree to store data and metadata. -+ Unformatted nodes are pointed to by extent pointers. Such nodes -+ are used to store data of large objects. Unlike unformatted nodes, -+ formatted ones have associated format described by node4X plugin. -+ -+ . Jnode (or journal node) -+ The in-memory header which is used to track formatted and unformatted -+ nodes, bitmap nodes, etc. In particular, jnodes are used to track -+ transactional information associated with each block(see reiser4/jnode.c -+ for details). -+ -+ . Znode -+ The in-memory header which is used to track formatted nodes. Contains -+ embedded jnode (see reiser4/znode.c for details). -+*/ -+ -+/* We store all file system meta data (and data, of course) in the page cache. -+ -+ What does this mean? In stead of using bread/brelse we create special -+ "fake" inode (one per super block) and store content of formatted nodes -+ into pages bound to this inode in the page cache. In newer kernels bread() -+ already uses inode attached to block device (bd_inode). Advantage of having -+ our own fake inode is that we can install appropriate methods in its -+ address_space operations. Such methods are called by VM on memory pressure -+ (or during background page flushing) and we can use them to react -+ appropriately. -+ -+ In initial version we only support one block per page. Support for multiple -+ blocks per page is complicated by relocation. -+ -+ To each page, used by reiser4, jnode is attached. jnode is analogous to -+ buffer head. Difference is that jnode is bound to the page permanently: -+ jnode cannot be removed from memory until its backing page is. -+ -+ jnode contain pointer to page (->pg field) and page contain pointer to -+ jnode in ->private field. Pointer from jnode to page is protected to by -+ jnode's spinlock and pointer from page to jnode is protected by page lock -+ (PG_locked bit). Lock ordering is: first take page lock, then jnode spin -+ lock. To go into reverse direction use jnode_lock_page() function that uses -+ standard try-lock-and-release device. -+ -+ Properties: -+ -+ 1. when jnode-to-page mapping is established (by jnode_attach_page()), page -+ reference counter is increased. -+ -+ 2. when jnode-to-page mapping is destroyed (by page_clear_jnode(), page -+ reference counter is decreased. -+ -+ 3. on jload() reference counter on jnode page is increased, page is -+ kmapped and `referenced'. -+ -+ 4. on jrelse() inverse operations are performed. -+ -+ 5. kmapping/kunmapping of unformatted pages is done by read/write methods. -+ -+ DEADLOCKS RELATED TO MEMORY PRESSURE. [OUTDATED. Only interesting -+ historically.] -+ -+ [In the following discussion, `lock' invariably means long term lock on -+ znode.] (What about page locks?) -+ -+ There is some special class of deadlock possibilities related to memory -+ pressure. Locks acquired by other reiser4 threads are accounted for in -+ deadlock prevention mechanism (lock.c), but when ->vm_writeback() is -+ invoked additional hidden arc is added to the locking graph: thread that -+ tries to allocate memory waits for ->vm_writeback() to finish. If this -+ thread keeps lock and ->vm_writeback() tries to acquire this lock, deadlock -+ prevention is useless. -+ -+ Another related problem is possibility for ->vm_writeback() to run out of -+ memory itself. This is not a problem for ext2 and friends, because their -+ ->vm_writeback() don't allocate much memory, but reiser4 flush is -+ definitely able to allocate huge amounts of memory. -+ -+ It seems that there is no reliable way to cope with the problems above. In -+ stead it was decided that ->vm_writeback() (as invoked in the kswapd -+ context) wouldn't perform any flushing itself, but rather should just wake -+ up some auxiliary thread dedicated for this purpose (or, the same thread -+ that does periodic commit of old atoms (ktxnmgrd.c)). -+ -+ Details: -+ -+ 1. Page is called `reclaimable' against particular reiser4 mount F if this -+ page can be ultimately released by try_to_free_pages() under presumptions -+ that: -+ -+ a. ->vm_writeback() for F is no-op, and -+ -+ b. none of the threads accessing F are making any progress, and -+ -+ c. other reiser4 mounts obey the same memory reservation protocol as F -+ (described below). -+ -+ For example, clean un-pinned page, or page occupied by ext2 data are -+ reclaimable against any reiser4 mount. -+ -+ When there is more than one reiser4 mount in a system, condition (c) makes -+ reclaim-ability not easily verifiable beyond trivial cases mentioned above. -+ -+ THIS COMMENT IS VALID FOR "MANY BLOCKS ON PAGE" CASE -+ -+ Fake inode is used to bound formatted nodes and each node is indexed within -+ fake inode by its block number. If block size of smaller than page size, it -+ may so happen that block mapped to the page with formatted node is occupied -+ by unformatted node or is unallocated. This lead to some complications, -+ because flushing whole page can lead to an incorrect overwrite of -+ unformatted node that is moreover, can be cached in some other place as -+ part of the file body. To avoid this, buffers for unformatted nodes are -+ never marked dirty. Also pages in the fake are never marked dirty. This -+ rules out usage of ->writepage() as memory pressure hook. In stead -+ ->releasepage() is used. -+ -+ Josh is concerned that page->buffer is going to die. This should not pose -+ significant problem though, because we need to add some data structures to -+ the page anyway (jnode) and all necessary book keeping can be put there. -+ -+*/ -+ -+/* Life cycle of pages/nodes. -+ -+ jnode contains reference to page and page contains reference back to -+ jnode. This reference is counted in page ->count. Thus, page bound to jnode -+ cannot be released back into free pool. -+ -+ 1. Formatted nodes. -+ -+ 1. formatted node is represented by znode. When new znode is created its -+ ->pg pointer is NULL initially. -+ -+ 2. when node content is loaded into znode (by call to zload()) for the -+ first time following happens (in call to ->read_node() or -+ ->allocate_node()): -+ -+ 1. new page is added to the page cache. -+ -+ 2. this page is attached to znode and its ->count is increased. -+ -+ 3. page is kmapped. -+ -+ 3. if more calls to zload() follow (without corresponding zrelses), page -+ counter is left intact and in its stead ->d_count is increased in znode. -+ -+ 4. each call to zrelse decreases ->d_count. When ->d_count drops to zero -+ ->release_node() is called and page is kunmapped as result. -+ -+ 5. at some moment node can be captured by a transaction. Its ->x_count -+ is then increased by transaction manager. -+ -+ 6. if node is removed from the tree (empty node with JNODE_HEARD_BANSHEE -+ bit set) following will happen (also see comment at the top of znode.c): -+ -+ 1. when last lock is released, node will be uncaptured from -+ transaction. This released reference that transaction manager acquired -+ at the step 5. -+ -+ 2. when last reference is released, zput() detects that node is -+ actually deleted and calls ->delete_node() -+ operation. page_cache_delete_node() implementation detaches jnode from -+ page and releases page. -+ -+ 7. otherwise (node wasn't removed from the tree), last reference to -+ znode will be released after transaction manager committed transaction -+ node was in. This implies squallocing of this node (see -+ flush.c). Nothing special happens at this point. Znode is still in the -+ hash table and page is still attached to it. -+ -+ 8. znode is actually removed from the memory because of the memory -+ pressure, or during umount (znodes_tree_done()). Anyway, znode is -+ removed by the call to zdrop(). At this moment, page is detached from -+ znode and removed from the inode address space. -+ -+*/ -+ -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree.h" -+#include "vfs_ops.h" -+#include "inode.h" -+#include "super.h" -+#include "entd.h" -+#include "page_cache.h" -+#include "ktxnmgrd.h" -+ -+#include -+#include -+#include /* for struct page */ -+#include /* for struct page */ -+#include -+#include -+#include -+#include -+ -+static struct bio *page_bio(struct page *, jnode *, int rw, gfp_t gfp); -+ -+static struct address_space_operations formatted_fake_as_ops; -+ -+static const oid_t fake_ino = 0x1; -+static const oid_t bitmap_ino = 0x2; -+static const oid_t cc_ino = 0x3; -+ -+static void -+init_fake_inode(struct super_block *super, struct inode *fake, -+ struct inode **pfake) -+{ -+ assert("nikita-2168", fake->i_state & I_NEW); -+ fake->i_mapping->a_ops = &formatted_fake_as_ops; -+ *pfake = fake; -+ /* NOTE-NIKITA something else? */ -+ unlock_new_inode(fake); -+} -+ -+/** -+ * reiser4_init_formatted_fake - iget inodes for formatted nodes and bitmaps -+ * @super: super block to init fake inode for -+ * -+ * Initializes fake inode to which formatted nodes are bound in the page cache -+ * and inode for bitmaps. -+ */ -+int reiser4_init_formatted_fake(struct super_block *super) -+{ -+ struct inode *fake; -+ struct inode *bitmap; -+ struct inode *cc; -+ reiser4_super_info_data *sinfo; -+ -+ assert("nikita-1703", super != NULL); -+ -+ sinfo = get_super_private_nocheck(super); -+ fake = iget_locked(super, oid_to_ino(fake_ino)); -+ -+ if (fake != NULL) { -+ init_fake_inode(super, fake, &sinfo->fake); -+ -+ bitmap = iget_locked(super, oid_to_ino(bitmap_ino)); -+ if (bitmap != NULL) { -+ init_fake_inode(super, bitmap, &sinfo->bitmap); -+ -+ cc = iget_locked(super, oid_to_ino(cc_ino)); -+ if (cc != NULL) { -+ init_fake_inode(super, cc, &sinfo->cc); -+ return 0; -+ } else { -+ iput(sinfo->fake); -+ iput(sinfo->bitmap); -+ sinfo->fake = NULL; -+ sinfo->bitmap = NULL; -+ } -+ } else { -+ iput(sinfo->fake); -+ sinfo->fake = NULL; -+ } -+ } -+ return RETERR(-ENOMEM); -+} -+ -+/** -+ * reiser4_done_formatted_fake - release inode used by formatted nodes and bitmaps -+ * @super: super block to init fake inode for -+ * -+ * Releases inodes which were used as address spaces of bitmap and formatted -+ * nodes. -+ */ -+void reiser4_done_formatted_fake(struct super_block *super) -+{ -+ reiser4_super_info_data *sinfo; -+ -+ sinfo = get_super_private_nocheck(super); -+ -+ if (sinfo->fake != NULL) { -+ iput(sinfo->fake); -+ sinfo->fake = NULL; -+ } -+ -+ if (sinfo->bitmap != NULL) { -+ iput(sinfo->bitmap); -+ sinfo->bitmap = NULL; -+ } -+ -+ if (sinfo->cc != NULL) { -+ iput(sinfo->cc); -+ sinfo->cc = NULL; -+ } -+ return; -+} -+ -+void reiser4_wait_page_writeback(struct page *page) -+{ -+ assert("zam-783", PageLocked(page)); -+ -+ do { -+ unlock_page(page); -+ wait_on_page_writeback(page); -+ lock_page(page); -+ } while (PageWriteback(page)); -+} -+ -+/* return tree @page is in */ -+reiser4_tree *reiser4_tree_by_page(const struct page *page /* page to query */ ) -+{ -+ assert("nikita-2461", page != NULL); -+ return &get_super_private(page->mapping->host->i_sb)->tree; -+} -+ -+/* completion handler for single page bio-based read. -+ -+ mpage_end_io_read() would also do. But it's static. -+ -+*/ -+static void -+end_bio_single_page_read(struct bio *bio, int err UNUSED_ARG) -+{ -+ struct page *page; -+ -+ page = bio->bi_io_vec[0].bv_page; -+ -+ if (test_bit(BIO_UPTODATE, &bio->bi_flags)) { -+ SetPageUptodate(page); -+ } else { -+ ClearPageUptodate(page); -+ SetPageError(page); -+ } -+ unlock_page(page); -+ bio_put(bio); -+} -+ -+/* completion handler for single page bio-based write. -+ -+ mpage_end_io_write() would also do. But it's static. -+ -+*/ -+static void -+end_bio_single_page_write(struct bio *bio, int err UNUSED_ARG) -+{ -+ struct page *page; -+ -+ page = bio->bi_io_vec[0].bv_page; -+ -+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) -+ SetPageError(page); -+ end_page_writeback(page); -+ bio_put(bio); -+} -+ -+/* ->readpage() method for formatted nodes */ -+static int formatted_readpage(struct file *f UNUSED_ARG, -+ struct page *page /* page to read */ ) -+{ -+ assert("nikita-2412", PagePrivate(page) && jprivate(page)); -+ return reiser4_page_io(page, jprivate(page), READ, -+ reiser4_ctx_gfp_mask_get()); -+} -+ -+/** -+ * reiser4_page_io - submit single-page bio request -+ * @page: page to perform io for -+ * @node: jnode of page -+ * @rw: read or write -+ * @gfp: gfp mask for bio allocation -+ * -+ * Submits single page read or write. -+ */ -+int reiser4_page_io(struct page *page, jnode *node, int rw, gfp_t gfp) -+{ -+ struct bio *bio; -+ int result; -+ -+ assert("nikita-2094", page != NULL); -+ assert("nikita-2226", PageLocked(page)); -+ assert("nikita-2634", node != NULL); -+ assert("nikita-2893", rw == READ || rw == WRITE); -+ -+ if (rw) { -+ if (unlikely(page->mapping->host->i_sb->s_flags & MS_RDONLY)) { -+ unlock_page(page); -+ return 0; -+ } -+ } -+ -+ bio = page_bio(page, node, rw, gfp); -+ if (!IS_ERR(bio)) { -+ if (rw == WRITE) { -+ set_page_writeback(page); -+ unlock_page(page); -+ } -+ reiser4_submit_bio(rw, bio); -+ result = 0; -+ } else { -+ unlock_page(page); -+ result = PTR_ERR(bio); -+ } -+ -+ return result; -+} -+ -+/* helper function to construct bio for page */ -+static struct bio *page_bio(struct page *page, jnode * node, int rw, gfp_t gfp) -+{ -+ struct bio *bio; -+ assert("nikita-2092", page != NULL); -+ assert("nikita-2633", node != NULL); -+ -+ /* Simple implementation in the assumption that blocksize == pagesize. -+ -+ We only have to submit one block, but submit_bh() will allocate bio -+ anyway, so lets use all the bells-and-whistles of bio code. -+ */ -+ -+ bio = bio_alloc(gfp, 1); -+ if (bio != NULL) { -+ int blksz; -+ struct super_block *super; -+ reiser4_block_nr blocknr; -+ -+ super = page->mapping->host->i_sb; -+ assert("nikita-2029", super != NULL); -+ blksz = super->s_blocksize; -+ assert("nikita-2028", blksz == (int)PAGE_CACHE_SIZE); -+ -+ spin_lock_jnode(node); -+ blocknr = *jnode_get_io_block(node); -+ spin_unlock_jnode(node); -+ -+ assert("nikita-2275", blocknr != (reiser4_block_nr) 0); -+ assert("nikita-2276", !reiser4_blocknr_is_fake(&blocknr)); -+ -+ bio->bi_bdev = super->s_bdev; -+ /* fill bio->bi_sector before calling bio_add_page(), because -+ * q->merge_bvec_fn may want to inspect it (see -+ * drivers/md/linear.c:linear_mergeable_bvec() for example. */ -+ bio->bi_sector = blocknr * (blksz >> 9); -+ -+ if (!bio_add_page(bio, page, blksz, 0)) { -+ warning("nikita-3452", -+ "Single page bio cannot be constructed"); -+ return ERR_PTR(RETERR(-EINVAL)); -+ } -+ -+ /* bio -> bi_idx is filled by bio_init() */ -+ bio->bi_end_io = (rw == READ) ? -+ end_bio_single_page_read : end_bio_single_page_write; -+ -+ return bio; -+ } else -+ return ERR_PTR(RETERR(-ENOMEM)); -+} -+ -+/* this function is internally called by jnode_make_dirty() */ -+int reiser4_set_page_dirty_internal(struct page *page) -+{ -+ struct address_space *mapping; -+ -+ mapping = page->mapping; -+ BUG_ON(mapping == NULL); -+ -+ if (!TestSetPageDirty(page)) { -+ if (mapping_cap_account_dirty(mapping)) -+ inc_zone_page_state(page, NR_FILE_DIRTY); -+ -+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); -+ } -+ -+ /* znode must be dirty ? */ -+ if (mapping->host == reiser4_get_super_fake(mapping->host->i_sb)) -+ assert("", JF_ISSET(jprivate(page), JNODE_DIRTY)); -+ return 0; -+} -+ -+#if 0 -+static int can_hit_entd(reiser4_context *ctx, struct super_block *s) -+{ -+ if (ctx == NULL || ((unsigned long)ctx->magic) != context_magic) -+ return 1; -+ if (ctx->super != s) -+ return 1; -+ if (get_super_private(s)->entd.tsk == current) -+ return 0; -+ if (!lock_stack_isclean(&ctx->stack)) -+ return 0; -+ if (ctx->trans->atom != NULL) -+ return 0; -+ return 1; -+} -+#endif -+ -+/** -+ * reiser4_writepage - writepage of struct address_space_operations -+ * @page: page to write -+ * @wbc: -+ * -+ * -+ */ -+/* Common memory pressure notification. */ -+int reiser4_writepage(struct page *page, -+ struct writeback_control *wbc) -+{ -+ struct super_block *s; -+ reiser4_context *ctx; -+ -+ assert("vs-828", PageLocked(page)); -+ -+ s = page->mapping->host->i_sb; -+ ctx = get_current_context_check(); -+ -+ //assert("", can_hit_entd(ctx, s)); -+ return write_page_by_ent(page, wbc); -+} -+ -+/* ->set_page_dirty() method of formatted address_space */ -+static int formatted_set_page_dirty(struct page *page) -+{ -+ assert("nikita-2173", page != NULL); -+ BUG(); -+ return __set_page_dirty_nobuffers(page); -+} -+ -+/* writepages method of address space operations in reiser4 is used to involve -+ into transactions pages which are dirtied via mmap. Only regular files can -+ have such pages. Fake inode is used to access formatted nodes via page -+ cache. As formatted nodes can never be mmaped, fake inode's writepages has -+ nothing to do */ -+static int -+writepages_fake(struct address_space *mapping, struct writeback_control *wbc) -+{ -+ return 0; -+} -+ -+/* address space operations for the fake inode */ -+static struct address_space_operations formatted_fake_as_ops = { -+ /* Perform a writeback of a single page as a memory-freeing -+ * operation. */ -+ .writepage = reiser4_writepage, -+ /* this is called to read formatted node */ -+ .readpage = formatted_readpage, -+ /* ->sync_page() method of fake inode address space operations. Called -+ from wait_on_page() and lock_page(). -+ -+ This is most annoyingly misnomered method. Actually it is called -+ from wait_on_page_bit() and lock_page() and its purpose is to -+ actually start io by jabbing device drivers. -+ */ -+ .sync_page = block_sync_page, -+ /* Write back some dirty pages from this mapping. Called from sync. -+ called during sync (pdflush) */ -+ .writepages = writepages_fake, -+ /* Set a page dirty */ -+ .set_page_dirty = formatted_set_page_dirty, -+ /* used for read-ahead. Not applicable */ -+ .readpages = NULL, -+ .prepare_write = NULL, -+ .commit_write = NULL, -+ .bmap = NULL, -+ /* called just before page is being detached from inode mapping and -+ removed from memory. Called on truncate, cut/squeeze, and -+ umount. */ -+ .invalidatepage = reiser4_invalidatepage, -+ /* this is called by shrink_cache() so that file system can try to -+ release objects (jnodes, buffers, journal heads) attached to page -+ and, may be made page itself free-able. -+ */ -+ .releasepage = reiser4_releasepage, -+ .direct_IO = NULL -+}; -+ -+/* called just before page is released (no longer used by reiser4). Callers: -+ jdelete() and extent2tail(). */ -+void reiser4_drop_page(struct page *page) -+{ -+ assert("nikita-2181", PageLocked(page)); -+ clear_page_dirty_for_io(page); -+ ClearPageUptodate(page); -+#if defined(PG_skipped) -+ ClearPageSkipped(page); -+#endif -+ unlock_page(page); -+} -+ -+#define JNODE_GANG_SIZE (16) -+ -+/* find all jnodes from range specified and invalidate them */ -+static int -+truncate_jnodes_range(struct inode *inode, pgoff_t from, pgoff_t count) -+{ -+ reiser4_inode *info; -+ int truncated_jnodes; -+ reiser4_tree *tree; -+ unsigned long index; -+ unsigned long end; -+ -+ if (inode_file_plugin(inode) == -+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)) -+ /* -+ * No need to get rid of jnodes here: if the single jnode of -+ * page cluster did not have page, then it was found and killed -+ * before in -+ * truncate_complete_page_cluster()->jput()->jput_final(), -+ * otherwise it will be dropped by reiser4_invalidatepage() -+ */ -+ return 0; -+ truncated_jnodes = 0; -+ -+ info = reiser4_inode_data(inode); -+ tree = reiser4_tree_by_inode(inode); -+ -+ index = from; -+ end = from + count; -+ -+ while (1) { -+ jnode *gang[JNODE_GANG_SIZE]; -+ int taken; -+ int i; -+ jnode *node; -+ -+ assert("nikita-3466", index <= end); -+ -+ read_lock_tree(tree); -+ taken = -+ radix_tree_gang_lookup(jnode_tree_by_reiser4_inode(info), -+ (void **)gang, index, -+ JNODE_GANG_SIZE); -+ for (i = 0; i < taken; ++i) { -+ node = gang[i]; -+ if (index_jnode(node) < end) -+ jref(node); -+ else -+ gang[i] = NULL; -+ } -+ read_unlock_tree(tree); -+ -+ for (i = 0; i < taken; ++i) { -+ node = gang[i]; -+ if (node != NULL) { -+ index = max(index, index_jnode(node)); -+ spin_lock_jnode(node); -+ assert("edward-1457", node->pg == NULL); -+ /* this is always called after -+ truncate_inode_pages_range(). Therefore, here -+ jnode can not have page. New pages can not be -+ created because truncate_jnodes_range goes -+ under exclusive access on file obtained, -+ where as new page creation requires -+ non-exclusive access obtained */ -+ JF_SET(node, JNODE_HEARD_BANSHEE); -+ reiser4_uncapture_jnode(node); -+ unhash_unformatted_jnode(node); -+ truncated_jnodes++; -+ jput(node); -+ } else -+ break; -+ } -+ if (i != taken || taken == 0) -+ break; -+ } -+ return truncated_jnodes; -+} -+ -+/* Truncating files in reiser4: problems and solutions. -+ -+ VFS calls fs's truncate after it has called truncate_inode_pages() -+ to get rid of pages corresponding to part of file being truncated. -+ In reiser4 it may cause existence of unallocated extents which do -+ not have jnodes. Flush code does not expect that. Solution of this -+ problem is straightforward. As vfs's truncate is implemented using -+ setattr operation, it seems reasonable to have ->setattr() that -+ will cut file body. However, flush code also does not expect dirty -+ pages without parent items, so it is impossible to cut all items, -+ then truncate all pages in two steps. We resolve this problem by -+ cutting items one-by-one. Each such fine-grained step performed -+ under longterm znode lock calls at the end ->kill_hook() method of -+ a killed item to remove its binded pages and jnodes. -+ -+ The following function is a common part of mentioned kill hooks. -+ Also, this is called before tail-to-extent conversion (to not manage -+ few copies of the data). -+*/ -+void reiser4_invalidate_pages(struct address_space *mapping, pgoff_t from, -+ unsigned long count, int even_cows) -+{ -+ loff_t from_bytes, count_bytes; -+ -+ if (count == 0) -+ return; -+ from_bytes = ((loff_t) from) << PAGE_CACHE_SHIFT; -+ count_bytes = ((loff_t) count) << PAGE_CACHE_SHIFT; -+ -+ unmap_mapping_range(mapping, from_bytes, count_bytes, even_cows); -+ truncate_inode_pages_range(mapping, from_bytes, -+ from_bytes + count_bytes - 1); -+ truncate_jnodes_range(mapping->host, from, count); -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 120 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/page_cache.h linux-2.6.24/fs/reiser4/page_cache.h ---- linux-2.6.24.orig/fs/reiser4/page_cache.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/page_cache.h 2008-01-25 11:39:06.952211810 +0300 -@@ -0,0 +1,68 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+/* Memory pressure hooks. Fake inodes handling. See page_cache.c. */ -+ -+#if !defined( __REISER4_PAGE_CACHE_H__ ) -+#define __REISER4_PAGE_CACHE_H__ -+ -+#include "forward.h" -+#include "context.h" /* for reiser4_ctx_gfp_mask_get() */ -+ -+#include /* for struct super_block, address_space */ -+#include /* for struct page */ -+#include /* for lock_page() */ -+#include /* for __vmalloc() */ -+ -+extern int reiser4_init_formatted_fake(struct super_block *); -+extern void reiser4_done_formatted_fake(struct super_block *); -+ -+extern reiser4_tree *reiser4_tree_by_page(const struct page *); -+ -+extern int reiser4_set_page_dirty_internal(struct page *); -+ -+#define reiser4_submit_bio(rw, bio) submit_bio((rw), (bio)) -+ -+extern void reiser4_wait_page_writeback(struct page *); -+static inline void lock_and_wait_page_writeback(struct page *page) -+{ -+ lock_page(page); -+ if (unlikely(PageWriteback(page))) -+ reiser4_wait_page_writeback(page); -+} -+ -+#define jprivate(page) ((jnode *)page_private(page)) -+ -+extern int reiser4_page_io(struct page *, jnode *, int rw, gfp_t); -+extern void reiser4_drop_page(struct page *); -+extern void reiser4_invalidate_pages(struct address_space *, pgoff_t from, -+ unsigned long count, int even_cows); -+extern void capture_reiser4_inodes(struct super_block *, -+ struct writeback_control *); -+static inline void * reiser4_vmalloc (unsigned long size) -+{ -+ return __vmalloc(size, -+ reiser4_ctx_gfp_mask_get() | __GFP_HIGHMEM, -+ PAGE_KERNEL); -+} -+ -+#define PAGECACHE_TAG_REISER4_MOVED PAGECACHE_TAG_DIRTY -+ -+#if REISER4_DEBUG -+extern void print_page(const char *prefix, struct page *page); -+#else -+#define print_page(prf, p) noop -+#endif -+ -+/* __REISER4_PAGE_CACHE_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/cluster.c linux-2.6.24/fs/reiser4/plugin/cluster.c ---- linux-2.6.24.orig/fs/reiser4/plugin/cluster.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/cluster.c 2008-01-25 11:39:06.952211810 +0300 -@@ -0,0 +1,71 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Contains reiser4 cluster plugins (see -+ http://www.namesys.com/cryptcompress_design.html -+ "Concepts of clustering" for details). */ -+ -+#include "plugin_header.h" -+#include "plugin.h" -+#include "../inode.h" -+ -+static int change_cluster(struct inode *inode, -+ reiser4_plugin * plugin, -+ pset_member memb) -+{ -+ assert("edward-1324", inode != NULL); -+ assert("edward-1325", plugin != NULL); -+ assert("edward-1326", is_reiser4_inode(inode)); -+ assert("edward-1327", plugin->h.type_id == REISER4_CLUSTER_PLUGIN_TYPE); -+ -+ /* Can't change the cluster plugin for already existent regular files. */ -+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE)) -+ return RETERR(-EINVAL); -+ -+ /* If matches, nothing to change. */ -+ if (inode_hash_plugin(inode) != NULL && -+ inode_hash_plugin(inode)->h.id == plugin->h.id) -+ return 0; -+ -+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset, -+ PSET_CLUSTER, plugin); -+} -+ -+static reiser4_plugin_ops cluster_plugin_ops = { -+ .init = NULL, -+ .load = NULL, -+ .save_len = NULL, -+ .save = NULL, -+ .change = &change_cluster -+}; -+ -+#define SUPPORT_CLUSTER(SHIFT, ID, LABEL, DESC) \ -+ [CLUSTER_ ## ID ## _ID] = { \ -+ .h = { \ -+ .type_id = REISER4_CLUSTER_PLUGIN_TYPE, \ -+ .id = CLUSTER_ ## ID ## _ID, \ -+ .pops = &cluster_plugin_ops, \ -+ .label = LABEL, \ -+ .desc = DESC, \ -+ .linkage = {NULL, NULL} \ -+ }, \ -+ .shift = SHIFT \ -+ } -+ -+cluster_plugin cluster_plugins[LAST_CLUSTER_ID] = { -+ SUPPORT_CLUSTER(16, 64K, "64K", "Large"), -+ SUPPORT_CLUSTER(15, 32K, "32K", "Big"), -+ SUPPORT_CLUSTER(14, 16K, "16K", "Average"), -+ SUPPORT_CLUSTER(13, 8K, "8K", "Small"), -+ SUPPORT_CLUSTER(12, 4K, "4K", "Minimal") -+}; -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/cluster.h linux-2.6.24/fs/reiser4/plugin/cluster.h ---- linux-2.6.24.orig/fs/reiser4/plugin/cluster.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/cluster.h 2008-01-25 11:39:06.956212841 +0300 -@@ -0,0 +1,409 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* This file contains size/offset translators, modulators -+ and other helper functions. */ -+ -+#if !defined( __FS_REISER4_CLUSTER_H__ ) -+#define __FS_REISER4_CLUSTER_H__ -+ -+#include "../inode.h" -+ -+static inline int inode_cluster_shift(struct inode *inode) -+{ -+ assert("edward-92", inode != NULL); -+ assert("edward-93", reiser4_inode_data(inode) != NULL); -+ -+ return inode_cluster_plugin(inode)->shift; -+} -+ -+static inline unsigned cluster_nrpages_shift(struct inode *inode) -+{ -+ return inode_cluster_shift(inode) - PAGE_CACHE_SHIFT; -+} -+ -+/* cluster size in page units */ -+static inline unsigned cluster_nrpages(struct inode *inode) -+{ -+ return 1U << cluster_nrpages_shift(inode); -+} -+ -+static inline size_t inode_cluster_size(struct inode *inode) -+{ -+ assert("edward-96", inode != NULL); -+ -+ return 1U << inode_cluster_shift(inode); -+} -+ -+static inline cloff_t pg_to_clust(pgoff_t idx, struct inode *inode) -+{ -+ return idx >> cluster_nrpages_shift(inode); -+} -+ -+static inline pgoff_t clust_to_pg(cloff_t idx, struct inode *inode) -+{ -+ return idx << cluster_nrpages_shift(inode); -+} -+ -+static inline pgoff_t pg_to_clust_to_pg(pgoff_t idx, struct inode *inode) -+{ -+ return clust_to_pg(pg_to_clust(idx, inode), inode); -+} -+ -+static inline pgoff_t off_to_pg(loff_t off) -+{ -+ return (off >> PAGE_CACHE_SHIFT); -+} -+ -+static inline loff_t pg_to_off(pgoff_t idx) -+{ -+ return ((loff_t) (idx) << PAGE_CACHE_SHIFT); -+} -+ -+static inline cloff_t off_to_clust(loff_t off, struct inode *inode) -+{ -+ return off >> inode_cluster_shift(inode); -+} -+ -+static inline loff_t clust_to_off(cloff_t idx, struct inode *inode) -+{ -+ return (loff_t) idx << inode_cluster_shift(inode); -+} -+ -+static inline loff_t off_to_clust_to_off(loff_t off, struct inode *inode) -+{ -+ return clust_to_off(off_to_clust(off, inode), inode); -+} -+ -+static inline pgoff_t off_to_clust_to_pg(loff_t off, struct inode *inode) -+{ -+ return clust_to_pg(off_to_clust(off, inode), inode); -+} -+ -+static inline unsigned off_to_pgoff(loff_t off) -+{ -+ return off & (PAGE_CACHE_SIZE - 1); -+} -+ -+static inline unsigned off_to_cloff(loff_t off, struct inode *inode) -+{ -+ return off & ((loff_t) (inode_cluster_size(inode)) - 1); -+} -+ -+static inline pgoff_t offset_in_clust(struct page * page) -+{ -+ assert("edward-1488", page != NULL); -+ assert("edward-1489", page->mapping != NULL); -+ -+ return page_index(page) & ((cluster_nrpages(page->mapping->host)) - 1); -+} -+ -+static inline int first_page_in_cluster(struct page * page) -+{ -+ return offset_in_clust(page) == 0; -+} -+ -+static inline int last_page_in_cluster(struct page * page) -+{ -+ return offset_in_clust(page) == -+ cluster_nrpages(page->mapping->host) - 1; -+} -+ -+static inline unsigned -+pg_to_off_to_cloff(unsigned long idx, struct inode *inode) -+{ -+ return off_to_cloff(pg_to_off(idx), inode); -+} -+ -+/*********************** Size translators **************************/ -+ -+/* Translate linear size. -+ * New units are (1 << @blk_shift) times larger, then old ones. -+ * In other words, calculate number of logical blocks, occupied -+ * by @count elements -+ */ -+static inline unsigned long size_in_blocks(loff_t count, unsigned blkbits) -+{ -+ return (count + (1UL << blkbits) - 1) >> blkbits; -+} -+ -+/* size in pages */ -+static inline pgoff_t size_in_pages(loff_t size) -+{ -+ return size_in_blocks(size, PAGE_CACHE_SHIFT); -+} -+ -+/* size in logical clusters */ -+static inline cloff_t size_in_lc(loff_t size, struct inode *inode) -+{ -+ return size_in_blocks(size, inode_cluster_shift(inode)); -+} -+ -+/* size in pages to the size in page clusters */ -+static inline cloff_t sp_to_spcl(pgoff_t size, struct inode *inode) -+{ -+ return size_in_blocks(size, cluster_nrpages_shift(inode)); -+} -+ -+/*********************** Size modulators ***************************/ -+ -+/* -+ Modulate linear size by nominated block size and offset. -+ -+ The "finite" function (which is zero almost everywhere). -+ How much is a height of the figure at a position @pos, -+ when trying to construct rectangle of height (1 << @blkbits), -+ and square @size. -+ -+ ****** -+ ******* -+ ******* -+ ******* -+ ----------> pos -+*/ -+static inline unsigned __mbb(loff_t size, unsigned long pos, int blkbits) -+{ -+ unsigned end = size >> blkbits; -+ if (pos < end) -+ return 1U << blkbits; -+ if (unlikely(pos > end)) -+ return 0; -+ return size & ~(~0ull << blkbits); -+} -+ -+/* the same as above, but block size is page size */ -+static inline unsigned __mbp(loff_t size, pgoff_t pos) -+{ -+ return __mbb(size, pos, PAGE_CACHE_SHIFT); -+} -+ -+/* number of file's bytes in the nominated logical cluster */ -+static inline unsigned lbytes(cloff_t index, struct inode * inode) -+{ -+ return __mbb(i_size_read(inode), index, inode_cluster_shift(inode)); -+} -+ -+/* number of file's bytes in the nominated page */ -+static inline unsigned pbytes(pgoff_t index, struct inode * inode) -+{ -+ return __mbp(i_size_read(inode), index); -+} -+ -+/** -+ * number of pages occuped by @win->count bytes starting from -+ * @win->off at logical cluster defined by @win. This is exactly -+ * a number of pages to be modified and dirtied in any cluster operation. -+ */ -+static inline pgoff_t win_count_to_nrpages(struct reiser4_slide * win) -+{ -+ return ((win->off + win->count + -+ (1UL << PAGE_CACHE_SHIFT) - 1) >> PAGE_CACHE_SHIFT) - -+ off_to_pg(win->off); -+} -+ -+/* return true, if logical cluster is not occupied by the file */ -+static inline int new_logical_cluster(struct cluster_handle * clust, -+ struct inode *inode) -+{ -+ return clust_to_off(clust->index, inode) >= i_size_read(inode); -+} -+ -+/* return true, if pages @p1 and @p2 are of the same page cluster */ -+static inline int same_page_cluster(struct page * p1, struct page * p2) -+{ -+ assert("edward-1490", p1 != NULL); -+ assert("edward-1491", p2 != NULL); -+ assert("edward-1492", p1->mapping != NULL); -+ assert("edward-1493", p2->mapping != NULL); -+ -+ return (pg_to_clust(page_index(p1), p1->mapping->host) == -+ pg_to_clust(page_index(p2), p2->mapping->host)); -+} -+ -+static inline int cluster_is_complete(struct cluster_handle * clust, -+ struct inode * inode) -+{ -+ return clust->tc.lsize == inode_cluster_size(inode); -+} -+ -+static inline void reiser4_slide_init(struct reiser4_slide * win) -+{ -+ assert("edward-1084", win != NULL); -+ memset(win, 0, sizeof *win); -+} -+ -+static inline tfm_action -+cluster_get_tfm_act(struct tfm_cluster * tc) -+{ -+ assert("edward-1356", tc != NULL); -+ return tc->act; -+} -+ -+static inline void -+cluster_set_tfm_act(struct tfm_cluster * tc, tfm_action act) -+{ -+ assert("edward-1356", tc != NULL); -+ tc->act = act; -+} -+ -+static inline void cluster_init_act(struct cluster_handle * clust, -+ tfm_action act, -+ struct reiser4_slide * window) -+{ -+ assert("edward-84", clust != NULL); -+ memset(clust, 0, sizeof *clust); -+ cluster_set_tfm_act(&clust->tc, act); -+ clust->dstat = INVAL_DISK_CLUSTER; -+ clust->win = window; -+} -+ -+static inline void cluster_init_read(struct cluster_handle * clust, -+ struct reiser4_slide * window) -+{ -+ cluster_init_act (clust, TFMA_READ, window); -+} -+ -+static inline void cluster_init_write(struct cluster_handle * clust, -+ struct reiser4_slide * window) -+{ -+ cluster_init_act (clust, TFMA_WRITE, window); -+} -+ -+/* true if @p1 and @p2 are items of the same disk cluster */ -+static inline int same_disk_cluster(const coord_t * p1, const coord_t * p2) -+{ -+ /* drop this if you have other items to aggregate */ -+ assert("edward-1494", item_id_by_coord(p1) == CTAIL_ID); -+ -+ return item_plugin_by_coord(p1)->b.mergeable(p1, p2); -+} -+ -+static inline int dclust_get_extension_dsize(hint_t * hint) -+{ -+ return hint->ext_coord.extension.ctail.dsize; -+} -+ -+static inline void dclust_set_extension_dsize(hint_t * hint, int dsize) -+{ -+ hint->ext_coord.extension.ctail.dsize = dsize; -+} -+ -+static inline int dclust_get_extension_shift(hint_t * hint) -+{ -+ return hint->ext_coord.extension.ctail.shift; -+} -+ -+static inline int dclust_get_extension_ncount(hint_t * hint) -+{ -+ return hint->ext_coord.extension.ctail.ncount; -+} -+ -+static inline void dclust_inc_extension_ncount(hint_t * hint) -+{ -+ hint->ext_coord.extension.ctail.ncount ++; -+} -+ -+static inline void dclust_init_extension(hint_t * hint) -+{ -+ memset(&hint->ext_coord.extension.ctail, 0, -+ sizeof(hint->ext_coord.extension.ctail)); -+} -+ -+static inline int hint_is_unprepped_dclust(hint_t * hint) -+{ -+ assert("edward-1451", hint_is_valid(hint)); -+ return dclust_get_extension_shift(hint) == (int)UCTAIL_SHIFT; -+} -+ -+static inline void coord_set_between_clusters(coord_t * coord) -+{ -+#if REISER4_DEBUG -+ int result; -+ result = zload(coord->node); -+ assert("edward-1296", !result); -+#endif -+ if (!coord_is_between_items(coord)) { -+ coord->between = AFTER_ITEM; -+ coord->unit_pos = 0; -+ } -+#if REISER4_DEBUG -+ zrelse(coord->node); -+#endif -+} -+ -+int reiser4_inflate_cluster(struct cluster_handle *, struct inode *); -+int find_disk_cluster(struct cluster_handle *, struct inode *, int read, -+ znode_lock_mode mode); -+int checkout_logical_cluster(struct cluster_handle *, jnode *, struct inode *); -+int reiser4_deflate_cluster(struct cluster_handle *, struct inode *); -+void truncate_complete_page_cluster(struct inode *inode, cloff_t start, -+ int even_cows); -+void invalidate_hint_cluster(struct cluster_handle * clust); -+int get_disk_cluster_locked(struct cluster_handle * clust, struct inode * inode, -+ znode_lock_mode lock_mode); -+void reset_cluster_params(struct cluster_handle * clust); -+int set_cluster_by_page(struct cluster_handle * clust, struct page * page, -+ int count); -+int prepare_page_cluster(struct inode *inode, struct cluster_handle * clust, -+ rw_op rw); -+void __put_page_cluster(int from, int count, -+ struct page ** pages, struct inode * inode); -+void put_page_cluster(struct cluster_handle * clust, -+ struct inode * inode, rw_op rw); -+void put_cluster_handle(struct cluster_handle * clust); -+int grab_tfm_stream(struct inode *inode, struct tfm_cluster * tc, tfm_stream_id id); -+int tfm_cluster_is_uptodate(struct tfm_cluster * tc); -+void tfm_cluster_set_uptodate(struct tfm_cluster * tc); -+void tfm_cluster_clr_uptodate(struct tfm_cluster * tc); -+ -+/* move cluster handle to the target position -+ specified by the page of index @pgidx */ -+static inline void move_cluster_forward(struct cluster_handle * clust, -+ struct inode *inode, -+ pgoff_t pgidx) -+{ -+ assert("edward-1297", clust != NULL); -+ assert("edward-1298", inode != NULL); -+ -+ reset_cluster_params(clust); -+ if (clust->index_valid && -+ /* Hole in the indices. Hint became invalid and can not be -+ used by find_cluster_item() even if seal/node versions -+ will coincide */ -+ pg_to_clust(pgidx, inode) != clust->index + 1) { -+ reiser4_unset_hint(clust->hint); -+ invalidate_hint_cluster(clust); -+ } -+ clust->index = pg_to_clust(pgidx, inode); -+ clust->index_valid = 1; -+} -+ -+static inline int alloc_clust_pages(struct cluster_handle * clust, -+ struct inode *inode) -+{ -+ assert("edward-791", clust != NULL); -+ assert("edward-792", inode != NULL); -+ clust->pages = -+ kmalloc(sizeof(*clust->pages) << inode_cluster_shift(inode), -+ reiser4_ctx_gfp_mask_get()); -+ if (!clust->pages) -+ return -ENOMEM; -+ return 0; -+} -+ -+static inline void free_clust_pages(struct cluster_handle * clust) -+{ -+ kfree(clust->pages); -+} -+ -+#endif /* __FS_REISER4_CLUSTER_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/compress/compress.c linux-2.6.24/fs/reiser4/plugin/compress/compress.c ---- linux-2.6.24.orig/fs/reiser4/plugin/compress/compress.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/compress/compress.c 2008-01-25 11:39:06.956212841 +0300 -@@ -0,0 +1,367 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+/* reiser4 compression transform plugins */ -+ -+#include "../../debug.h" -+#include "../../inode.h" -+#include "../plugin.h" -+ -+#include -+#include -+#include -+#include -+ -+static int change_compression(struct inode *inode, -+ reiser4_plugin * plugin, -+ pset_member memb) -+{ -+ assert("edward-1316", inode != NULL); -+ assert("edward-1317", plugin != NULL); -+ assert("edward-1318", is_reiser4_inode(inode)); -+ assert("edward-1319", -+ plugin->h.type_id == REISER4_COMPRESSION_PLUGIN_TYPE); -+ -+ /* cannot change compression plugin of already existing regular object */ -+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE)) -+ return RETERR(-EINVAL); -+ -+ /* If matches, nothing to change. */ -+ if (inode_hash_plugin(inode) != NULL && -+ inode_hash_plugin(inode)->h.id == plugin->h.id) -+ return 0; -+ -+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset, -+ PSET_COMPRESSION, plugin); -+} -+ -+static reiser4_plugin_ops compression_plugin_ops = { -+ .init = NULL, -+ .load = NULL, -+ .save_len = NULL, -+ .save = NULL, -+ .change = &change_compression -+}; -+ -+/******************************************************************************/ -+/* gzip1 compression */ -+/******************************************************************************/ -+ -+#define GZIP1_DEF_LEVEL Z_BEST_SPEED -+#define GZIP1_DEF_WINBITS 15 -+#define GZIP1_DEF_MEMLEVEL MAX_MEM_LEVEL -+ -+static int gzip1_init(void) -+{ -+ int ret = -EINVAL; -+#if REISER4_ZLIB -+ ret = 0; -+#endif -+ if (ret == -EINVAL) -+ warning("edward-1337", "Zlib not compiled into kernel"); -+ return ret; -+} -+ -+static int gzip1_overrun(unsigned src_len UNUSED_ARG) -+{ -+ return 0; -+} -+ -+static coa_t gzip1_alloc(tfm_action act) -+{ -+ coa_t coa = NULL; -+#if REISER4_ZLIB -+ int ret = 0; -+ switch (act) { -+ case TFMA_WRITE: /* compress */ -+ coa = reiser4_vmalloc(zlib_deflate_workspacesize()); -+ if (!coa) { -+ ret = -ENOMEM; -+ break; -+ } -+ break; -+ case TFMA_READ: /* decompress */ -+ coa = reiser4_vmalloc(zlib_inflate_workspacesize()); -+ if (!coa) { -+ ret = -ENOMEM; -+ break; -+ } -+ break; -+ default: -+ impossible("edward-767", -+ "trying to alloc workspace for unknown tfm action"); -+ } -+ if (ret) { -+ warning("edward-768", -+ "alloc workspace for gzip1 (tfm action = %d) failed\n", -+ act); -+ return ERR_PTR(ret); -+ } -+#endif -+ return coa; -+} -+ -+static void gzip1_free(coa_t coa, tfm_action act) -+{ -+ assert("edward-769", coa != NULL); -+ -+ switch (act) { -+ case TFMA_WRITE: /* compress */ -+ vfree(coa); -+ break; -+ case TFMA_READ: /* decompress */ -+ vfree(coa); -+ break; -+ default: -+ impossible("edward-770", "unknown tfm action"); -+ } -+ return; -+} -+ -+static int gzip1_min_size_deflate(void) -+{ -+ return 64; -+} -+ -+static void -+gzip1_compress(coa_t coa, __u8 * src_first, unsigned src_len, -+ __u8 * dst_first, unsigned *dst_len) -+{ -+#if REISER4_ZLIB -+ int ret = 0; -+ struct z_stream_s stream; -+ -+ assert("edward-842", coa != NULL); -+ assert("edward-875", src_len != 0); -+ -+ stream.workspace = coa; -+ ret = zlib_deflateInit2(&stream, GZIP1_DEF_LEVEL, Z_DEFLATED, -+ -GZIP1_DEF_WINBITS, GZIP1_DEF_MEMLEVEL, -+ Z_DEFAULT_STRATEGY); -+ if (ret != Z_OK) { -+ warning("edward-771", "zlib_deflateInit2 returned %d\n", ret); -+ goto rollback; -+ } -+ ret = zlib_deflateReset(&stream); -+ if (ret != Z_OK) { -+ warning("edward-772", "zlib_deflateReset returned %d\n", ret); -+ goto rollback; -+ } -+ stream.next_in = src_first; -+ stream.avail_in = src_len; -+ stream.next_out = dst_first; -+ stream.avail_out = *dst_len; -+ -+ ret = zlib_deflate(&stream, Z_FINISH); -+ if (ret != Z_STREAM_END) { -+ if (ret != Z_OK) -+ warning("edward-773", -+ "zlib_deflate returned %d\n", ret); -+ goto rollback; -+ } -+ *dst_len = stream.total_out; -+ return; -+ rollback: -+ *dst_len = src_len; -+#endif -+ return; -+} -+ -+static void -+gzip1_decompress(coa_t coa, __u8 * src_first, unsigned src_len, -+ __u8 * dst_first, unsigned *dst_len) -+{ -+#if REISER4_ZLIB -+ int ret = 0; -+ struct z_stream_s stream; -+ -+ assert("edward-843", coa != NULL); -+ assert("edward-876", src_len != 0); -+ -+ stream.workspace = coa; -+ ret = zlib_inflateInit2(&stream, -GZIP1_DEF_WINBITS); -+ if (ret != Z_OK) { -+ warning("edward-774", "zlib_inflateInit2 returned %d\n", ret); -+ return; -+ } -+ ret = zlib_inflateReset(&stream); -+ if (ret != Z_OK) { -+ warning("edward-775", "zlib_inflateReset returned %d\n", ret); -+ return; -+ } -+ -+ stream.next_in = src_first; -+ stream.avail_in = src_len; -+ stream.next_out = dst_first; -+ stream.avail_out = *dst_len; -+ -+ ret = zlib_inflate(&stream, Z_SYNC_FLUSH); -+ /* -+ * Work around a bug in zlib, which sometimes wants to taste an extra -+ * byte when being used in the (undocumented) raw deflate mode. -+ * (From USAGI). -+ */ -+ if (ret == Z_OK && !stream.avail_in && stream.avail_out) { -+ u8 zerostuff = 0; -+ stream.next_in = &zerostuff; -+ stream.avail_in = 1; -+ ret = zlib_inflate(&stream, Z_FINISH); -+ } -+ if (ret != Z_STREAM_END) { -+ warning("edward-776", "zlib_inflate returned %d\n", ret); -+ return; -+ } -+ *dst_len = stream.total_out; -+#endif -+ return; -+} -+ -+/******************************************************************************/ -+/* lzo1 compression */ -+/******************************************************************************/ -+ -+static int lzo1_init(void) -+{ -+ return 0; -+} -+ -+static int lzo1_overrun(unsigned in_len) -+{ -+ return in_len / 64 + 16 + 3; -+} -+ -+static coa_t lzo1_alloc(tfm_action act) -+{ -+ int ret = 0; -+ coa_t coa = NULL; -+ -+ switch (act) { -+ case TFMA_WRITE: /* compress */ -+ coa = reiser4_vmalloc(LZO1X_1_MEM_COMPRESS); -+ if (!coa) { -+ ret = -ENOMEM; -+ break; -+ } -+ case TFMA_READ: /* decompress */ -+ break; -+ default: -+ impossible("edward-877", -+ "trying to alloc workspace for unknown tfm action"); -+ } -+ if (ret) { -+ warning("edward-878", -+ "alloc workspace for lzo1 (tfm action = %d) failed\n", -+ act); -+ return ERR_PTR(ret); -+ } -+ return coa; -+} -+ -+static void lzo1_free(coa_t coa, tfm_action act) -+{ -+ assert("edward-879", coa != NULL); -+ -+ switch (act) { -+ case TFMA_WRITE: /* compress */ -+ vfree(coa); -+ break; -+ case TFMA_READ: /* decompress */ -+ impossible("edward-1304", -+ "trying to free non-allocated workspace"); -+ default: -+ impossible("edward-880", "unknown tfm action"); -+ } -+ return; -+} -+ -+static int lzo1_min_size_deflate(void) -+{ -+ return 256; -+} -+ -+static void -+lzo1_compress(coa_t coa, __u8 * src_first, unsigned src_len, -+ __u8 * dst_first, unsigned *dst_len) -+{ -+ int result; -+ -+ assert("edward-846", coa != NULL); -+ assert("edward-847", src_len != 0); -+ -+ result = lzo1x_1_compress(src_first, src_len, dst_first, dst_len, coa); -+ if (unlikely(result != LZO_E_OK)) { -+ warning("edward-849", "lzo1x_1_compress failed\n"); -+ goto out; -+ } -+ if (*dst_len >= src_len) { -+ //warning("edward-850", "lzo1x_1_compress: incompressible data\n"); -+ goto out; -+ } -+ return; -+ out: -+ *dst_len = src_len; -+ return; -+} -+ -+static void -+lzo1_decompress(coa_t coa, __u8 * src_first, unsigned src_len, -+ __u8 * dst_first, unsigned *dst_len) -+{ -+ int result; -+ -+ assert("edward-851", coa == NULL); -+ assert("edward-852", src_len != 0); -+ -+ result = lzo1x_decompress_safe(src_first, src_len, dst_first, dst_len); -+ if (result != LZO_E_OK) -+ warning("edward-853", "lzo1x_1_decompress failed\n"); -+ return; -+} -+ -+compression_plugin compression_plugins[LAST_COMPRESSION_ID] = { -+ [LZO1_COMPRESSION_ID] = { -+ .h = { -+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE, -+ .id = LZO1_COMPRESSION_ID, -+ .pops = &compression_plugin_ops, -+ .label = "lzo1", -+ .desc = "lzo1 compression transform", -+ .linkage = {NULL, NULL} -+ }, -+ .init = lzo1_init, -+ .overrun = lzo1_overrun, -+ .alloc = lzo1_alloc, -+ .free = lzo1_free, -+ .min_size_deflate = lzo1_min_size_deflate, -+ .checksum = reiser4_adler32, -+ .compress = lzo1_compress, -+ .decompress = lzo1_decompress -+ }, -+ [GZIP1_COMPRESSION_ID] = { -+ .h = { -+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE, -+ .id = GZIP1_COMPRESSION_ID, -+ .pops = &compression_plugin_ops, -+ .label = "gzip1", -+ .desc = "gzip1 compression transform", -+ .linkage = {NULL, NULL} -+ }, -+ .init = gzip1_init, -+ .overrun = gzip1_overrun, -+ .alloc = gzip1_alloc, -+ .free = gzip1_free, -+ .min_size_deflate = gzip1_min_size_deflate, -+ .checksum = reiser4_adler32, -+ .compress = gzip1_compress, -+ .decompress = gzip1_decompress -+ } -+}; -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/compress/compress.h linux-2.6.24/fs/reiser4/plugin/compress/compress.h ---- linux-2.6.24.orig/fs/reiser4/plugin/compress/compress.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/compress/compress.h 2008-01-25 11:39:06.956212841 +0300 -@@ -0,0 +1,43 @@ -+#if !defined( __FS_REISER4_COMPRESS_H__ ) -+#define __FS_REISER4_COMPRESS_H__ -+ -+#include -+#include -+ -+/* transform direction */ -+typedef enum { -+ TFMA_READ, /* decrypt, decompress */ -+ TFMA_WRITE, /* encrypt, compress */ -+ TFMA_LAST -+} tfm_action; -+ -+/* supported compression algorithms */ -+typedef enum { -+ LZO1_COMPRESSION_ID, -+ GZIP1_COMPRESSION_ID, -+ LAST_COMPRESSION_ID, -+} reiser4_compression_id; -+ -+/* the same as pgoff, but units are page clusters */ -+typedef unsigned long cloff_t; -+ -+/* working data of a (de)compression algorithm */ -+typedef void *coa_t; -+ -+/* table for all supported (de)compression algorithms */ -+typedef coa_t coa_set[LAST_COMPRESSION_ID][TFMA_LAST]; -+ -+__u32 reiser4_adler32(char *data, __u32 len); -+ -+#endif /* __FS_REISER4_COMPRESS_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/compress/compress_mode.c linux-2.6.24/fs/reiser4/plugin/compress/compress_mode.c ---- linux-2.6.24.orig/fs/reiser4/plugin/compress/compress_mode.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/compress/compress_mode.c 2008-01-25 11:39:06.956212841 +0300 -@@ -0,0 +1,162 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+/* This file contains Reiser4 compression mode plugins. -+ -+ Compression mode plugin is a set of handlers called by compressor -+ at flush time and represent some heuristics including the ones -+ which are to avoid compression of incompressible data, see -+ http://www.namesys.com/cryptcompress_design.html for more details. -+*/ -+#include "../../inode.h" -+#include "../plugin.h" -+ -+static int should_deflate_none(struct inode * inode, cloff_t index) -+{ -+ return 0; -+} -+ -+static int should_deflate_common(struct inode * inode, cloff_t index) -+{ -+ return compression_is_on(cryptcompress_inode_data(inode)); -+} -+ -+static int discard_hook_ultim(struct inode *inode, cloff_t index) -+{ -+ turn_off_compression(cryptcompress_inode_data(inode)); -+ return 0; -+} -+ -+static int discard_hook_lattd(struct inode *inode, cloff_t index) -+{ -+ struct cryptcompress_info * info = cryptcompress_inode_data(inode); -+ -+ assert("edward-1462", -+ get_lattice_factor(info) >= MIN_LATTICE_FACTOR && -+ get_lattice_factor(info) <= MAX_LATTICE_FACTOR); -+ -+ turn_off_compression(info); -+ if (get_lattice_factor(info) < MAX_LATTICE_FACTOR) -+ set_lattice_factor(info, get_lattice_factor(info) << 1); -+ return 0; -+} -+ -+static int accept_hook_lattd(struct inode *inode, cloff_t index) -+{ -+ turn_on_compression(cryptcompress_inode_data(inode)); -+ set_lattice_factor(cryptcompress_inode_data(inode), MIN_LATTICE_FACTOR); -+ return 0; -+} -+ -+/* Check on dynamic lattice, the adaptive compression modes which -+ defines the following behavior: -+ -+ Compression is on: try to compress everything and turn -+ it off, whenever cluster is incompressible. -+ -+ Compression is off: try to compress clusters of indexes -+ k * FACTOR (k = 0, 1, 2, ...) and turn it on, if some of -+ them is compressible. If incompressible, then increase FACTOR */ -+ -+/* check if @index belongs to one-dimensional lattice -+ of sparce factor @factor */ -+static int is_on_lattice(cloff_t index, int factor) -+{ -+ return (factor ? index % factor == 0: index == 0); -+} -+ -+static int should_deflate_lattd(struct inode * inode, cloff_t index) -+{ -+ return should_deflate_common(inode, index) || -+ is_on_lattice(index, -+ get_lattice_factor -+ (cryptcompress_inode_data(inode))); -+} -+ -+/* compression mode_plugins */ -+compression_mode_plugin compression_mode_plugins[LAST_COMPRESSION_MODE_ID] = { -+ [NONE_COMPRESSION_MODE_ID] = { -+ .h = { -+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ .id = NONE_COMPRESSION_MODE_ID, -+ .pops = NULL, -+ .label = "none", -+ .desc = "Compress nothing", -+ .linkage = {NULL, NULL} -+ }, -+ .should_deflate = should_deflate_none, -+ .accept_hook = NULL, -+ .discard_hook = NULL -+ }, -+ /* Check-on-dynamic-lattice adaptive compression mode */ -+ [LATTD_COMPRESSION_MODE_ID] = { -+ .h = { -+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ .id = LATTD_COMPRESSION_MODE_ID, -+ .pops = NULL, -+ .label = "lattd", -+ .desc = "Check on dynamic lattice", -+ .linkage = {NULL, NULL} -+ }, -+ .should_deflate = should_deflate_lattd, -+ .accept_hook = accept_hook_lattd, -+ .discard_hook = discard_hook_lattd -+ }, -+ /* Check-ultimately compression mode: -+ Turn off compression forever as soon as we meet -+ incompressible data */ -+ [ULTIM_COMPRESSION_MODE_ID] = { -+ .h = { -+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ .id = ULTIM_COMPRESSION_MODE_ID, -+ .pops = NULL, -+ .label = "ultim", -+ .desc = "Check ultimately", -+ .linkage = {NULL, NULL} -+ }, -+ .should_deflate = should_deflate_common, -+ .accept_hook = NULL, -+ .discard_hook = discard_hook_ultim -+ }, -+ /* Force-to-compress-everything compression mode */ -+ [FORCE_COMPRESSION_MODE_ID] = { -+ .h = { -+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ .id = FORCE_COMPRESSION_MODE_ID, -+ .pops = NULL, -+ .label = "force", -+ .desc = "Force to compress everything", -+ .linkage = {NULL, NULL} -+ }, -+ .should_deflate = NULL, -+ .accept_hook = NULL, -+ .discard_hook = NULL -+ }, -+ /* Convert-to-extent compression mode. -+ In this mode items will be converted to extents and management -+ will be passed to (classic) unix file plugin as soon as ->write() -+ detects that the first complete logical cluster (of index #0) is -+ incompressible. */ -+ [CONVX_COMPRESSION_MODE_ID] = { -+ .h = { -+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ .id = CONVX_COMPRESSION_MODE_ID, -+ .pops = NULL, -+ .label = "conv", -+ .desc = "Convert to extent", -+ .linkage = {NULL, NULL} -+ }, -+ .should_deflate = should_deflate_common, -+ .accept_hook = NULL, -+ .discard_hook = NULL -+ } -+}; -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/compress/Makefile linux-2.6.24/fs/reiser4/plugin/compress/Makefile ---- linux-2.6.24.orig/fs/reiser4/plugin/compress/Makefile 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/compress/Makefile 2008-01-25 11:39:06.956212841 +0300 -@@ -0,0 +1,5 @@ -+obj-$(CONFIG_REISER4_FS) += compress_plugins.o -+ -+compress_plugins-objs := \ -+ compress.o \ -+ compress_mode.o -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/crypto/cipher.c linux-2.6.24/fs/reiser4/plugin/crypto/cipher.c ---- linux-2.6.24.orig/fs/reiser4/plugin/crypto/cipher.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/crypto/cipher.c 2008-01-25 11:39:06.956212841 +0300 -@@ -0,0 +1,37 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, -+ licensing governed by reiser4/README */ -+/* Reiser4 cipher transform plugins */ -+ -+#include "../../debug.h" -+#include "../plugin.h" -+ -+cipher_plugin cipher_plugins[LAST_CIPHER_ID] = { -+ [NONE_CIPHER_ID] = { -+ .h = { -+ .type_id = REISER4_CIPHER_PLUGIN_TYPE, -+ .id = NONE_CIPHER_ID, -+ .pops = NULL, -+ .label = "none", -+ .desc = "no cipher transform", -+ .linkage = {NULL, NULL} -+ }, -+ .alloc = NULL, -+ .free = NULL, -+ .scale = NULL, -+ .align_stream = NULL, -+ .setkey = NULL, -+ .encrypt = NULL, -+ .decrypt = NULL -+ } -+}; -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/crypto/cipher.h linux-2.6.24/fs/reiser4/plugin/crypto/cipher.h ---- linux-2.6.24.orig/fs/reiser4/plugin/crypto/cipher.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/crypto/cipher.h 2008-01-25 11:39:06.956212841 +0300 -@@ -0,0 +1,55 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+/* This file contains definitions for the objects operated -+ by reiser4 key manager, which is something like keyring -+ wrapped by appropriate reiser4 plugin */ -+ -+#if !defined( __FS_REISER4_CRYPT_H__ ) -+#define __FS_REISER4_CRYPT_H__ -+ -+#include -+ -+/* key info imported from user space */ -+struct reiser4_crypto_data { -+ int keysize; /* uninstantiated key size */ -+ __u8 * key; /* uninstantiated key */ -+ int keyid_size; /* size of passphrase */ -+ __u8 * keyid; /* passphrase */ -+}; -+ -+/* This object contains all needed infrastructure to implement -+ cipher transform. This is operated (allocating, inheriting, -+ validating, binding to host inode, etc..) by reiser4 key manager. -+ -+ This info can be allocated in two cases: -+ 1. importing a key from user space. -+ 2. reading inode from disk */ -+struct reiser4_crypto_info { -+ struct inode * host; -+ struct crypto_hash * digest; -+ struct crypto_blkcipher * cipher; -+#if 0 -+ cipher_key_plugin * kplug; /* key manager */ -+#endif -+ __u8 * keyid; /* key fingerprint, created by digest plugin, -+ using uninstantiated key and passphrase. -+ supposed to be stored in disk stat-data */ -+ int inst; /* this indicates if the cipher key is -+ instantiated (case 1 above) */ -+ int keysize; /* uninstantiated key size (bytes), supposed -+ to be stored in disk stat-data */ -+ int keyload_count; /* number of the objects which has this -+ crypto-stat attached */ -+}; -+ -+#endif /* __FS_REISER4_CRYPT_H__ */ -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/crypto/digest.c linux-2.6.24/fs/reiser4/plugin/crypto/digest.c ---- linux-2.6.24.orig/fs/reiser4/plugin/crypto/digest.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/crypto/digest.c 2008-01-25 11:39:06.956212841 +0300 -@@ -0,0 +1,58 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* reiser4 digest transform plugin (is used by cryptcompress object plugin) */ -+/* EDWARD-FIXME-HANS: and it does what? a digest is a what? */ -+#include "../../debug.h" -+#include "../plugin_header.h" -+#include "../plugin.h" -+#include "../file/cryptcompress.h" -+ -+#include -+ -+extern digest_plugin digest_plugins[LAST_DIGEST_ID]; -+ -+static struct crypto_hash * alloc_sha256 (void) -+{ -+#if REISER4_SHA256 -+ return crypto_alloc_hash ("sha256", 0, CRYPTO_ALG_ASYNC); -+#else -+ warning("edward-1418", "sha256 unsupported"); -+ return ERR_PTR(-EINVAL); -+#endif -+} -+ -+static void free_sha256 (struct crypto_hash * tfm) -+{ -+#if REISER4_SHA256 -+ crypto_free_hash(tfm); -+#endif -+ return; -+} -+ -+/* digest plugins */ -+digest_plugin digest_plugins[LAST_DIGEST_ID] = { -+ [SHA256_32_DIGEST_ID] = { -+ .h = { -+ .type_id = REISER4_DIGEST_PLUGIN_TYPE, -+ .id = SHA256_32_DIGEST_ID, -+ .pops = NULL, -+ .label = "sha256_32", -+ .desc = "sha256_32 digest transform", -+ .linkage = {NULL, NULL} -+ }, -+ .fipsize = sizeof(__u32), -+ .alloc = alloc_sha256, -+ .free = free_sha256 -+ } -+}; -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/dir/dir.h linux-2.6.24/fs/reiser4/plugin/dir/dir.h ---- linux-2.6.24.orig/fs/reiser4/plugin/dir/dir.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/dir/dir.h 2008-01-25 11:39:06.960213871 +0300 -@@ -0,0 +1,36 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* this file contains declarations of methods implementing directory plugins */ -+ -+#if !defined( __REISER4_DIR_H__ ) -+#define __REISER4_DIR_H__ -+ -+/*#include "../../key.h" -+ -+#include */ -+ -+/* declarations of functions implementing HASHED_DIR_PLUGIN_ID dir plugin */ -+ -+/* "hashed" directory methods of dir plugin */ -+void build_entry_key_hashed(const struct inode *, const struct qstr *, -+ reiser4_key *); -+ -+/* declarations of functions implementing SEEKABLE_HASHED_DIR_PLUGIN_ID dir plugin */ -+ -+/* "seekable" directory methods of dir plugin */ -+void build_entry_key_seekable(const struct inode *, const struct qstr *, -+ reiser4_key *); -+ -+/* __REISER4_DIR_H__ */ -+#endif -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/dir/hashed_dir.c linux-2.6.24/fs/reiser4/plugin/dir/hashed_dir.c ---- linux-2.6.24.orig/fs/reiser4/plugin/dir/hashed_dir.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/dir/hashed_dir.c 2008-01-25 11:39:06.960213871 +0300 -@@ -0,0 +1,81 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Directory plugin using hashes (see fs/reiser4/plugin/hash.c) to map file -+ names to the files. */ -+ -+/* -+ * Hashed directory logically consists of persistent directory -+ * entries. Directory entry is a pair of a file name and a key of stat-data of -+ * a file that has this name in the given directory. -+ * -+ * Directory entries are stored in the tree in the form of directory -+ * items. Directory item should implement dir_entry_ops portion of item plugin -+ * interface (see plugin/item/item.h). Hashed directory interacts with -+ * directory item plugin exclusively through dir_entry_ops operations. -+ * -+ * Currently there are two implementations of directory items: "simple -+ * directory item" (plugin/item/sde.[ch]), and "compound directory item" -+ * (plugin/item/cde.[ch]) with the latter being the default. -+ * -+ * There is, however some delicate way through which directory code interferes -+ * with item plugin: key assignment policy. A key for a directory item is -+ * chosen by directory code, and as described in kassign.c, this key contains -+ * a portion of file name. Directory item uses this knowledge to avoid storing -+ * this portion of file name twice: in the key and in the directory item body. -+ * -+ */ -+ -+#include "../../inode.h" -+ -+void complete_entry_key(const struct inode *, const char *name, -+ int len, reiser4_key * result); -+ -+/* this is implementation of build_entry_key method of dir -+ plugin for HASHED_DIR_PLUGIN_ID -+ */ -+void build_entry_key_hashed(const struct inode *dir, /* directory where entry is -+ * (or will be) in.*/ -+ const struct qstr *qname, /* name of file referenced -+ * by this entry */ -+ reiser4_key * result /* resulting key of directory -+ * entry */ ) -+{ -+ const char *name; -+ int len; -+ -+ assert("nikita-1139", dir != NULL); -+ assert("nikita-1140", qname != NULL); -+ assert("nikita-1141", qname->name != NULL); -+ assert("nikita-1142", result != NULL); -+ -+ name = qname->name; -+ len = qname->len; -+ -+ assert("nikita-2867", strlen(name) == len); -+ -+ reiser4_key_init(result); -+ /* locality of directory entry's key is objectid of parent -+ directory */ -+ set_key_locality(result, get_inode_oid(dir)); -+ /* minor packing locality is constant */ -+ set_key_type(result, KEY_FILE_NAME_MINOR); -+ /* dot is special case---we always want it to be first entry in -+ a directory. Actually, we just want to have smallest -+ directory entry. -+ */ -+ if (len == 1 && name[0] == '.') -+ return; -+ -+ /* initialize part of entry key which depends on file name */ -+ complete_entry_key(dir, name, len, result); -+} -+ -+/* Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/dir/Makefile linux-2.6.24/fs/reiser4/plugin/dir/Makefile ---- linux-2.6.24.orig/fs/reiser4/plugin/dir/Makefile 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/dir/Makefile 2008-01-25 11:39:06.960213871 +0300 -@@ -0,0 +1,5 @@ -+obj-$(CONFIG_REISER4_FS) += dir_plugins.o -+ -+dir_plugins-objs := \ -+ hashed_dir.o \ -+ seekable_dir.o -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/dir/seekable_dir.c linux-2.6.24/fs/reiser4/plugin/dir/seekable_dir.c ---- linux-2.6.24.orig/fs/reiser4/plugin/dir/seekable_dir.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/dir/seekable_dir.c 2008-01-25 11:39:06.960213871 +0300 -@@ -0,0 +1,46 @@ -+/* Copyright 2005 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#include "../../inode.h" -+ -+/* this is implementation of build_entry_key method of dir -+ plugin for SEEKABLE_HASHED_DIR_PLUGIN_ID -+ This is for directories where we want repeatable and restartable readdir() -+ even in case 32bit user level struct dirent (readdir(3)). -+*/ -+void -+build_entry_key_seekable(const struct inode *dir, const struct qstr *name, -+ reiser4_key * result) -+{ -+ oid_t objectid; -+ -+ assert("nikita-2283", dir != NULL); -+ assert("nikita-2284", name != NULL); -+ assert("nikita-2285", name->name != NULL); -+ assert("nikita-2286", result != NULL); -+ -+ reiser4_key_init(result); -+ /* locality of directory entry's key is objectid of parent -+ directory */ -+ set_key_locality(result, get_inode_oid(dir)); -+ /* minor packing locality is constant */ -+ set_key_type(result, KEY_FILE_NAME_MINOR); -+ /* dot is special case---we always want it to be first entry in -+ a directory. Actually, we just want to have smallest -+ directory entry. -+ */ -+ if ((name->len == 1) && (name->name[0] == '.')) -+ return; -+ -+ /* objectid of key is 31 lowest bits of hash. */ -+ objectid = -+ inode_hash_plugin(dir)->hash(name->name, -+ (int)name->len) & 0x7fffffff; -+ -+ assert("nikita-2303", !(objectid & ~KEY_OBJECTID_MASK)); -+ set_key_objectid(result, objectid); -+ -+ /* offset is always 0. */ -+ set_key_offset(result, (__u64) 0); -+ return; -+} -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/dir_plugin_common.c linux-2.6.24/fs/reiser4/plugin/dir_plugin_common.c ---- linux-2.6.24.orig/fs/reiser4/plugin/dir_plugin_common.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/dir_plugin_common.c 2008-01-25 11:39:06.964214902 +0300 -@@ -0,0 +1,872 @@ -+/* Copyright 2005 by Hans Reiser, licensing governed by -+ reiser4/README */ -+ -+/* this file contains typical implementations for most of methods of -+ directory plugin -+*/ -+ -+#include "../inode.h" -+ -+int reiser4_find_entry(struct inode *dir, struct dentry *name, -+ lock_handle *, znode_lock_mode, reiser4_dir_entry_desc *); -+int reiser4_lookup_name(struct inode *parent, struct dentry *dentry, reiser4_key * key); -+void check_light_weight(struct inode *inode, struct inode *parent); -+ -+/* this is common implementation of get_parent method of dir plugin -+ this is used by NFS kernel server to "climb" up directory tree to -+ check permissions -+ */ -+struct dentry *get_parent_common(struct inode *child) -+{ -+ struct super_block *s; -+ struct inode *parent; -+ struct dentry dotdot; -+ struct dentry *dentry; -+ reiser4_key key; -+ int result; -+ -+ /* -+ * lookup dotdot entry. -+ */ -+ -+ s = child->i_sb; -+ memset(&dotdot, 0, sizeof(dotdot)); -+ dotdot.d_name.name = ".."; -+ dotdot.d_name.len = 2; -+ dotdot.d_op = &get_super_private(s)->ops.dentry; -+ -+ result = reiser4_lookup_name(child, &dotdot, &key); -+ if (result != 0) -+ return ERR_PTR(result); -+ -+ parent = reiser4_iget(s, &key, 1); -+ if (!IS_ERR(parent)) { -+ /* -+ * FIXME-NIKITA dubious: attributes are inherited from @child -+ * to @parent. But: -+ * -+ * (*) this is the only this we can do -+ * -+ * (*) attributes of light-weight object are inherited -+ * from a parent through which object was looked up first, -+ * so it is ambiguous anyway. -+ * -+ */ -+ check_light_weight(parent, child); -+ reiser4_iget_complete(parent); -+ dentry = d_alloc_anon(parent); -+ if (dentry == NULL) { -+ iput(parent); -+ dentry = ERR_PTR(RETERR(-ENOMEM)); -+ } else -+ dentry->d_op = &get_super_private(s)->ops.dentry; -+ } else if (PTR_ERR(parent) == -ENOENT) -+ dentry = ERR_PTR(RETERR(-ESTALE)); -+ else -+ dentry = (void *)parent; -+ return dentry; -+} -+ -+/* this is common implementation of is_name_acceptable method of dir -+ plugin -+ */ -+int is_name_acceptable_common(const struct inode *inode, /* directory to check */ -+ const char *name UNUSED_ARG, /* name to check */ -+ int len /* @name's length */ ) -+{ -+ assert("nikita-733", inode != NULL); -+ assert("nikita-734", name != NULL); -+ assert("nikita-735", len > 0); -+ -+ return len <= reiser4_max_filename_len(inode); -+} -+ -+/* there is no common implementation of build_entry_key method of dir -+ plugin. See plugin/dir/hashed_dir.c:build_entry_key_hashed() or -+ plugin/dir/seekable.c:build_entry_key_seekable() for example -+*/ -+ -+/* this is common implementation of build_readdir_key method of dir -+ plugin -+ see reiser4_readdir_common for more details -+*/ -+int build_readdir_key_common(struct file *dir /* directory being read */ , -+ reiser4_key * result /* where to store key */ ) -+{ -+ reiser4_file_fsdata *fdata; -+ struct inode *inode; -+ -+ assert("nikita-1361", dir != NULL); -+ assert("nikita-1362", result != NULL); -+ assert("nikita-1363", dir->f_dentry != NULL); -+ inode = dir->f_dentry->d_inode; -+ assert("nikita-1373", inode != NULL); -+ -+ fdata = reiser4_get_file_fsdata(dir); -+ if (IS_ERR(fdata)) -+ return PTR_ERR(fdata); -+ assert("nikita-1364", fdata != NULL); -+ return extract_key_from_de_id(get_inode_oid(inode), -+ &fdata->dir.readdir.position. -+ dir_entry_key, result); -+ -+} -+ -+void reiser4_adjust_dir_file(struct inode *, const struct dentry *, int offset, -+ int adj); -+ -+/* this is common implementation of add_entry method of dir plugin -+*/ -+int reiser4_add_entry_common(struct inode *object, /* directory to add new name -+ * in */ -+ struct dentry *where, /* new name */ -+ reiser4_object_create_data * data, /* parameters of -+ * new object */ -+ reiser4_dir_entry_desc * entry /* parameters of -+ * new directory -+ * entry */) -+{ -+ int result; -+ coord_t *coord; -+ lock_handle lh; -+ struct reiser4_dentry_fsdata *fsdata; -+ reiser4_block_nr reserve; -+ -+ assert("nikita-1114", object != NULL); -+ assert("nikita-1250", where != NULL); -+ -+ fsdata = reiser4_get_dentry_fsdata(where); -+ if (unlikely(IS_ERR(fsdata))) -+ return PTR_ERR(fsdata); -+ -+ reserve = inode_dir_plugin(object)->estimate.add_entry(object); -+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) -+ return RETERR(-ENOSPC); -+ -+ init_lh(&lh); -+ coord = &fsdata->dec.entry_coord; -+ coord_clear_iplug(coord); -+ -+ /* check for this entry in a directory. This is plugin method. */ -+ result = reiser4_find_entry(object, where, &lh, ZNODE_WRITE_LOCK, -+ entry); -+ if (likely(result == -ENOENT)) { -+ /* add new entry. Just pass control to the directory -+ item plugin. */ -+ assert("nikita-1709", inode_dir_item_plugin(object)); -+ assert("nikita-2230", coord->node == lh.node); -+ reiser4_seal_done(&fsdata->dec.entry_seal); -+ result = -+ inode_dir_item_plugin(object)->s.dir.add_entry(object, -+ coord, &lh, -+ where, -+ entry); -+ if (result == 0) { -+ reiser4_adjust_dir_file(object, where, -+ fsdata->dec.pos + 1, +1); -+ INODE_INC_FIELD(object, i_size); -+ } -+ } else if (result == 0) { -+ assert("nikita-2232", coord->node == lh.node); -+ result = RETERR(-EEXIST); -+ } -+ done_lh(&lh); -+ -+ return result; -+} -+ -+/** -+ * rem_entry - remove entry from directory item -+ * @dir: -+ * @dentry: -+ * @entry: -+ * @coord: -+ * @lh: -+ * -+ * Checks that coordinate @coord is set properly and calls item plugin -+ * method to cut entry. -+ */ -+static int -+rem_entry(struct inode *dir, struct dentry *dentry, -+ reiser4_dir_entry_desc * entry, coord_t * coord, lock_handle * lh) -+{ -+ item_plugin *iplug; -+ struct inode *child; -+ -+ iplug = inode_dir_item_plugin(dir); -+ child = dentry->d_inode; -+ assert("nikita-3399", child != NULL); -+ -+ /* check that we are really destroying an entry for @child */ -+ if (REISER4_DEBUG) { -+ int result; -+ reiser4_key key; -+ -+ result = iplug->s.dir.extract_key(coord, &key); -+ if (result != 0) -+ return result; -+ if (get_key_objectid(&key) != get_inode_oid(child)) { -+ warning("nikita-3397", -+ "rem_entry: %#llx != %#llx\n", -+ get_key_objectid(&key), -+ (unsigned long long)get_inode_oid(child)); -+ return RETERR(-EIO); -+ } -+ } -+ return iplug->s.dir.rem_entry(dir, &dentry->d_name, coord, lh, entry); -+} -+ -+/** -+ * reiser4_rem_entry_common - remove entry from a directory -+ * @dir: directory to remove entry from -+ * @where: name that is being removed -+ * @entry: description of entry being removed -+ * -+ * This is common implementation of rem_entry method of dir plugin. -+ */ -+int reiser4_rem_entry_common(struct inode *dir, -+ struct dentry *dentry, -+ reiser4_dir_entry_desc *entry) -+{ -+ int result; -+ coord_t *coord; -+ lock_handle lh; -+ struct reiser4_dentry_fsdata *fsdata; -+ __u64 tograb; -+ -+ assert("nikita-1124", dir != NULL); -+ assert("nikita-1125", dentry != NULL); -+ -+ tograb = inode_dir_plugin(dir)->estimate.rem_entry(dir); -+ result = reiser4_grab_space(tograb, BA_CAN_COMMIT | BA_RESERVED); -+ if (result != 0) -+ return RETERR(-ENOSPC); -+ -+ init_lh(&lh); -+ -+ /* check for this entry in a directory. This is plugin method. */ -+ result = reiser4_find_entry(dir, dentry, &lh, ZNODE_WRITE_LOCK, entry); -+ fsdata = reiser4_get_dentry_fsdata(dentry); -+ if (IS_ERR(fsdata)) { -+ done_lh(&lh); -+ return PTR_ERR(fsdata); -+ } -+ -+ coord = &fsdata->dec.entry_coord; -+ -+ assert("nikita-3404", -+ get_inode_oid(dentry->d_inode) != get_inode_oid(dir) || -+ dir->i_size <= 1); -+ -+ coord_clear_iplug(coord); -+ if (result == 0) { -+ /* remove entry. Just pass control to the directory item -+ plugin. */ -+ assert("vs-542", inode_dir_item_plugin(dir)); -+ reiser4_seal_done(&fsdata->dec.entry_seal); -+ reiser4_adjust_dir_file(dir, dentry, fsdata->dec.pos, -1); -+ result = -+ WITH_COORD(coord, -+ rem_entry(dir, dentry, entry, coord, &lh)); -+ if (result == 0) { -+ if (dir->i_size >= 1) -+ INODE_DEC_FIELD(dir, i_size); -+ else { -+ warning("nikita-2509", "Dir %llu is runt", -+ (unsigned long long) -+ get_inode_oid(dir)); -+ result = RETERR(-EIO); -+ } -+ -+ assert("nikita-3405", dentry->d_inode->i_nlink != 1 || -+ dentry->d_inode->i_size != 2 || -+ inode_dir_plugin(dentry->d_inode) == NULL); -+ } -+ } -+ done_lh(&lh); -+ -+ return result; -+} -+ -+static reiser4_block_nr estimate_init(struct inode *parent, -+ struct inode *object); -+static int create_dot_dotdot(struct inode *object, struct inode *parent); -+ -+/* this is common implementation of init method of dir plugin -+ create "." and ".." entries -+*/ -+int reiser4_dir_init_common(struct inode *object, /* new directory */ -+ struct inode *parent, /* parent directory */ -+ reiser4_object_create_data * data /* info passed -+ * to us, this -+ * is filled by -+ * reiser4() -+ * syscall in -+ * particular */) -+{ -+ reiser4_block_nr reserve; -+ -+ assert("nikita-680", object != NULL); -+ assert("nikita-681", S_ISDIR(object->i_mode)); -+ assert("nikita-682", parent != NULL); -+ assert("nikita-684", data != NULL); -+ assert("nikita-686", data->id == DIRECTORY_FILE_PLUGIN_ID); -+ assert("nikita-687", object->i_mode & S_IFDIR); -+ -+ reserve = estimate_init(parent, object); -+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) -+ return RETERR(-ENOSPC); -+ -+ return create_dot_dotdot(object, parent); -+} -+ -+/* this is common implementation of done method of dir plugin -+ remove "." entry -+*/ -+int reiser4_dir_done_common(struct inode *object /* object being deleted */ ) -+{ -+ int result; -+ reiser4_block_nr reserve; -+ struct dentry goodby_dots; -+ reiser4_dir_entry_desc entry; -+ -+ assert("nikita-1449", object != NULL); -+ -+ if (reiser4_inode_get_flag(object, REISER4_NO_SD)) -+ return 0; -+ -+ /* of course, this can be rewritten to sweep everything in one -+ reiser4_cut_tree(). */ -+ memset(&entry, 0, sizeof entry); -+ -+ /* FIXME: this done method is called from reiser4_delete_dir_common which -+ * reserved space already */ -+ reserve = inode_dir_plugin(object)->estimate.rem_entry(object); -+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT | BA_RESERVED)) -+ return RETERR(-ENOSPC); -+ -+ memset(&goodby_dots, 0, sizeof goodby_dots); -+ entry.obj = goodby_dots.d_inode = object; -+ goodby_dots.d_name.name = "."; -+ goodby_dots.d_name.len = 1; -+ result = reiser4_rem_entry_common(object, &goodby_dots, &entry); -+ reiser4_free_dentry_fsdata(&goodby_dots); -+ if (unlikely(result != 0 && result != -ENOMEM && result != -ENOENT)) -+ /* only worth a warning -+ -+ "values of B will give rise to dom!\n" -+ -- v6src/s2/mv.c:89 -+ */ -+ warning("nikita-2252", "Cannot remove dot of %lli: %i", -+ (unsigned long long)get_inode_oid(object), result); -+ return 0; -+} -+ -+/* this is common implementation of attach method of dir plugin -+*/ -+int reiser4_attach_common(struct inode *child UNUSED_ARG, -+ struct inode *parent UNUSED_ARG) -+{ -+ assert("nikita-2647", child != NULL); -+ assert("nikita-2648", parent != NULL); -+ -+ return 0; -+} -+ -+/* this is common implementation of detach method of dir plugin -+ remove "..", decrease nlink on parent -+*/ -+int reiser4_detach_common(struct inode *object, struct inode *parent) -+{ -+ int result; -+ struct dentry goodby_dots; -+ reiser4_dir_entry_desc entry; -+ -+ assert("nikita-2885", object != NULL); -+ assert("nikita-2886", !reiser4_inode_get_flag(object, REISER4_NO_SD)); -+ -+ memset(&entry, 0, sizeof entry); -+ -+ /* NOTE-NIKITA this only works if @parent is -the- parent of -+ @object, viz. object whose key is stored in dotdot -+ entry. Wouldn't work with hard-links on directories. */ -+ memset(&goodby_dots, 0, sizeof goodby_dots); -+ entry.obj = goodby_dots.d_inode = parent; -+ goodby_dots.d_name.name = ".."; -+ goodby_dots.d_name.len = 2; -+ result = reiser4_rem_entry_common(object, &goodby_dots, &entry); -+ reiser4_free_dentry_fsdata(&goodby_dots); -+ if (result == 0) { -+ /* the dot should be the only entry remaining at this time... */ -+ assert("nikita-3400", -+ object->i_size == 1 && object->i_nlink <= 2); -+#if 0 -+ /* and, together with the only name directory can have, they -+ * provides for the last 2 remaining references. If we get -+ * here as part of error handling during mkdir, @object -+ * possibly has no name yet, so its nlink == 1. If we get here -+ * from rename (targeting empty directory), it has no name -+ * already, so its nlink == 1. */ -+ assert("nikita-3401", -+ object->i_nlink == 2 || object->i_nlink == 1); -+#endif -+ -+ /* decrement nlink of directory removed ".." pointed -+ to */ -+ reiser4_del_nlink(parent, NULL, 0); -+ } -+ return result; -+} -+ -+/* this is common implementation of estimate.add_entry method of -+ dir plugin -+ estimation of adding entry which supposes that entry is inserting a -+ unit into item -+*/ -+reiser4_block_nr estimate_add_entry_common(const struct inode * inode) -+{ -+ return estimate_one_insert_into_item(reiser4_tree_by_inode(inode)); -+} -+ -+/* this is common implementation of estimate.rem_entry method of dir -+ plugin -+*/ -+reiser4_block_nr estimate_rem_entry_common(const struct inode * inode) -+{ -+ return estimate_one_item_removal(reiser4_tree_by_inode(inode)); -+} -+ -+/* this is common implementation of estimate.unlink method of dir -+ plugin -+*/ -+reiser4_block_nr -+dir_estimate_unlink_common(const struct inode * parent, -+ const struct inode * object) -+{ -+ reiser4_block_nr res; -+ -+ /* hashed_rem_entry(object) */ -+ res = inode_dir_plugin(object)->estimate.rem_entry(object); -+ /* del_nlink(parent) */ -+ res += 2 * inode_file_plugin(parent)->estimate.update(parent); -+ -+ return res; -+} -+ -+/* -+ * helper for inode_ops ->lookup() and dir plugin's ->get_parent() -+ * methods: if @inode is a light-weight file, setup its credentials -+ * that are not stored in the stat-data in this case -+ */ -+void check_light_weight(struct inode *inode, struct inode *parent) -+{ -+ if (reiser4_inode_get_flag(inode, REISER4_LIGHT_WEIGHT)) { -+ inode->i_uid = parent->i_uid; -+ inode->i_gid = parent->i_gid; -+ /* clear light-weight flag. If inode would be read by any -+ other name, [ug]id wouldn't change. */ -+ reiser4_inode_clr_flag(inode, REISER4_LIGHT_WEIGHT); -+ } -+} -+ -+/* looks for name specified in @dentry in directory @parent and if name is -+ found - key of object found entry points to is stored in @entry->key */ -+int reiser4_lookup_name(struct inode *parent, /* inode of directory to lookup for -+ * name in */ -+ struct dentry *dentry, /* name to look for */ -+ reiser4_key * key /* place to store key */ ) -+{ -+ int result; -+ coord_t *coord; -+ lock_handle lh; -+ const char *name; -+ int len; -+ reiser4_dir_entry_desc entry; -+ struct reiser4_dentry_fsdata *fsdata; -+ -+ assert("nikita-1247", parent != NULL); -+ assert("nikita-1248", dentry != NULL); -+ assert("nikita-1123", dentry->d_name.name != NULL); -+ assert("vs-1486", -+ dentry->d_op == &get_super_private(parent->i_sb)->ops.dentry); -+ -+ name = dentry->d_name.name; -+ len = dentry->d_name.len; -+ -+ if (!inode_dir_plugin(parent)->is_name_acceptable(parent, name, len)) -+ /* some arbitrary error code to return */ -+ return RETERR(-ENAMETOOLONG); -+ -+ fsdata = reiser4_get_dentry_fsdata(dentry); -+ if (IS_ERR(fsdata)) -+ return PTR_ERR(fsdata); -+ -+ coord = &fsdata->dec.entry_coord; -+ coord_clear_iplug(coord); -+ init_lh(&lh); -+ -+ /* find entry in a directory. This is plugin method. */ -+ result = reiser4_find_entry(parent, dentry, &lh, ZNODE_READ_LOCK, -+ &entry); -+ if (result == 0) { -+ /* entry was found, extract object key from it. */ -+ result = -+ WITH_COORD(coord, -+ item_plugin_by_coord(coord)->s.dir. -+ extract_key(coord, key)); -+ } -+ done_lh(&lh); -+ return result; -+ -+} -+ -+/* helper for reiser4_dir_init_common(): estimate number of blocks to reserve */ -+static reiser4_block_nr -+estimate_init(struct inode *parent, struct inode *object) -+{ -+ reiser4_block_nr res = 0; -+ -+ assert("vpf-321", parent != NULL); -+ assert("vpf-322", object != NULL); -+ -+ /* hashed_add_entry(object) */ -+ res += inode_dir_plugin(object)->estimate.add_entry(object); -+ /* reiser4_add_nlink(object) */ -+ res += inode_file_plugin(object)->estimate.update(object); -+ /* hashed_add_entry(object) */ -+ res += inode_dir_plugin(object)->estimate.add_entry(object); -+ /* reiser4_add_nlink(parent) */ -+ res += inode_file_plugin(parent)->estimate.update(parent); -+ -+ return 0; -+} -+ -+/* helper function for reiser4_dir_init_common(). Create "." and ".." */ -+static int create_dot_dotdot(struct inode *object /* object to create dot and -+ * dotdot for */ , -+ struct inode *parent /* parent of @object */) -+{ -+ int result; -+ struct dentry dots_entry; -+ reiser4_dir_entry_desc entry; -+ -+ assert("nikita-688", object != NULL); -+ assert("nikita-689", S_ISDIR(object->i_mode)); -+ assert("nikita-691", parent != NULL); -+ -+ /* We store dot and dotdot as normal directory entries. This is -+ not necessary, because almost all information stored in them -+ is already in the stat-data of directory, the only thing -+ being missed is objectid of grand-parent directory that can -+ easily be added there as extension. -+ -+ But it is done the way it is done, because not storing dot -+ and dotdot will lead to the following complications: -+ -+ . special case handling in ->lookup(). -+ . addition of another extension to the sd. -+ . dependency on key allocation policy for stat data. -+ -+ */ -+ -+ memset(&entry, 0, sizeof entry); -+ memset(&dots_entry, 0, sizeof dots_entry); -+ entry.obj = dots_entry.d_inode = object; -+ dots_entry.d_name.name = "."; -+ dots_entry.d_name.len = 1; -+ result = reiser4_add_entry_common(object, &dots_entry, NULL, &entry); -+ reiser4_free_dentry_fsdata(&dots_entry); -+ -+ if (result == 0) { -+ result = reiser4_add_nlink(object, object, 0); -+ if (result == 0) { -+ entry.obj = dots_entry.d_inode = parent; -+ dots_entry.d_name.name = ".."; -+ dots_entry.d_name.len = 2; -+ result = reiser4_add_entry_common(object, -+ &dots_entry, NULL, &entry); -+ reiser4_free_dentry_fsdata(&dots_entry); -+ /* if creation of ".." failed, iput() will delete -+ object with ".". */ -+ if (result == 0) { -+ result = reiser4_add_nlink(parent, object, 0); -+ if (result != 0) -+ /* -+ * if we failed to bump i_nlink, try -+ * to remove ".." -+ */ -+ reiser4_detach_common(object, parent); -+ } -+ } -+ } -+ -+ if (result != 0) { -+ /* -+ * in the case of error, at least update stat-data so that, -+ * ->i_nlink updates are not lingering. -+ */ -+ reiser4_update_sd(object); -+ reiser4_update_sd(parent); -+ } -+ -+ return result; -+} -+ -+/* -+ * return 0 iff @coord contains a directory entry for the file with the name -+ * @name. -+ */ -+static int -+check_item(const struct inode *dir, const coord_t * coord, const char *name) -+{ -+ item_plugin *iplug; -+ char buf[DE_NAME_BUF_LEN]; -+ -+ iplug = item_plugin_by_coord(coord); -+ if (iplug == NULL) { -+ warning("nikita-1135", "Cannot get item plugin"); -+ print_coord("coord", coord, 1); -+ return RETERR(-EIO); -+ } else if (item_id_by_coord(coord) != -+ item_id_by_plugin(inode_dir_item_plugin(dir))) { -+ /* item id of current item does not match to id of items a -+ directory is built of */ -+ warning("nikita-1136", "Wrong item plugin"); -+ print_coord("coord", coord, 1); -+ return RETERR(-EIO); -+ } -+ assert("nikita-1137", iplug->s.dir.extract_name); -+ -+ /* Compare name stored in this entry with name we are looking for. -+ -+ NOTE-NIKITA Here should go code for support of something like -+ unicode, code tables, etc. -+ */ -+ return !!strcmp(name, iplug->s.dir.extract_name(coord, buf)); -+} -+ -+static int -+check_entry(const struct inode *dir, coord_t * coord, const struct qstr *name) -+{ -+ return WITH_COORD(coord, check_item(dir, coord, name->name)); -+} -+ -+/* -+ * argument package used by entry_actor to scan entries with identical keys. -+ */ -+struct entry_actor_args { -+ /* name we are looking for */ -+ const char *name; -+ /* key of directory entry. entry_actor() scans through sequence of -+ * items/units having the same key */ -+ reiser4_key *key; -+ /* how many entries with duplicate key was scanned so far. */ -+ int non_uniq; -+#if REISER4_USE_COLLISION_LIMIT -+ /* scan limit */ -+ int max_non_uniq; -+#endif -+ /* return parameter: set to true, if ->name wasn't found */ -+ int not_found; -+ /* what type of lock to take when moving to the next node during -+ * scan */ -+ znode_lock_mode mode; -+ -+ /* last coord that was visited during scan */ -+ coord_t last_coord; -+ /* last node locked during scan */ -+ lock_handle last_lh; -+ /* inode of directory */ -+ const struct inode *inode; -+}; -+ -+/* Function called by reiser4_find_entry() to look for given name -+ in the directory. */ -+static int entry_actor(reiser4_tree * tree UNUSED_ARG /* tree being scanned */ , -+ coord_t * coord /* current coord */ , -+ lock_handle * lh /* current lock handle */ , -+ void *entry_actor_arg /* argument to scan */ ) -+{ -+ reiser4_key unit_key; -+ struct entry_actor_args *args; -+ -+ assert("nikita-1131", tree != NULL); -+ assert("nikita-1132", coord != NULL); -+ assert("nikita-1133", entry_actor_arg != NULL); -+ -+ args = entry_actor_arg; -+ ++args->non_uniq; -+#if REISER4_USE_COLLISION_LIMIT -+ if (args->non_uniq > args->max_non_uniq) { -+ args->not_found = 1; -+ /* hash collision overflow. */ -+ return RETERR(-EBUSY); -+ } -+#endif -+ -+ /* -+ * did we just reach the end of the sequence of items/units with -+ * identical keys? -+ */ -+ if (!keyeq(args->key, unit_key_by_coord(coord, &unit_key))) { -+ assert("nikita-1791", -+ keylt(args->key, unit_key_by_coord(coord, &unit_key))); -+ args->not_found = 1; -+ args->last_coord.between = AFTER_UNIT; -+ return 0; -+ } -+ -+ coord_dup(&args->last_coord, coord); -+ /* -+ * did scan just moved to the next node? -+ */ -+ if (args->last_lh.node != lh->node) { -+ int lock_result; -+ -+ /* -+ * if so, lock new node with the mode requested by the caller -+ */ -+ done_lh(&args->last_lh); -+ assert("nikita-1896", znode_is_any_locked(lh->node)); -+ lock_result = longterm_lock_znode(&args->last_lh, lh->node, -+ args->mode, ZNODE_LOCK_HIPRI); -+ if (lock_result != 0) -+ return lock_result; -+ } -+ return check_item(args->inode, coord, args->name); -+} -+ -+/* Look for given @name within directory @dir. -+ -+ This is called during lookup, creation and removal of directory -+ entries and on reiser4_rename_common -+ -+ First calculate key that directory entry for @name would have. Search -+ for this key in the tree. If such key is found, scan all items with -+ the same key, checking name in each directory entry along the way. -+*/ -+int reiser4_find_entry(struct inode *dir, /* directory to scan */ -+ struct dentry *de, /* name to search for */ -+ lock_handle * lh, /* resulting lock handle */ -+ znode_lock_mode mode, /* required lock mode */ -+ reiser4_dir_entry_desc * entry /* parameters of found -+ directory entry */) -+{ -+ const struct qstr *name; -+ seal_t *seal; -+ coord_t *coord; -+ int result; -+ __u32 flags; -+ struct de_location *dec; -+ struct reiser4_dentry_fsdata *fsdata; -+ -+ assert("nikita-1130", lh != NULL); -+ assert("nikita-1128", dir != NULL); -+ -+ name = &de->d_name; -+ assert("nikita-1129", name != NULL); -+ -+ /* dentry private data don't require lock, because dentry -+ manipulations are protected by i_mutex on parent. -+ -+ This is not so for inodes, because there is no -the- parent in -+ inode case. -+ */ -+ fsdata = reiser4_get_dentry_fsdata(de); -+ if (IS_ERR(fsdata)) -+ return PTR_ERR(fsdata); -+ dec = &fsdata->dec; -+ -+ coord = &dec->entry_coord; -+ coord_clear_iplug(coord); -+ seal = &dec->entry_seal; -+ /* compose key of directory entry for @name */ -+ inode_dir_plugin(dir)->build_entry_key(dir, name, &entry->key); -+ -+ if (reiser4_seal_is_set(seal)) { -+ /* check seal */ -+ result = reiser4_seal_validate(seal, coord, &entry->key, -+ lh, mode, ZNODE_LOCK_LOPRI); -+ if (result == 0) { -+ /* key was found. Check that it is really item we are -+ looking for. */ -+ result = check_entry(dir, coord, name); -+ if (result == 0) -+ return 0; -+ } -+ } -+ flags = (mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0; -+ /* -+ * find place in the tree where directory item should be located. -+ */ -+ result = reiser4_object_lookup(dir, &entry->key, coord, lh, mode, -+ FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, -+ flags, NULL /*ra_info */ ); -+ if (result == CBK_COORD_FOUND) { -+ struct entry_actor_args arg; -+ -+ /* fast path: no hash collisions */ -+ result = check_entry(dir, coord, name); -+ if (result == 0) { -+ reiser4_seal_init(seal, coord, &entry->key); -+ dec->pos = 0; -+ } else if (result > 0) { -+ /* Iterate through all units with the same keys. */ -+ arg.name = name->name; -+ arg.key = &entry->key; -+ arg.not_found = 0; -+ arg.non_uniq = 0; -+#if REISER4_USE_COLLISION_LIMIT -+ arg.max_non_uniq = max_hash_collisions(dir); -+ assert("nikita-2851", arg.max_non_uniq > 1); -+#endif -+ arg.mode = mode; -+ arg.inode = dir; -+ coord_init_zero(&arg.last_coord); -+ init_lh(&arg.last_lh); -+ -+ result = reiser4_iterate_tree -+ (reiser4_tree_by_inode(dir), -+ coord, lh, -+ entry_actor, &arg, mode, 1); -+ /* if end of the tree or extent was reached during -+ scanning. */ -+ if (arg.not_found || (result == -E_NO_NEIGHBOR)) { -+ /* step back */ -+ done_lh(lh); -+ -+ result = zload(arg.last_coord.node); -+ if (result == 0) { -+ coord_clear_iplug(&arg.last_coord); -+ coord_dup(coord, &arg.last_coord); -+ move_lh(lh, &arg.last_lh); -+ result = RETERR(-ENOENT); -+ zrelse(arg.last_coord.node); -+ --arg.non_uniq; -+ } -+ } -+ -+ done_lh(&arg.last_lh); -+ if (result == 0) -+ reiser4_seal_init(seal, coord, &entry->key); -+ -+ if (result == 0 || result == -ENOENT) { -+ assert("nikita-2580", arg.non_uniq > 0); -+ dec->pos = arg.non_uniq - 1; -+ } -+ } -+ } else -+ dec->pos = -1; -+ return result; -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/disk_format/disk_format40.c linux-2.6.24/fs/reiser4/plugin/disk_format/disk_format40.c ---- linux-2.6.24.orig/fs/reiser4/plugin/disk_format/disk_format40.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/disk_format/disk_format40.c 2008-01-25 11:39:06.964214902 +0300 -@@ -0,0 +1,655 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "../../debug.h" -+#include "../../dformat.h" -+#include "../../key.h" -+#include "../node/node.h" -+#include "../space/space_allocator.h" -+#include "disk_format40.h" -+#include "../plugin.h" -+#include "../../txnmgr.h" -+#include "../../jnode.h" -+#include "../../tree.h" -+#include "../../super.h" -+#include "../../wander.h" -+#include "../../inode.h" -+#include "../../ktxnmgrd.h" -+#include "../../status_flags.h" -+ -+#include /* for __u?? */ -+#include /* for struct super_block */ -+#include -+ -+/* reiser 4.0 default disk layout */ -+ -+/* Amount of free blocks needed to perform release_format40 when fs gets -+ mounted RW: 1 for SB, 1 for non-leaves in overwrite set, 2 for tx header -+ & tx record. */ -+#define RELEASE_RESERVED 4 -+ -+/* The greatest supported format40 version number */ -+#define FORMAT40_VERSION PLUGIN_LIBRARY_VERSION -+ -+/* This flag indicates that backup should be updated -+ (the update is performed by fsck) */ -+#define FORMAT40_UPDATE_BACKUP (1 << 31) -+ -+/* functions to access fields of format40_disk_super_block */ -+static __u64 get_format40_block_count(const format40_disk_super_block * sb) -+{ -+ return le64_to_cpu(get_unaligned(&sb->block_count)); -+} -+ -+static __u64 get_format40_free_blocks(const format40_disk_super_block * sb) -+{ -+ return le64_to_cpu(get_unaligned(&sb->free_blocks)); -+} -+ -+static __u64 get_format40_root_block(const format40_disk_super_block * sb) -+{ -+ return le64_to_cpu(get_unaligned(&sb->root_block)); -+} -+ -+static __u16 get_format40_tree_height(const format40_disk_super_block * sb) -+{ -+ return le16_to_cpu(get_unaligned(&sb->tree_height)); -+} -+ -+static __u64 get_format40_file_count(const format40_disk_super_block * sb) -+{ -+ return le64_to_cpu(get_unaligned(&sb->file_count)); -+} -+ -+static __u64 get_format40_oid(const format40_disk_super_block * sb) -+{ -+ return le64_to_cpu(get_unaligned(&sb->oid)); -+} -+ -+static __u32 get_format40_mkfs_id(const format40_disk_super_block * sb) -+{ -+ return le32_to_cpu(get_unaligned(&sb->mkfs_id)); -+} -+ -+static __u64 get_format40_flags(const format40_disk_super_block * sb) -+{ -+ return le64_to_cpu(get_unaligned(&sb->flags)); -+} -+ -+static __u32 get_format40_version(const format40_disk_super_block * sb) -+{ -+ return le32_to_cpu(get_unaligned(&sb->version)) & -+ ~FORMAT40_UPDATE_BACKUP; -+} -+ -+static int update_backup_version(const format40_disk_super_block * sb) -+{ -+ return (le32_to_cpu(get_unaligned(&sb->version)) & -+ FORMAT40_UPDATE_BACKUP); -+} -+ -+static int update_disk_version(const format40_disk_super_block * sb) -+{ -+ return (get_format40_version(sb) < FORMAT40_VERSION); -+} -+ -+static int incomplete_compatibility(const format40_disk_super_block * sb) -+{ -+ return (get_format40_version(sb) > FORMAT40_VERSION); -+} -+ -+static format40_super_info *get_sb_info(struct super_block *super) -+{ -+ return &get_super_private(super)->u.format40; -+} -+ -+static int consult_diskmap(struct super_block *s) -+{ -+ format40_super_info *info; -+ journal_location *jloc; -+ -+ info = get_sb_info(s); -+ jloc = &get_super_private(s)->jloc; -+ /* Default format-specific locations, if there is nothing in -+ * diskmap */ -+ jloc->footer = FORMAT40_JOURNAL_FOOTER_BLOCKNR; -+ jloc->header = FORMAT40_JOURNAL_HEADER_BLOCKNR; -+ info->loc.super = FORMAT40_OFFSET / s->s_blocksize; -+#ifdef CONFIG_REISER4_BADBLOCKS -+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JF, -+ &jloc->footer); -+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JH, -+ &jloc->header); -+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_SUPER, -+ &info->loc.super); -+#endif -+ return 0; -+} -+ -+/* find any valid super block of disk_format40 (even if the first -+ super block is destroyed), will change block numbers of actual journal header/footer (jf/jh) -+ if needed */ -+static struct buffer_head *find_a_disk_format40_super_block(struct super_block -+ *s) -+{ -+ struct buffer_head *super_bh; -+ format40_disk_super_block *disk_sb; -+ format40_super_info *info; -+ -+ assert("umka-487", s != NULL); -+ -+ info = get_sb_info(s); -+ -+ super_bh = sb_bread(s, info->loc.super); -+ if (super_bh == NULL) -+ return ERR_PTR(RETERR(-EIO)); -+ -+ disk_sb = (format40_disk_super_block *) super_bh->b_data; -+ if (strncmp(disk_sb->magic, FORMAT40_MAGIC, sizeof(FORMAT40_MAGIC))) { -+ brelse(super_bh); -+ return ERR_PTR(RETERR(-EINVAL)); -+ } -+ -+ reiser4_set_block_count(s, le64_to_cpu(get_unaligned(&disk_sb->block_count))); -+ reiser4_set_data_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)) - -+ le64_to_cpu(get_unaligned(&disk_sb->free_blocks))); -+ reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->free_blocks))); -+ -+ return super_bh; -+} -+ -+/* find the most recent version of super block. This is called after journal is -+ replayed */ -+static struct buffer_head *read_super_block(struct super_block *s UNUSED_ARG) -+{ -+ /* Here the most recent superblock copy has to be read. However, as -+ journal replay isn't complete, we are using -+ find_a_disk_format40_super_block() function. */ -+ return find_a_disk_format40_super_block(s); -+} -+ -+static int get_super_jnode(struct super_block *s) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(s); -+ jnode *sb_jnode; -+ int ret; -+ -+ sb_jnode = reiser4_alloc_io_head(&get_sb_info(s)->loc.super); -+ -+ ret = jload(sb_jnode); -+ -+ if (ret) { -+ reiser4_drop_io_head(sb_jnode); -+ return ret; -+ } -+ -+ pin_jnode_data(sb_jnode); -+ jrelse(sb_jnode); -+ -+ sbinfo->u.format40.sb_jnode = sb_jnode; -+ -+ return 0; -+} -+ -+static void done_super_jnode(struct super_block *s) -+{ -+ jnode *sb_jnode = get_super_private(s)->u.format40.sb_jnode; -+ -+ if (sb_jnode) { -+ unpin_jnode_data(sb_jnode); -+ reiser4_drop_io_head(sb_jnode); -+ } -+} -+ -+typedef enum format40_init_stage { -+ NONE_DONE = 0, -+ CONSULT_DISKMAP, -+ FIND_A_SUPER, -+ INIT_JOURNAL_INFO, -+ INIT_STATUS, -+ JOURNAL_REPLAY, -+ READ_SUPER, -+ KEY_CHECK, -+ INIT_OID, -+ INIT_TREE, -+ JOURNAL_RECOVER, -+ INIT_SA, -+ INIT_JNODE, -+ ALL_DONE -+} format40_init_stage; -+ -+static format40_disk_super_block *copy_sb(const struct buffer_head *super_bh) -+{ -+ format40_disk_super_block *sb_copy; -+ -+ sb_copy = kmalloc(sizeof(format40_disk_super_block), -+ reiser4_ctx_gfp_mask_get()); -+ if (sb_copy == NULL) -+ return ERR_PTR(RETERR(-ENOMEM)); -+ memcpy(sb_copy, ((format40_disk_super_block *) super_bh->b_data), -+ sizeof(format40_disk_super_block)); -+ return sb_copy; -+} -+ -+static int check_key_format(const format40_disk_super_block *sb_copy) -+{ -+ if (!equi(REISER4_LARGE_KEY, -+ get_format40_flags(sb_copy) & (1 << FORMAT40_LARGE_KEYS))) { -+ warning("nikita-3228", "Key format mismatch. " -+ "Only %s keys are supported.", -+ REISER4_LARGE_KEY ? "large" : "small"); -+ return RETERR(-EINVAL); -+ } -+ return 0; -+} -+ -+/** -+ * try_init_format40 -+ * @super: -+ * @stage: -+ * -+ */ -+static int try_init_format40(struct super_block *super, -+ format40_init_stage *stage) -+{ -+ int result; -+ struct buffer_head *super_bh; -+ reiser4_super_info_data *sbinfo; -+ format40_disk_super_block *sb_copy; -+ tree_level height; -+ reiser4_block_nr root_block; -+ node_plugin *nplug; -+ -+ assert("vs-475", super != NULL); -+ assert("vs-474", get_super_private(super)); -+ -+ *stage = NONE_DONE; -+ -+ result = consult_diskmap(super); -+ if (result) -+ return result; -+ *stage = CONSULT_DISKMAP; -+ -+ super_bh = find_a_disk_format40_super_block(super); -+ if (IS_ERR(super_bh)) -+ return PTR_ERR(super_bh); -+ brelse(super_bh); -+ *stage = FIND_A_SUPER; -+ -+ /* ok, we are sure that filesystem format is a format40 format */ -+ -+ /* map jnodes for journal control blocks (header, footer) to disk */ -+ result = reiser4_init_journal_info(super); -+ if (result) -+ return result; -+ *stage = INIT_JOURNAL_INFO; -+ -+ /* ok, we are sure that filesystem format is a format40 format */ -+ /* Now check it's state */ -+ result = reiser4_status_init(FORMAT40_STATUS_BLOCKNR); -+ if (result != 0 && result != -EINVAL) -+ /* -EINVAL means there is no magic, so probably just old -+ * fs. */ -+ return result; -+ *stage = INIT_STATUS; -+ -+ result = reiser4_status_query(NULL, NULL); -+ if (result == REISER4_STATUS_MOUNT_WARN) -+ notice("vpf-1363", "Warning: mounting %s with errors.", -+ super->s_id); -+ if (result == REISER4_STATUS_MOUNT_RO) -+ notice("vpf-1364", "Warning: mounting %s with fatal errors," -+ " forcing read-only mount.", super->s_id); -+ result = reiser4_journal_replay(super); -+ if (result) -+ return result; -+ *stage = JOURNAL_REPLAY; -+ -+ super_bh = read_super_block(super); -+ if (IS_ERR(super_bh)) -+ return PTR_ERR(super_bh); -+ *stage = READ_SUPER; -+ -+ /* allocate and make a copy of format40_disk_super_block */ -+ sb_copy = copy_sb(super_bh); -+ brelse(super_bh); -+ -+ if (IS_ERR(sb_copy)) -+ return PTR_ERR(sb_copy); -+ printk("reiser4: %s: found disk format 4.0.%u.\n", -+ super->s_id, -+ get_format40_version(sb_copy)); -+ if (incomplete_compatibility(sb_copy)) -+ printk("reiser4: Warning: The last completely supported " -+ "version of disk format40 is %u. Some objects of " -+ "the semantic tree can be unaccessible.\n", -+ FORMAT40_VERSION); -+ /* make sure that key format of kernel and filesystem match */ -+ result = check_key_format(sb_copy); -+ if (result) { -+ kfree(sb_copy); -+ return result; -+ } -+ *stage = KEY_CHECK; -+ -+ result = oid_init_allocator(super, get_format40_file_count(sb_copy), -+ get_format40_oid(sb_copy)); -+ if (result) { -+ kfree(sb_copy); -+ return result; -+ } -+ *stage = INIT_OID; -+ -+ /* get things necessary to init reiser4_tree */ -+ root_block = get_format40_root_block(sb_copy); -+ height = get_format40_tree_height(sb_copy); -+ nplug = node_plugin_by_id(NODE40_ID); -+ -+ /* initialize reiser4_super_info_data */ -+ sbinfo = get_super_private(super); -+ assert("", sbinfo->tree.super == super); -+ /* init reiser4_tree for the filesystem */ -+ result = reiser4_init_tree(&sbinfo->tree, &root_block, height, nplug); -+ if (result) { -+ kfree(sb_copy); -+ return result; -+ } -+ *stage = INIT_TREE; -+ -+ /* -+ * initialize reiser4_super_info_data with data from format40 super -+ * block -+ */ -+ sbinfo->default_uid = 0; -+ sbinfo->default_gid = 0; -+ sbinfo->mkfs_id = get_format40_mkfs_id(sb_copy); -+ /* number of blocks in filesystem and reserved space */ -+ reiser4_set_block_count(super, get_format40_block_count(sb_copy)); -+ sbinfo->blocks_free = get_format40_free_blocks(sb_copy); -+ sbinfo->version = get_format40_version(sb_copy); -+ kfree(sb_copy); -+ -+ if (update_backup_version(sb_copy)) -+ printk("reiser4: Warning: metadata backup is not updated. " -+ "Please run 'fsck.reiser4 --fix' on %s.\n", -+ super->s_id); -+ -+ sbinfo->fsuid = 0; -+ sbinfo->fs_flags |= (1 << REISER4_ADG); /* hard links for directories -+ * are not supported */ -+ sbinfo->fs_flags |= (1 << REISER4_ONE_NODE_PLUGIN); /* all nodes in -+ * layout 40 are -+ * of one -+ * plugin */ -+ /* sbinfo->tmgr is initialized already */ -+ -+ /* recover sb data which were logged separately from sb block */ -+ -+ /* NOTE-NIKITA: reiser4_journal_recover_sb_data() calls -+ * oid_init_allocator() and reiser4_set_free_blocks() with new -+ * data. What's the reason to call them above? */ -+ result = reiser4_journal_recover_sb_data(super); -+ if (result != 0) -+ return result; -+ *stage = JOURNAL_RECOVER; -+ -+ /* -+ * Set number of used blocks. The number of used blocks is not stored -+ * neither in on-disk super block nor in the journal footer blocks. At -+ * this moment actual values of total blocks and free block counters -+ * are set in the reiser4 super block (in-memory structure) and we can -+ * calculate number of used blocks from them. -+ */ -+ reiser4_set_data_blocks(super, -+ reiser4_block_count(super) - -+ reiser4_free_blocks(super)); -+ -+#if REISER4_DEBUG -+ sbinfo->min_blocks_used = 16 /* reserved area */ + -+ 2 /* super blocks */ + -+ 2 /* journal footer and header */ ; -+#endif -+ -+ /* init disk space allocator */ -+ result = sa_init_allocator(reiser4_get_space_allocator(super), -+ super, NULL); -+ if (result) -+ return result; -+ *stage = INIT_SA; -+ -+ result = get_super_jnode(super); -+ if (result == 0) -+ *stage = ALL_DONE; -+ return result; -+} -+ -+/* plugin->u.format.get_ready */ -+int init_format_format40(struct super_block *s, void *data UNUSED_ARG) -+{ -+ int result; -+ format40_init_stage stage; -+ -+ result = try_init_format40(s, &stage); -+ switch (stage) { -+ case ALL_DONE: -+ assert("nikita-3458", result == 0); -+ break; -+ case INIT_JNODE: -+ done_super_jnode(s); -+ case INIT_SA: -+ sa_destroy_allocator(reiser4_get_space_allocator(s), s); -+ case JOURNAL_RECOVER: -+ case INIT_TREE: -+ reiser4_done_tree(&get_super_private(s)->tree); -+ case INIT_OID: -+ case KEY_CHECK: -+ case READ_SUPER: -+ case JOURNAL_REPLAY: -+ case INIT_STATUS: -+ reiser4_status_finish(); -+ case INIT_JOURNAL_INFO: -+ reiser4_done_journal_info(s); -+ case FIND_A_SUPER: -+ case CONSULT_DISKMAP: -+ case NONE_DONE: -+ break; -+ default: -+ impossible("nikita-3457", "init stage: %i", stage); -+ } -+ -+ if (!rofs_super(s) && reiser4_free_blocks(s) < RELEASE_RESERVED) -+ return RETERR(-ENOSPC); -+ -+ return result; -+} -+ -+static void pack_format40_super(const struct super_block *s, char *data) -+{ -+ format40_disk_super_block *super_data = -+ (format40_disk_super_block *) data; -+ -+ reiser4_super_info_data *sbinfo = get_super_private(s); -+ -+ assert("zam-591", data != NULL); -+ -+ put_unaligned(cpu_to_le64(reiser4_free_committed_blocks(s)), -+ &super_data->free_blocks); -+ -+ put_unaligned(cpu_to_le64(sbinfo->tree.root_block), -+ &super_data->root_block); -+ -+ put_unaligned(cpu_to_le64(oid_next(s)), -+ &super_data->oid); -+ -+ put_unaligned(cpu_to_le64(oids_used(s)), -+ &super_data->file_count); -+ -+ put_unaligned(cpu_to_le16(sbinfo->tree.height), -+ &super_data->tree_height); -+ -+ if (update_disk_version(super_data)) { -+ __u32 version = FORMAT40_VERSION | FORMAT40_UPDATE_BACKUP; -+ -+ put_unaligned(cpu_to_le32(version), &super_data->version); -+ } -+} -+ -+/* plugin->u.format.log_super -+ return a jnode which should be added to transaction when the super block -+ gets logged */ -+jnode *log_super_format40(struct super_block *s) -+{ -+ jnode *sb_jnode; -+ -+ sb_jnode = get_super_private(s)->u.format40.sb_jnode; -+ -+ jload(sb_jnode); -+ -+ pack_format40_super(s, jdata(sb_jnode)); -+ -+ jrelse(sb_jnode); -+ -+ return sb_jnode; -+} -+ -+/* plugin->u.format.release */ -+int release_format40(struct super_block *s) -+{ -+ int ret; -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = get_super_private(s); -+ assert("zam-579", sbinfo != NULL); -+ -+ if (!rofs_super(s)) { -+ ret = reiser4_capture_super_block(s); -+ if (ret != 0) -+ warning("vs-898", -+ "reiser4_capture_super_block failed: %d", -+ ret); -+ -+ ret = txnmgr_force_commit_all(s, 1); -+ if (ret != 0) -+ warning("jmacd-74438", "txn_force failed: %d", ret); -+ -+ all_grabbed2free(); -+ } -+ -+ sa_destroy_allocator(&sbinfo->space_allocator, s); -+ reiser4_done_journal_info(s); -+ done_super_jnode(s); -+ -+ rcu_barrier(); -+ reiser4_done_tree(&sbinfo->tree); -+ /* call finish_rcu(), because some znode were "released" in -+ * reiser4_done_tree(). */ -+ rcu_barrier(); -+ -+ return 0; -+} -+ -+#define FORMAT40_ROOT_LOCALITY 41 -+#define FORMAT40_ROOT_OBJECTID 42 -+ -+/* plugin->u.format.root_dir_key */ -+const reiser4_key *root_dir_key_format40(const struct super_block *super -+ UNUSED_ARG) -+{ -+ static const reiser4_key FORMAT40_ROOT_DIR_KEY = { -+ .el = { -+ __constant_cpu_to_le64((FORMAT40_ROOT_LOCALITY << 4) | KEY_SD_MINOR), -+#if REISER4_LARGE_KEY -+ ON_LARGE_KEY(0ull,) -+#endif -+ __constant_cpu_to_le64(FORMAT40_ROOT_OBJECTID), -+ 0ull -+ } -+ }; -+ -+ return &FORMAT40_ROOT_DIR_KEY; -+} -+ -+/* plugin->u.format.check_open. -+ Check the opened object for validness. For now it checks for the valid oid & -+ locality only, can be improved later and it its work may depend on the mount -+ options. */ -+int check_open_format40(const struct inode *object) -+{ -+ oid_t max, oid; -+ -+ max = oid_next(object->i_sb) - 1; -+ -+ /* Check the oid. */ -+ oid = get_inode_oid(object); -+ if (oid > max) { -+ warning("vpf-1360", "The object with the oid %llu " -+ "greater then the max used oid %llu found.", -+ (unsigned long long)oid, (unsigned long long)max); -+ -+ return RETERR(-EIO); -+ } -+ -+ /* Check the locality. */ -+ oid = reiser4_inode_data(object)->locality_id; -+ if (oid > max) { -+ warning("vpf-1361", "The object with the locality %llu " -+ "greater then the max used oid %llu found.", -+ (unsigned long long)oid, (unsigned long long)max); -+ -+ return RETERR(-EIO); -+ } -+ -+ return 0; -+} -+ -+/* plugin->u.format.version_update. -+ Perform all version update operations from the on-disk -+ format40_disk_super_block.version on disk to FORMAT40_VERSION. -+ */ -+int version_update_format40(struct super_block *super) { -+ txn_handle * trans; -+ lock_handle lh; -+ txn_atom *atom; -+ int ret; -+ -+ /* Nothing to do if RO mount or the on-disk version is not less. */ -+ if (super->s_flags & MS_RDONLY) -+ return 0; -+ -+ if (get_super_private(super)->version >= FORMAT40_VERSION) -+ return 0; -+ -+ printk("reiser4: Updating disk format to 4.0.%u. The reiser4 metadata " -+ "backup is left unchanged. Please run 'fsck.reiser4 --fix' " -+ "on %s to update it too.\n", FORMAT40_VERSION, super->s_id); -+ -+ /* Mark the uber znode dirty to call log_super on write_logs. */ -+ init_lh(&lh); -+ ret = get_uber_znode(reiser4_get_tree(super), ZNODE_WRITE_LOCK, -+ ZNODE_LOCK_HIPRI, &lh); -+ if (ret != 0) -+ return ret; -+ -+ znode_make_dirty(lh.node); -+ done_lh(&lh); -+ -+ /* Update the backup blocks. */ -+ -+ /* Force write_logs immediately. */ -+ trans = get_current_context()->trans; -+ atom = get_current_atom_locked(); -+ assert("vpf-1906", atom != NULL); -+ -+ spin_lock_txnh(trans); -+ return force_commit_atom(trans); -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/disk_format/disk_format40.h linux-2.6.24/fs/reiser4/plugin/disk_format/disk_format40.h ---- linux-2.6.24.orig/fs/reiser4/plugin/disk_format/disk_format40.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/disk_format/disk_format40.h 2008-01-25 11:39:06.968215932 +0300 -@@ -0,0 +1,109 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* this file contains: -+ - definition of ondisk super block of standart disk layout for -+ reiser 4.0 (layout 40) -+ - definition of layout 40 specific portion of in-core super block -+ - declarations of functions implementing methods of layout plugin -+ for layout 40 -+ - declarations of functions used to get/set fields in layout 40 super block -+*/ -+ -+#ifndef __DISK_FORMAT40_H__ -+#define __DISK_FORMAT40_H__ -+ -+/* magic for default reiser4 layout */ -+#define FORMAT40_MAGIC "ReIsEr40FoRmAt" -+#define FORMAT40_OFFSET (REISER4_MASTER_OFFSET + PAGE_CACHE_SIZE) -+ -+#include "../../dformat.h" -+ -+#include /* for struct super_block */ -+ -+typedef enum { -+ FORMAT40_LARGE_KEYS -+} format40_flags; -+ -+/* ondisk super block for format 40. It is 512 bytes long */ -+typedef struct format40_disk_super_block { -+ /* 0 */ d64 block_count; -+ /* number of block in a filesystem */ -+ /* 8 */ d64 free_blocks; -+ /* number of free blocks */ -+ /* 16 */ d64 root_block; -+ /* filesystem tree root block */ -+ /* 24 */ d64 oid; -+ /* smallest free objectid */ -+ /* 32 */ d64 file_count; -+ /* number of files in a filesystem */ -+ /* 40 */ d64 flushes; -+ /* number of times super block was -+ flushed. Needed if format 40 -+ will have few super blocks */ -+ /* 48 */ d32 mkfs_id; -+ /* unique identifier of fs */ -+ /* 52 */ char magic[16]; -+ /* magic string ReIsEr40FoRmAt */ -+ /* 68 */ d16 tree_height; -+ /* height of filesystem tree */ -+ /* 70 */ d16 formatting_policy; -+ /* not used anymore */ -+ /* 72 */ d64 flags; -+ /* 80 */ d32 version; -+ /* on-disk format version number -+ initially assigned by mkfs as the greatest format40 -+ version number supported by reiser4progs and updated -+ in mount time in accordance with the greatest format40 -+ version number supported by kernel. -+ Is used by fsck to catch possible corruption and -+ for various compatibility issues */ -+ /* 84 */ char not_used[428]; -+} format40_disk_super_block; -+ -+/* format 40 specific part of reiser4_super_info_data */ -+typedef struct format40_super_info { -+/* format40_disk_super_block actual_sb; */ -+ jnode *sb_jnode; -+ struct { -+ reiser4_block_nr super; -+ } loc; -+} format40_super_info; -+ -+/* Defines for journal header and footer respectively. */ -+#define FORMAT40_JOURNAL_HEADER_BLOCKNR \ -+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 3) -+ -+#define FORMAT40_JOURNAL_FOOTER_BLOCKNR \ -+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 4) -+ -+#define FORMAT40_STATUS_BLOCKNR \ -+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 5) -+ -+/* Diskmap declarations */ -+#define FORMAT40_PLUGIN_DISKMAP_ID ((REISER4_FORMAT_PLUGIN_TYPE<<16) | (FORMAT40_ID)) -+#define FORMAT40_SUPER 1 -+#define FORMAT40_JH 2 -+#define FORMAT40_JF 3 -+ -+/* declarations of functions implementing methods of layout plugin for -+ format 40. The functions theirself are in disk_format40.c */ -+extern int init_format_format40(struct super_block *, void *data); -+extern const reiser4_key *root_dir_key_format40(const struct super_block *); -+extern int release_format40(struct super_block *s); -+extern jnode *log_super_format40(struct super_block *s); -+extern int check_open_format40(const struct inode *object); -+extern int version_update_format40(struct super_block *super); -+ -+/* __DISK_FORMAT40_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/disk_format/disk_format.c linux-2.6.24/fs/reiser4/plugin/disk_format/disk_format.c ---- linux-2.6.24.orig/fs/reiser4/plugin/disk_format/disk_format.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/disk_format/disk_format.c 2008-01-25 11:39:06.968215932 +0300 -@@ -0,0 +1,38 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "../../debug.h" -+#include "../plugin_header.h" -+#include "disk_format40.h" -+#include "disk_format.h" -+#include "../plugin.h" -+ -+/* initialization of disk layout plugins */ -+disk_format_plugin format_plugins[LAST_FORMAT_ID] = { -+ [FORMAT40_ID] = { -+ .h = { -+ .type_id = REISER4_FORMAT_PLUGIN_TYPE, -+ .id = FORMAT40_ID, -+ .pops = NULL, -+ .label = "reiser40", -+ .desc = "standard disk layout for reiser40", -+ .linkage = {NULL, NULL} -+ }, -+ .init_format = init_format_format40, -+ .root_dir_key = root_dir_key_format40, -+ .release = release_format40, -+ .log_super = log_super_format40, -+ .check_open = check_open_format40, -+ .version_update = version_update_format40 -+ } -+}; -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/disk_format/disk_format.h linux-2.6.24/fs/reiser4/plugin/disk_format/disk_format.h ---- linux-2.6.24.orig/fs/reiser4/plugin/disk_format/disk_format.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/disk_format/disk_format.h 2008-01-25 11:39:06.968215932 +0300 -@@ -0,0 +1,27 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* identifiers for disk layouts, they are also used as indexes in array of disk -+ plugins */ -+ -+#if !defined( __REISER4_DISK_FORMAT_H__ ) -+#define __REISER4_DISK_FORMAT_H__ -+ -+typedef enum { -+ /* standard reiser4 disk layout plugin id */ -+ FORMAT40_ID, -+ LAST_FORMAT_ID -+} disk_format_id; -+ -+/* __REISER4_DISK_FORMAT_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/disk_format/Makefile linux-2.6.24/fs/reiser4/plugin/disk_format/Makefile ---- linux-2.6.24.orig/fs/reiser4/plugin/disk_format/Makefile 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/disk_format/Makefile 2008-01-25 11:39:06.968215932 +0300 -@@ -0,0 +1,5 @@ -+obj-$(CONFIG_REISER4_FS) += df_plugins.o -+ -+df_plugins-objs := \ -+ disk_format40.o \ -+ disk_format.o -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/fibration.c linux-2.6.24/fs/reiser4/plugin/fibration.c ---- linux-2.6.24.orig/fs/reiser4/plugin/fibration.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/fibration.c 2008-01-25 11:39:06.968215932 +0300 -@@ -0,0 +1,175 @@ -+/* Copyright 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Directory fibrations */ -+ -+/* -+ * Suppose we have a directory tree with sources of some project. During -+ * compilation .o files are created within this tree. This makes access -+ * to the original source files less efficient, because source files are -+ * now "diluted" by object files: default directory plugin uses prefix -+ * of a file name as a part of the key for directory entry (and this -+ * part is also inherited by the key of file body). This means that -+ * foo.o will be located close to foo.c and foo.h in the tree. -+ * -+ * To avoid this effect directory plugin fill highest 7 (unused -+ * originally) bits of the second component of the directory entry key -+ * by bit-pattern depending on the file name (see -+ * fs/reiser4/kassign.c:build_entry_key_common()). These bits are called -+ * "fibre". Fibre of the file name key is inherited by key of stat data -+ * and keys of file body (in the case of REISER4_LARGE_KEY). -+ * -+ * Fibre for a given file is chosen by per-directory fibration -+ * plugin. Names within given fibre are ordered lexicographically. -+ */ -+ -+#include "../debug.h" -+#include "plugin_header.h" -+#include "plugin.h" -+#include "../super.h" -+#include "../inode.h" -+ -+#include -+ -+static const int fibre_shift = 57; -+ -+#define FIBRE_NO(n) (((__u64)(n)) << fibre_shift) -+ -+/* -+ * Trivial fibration: all files of directory are just ordered -+ * lexicographically. -+ */ -+static __u64 fibre_trivial(const struct inode *dir, const char *name, int len) -+{ -+ return FIBRE_NO(0); -+} -+ -+/* -+ * dot-o fibration: place .o files after all others. -+ */ -+static __u64 fibre_dot_o(const struct inode *dir, const char *name, int len) -+{ -+ /* special treatment for .*\.o */ -+ if (len > 2 && name[len - 1] == 'o' && name[len - 2] == '.') -+ return FIBRE_NO(1); -+ else -+ return FIBRE_NO(0); -+} -+ -+/* -+ * ext.1 fibration: subdivide directory into 128 fibrations one for each -+ * 7bit extension character (file "foo.h" goes into fibre "h"), plus -+ * default fibre for the rest. -+ */ -+static __u64 fibre_ext_1(const struct inode *dir, const char *name, int len) -+{ -+ if (len > 2 && name[len - 2] == '.') -+ return FIBRE_NO(name[len - 1]); -+ else -+ return FIBRE_NO(0); -+} -+ -+/* -+ * ext.3 fibration: try to separate files with different 3-character -+ * extensions from each other. -+ */ -+static __u64 fibre_ext_3(const struct inode *dir, const char *name, int len) -+{ -+ if (len > 4 && name[len - 4] == '.') -+ return FIBRE_NO(name[len - 3] + name[len - 2] + name[len - 1]); -+ else -+ return FIBRE_NO(0); -+} -+ -+static int change_fibration(struct inode *inode, -+ reiser4_plugin * plugin, -+ pset_member memb) -+{ -+ int result; -+ -+ assert("nikita-3503", inode != NULL); -+ assert("nikita-3504", plugin != NULL); -+ -+ assert("nikita-3505", is_reiser4_inode(inode)); -+ assert("nikita-3506", inode_dir_plugin(inode) != NULL); -+ assert("nikita-3507", -+ plugin->h.type_id == REISER4_FIBRATION_PLUGIN_TYPE); -+ -+ result = 0; -+ if (inode_fibration_plugin(inode) == NULL || -+ inode_fibration_plugin(inode)->h.id != plugin->h.id) { -+ if (is_dir_empty(inode) == 0) -+ result = aset_set_unsafe(&reiser4_inode_data(inode)->pset, -+ PSET_FIBRATION, plugin); -+ else -+ result = RETERR(-ENOTEMPTY); -+ -+ } -+ return result; -+} -+ -+static reiser4_plugin_ops fibration_plugin_ops = { -+ .init = NULL, -+ .load = NULL, -+ .save_len = NULL, -+ .save = NULL, -+ .change = change_fibration -+}; -+ -+/* fibration plugins */ -+fibration_plugin fibration_plugins[LAST_FIBRATION_ID] = { -+ [FIBRATION_LEXICOGRAPHIC] = { -+ .h = { -+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE, -+ .id = FIBRATION_LEXICOGRAPHIC, -+ .pops = &fibration_plugin_ops, -+ .label = "lexicographic", -+ .desc = "no fibration", -+ .linkage = {NULL, NULL} -+ }, -+ .fibre = fibre_trivial -+ }, -+ [FIBRATION_DOT_O] = { -+ .h = { -+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE, -+ .id = FIBRATION_DOT_O, -+ .pops = &fibration_plugin_ops, -+ .label = "dot-o", -+ .desc = "fibrate .o files separately", -+ .linkage = {NULL, NULL} -+ }, -+ .fibre = fibre_dot_o -+ }, -+ [FIBRATION_EXT_1] = { -+ .h = { -+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE, -+ .id = FIBRATION_EXT_1, -+ .pops = &fibration_plugin_ops, -+ .label = "ext-1", -+ .desc = "fibrate file by single character extension", -+ .linkage = {NULL, NULL} -+ }, -+ .fibre = fibre_ext_1 -+ }, -+ [FIBRATION_EXT_3] = { -+ .h = { -+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE, -+ .id = FIBRATION_EXT_3, -+ .pops = &fibration_plugin_ops, -+ .label = "ext-3", -+ .desc = "fibrate file by three character extension", -+ .linkage = {NULL, NULL} -+ }, -+ .fibre = fibre_ext_3 -+ } -+}; -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/fibration.h linux-2.6.24/fs/reiser4/plugin/fibration.h ---- linux-2.6.24.orig/fs/reiser4/plugin/fibration.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/fibration.h 2008-01-25 11:39:06.968215932 +0300 -@@ -0,0 +1,37 @@ -+/* Copyright 2004 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Fibration plugin used by hashed directory plugin to segment content -+ * of directory. See fs/reiser4/plugin/fibration.c for more on this. */ -+ -+#if !defined( __FS_REISER4_PLUGIN_FIBRATION_H__ ) -+#define __FS_REISER4_PLUGIN_FIBRATION_H__ -+ -+#include "plugin_header.h" -+ -+typedef struct fibration_plugin { -+ /* generic fields */ -+ plugin_header h; -+ -+ __u64(*fibre) (const struct inode * dir, const char *name, int len); -+} fibration_plugin; -+ -+typedef enum { -+ FIBRATION_LEXICOGRAPHIC, -+ FIBRATION_DOT_O, -+ FIBRATION_EXT_1, -+ FIBRATION_EXT_3, -+ LAST_FIBRATION_ID -+} reiser4_fibration_id; -+ -+/* __FS_REISER4_PLUGIN_FIBRATION_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/file/cryptcompress.c linux-2.6.24/fs/reiser4/plugin/file/cryptcompress.c ---- linux-2.6.24.orig/fs/reiser4/plugin/file/cryptcompress.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/file/cryptcompress.c 2008-01-25 11:40:16.690167725 +0300 -@@ -0,0 +1,3776 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ reiser4/README */ -+/* -+ * Written by Edward Shishkin. -+ * -+ * Implementations of inode/file/address_space operations -+ * specific for cryptcompress file plugin which manages -+ * regular files built of compressed and(or) encrypted bodies. -+ * See http://dev.namesys.com/CryptcompressPlugin for details. -+ */ -+ -+#include "../../inode.h" -+#include "../cluster.h" -+#include "../object.h" -+#include "../../tree_walk.h" -+#include "cryptcompress.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ Managing primary and secondary caches by Reiser4 -+ cryptcompress file plugin. Synchronization scheme. -+ -+ -+ +------------------+ -+ +------------------->| tfm stream | -+ | | (compressed data)| -+ flush | +------------------+ -+ +-----------------+ | -+ |(->)longterm lock| V -+--+ writepages() | | +-***-+ reiser4 +---+ -+ | | +--+ | *** | storage tree | | -+ | | | +-***-+ (primary cache)| | -+u | write() (secondary| cache) V / | \ | | -+s | ----> +----+ +----+ +----+ +----+ +-***** ******* **----+ ----> | d | -+e | | | |page cluster | | | **disk cluster** | | i | -+r | <---- +----+ +----+ +----+ +----+ +-***** **********----+ <---- | s | -+ | read() ^ ^ | | k | -+ | | (->)longterm lock| | page_io()| | -+ | | +------+ | | -+--+ readpages() | | +---+ -+ | V -+ | +------------------+ -+ +--------------------| tfm stream | -+ | (plain text) | -+ +------------------+ -+*/ -+ -+/* get cryptcompress specific portion of inode */ -+struct cryptcompress_info *cryptcompress_inode_data(const struct inode *inode) -+{ -+ return &reiser4_inode_data(inode)->file_plugin_data.cryptcompress_info; -+} -+ -+/* plugin->u.file.init_inode_data */ -+void init_inode_data_cryptcompress(struct inode *inode, -+ reiser4_object_create_data * crd, -+ int create) -+{ -+ struct cryptcompress_info *data; -+ -+ data = cryptcompress_inode_data(inode); -+ assert("edward-685", data != NULL); -+ -+ memset(data, 0, sizeof(*data)); -+ -+ mutex_init(&data->checkin_mutex); -+ data->trunc_index = ULONG_MAX; -+ turn_on_compression(data); -+ set_lattice_factor(data, MIN_LATTICE_FACTOR); -+ init_inode_ordering(inode, crd, create); -+} -+ -+/* The following is a part of reiser4 cipher key manager -+ which is called when opening/creating a cryptcompress file */ -+ -+/* get/set cipher key info */ -+struct reiser4_crypto_info * inode_crypto_info (struct inode * inode) -+{ -+ assert("edward-90", inode != NULL); -+ assert("edward-91", reiser4_inode_data(inode) != NULL); -+ return cryptcompress_inode_data(inode)->crypt; -+} -+ -+static void set_inode_crypto_info (struct inode * inode, -+ struct reiser4_crypto_info * info) -+{ -+ cryptcompress_inode_data(inode)->crypt = info; -+} -+ -+/* allocate a cipher key info */ -+struct reiser4_crypto_info * reiser4_alloc_crypto_info (struct inode * inode) -+{ -+ struct reiser4_crypto_info *info; -+ int fipsize; -+ -+ info = kzalloc(sizeof(*info), reiser4_ctx_gfp_mask_get()); -+ if (!info) -+ return ERR_PTR(-ENOMEM); -+ -+ fipsize = inode_digest_plugin(inode)->fipsize; -+ info->keyid = kmalloc(fipsize, reiser4_ctx_gfp_mask_get()); -+ if (!info->keyid) { -+ kfree(info); -+ return ERR_PTR(-ENOMEM); -+ } -+ info->host = inode; -+ return info; -+} -+ -+#if 0 -+/* allocate/free low-level info for cipher and digest -+ transforms */ -+static int alloc_crypto_tfms(struct reiser4_crypto_info * info) -+{ -+ struct crypto_blkcipher * ctfm = NULL; -+ struct crypto_hash * dtfm = NULL; -+ cipher_plugin * cplug = inode_cipher_plugin(info->host); -+ digest_plugin * dplug = inode_digest_plugin(info->host); -+ -+ if (cplug->alloc) { -+ ctfm = cplug->alloc(); -+ if (IS_ERR(ctfm)) { -+ warning("edward-1364", -+ "Can not allocate info for %s\n", -+ cplug->h.desc); -+ return RETERR(PTR_ERR(ctfm)); -+ } -+ } -+ info_set_cipher(info, ctfm); -+ if (dplug->alloc) { -+ dtfm = dplug->alloc(); -+ if (IS_ERR(dtfm)) { -+ warning("edward-1365", -+ "Can not allocate info for %s\n", -+ dplug->h.desc); -+ goto unhappy_with_digest; -+ } -+ } -+ info_set_digest(info, dtfm); -+ return 0; -+ unhappy_with_digest: -+ if (cplug->free) { -+ cplug->free(ctfm); -+ info_set_cipher(info, NULL); -+ } -+ return RETERR(PTR_ERR(dtfm)); -+} -+#endif -+ -+static void -+free_crypto_tfms(struct reiser4_crypto_info * info) -+{ -+ assert("edward-1366", info != NULL); -+ if (!info_get_cipher(info)) { -+ assert("edward-1601", !info_get_digest(info)); -+ return; -+ } -+ inode_cipher_plugin(info->host)->free(info_get_cipher(info)); -+ info_set_cipher(info, NULL); -+ inode_digest_plugin(info->host)->free(info_get_digest(info)); -+ info_set_digest(info, NULL); -+ return; -+} -+ -+#if 0 -+/* create a key fingerprint for disk stat-data */ -+static int create_keyid (struct reiser4_crypto_info * info, -+ struct reiser4_crypto_data * data) -+{ -+ int ret = -ENOMEM; -+ size_t blk, pad; -+ __u8 * dmem; -+ __u8 * cmem; -+ struct hash_desc ddesc; -+ struct blkcipher_desc cdesc; -+ struct scatterlist sg; -+ -+ assert("edward-1367", info != NULL); -+ assert("edward-1368", info->keyid != NULL); -+ -+ ddesc.tfm = info_get_digest(info); -+ ddesc.flags = 0; -+ cdesc.tfm = info_get_cipher(info); -+ cdesc.flags = 0; -+ -+ dmem = kmalloc((size_t)crypto_hash_digestsize(ddesc.tfm), -+ reiser4_ctx_gfp_mask_get()); -+ if (!dmem) -+ goto exit1; -+ -+ blk = crypto_blkcipher_blocksize(cdesc.tfm); -+ -+ pad = data->keyid_size % blk; -+ pad = (pad ? blk - pad : 0); -+ -+ cmem = kmalloc((size_t)data->keyid_size + pad, -+ reiser4_ctx_gfp_mask_get()); -+ if (!cmem) -+ goto exit2; -+ memcpy(cmem, data->keyid, data->keyid_size); -+ memset(cmem + data->keyid_size, 0, pad); -+ -+ sg_init_one(&sg, cmem, data->keyid_size + pad); -+ -+ ret = crypto_blkcipher_encrypt(&cdesc, &sg, &sg, -+ data->keyid_size + pad); -+ if (ret) { -+ warning("edward-1369", -+ "encryption failed flags=%x\n", cdesc.flags); -+ goto exit3; -+ } -+ ret = crypto_hash_digest(&ddesc, &sg, sg.length, dmem); -+ if (ret) { -+ warning("edward-1602", -+ "digest failed flags=%x\n", ddesc.flags); -+ goto exit3; -+ } -+ memcpy(info->keyid, dmem, inode_digest_plugin(info->host)->fipsize); -+ exit3: -+ kfree(cmem); -+ exit2: -+ kfree(dmem); -+ exit1: -+ return ret; -+} -+#endif -+ -+static void destroy_keyid(struct reiser4_crypto_info * info) -+{ -+ assert("edward-1370", info != NULL); -+ assert("edward-1371", info->keyid != NULL); -+ kfree(info->keyid); -+ return; -+} -+ -+static void __free_crypto_info (struct inode * inode) -+{ -+ struct reiser4_crypto_info * info = inode_crypto_info(inode); -+ assert("edward-1372", info != NULL); -+ -+ free_crypto_tfms(info); -+ destroy_keyid(info); -+ kfree(info); -+} -+ -+#if 0 -+static void instantiate_crypto_info(struct reiser4_crypto_info * info) -+{ -+ assert("edward-1373", info != NULL); -+ assert("edward-1374", info->inst == 0); -+ info->inst = 1; -+} -+#endif -+ -+static void uninstantiate_crypto_info(struct reiser4_crypto_info * info) -+{ -+ assert("edward-1375", info != NULL); -+ info->inst = 0; -+} -+ -+#if 0 -+static int is_crypto_info_instantiated(struct reiser4_crypto_info * info) -+{ -+ return info->inst; -+} -+ -+static int inode_has_cipher_key(struct inode * inode) -+{ -+ assert("edward-1376", inode != NULL); -+ return inode_crypto_info(inode) && -+ is_crypto_info_instantiated(inode_crypto_info(inode)); -+} -+#endif -+ -+static void free_crypto_info (struct inode * inode) -+{ -+ uninstantiate_crypto_info(inode_crypto_info(inode)); -+ __free_crypto_info(inode); -+} -+ -+static int need_cipher(struct inode * inode) -+{ -+ return inode_cipher_plugin(inode) != -+ cipher_plugin_by_id(NONE_CIPHER_ID); -+} -+ -+/* Parse @data which contains a (uninstantiated) cipher key imported -+ from user space, create a low-level cipher info and attach it to -+ the @object. If success, then info contains an instantiated key */ -+#if 0 -+struct reiser4_crypto_info * create_crypto_info(struct inode * object, -+ struct reiser4_crypto_data * data) -+{ -+ int ret; -+ struct reiser4_crypto_info * info; -+ -+ assert("edward-1377", data != NULL); -+ assert("edward-1378", need_cipher(object)); -+ -+ if (inode_file_plugin(object) != -+ file_plugin_by_id(DIRECTORY_FILE_PLUGIN_ID)) -+ return ERR_PTR(-EINVAL); -+ -+ info = reiser4_alloc_crypto_info(object); -+ if (IS_ERR(info)) -+ return info; -+ ret = alloc_crypto_tfms(info); -+ if (ret) -+ goto err; -+ /* instantiating a key */ -+ ret = crypto_blkcipher_setkey(info_get_cipher(info), -+ data->key, -+ data->keysize); -+ if (ret) { -+ warning("edward-1379", -+ "setkey failed flags=%x", -+ crypto_blkcipher_get_flags(info_get_cipher(info))); -+ goto err; -+ } -+ info->keysize = data->keysize; -+ ret = create_keyid(info, data); -+ if (ret) -+ goto err; -+ instantiate_crypto_info(info); -+ return info; -+ err: -+ __free_crypto_info(object); -+ return ERR_PTR(ret); -+} -+#endif -+ -+/* increment/decrement a load counter when -+ attaching/detaching the crypto-stat to any object */ -+static void load_crypto_info(struct reiser4_crypto_info * info) -+{ -+ assert("edward-1380", info != NULL); -+ inc_keyload_count(info); -+} -+ -+static void unload_crypto_info(struct inode * inode) -+{ -+ struct reiser4_crypto_info * info = inode_crypto_info(inode); -+ assert("edward-1381", info->keyload_count > 0); -+ -+ dec_keyload_count(inode_crypto_info(inode)); -+ if (info->keyload_count == 0) -+ /* final release */ -+ free_crypto_info(inode); -+} -+ -+/* attach/detach an existing crypto-stat */ -+void reiser4_attach_crypto_info(struct inode * inode, -+ struct reiser4_crypto_info * info) -+{ -+ assert("edward-1382", inode != NULL); -+ assert("edward-1383", info != NULL); -+ assert("edward-1384", inode_crypto_info(inode) == NULL); -+ -+ set_inode_crypto_info(inode, info); -+ load_crypto_info(info); -+} -+ -+/* returns true, if crypto stat can be attached to the @host */ -+#if REISER4_DEBUG -+static int host_allows_crypto_info(struct inode * host) -+{ -+ int ret; -+ file_plugin * fplug = inode_file_plugin(host); -+ -+ switch (fplug->h.id) { -+ case CRYPTCOMPRESS_FILE_PLUGIN_ID: -+ ret = 1; -+ break; -+ default: -+ ret = 0; -+ } -+ return ret; -+} -+#endif /* REISER4_DEBUG */ -+ -+static void reiser4_detach_crypto_info(struct inode * inode) -+{ -+ assert("edward-1385", inode != NULL); -+ assert("edward-1386", host_allows_crypto_info(inode)); -+ -+ if (inode_crypto_info(inode)) -+ unload_crypto_info(inode); -+ set_inode_crypto_info(inode, NULL); -+} -+ -+#if 0 -+ -+/* compare fingerprints of @child and @parent */ -+static int keyid_eq(struct reiser4_crypto_info * child, -+ struct reiser4_crypto_info * parent) -+{ -+ return !memcmp(child->keyid, -+ parent->keyid, -+ info_digest_plugin(parent)->fipsize); -+} -+ -+/* check if a crypto-stat (which is bound to @parent) can be inherited */ -+int can_inherit_crypto_cryptcompress(struct inode *child, struct inode *parent) -+{ -+ if (!need_cipher(child)) -+ return 0; -+ /* the child is created */ -+ if (!inode_crypto_info(child)) -+ return 1; -+ /* the child is looked up */ -+ if (!inode_crypto_info(parent)) -+ return 0; -+ return (inode_cipher_plugin(child) == inode_cipher_plugin(parent) && -+ inode_digest_plugin(child) == inode_digest_plugin(parent) && -+ inode_crypto_info(child)->keysize == -+ inode_crypto_info(parent)->keysize && -+ keyid_eq(inode_crypto_info(child), inode_crypto_info(parent))); -+} -+#endif -+ -+/* helper functions for ->create() method of the cryptcompress plugin */ -+static int inode_set_crypto(struct inode * object) -+{ -+ reiser4_inode * info; -+ if (!inode_crypto_info(object)) { -+ if (need_cipher(object)) -+ return RETERR(-EINVAL); -+ /* the file is not to be encrypted */ -+ return 0; -+ } -+ info = reiser4_inode_data(object); -+ info->extmask |= (1 << CRYPTO_STAT); -+ return 0; -+} -+ -+static int inode_init_compression(struct inode * object) -+{ -+ int result = 0; -+ assert("edward-1461", object != NULL); -+ if (inode_compression_plugin(object)->init) -+ result = inode_compression_plugin(object)->init(); -+ return result; -+} -+ -+static int inode_check_cluster(struct inode * object) -+{ -+ assert("edward-696", object != NULL); -+ -+ if (unlikely(inode_cluster_size(object) < PAGE_CACHE_SIZE)) { -+ warning("edward-1320", "Can not support '%s' " -+ "logical clusters (less then page size)", -+ inode_cluster_plugin(object)->h.label); -+ return RETERR(-EINVAL); -+ } -+ if (unlikely(inode_cluster_shift(object)) >= BITS_PER_BYTE*sizeof(int)){ -+ warning("edward-1463", "Can not support '%s' " -+ "logical clusters (too big for transform)", -+ inode_cluster_plugin(object)->h.label); -+ return RETERR(-EINVAL); -+ } -+ return 0; -+} -+ -+/* plugin->destroy_inode() */ -+void destroy_inode_cryptcompress(struct inode * inode) -+{ -+ assert("edward-1464", INODE_PGCOUNT(inode) == 0); -+ reiser4_detach_crypto_info(inode); -+ return; -+} -+ -+/* plugin->create_object(): -+. install plugins -+. attach crypto info if specified -+. attach compression info if specified -+. attach cluster info -+*/ -+int create_object_cryptcompress(struct inode *object, struct inode *parent, -+ reiser4_object_create_data * data) -+{ -+ int result; -+ reiser4_inode *info; -+ -+ assert("edward-23", object != NULL); -+ assert("edward-24", parent != NULL); -+ assert("edward-30", data != NULL); -+ assert("edward-26", reiser4_inode_get_flag(object, REISER4_NO_SD)); -+ assert("edward-27", data->id == CRYPTCOMPRESS_FILE_PLUGIN_ID); -+ -+ info = reiser4_inode_data(object); -+ -+ assert("edward-29", info != NULL); -+ -+ /* set file bit */ -+ info->plugin_mask |= (1 << PSET_FILE); -+ -+ /* set crypto */ -+ result = inode_set_crypto(object); -+ if (result) -+ goto error; -+ /* set compression */ -+ result = inode_init_compression(object); -+ if (result) -+ goto error; -+ /* set cluster */ -+ result = inode_check_cluster(object); -+ if (result) -+ goto error; -+ -+ /* save everything in disk stat-data */ -+ result = write_sd_by_inode_common(object); -+ if (!result) -+ return 0; -+ error: -+ reiser4_detach_crypto_info(object); -+ return result; -+} -+ -+/* plugin->open() */ -+int open_cryptcompress(struct inode * inode, struct file * file) -+{ -+ return 0; -+} -+ -+/* returns a blocksize, the attribute of a cipher algorithm */ -+static unsigned int -+cipher_blocksize(struct inode * inode) -+{ -+ assert("edward-758", need_cipher(inode)); -+ assert("edward-1400", inode_crypto_info(inode) != NULL); -+ return crypto_blkcipher_blocksize -+ (info_get_cipher(inode_crypto_info(inode))); -+} -+ -+/* returns offset translated by scale factor of the crypto-algorithm */ -+static loff_t inode_scaled_offset (struct inode * inode, -+ const loff_t src_off /* input offset */) -+{ -+ assert("edward-97", inode != NULL); -+ -+ if (!need_cipher(inode) || -+ src_off == get_key_offset(reiser4_min_key()) || -+ src_off == get_key_offset(reiser4_max_key())) -+ return src_off; -+ -+ return inode_cipher_plugin(inode)->scale(inode, -+ cipher_blocksize(inode), -+ src_off); -+} -+ -+/* returns disk cluster size */ -+size_t inode_scaled_cluster_size(struct inode * inode) -+{ -+ assert("edward-110", inode != NULL); -+ -+ return inode_scaled_offset(inode, inode_cluster_size(inode)); -+} -+ -+/* set number of cluster pages */ -+static void set_cluster_nrpages(struct cluster_handle * clust, -+ struct inode *inode) -+{ -+ struct reiser4_slide * win; -+ -+ assert("edward-180", clust != NULL); -+ assert("edward-1040", inode != NULL); -+ -+ clust->old_nrpages = size_in_pages(lbytes(clust->index, inode)); -+ win = clust->win; -+ if (!win) { -+ clust->nr_pages = size_in_pages(lbytes(clust->index, inode)); -+ return; -+ } -+ assert("edward-1176", clust->op != LC_INVAL); -+ assert("edward-1064", win->off + win->count + win->delta != 0); -+ -+ if (win->stat == HOLE_WINDOW && -+ win->off == 0 && win->count == inode_cluster_size(inode)) { -+ /* special case: writing a "fake" logical cluster */ -+ clust->nr_pages = 0; -+ return; -+ } -+ clust->nr_pages = size_in_pages(max(win->off + win->count + win->delta, -+ lbytes(clust->index, inode))); -+ return; -+} -+ -+/* plugin->key_by_inode() -+ build key of a disk cluster */ -+int key_by_inode_cryptcompress(struct inode *inode, loff_t off, -+ reiser4_key * key) -+{ -+ assert("edward-64", inode != 0); -+ -+ if (likely(off != get_key_offset(reiser4_max_key()))) -+ off = off_to_clust_to_off(off, inode); -+ if (inode_crypto_info(inode)) -+ off = inode_scaled_offset(inode, off); -+ -+ key_by_inode_and_offset_common(inode, 0, key); -+ set_key_offset(key, (__u64)off); -+ return 0; -+} -+ -+/* plugin->flow_by_inode() */ -+/* flow is used to read/write disk clusters */ -+int flow_by_inode_cryptcompress(struct inode *inode, const char __user * buf, -+ int user, /* 1: @buf is of user space, -+ 0: kernel space */ -+ loff_t size, /* @buf size */ -+ loff_t off, /* offset to start io from */ -+ rw_op op, /* READ or WRITE */ -+ flow_t * f /* resulting flow */) -+{ -+ assert("edward-436", f != NULL); -+ assert("edward-149", inode != NULL); -+ assert("edward-150", inode_file_plugin(inode) != NULL); -+ assert("edward-1465", user == 0); /* we use flow to read/write -+ disk clusters located in -+ kernel space */ -+ f->length = size; -+ memcpy(&f->data, &buf, sizeof(buf)); -+ f->user = user; -+ f->op = op; -+ -+ return key_by_inode_cryptcompress(inode, off, &f->key); -+} -+ -+static int -+cryptcompress_hint_validate(hint_t * hint, const reiser4_key * key, -+ znode_lock_mode lock_mode) -+{ -+ coord_t *coord; -+ -+ assert("edward-704", hint != NULL); -+ assert("edward-1089", !hint_is_valid(hint)); -+ assert("edward-706", hint->lh.owner == NULL); -+ -+ coord = &hint->ext_coord.coord; -+ -+ if (!hint || !hint_is_set(hint) || hint->mode != lock_mode) -+ /* hint either not set or set by different operation */ -+ return RETERR(-E_REPEAT); -+ -+ if (get_key_offset(key) != hint->offset) -+ /* hint is set for different key */ -+ return RETERR(-E_REPEAT); -+ -+ assert("edward-707", reiser4_schedulable()); -+ -+ return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord, -+ key, &hint->lh, lock_mode, -+ ZNODE_LOCK_LOPRI); -+} -+ -+/* reserve disk space when writing a logical cluster */ -+static int reserve4cluster(struct inode *inode, struct cluster_handle *clust) -+{ -+ int result = 0; -+ -+ assert("edward-965", reiser4_schedulable()); -+ assert("edward-439", inode != NULL); -+ assert("edward-440", clust != NULL); -+ assert("edward-441", clust->pages != NULL); -+ -+ if (clust->nr_pages == 0) { -+ assert("edward-1152", clust->win != NULL); -+ assert("edward-1153", clust->win->stat == HOLE_WINDOW); -+ /* don't reserve disk space for fake logical cluster */ -+ return 0; -+ } -+ assert("edward-442", jprivate(clust->pages[0]) != NULL); -+ -+ result = reiser4_grab_space_force(estimate_insert_cluster(inode) + -+ estimate_update_cluster(inode), -+ BA_CAN_COMMIT); -+ if (result) -+ return result; -+ clust->reserved = 1; -+ grabbed2cluster_reserved(estimate_insert_cluster(inode) + -+ estimate_update_cluster(inode)); -+#if REISER4_DEBUG -+ clust->reserved_prepped = estimate_update_cluster(inode); -+ clust->reserved_unprepped = estimate_insert_cluster(inode); -+#endif -+ /* there can be space grabbed by txnmgr_force_commit_all */ -+ return 0; -+} -+ -+/* free reserved disk space if writing a logical cluster fails */ -+static void free_reserved4cluster(struct inode *inode, -+ struct cluster_handle *ch, int count) -+{ -+ assert("edward-967", ch->reserved == 1); -+ -+ cluster_reserved2free(count); -+ ch->reserved = 0; -+} -+ -+/* The core search procedure of the cryptcompress plugin. -+ If returned value is not cbk_errored, then current znode is locked */ -+static int find_cluster_item(hint_t * hint, -+ const reiser4_key * key, /* key of the item we are -+ looking for */ -+ znode_lock_mode lock_mode /* which lock */ , -+ ra_info_t * ra_info, lookup_bias bias, __u32 flags) -+{ -+ int result; -+ reiser4_key ikey; -+ int went_right = 0; -+ coord_t *coord = &hint->ext_coord.coord; -+ coord_t orig = *coord; -+ -+ assert("edward-152", hint != NULL); -+ -+ if (!hint_is_valid(hint)) { -+ result = cryptcompress_hint_validate(hint, key, lock_mode); -+ if (result == -E_REPEAT) -+ goto traverse_tree; -+ else if (result) { -+ assert("edward-1216", 0); -+ return result; -+ } -+ hint_set_valid(hint); -+ } -+ assert("edward-709", znode_is_any_locked(coord->node)); -+ -+ /* In-place lookup is going here, it means we just need to -+ check if next item of the @coord match to the @keyhint) */ -+ -+ if (equal_to_rdk(coord->node, key)) { -+ result = goto_right_neighbor(coord, &hint->lh); -+ if (result == -E_NO_NEIGHBOR) { -+ assert("edward-1217", 0); -+ return RETERR(-EIO); -+ } -+ if (result) -+ return result; -+ assert("edward-1218", equal_to_ldk(coord->node, key)); -+ went_right = 1; -+ } else { -+ coord->item_pos++; -+ coord->unit_pos = 0; -+ coord->between = AT_UNIT; -+ } -+ result = zload(coord->node); -+ if (result) -+ return result; -+ assert("edward-1219", !node_is_empty(coord->node)); -+ -+ if (!coord_is_existing_item(coord)) { -+ zrelse(coord->node); -+ goto not_found; -+ } -+ item_key_by_coord(coord, &ikey); -+ zrelse(coord->node); -+ if (!keyeq(key, &ikey)) -+ goto not_found; -+ /* Ok, item is found, update node counts */ -+ if (went_right) -+ dclust_inc_extension_ncount(hint); -+ return CBK_COORD_FOUND; -+ -+ not_found: -+ assert("edward-1220", coord->item_pos > 0); -+ //coord->item_pos--; -+ /* roll back */ -+ *coord = orig; -+ ON_DEBUG(coord_update_v(coord)); -+ return CBK_COORD_NOTFOUND; -+ -+ traverse_tree: -+ assert("edward-713", hint->lh.owner == NULL); -+ assert("edward-714", reiser4_schedulable()); -+ -+ reiser4_unset_hint(hint); -+ dclust_init_extension(hint); -+ coord_init_zero(coord); -+ result = coord_by_key(current_tree, key, coord, &hint->lh, -+ lock_mode, bias, LEAF_LEVEL, LEAF_LEVEL, -+ CBK_UNIQUE | flags, ra_info); -+ if (cbk_errored(result)) -+ return result; -+ if(result == CBK_COORD_FOUND) -+ dclust_inc_extension_ncount(hint); -+ hint_set_valid(hint); -+ return result; -+} -+ -+/* This function is called by deflate[inflate] manager when -+ creating a transformed/plain stream to check if we should -+ create/cut some overhead. If this returns true, then @oh -+ contains the size of this overhead. -+ */ -+static int need_cut_or_align(struct inode * inode, -+ struct cluster_handle * ch, rw_op rw, int * oh) -+{ -+ struct tfm_cluster * tc = &ch->tc; -+ switch (rw) { -+ case WRITE_OP: /* estimate align */ -+ *oh = tc->len % cipher_blocksize(inode); -+ if (*oh != 0) -+ return 1; -+ break; -+ case READ_OP: /* estimate cut */ -+ *oh = *(tfm_output_data(ch) + tc->len - 1); -+ break; -+ default: -+ impossible("edward-1401", "bad option"); -+ } -+ return (tc->len != tc->lsize); -+} -+ -+/* create/cut an overhead of transformed/plain stream */ -+static void align_or_cut_overhead(struct inode * inode, -+ struct cluster_handle * ch, rw_op rw) -+{ -+ int oh; -+ cipher_plugin * cplug = inode_cipher_plugin(inode); -+ -+ assert("edward-1402", need_cipher(inode)); -+ -+ if (!need_cut_or_align(inode, ch, rw, &oh)) -+ return; -+ switch (rw) { -+ case WRITE_OP: /* do align */ -+ ch->tc.len += -+ cplug->align_stream(tfm_input_data(ch) + -+ ch->tc.len, ch->tc.len, -+ cipher_blocksize(inode)); -+ *(tfm_input_data(ch) + ch->tc.len - 1) = -+ cipher_blocksize(inode) - oh; -+ break; -+ case READ_OP: /* do cut */ -+ assert("edward-1403", oh <= cipher_blocksize(inode)); -+ ch->tc.len -= oh; -+ break; -+ default: -+ impossible("edward-1404", "bad option"); -+ } -+ return; -+} -+ -+static unsigned max_cipher_overhead(struct inode * inode) -+{ -+ if (!need_cipher(inode) || !inode_cipher_plugin(inode)->align_stream) -+ return 0; -+ return cipher_blocksize(inode); -+} -+ -+static int deflate_overhead(struct inode *inode) -+{ -+ return (inode_compression_plugin(inode)-> -+ checksum ? DC_CHECKSUM_SIZE : 0); -+} -+ -+static unsigned deflate_overrun(struct inode * inode, int ilen) -+{ -+ return coa_overrun(inode_compression_plugin(inode), ilen); -+} -+ -+/* Estimating compressibility of a logical cluster by various -+ policies represented by compression mode plugin. -+ If this returns false, then compressor won't be called for -+ the cluster of index @index. -+*/ -+static int should_compress(struct tfm_cluster * tc, cloff_t index, -+ struct inode *inode) -+{ -+ compression_plugin *cplug = inode_compression_plugin(inode); -+ compression_mode_plugin *mplug = inode_compression_mode_plugin(inode); -+ -+ assert("edward-1321", tc->len != 0); -+ assert("edward-1322", cplug != NULL); -+ assert("edward-1323", mplug != NULL); -+ -+ return /* estimate by size */ -+ (cplug->min_size_deflate ? -+ tc->len >= cplug->min_size_deflate() : -+ 1) && -+ /* estimate by compression mode plugin */ -+ (mplug->should_deflate ? -+ mplug->should_deflate(inode, index) : -+ 1); -+} -+ -+/* Evaluating results of compression transform. -+ Returns true, if we need to accept this results */ -+static int save_compressed(int size_before, int size_after, struct inode *inode) -+{ -+ return (size_after + deflate_overhead(inode) + -+ max_cipher_overhead(inode) < size_before); -+} -+ -+/* Guess result of the evaluation above */ -+static int need_inflate(struct cluster_handle * ch, struct inode * inode, -+ int encrypted /* is cluster encrypted */ ) -+{ -+ struct tfm_cluster * tc = &ch->tc; -+ -+ assert("edward-142", tc != 0); -+ assert("edward-143", inode != NULL); -+ -+ return tc->len < -+ (encrypted ? -+ inode_scaled_offset(inode, tc->lsize) : -+ tc->lsize); -+} -+ -+/* If results of compression were accepted, then we add -+ a checksum to catch possible disk cluster corruption. -+ The following is a format of the data stored in disk clusters: -+ -+ data This is (transformed) logical cluster. -+ cipher_overhead This is created by ->align() method -+ of cipher plugin. May be absent. -+ checksum (4) This is created by ->checksum method -+ of compression plugin to check -+ integrity. May be absent. -+ -+ Crypto overhead format: -+ -+ data -+ control_byte (1) contains aligned overhead size: -+ 1 <= overhead <= cipher_blksize -+*/ -+/* Append a checksum at the end of a transformed stream */ -+static void dc_set_checksum(compression_plugin * cplug, struct tfm_cluster * tc) -+{ -+ __u32 checksum; -+ -+ assert("edward-1309", tc != NULL); -+ assert("edward-1310", tc->len > 0); -+ assert("edward-1311", cplug->checksum != NULL); -+ -+ checksum = cplug->checksum(tfm_stream_data(tc, OUTPUT_STREAM), tc->len); -+ put_unaligned(cpu_to_le32(checksum), -+ (d32 *)(tfm_stream_data(tc, OUTPUT_STREAM) + tc->len)); -+ tc->len += (int)DC_CHECKSUM_SIZE; -+} -+ -+/* Check a disk cluster checksum. -+ Returns 0 if checksum is correct, otherwise returns 1 */ -+static int dc_check_checksum(compression_plugin * cplug, struct tfm_cluster * tc) -+{ -+ assert("edward-1312", tc != NULL); -+ assert("edward-1313", tc->len > (int)DC_CHECKSUM_SIZE); -+ assert("edward-1314", cplug->checksum != NULL); -+ -+ if (cplug->checksum(tfm_stream_data(tc, INPUT_STREAM), -+ tc->len - (int)DC_CHECKSUM_SIZE) != -+ le32_to_cpu(get_unaligned((d32 *) -+ (tfm_stream_data(tc, INPUT_STREAM) -+ + tc->len - (int)DC_CHECKSUM_SIZE)))) { -+ warning("edward-156", -+ "Bad disk cluster checksum %d, (should be %d) Fsck?\n", -+ (int)le32_to_cpu -+ (get_unaligned((d32 *) -+ (tfm_stream_data(tc, INPUT_STREAM) + -+ tc->len - (int)DC_CHECKSUM_SIZE))), -+ (int)cplug->checksum -+ (tfm_stream_data(tc, INPUT_STREAM), -+ tc->len - (int)DC_CHECKSUM_SIZE)); -+ return 1; -+ } -+ tc->len -= (int)DC_CHECKSUM_SIZE; -+ return 0; -+} -+ -+/* get input/output stream for some transform action */ -+int grab_tfm_stream(struct inode * inode, struct tfm_cluster * tc, -+ tfm_stream_id id) -+{ -+ size_t size = inode_scaled_cluster_size(inode); -+ -+ assert("edward-901", tc != NULL); -+ assert("edward-1027", inode_compression_plugin(inode) != NULL); -+ -+ if (cluster_get_tfm_act(tc) == TFMA_WRITE) -+ size += deflate_overrun(inode, inode_cluster_size(inode)); -+ -+ if (!get_tfm_stream(tc, id) && id == INPUT_STREAM) -+ alternate_streams(tc); -+ if (!get_tfm_stream(tc, id)) -+ return alloc_tfm_stream(tc, size, id); -+ -+ assert("edward-902", tfm_stream_is_set(tc, id)); -+ -+ if (tfm_stream_size(tc, id) < size) -+ return realloc_tfm_stream(tc, size, id); -+ return 0; -+} -+ -+/* Common deflate manager */ -+int reiser4_deflate_cluster(struct cluster_handle * clust, struct inode * inode) -+{ -+ int result = 0; -+ int compressed = 0; -+ int encrypted = 0; -+ struct tfm_cluster * tc = &clust->tc; -+ compression_plugin * coplug; -+ -+ assert("edward-401", inode != NULL); -+ assert("edward-903", tfm_stream_is_set(tc, INPUT_STREAM)); -+ assert("edward-1348", cluster_get_tfm_act(tc) == TFMA_WRITE); -+ assert("edward-498", !tfm_cluster_is_uptodate(tc)); -+ -+ coplug = inode_compression_plugin(inode); -+ if (should_compress(tc, clust->index, inode)) { -+ /* try to compress, discard bad results */ -+ __u32 dst_len; -+ compression_mode_plugin * mplug = -+ inode_compression_mode_plugin(inode); -+ assert("edward-602", coplug != NULL); -+ assert("edward-1423", coplug->compress != NULL); -+ -+ result = grab_coa(tc, coplug); -+ if (result) { -+ warning("edward-1424", -+ "alloc_coa failed with ret=%d, skipped compression", -+ result); -+ goto cipher; -+ } -+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM); -+ if (result) { -+ warning("edward-1425", -+ "alloc stream failed with ret=%d, skipped compression", -+ result); -+ goto cipher; -+ } -+ dst_len = tfm_stream_size(tc, OUTPUT_STREAM); -+ coplug->compress(get_coa(tc, coplug->h.id, tc->act), -+ tfm_input_data(clust), tc->len, -+ tfm_output_data(clust), &dst_len); -+ /* make sure we didn't overwrite extra bytes */ -+ assert("edward-603", -+ dst_len <= tfm_stream_size(tc, OUTPUT_STREAM)); -+ -+ /* evaluate results of compression transform */ -+ if (save_compressed(tc->len, dst_len, inode)) { -+ /* good result, accept */ -+ tc->len = dst_len; -+ if (mplug->accept_hook != NULL) { -+ result = mplug->accept_hook(inode, clust->index); -+ if (result) -+ warning("edward-1426", -+ "accept_hook failed with ret=%d", -+ result); -+ } -+ compressed = 1; -+ } -+ else { -+ /* bad result, discard */ -+#if 0 -+ if (cluster_is_complete(clust, inode)) -+ warning("edward-1496", -+ "incompressible cluster %lu (inode %llu)", -+ clust->index, -+ (unsigned long long)get_inode_oid(inode)); -+#endif -+ if (mplug->discard_hook != NULL && -+ cluster_is_complete(clust, inode)) { -+ result = mplug->discard_hook(inode, -+ clust->index); -+ if (result) -+ warning("edward-1427", -+ "discard_hook failed with ret=%d", -+ result); -+ } -+ } -+ } -+ cipher: -+ if (need_cipher(inode)) { -+ cipher_plugin * ciplug; -+ struct blkcipher_desc desc; -+ struct scatterlist src; -+ struct scatterlist dst; -+ -+ ciplug = inode_cipher_plugin(inode); -+ desc.tfm = info_get_cipher(inode_crypto_info(inode)); -+ desc.flags = 0; -+ if (compressed) -+ alternate_streams(tc); -+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM); -+ if (result) -+ return result; -+ -+ align_or_cut_overhead(inode, clust, WRITE_OP); -+ sg_init_one(&src, tfm_input_data(clust), tc->len); -+ sg_init_one(&dst, tfm_output_data(clust), tc->len); -+ -+ result = crypto_blkcipher_encrypt(&desc, &dst, &src, tc->len); -+ if (result) { -+ warning("edward-1405", -+ "encryption failed flags=%x\n", desc.flags); -+ return result; -+ } -+ encrypted = 1; -+ } -+ if (compressed && coplug->checksum != NULL) -+ dc_set_checksum(coplug, tc); -+ if (!compressed && !encrypted) -+ alternate_streams(tc); -+ return result; -+} -+ -+/* Common inflate manager. */ -+int reiser4_inflate_cluster(struct cluster_handle * clust, struct inode * inode) -+{ -+ int result = 0; -+ int transformed = 0; -+ struct tfm_cluster * tc = &clust->tc; -+ compression_plugin * coplug; -+ -+ assert("edward-905", inode != NULL); -+ assert("edward-1178", clust->dstat == PREP_DISK_CLUSTER); -+ assert("edward-906", tfm_stream_is_set(&clust->tc, INPUT_STREAM)); -+ assert("edward-1349", tc->act == TFMA_READ); -+ assert("edward-907", !tfm_cluster_is_uptodate(tc)); -+ -+ /* Handle a checksum (if any) */ -+ coplug = inode_compression_plugin(inode); -+ if (need_inflate(clust, inode, need_cipher(inode)) && -+ coplug->checksum != NULL) { -+ result = dc_check_checksum(coplug, tc); -+ if (unlikely(result)) { -+ warning("edward-1460", -+ "Inode %llu: disk cluster %lu looks corrupted", -+ (unsigned long long)get_inode_oid(inode), -+ clust->index); -+ return RETERR(-EIO); -+ } -+ } -+ if (need_cipher(inode)) { -+ cipher_plugin * ciplug; -+ struct blkcipher_desc desc; -+ struct scatterlist src; -+ struct scatterlist dst; -+ -+ ciplug = inode_cipher_plugin(inode); -+ desc.tfm = info_get_cipher(inode_crypto_info(inode)); -+ desc.flags = 0; -+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM); -+ if (result) -+ return result; -+ assert("edward-909", tfm_cluster_is_set(tc)); -+ -+ sg_init_one(&src, tfm_input_data(clust), tc->len); -+ sg_init_one(&dst, tfm_output_data(clust), tc->len); -+ -+ result = crypto_blkcipher_decrypt(&desc, &dst, &src, tc->len); -+ if (result) { -+ warning("edward-1600", "decrypt failed flags=%x\n", -+ desc.flags); -+ return result; -+ } -+ align_or_cut_overhead(inode, clust, READ_OP); -+ transformed = 1; -+ } -+ if (need_inflate(clust, inode, 0)) { -+ unsigned dst_len = inode_cluster_size(inode); -+ if(transformed) -+ alternate_streams(tc); -+ -+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM); -+ if (result) -+ return result; -+ assert("edward-1305", coplug->decompress != NULL); -+ assert("edward-910", tfm_cluster_is_set(tc)); -+ -+ coplug->decompress(get_coa(tc, coplug->h.id, tc->act), -+ tfm_input_data(clust), tc->len, -+ tfm_output_data(clust), &dst_len); -+ /* check length */ -+ tc->len = dst_len; -+ assert("edward-157", dst_len == tc->lsize); -+ transformed = 1; -+ } -+ if (!transformed) -+ alternate_streams(tc); -+ return result; -+} -+ -+/* This is implementation of readpage method of struct -+ address_space_operations for cryptcompress plugin. */ -+int readpage_cryptcompress(struct file *file, struct page *page) -+{ -+ reiser4_context *ctx; -+ struct cluster_handle clust; -+ item_plugin *iplug; -+ int result; -+ -+ assert("edward-88", PageLocked(page)); -+ assert("vs-976", !PageUptodate(page)); -+ assert("edward-89", page->mapping && page->mapping->host); -+ -+ ctx = reiser4_init_context(page->mapping->host->i_sb); -+ if (IS_ERR(ctx)) { -+ unlock_page(page); -+ return PTR_ERR(ctx); -+ } -+ assert("edward-113", -+ ergo(file != NULL, -+ page->mapping == file->f_dentry->d_inode->i_mapping)); -+ -+ if (PageUptodate(page)) { -+ warning("edward-1338", "page is already uptodate\n"); -+ unlock_page(page); -+ reiser4_exit_context(ctx); -+ return 0; -+ } -+ cluster_init_read(&clust, NULL); -+ clust.file = file; -+ iplug = item_plugin_by_id(CTAIL_ID); -+ if (!iplug->s.file.readpage) { -+ unlock_page(page); -+ put_cluster_handle(&clust); -+ reiser4_exit_context(ctx); -+ return -EINVAL; -+ } -+ result = iplug->s.file.readpage(&clust, page); -+ -+ put_cluster_handle(&clust); -+ reiser4_txn_restart(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/* number of pages to check in */ -+static int get_new_nrpages(struct cluster_handle * clust) -+{ -+ switch (clust->op) { -+ case LC_APPOV: -+ return clust->nr_pages; -+ case LC_TRUNC: -+ assert("edward-1179", clust->win != NULL); -+ return size_in_pages(clust->win->off + clust->win->count); -+ default: -+ impossible("edward-1180", "bad page cluster option"); -+ return 0; -+ } -+} -+ -+static void set_cluster_pages_dirty(struct cluster_handle * clust, -+ struct inode * inode) -+{ -+ int i; -+ struct page *pg; -+ int nrpages = get_new_nrpages(clust); -+ -+ for (i = 0; i < nrpages; i++) { -+ -+ pg = clust->pages[i]; -+ assert("edward-968", pg != NULL); -+ lock_page(pg); -+ assert("edward-1065", PageUptodate(pg)); -+ reiser4_set_page_dirty_internal(pg); -+ unlock_page(pg); -+ mark_page_accessed(pg); -+ } -+} -+ -+/* Grab a page cluster for read/write operations. -+ Attach a jnode for write operations (when preparing for modifications, which -+ are supposed to be committed). -+ -+ We allocate only one jnode per page cluster; this jnode is binded to the -+ first page of this cluster, so we have an extra-reference that will be put -+ as soon as jnode is evicted from memory), other references will be cleaned -+ up in flush time (assume that check in page cluster was successful). -+*/ -+int grab_page_cluster(struct inode * inode, -+ struct cluster_handle * clust, rw_op rw) -+{ -+ int i; -+ int result = 0; -+ jnode *node = NULL; -+ -+ assert("edward-182", clust != NULL); -+ assert("edward-183", clust->pages != NULL); -+ assert("edward-1466", clust->node == NULL); -+ assert("edward-1428", inode != NULL); -+ assert("edward-1429", inode->i_mapping != NULL); -+ assert("edward-184", clust->nr_pages <= cluster_nrpages(inode)); -+ -+ if (clust->nr_pages == 0) -+ return 0; -+ -+ for (i = 0; i < clust->nr_pages; i++) { -+ -+ assert("edward-1044", clust->pages[i] == NULL); -+ -+ clust->pages[i] = -+ find_or_create_page(inode->i_mapping, -+ clust_to_pg(clust->index, inode) + i, -+ reiser4_ctx_gfp_mask_get()); -+ if (!clust->pages[i]) { -+ result = RETERR(-ENOMEM); -+ break; -+ } -+ if (i == 0 && rw == WRITE_OP) { -+ node = jnode_of_page(clust->pages[i]); -+ if (IS_ERR(node)) { -+ result = PTR_ERR(node); -+ unlock_page(clust->pages[i]); -+ break; -+ } -+ JF_SET(node, JNODE_CLUSTER_PAGE); -+ assert("edward-920", jprivate(clust->pages[0])); -+ } -+ INODE_PGCOUNT_INC(inode); -+ unlock_page(clust->pages[i]); -+ } -+ if (unlikely(result)) { -+ while (i) { -+ put_cluster_page(clust->pages[--i]); -+ INODE_PGCOUNT_DEC(inode); -+ } -+ if (node && !IS_ERR(node)) -+ jput(node); -+ return result; -+ } -+ clust->node = node; -+ return 0; -+} -+ -+static void truncate_page_cluster_range(struct inode * inode, -+ struct page ** pages, -+ cloff_t index, -+ int from, int count, -+ int even_cows) -+{ -+ assert("edward-1467", count > 0); -+ reiser4_invalidate_pages(inode->i_mapping, -+ clust_to_pg(index, inode) + from, -+ count, even_cows); -+} -+ -+/* Put @count pages starting from @from offset */ -+void __put_page_cluster(int from, int count, -+ struct page ** pages, struct inode * inode) -+{ -+ int i; -+ assert("edward-1468", pages != NULL); -+ assert("edward-1469", inode != NULL); -+ assert("edward-1470", from >= 0 && count >= 0); -+ -+ for (i = 0; i < count; i++) { -+ assert("edward-1471", pages[from + i] != NULL); -+ assert("edward-1472", -+ pages[from + i]->index == pages[from]->index + i); -+ -+ put_cluster_page(pages[from + i]); -+ INODE_PGCOUNT_DEC(inode); -+ } -+} -+ -+/* -+ * This is dual to grab_page_cluster, -+ * however if @rw == WRITE_OP, then we call this function -+ * only if something is failed before checkin page cluster. -+ */ -+void put_page_cluster(struct cluster_handle * clust, -+ struct inode * inode, rw_op rw) -+{ -+ assert("edward-445", clust != NULL); -+ assert("edward-922", clust->pages != NULL); -+ assert("edward-446", -+ ergo(clust->nr_pages != 0, clust->pages[0] != NULL)); -+ -+ __put_page_cluster(0, clust->nr_pages, clust->pages, inode); -+ if (rw == WRITE_OP) { -+ if (unlikely(clust->node)) { -+ assert("edward-447", -+ clust->node == jprivate(clust->pages[0])); -+ jput(clust->node); -+ clust->node = NULL; -+ } -+ } -+} -+ -+#if REISER4_DEBUG -+int cryptcompress_inode_ok(struct inode *inode) -+{ -+ if (!(reiser4_inode_data(inode)->plugin_mask & (1 << PSET_FILE))) -+ return 0; -+ if (!cluster_shift_ok(inode_cluster_shift(inode))) -+ return 0; -+ return 1; -+} -+ -+static int window_ok(struct reiser4_slide * win, struct inode *inode) -+{ -+ assert("edward-1115", win != NULL); -+ assert("edward-1116", ergo(win->delta, win->stat == HOLE_WINDOW)); -+ -+ return (win->off != inode_cluster_size(inode)) && -+ (win->off + win->count + win->delta <= inode_cluster_size(inode)); -+} -+ -+static int cluster_ok(struct cluster_handle * clust, struct inode *inode) -+{ -+ assert("edward-279", clust != NULL); -+ -+ if (!clust->pages) -+ return 0; -+ return (clust->win ? window_ok(clust->win, inode) : 1); -+} -+#if 0 -+static int pages_truncate_ok(struct inode *inode, pgoff_t start) -+{ -+ int found; -+ struct page * page; -+ -+ found = find_get_pages(inode->i_mapping, start, 1, &page); -+ if (found) -+ put_cluster_page(page); -+ return !found; -+} -+#else -+#define pages_truncate_ok(inode, start) 1 -+#endif -+ -+static int jnode_truncate_ok(struct inode *inode, cloff_t index) -+{ -+ jnode *node; -+ node = jlookup(current_tree, get_inode_oid(inode), -+ clust_to_pg(index, inode)); -+ if (likely(!node)) -+ return 1; -+ jput(node); -+ return 0; -+} -+ -+static int find_fake_appended(struct inode *inode, cloff_t * index); -+ -+static int body_truncate_ok(struct inode *inode, cloff_t aidx) -+{ -+ int result; -+ cloff_t raidx; -+ -+ result = find_fake_appended(inode, &raidx); -+ return !result && (aidx == raidx); -+} -+#endif -+ -+/* guess next window stat */ -+static inline window_stat next_window_stat(struct reiser4_slide * win) -+{ -+ assert("edward-1130", win != NULL); -+ return ((win->stat == HOLE_WINDOW && win->delta == 0) ? -+ HOLE_WINDOW : DATA_WINDOW); -+} -+ -+/* guess and set next cluster index and window params */ -+static void move_update_window(struct inode * inode, -+ struct cluster_handle * clust, -+ loff_t file_off, loff_t to_file) -+{ -+ struct reiser4_slide * win; -+ -+ assert("edward-185", clust != NULL); -+ assert("edward-438", clust->pages != NULL); -+ assert("edward-281", cluster_ok(clust, inode)); -+ -+ win = clust->win; -+ if (!win) -+ return; -+ -+ switch (win->stat) { -+ case DATA_WINDOW: -+ /* increment */ -+ clust->index++; -+ win->stat = DATA_WINDOW; -+ win->off = 0; -+ win->count = min((loff_t)inode_cluster_size(inode), to_file); -+ break; -+ case HOLE_WINDOW: -+ switch (next_window_stat(win)) { -+ case HOLE_WINDOW: -+ /* skip */ -+ clust->index = off_to_clust(file_off, inode); -+ win->stat = HOLE_WINDOW; -+ win->off = 0; -+ win->count = off_to_cloff(file_off, inode); -+ win->delta = min((loff_t)(inode_cluster_size(inode) - -+ win->count), to_file); -+ break; -+ case DATA_WINDOW: -+ /* stay */ -+ win->stat = DATA_WINDOW; -+ /* off+count+delta=inv */ -+ win->off = win->off + win->count; -+ win->count = win->delta; -+ win->delta = 0; -+ break; -+ default: -+ impossible("edward-282", "wrong next window state"); -+ } -+ break; -+ default: -+ impossible("edward-283", "wrong current window state"); -+ } -+ assert("edward-1068", cluster_ok(clust, inode)); -+} -+ -+static int update_sd_cryptcompress(struct inode *inode) -+{ -+ int result = 0; -+ -+ assert("edward-978", reiser4_schedulable()); -+ -+ result = reiser4_grab_space_force(/* one for stat data update */ -+ estimate_update_common(inode), -+ BA_CAN_COMMIT); -+ if (result) -+ return result; -+ inode->i_ctime = inode->i_mtime = CURRENT_TIME; -+ result = reiser4_update_sd(inode); -+ -+ return result; -+} -+ -+static void uncapture_cluster_jnode(jnode * node) -+{ -+ txn_atom *atom; -+ -+ assert_spin_locked(&(node->guard)); -+ -+ atom = jnode_get_atom(node); -+ if (atom == NULL) { -+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY)); -+ spin_unlock_jnode(node); -+ return; -+ } -+ reiser4_uncapture_block(node); -+ spin_unlock_atom(atom); -+ jput(node); -+} -+ -+static void put_found_pages(struct page **pages, int nr) -+{ -+ int i; -+ for (i = 0; i < nr; i++) { -+ assert("edward-1045", pages[i] != NULL); -+ put_cluster_page(pages[i]); -+ } -+} -+ -+/* Lifecycle of a logical cluster in the system. -+ * -+ * -+ * Logical cluster of a cryptcompress file is represented in the system by -+ * . page cluster (in memory, primary cache, contains plain text); -+ * . disk cluster (in memory, secondary cache, contains transformed text). -+ * Primary cache is to reduce number of transform operations (compression, -+ * encryption), i.e. to implement transform-caching strategy. -+ * Secondary cache is to reduce number of I/O operations, i.e. for usual -+ * write-caching strategy. Page cluster is a set of pages, i.e. mapping of -+ * a logical cluster to the primary cache. Disk cluster is a set of items -+ * of the same type defined by some reiser4 item plugin id. -+ * -+ * 1. Performing modifications -+ * -+ * Every modification of a cryptcompress file is considered as a set of -+ * operations performed on file's logical clusters. Every such "atomic" -+ * modification is truncate, append and(or) overwrite some bytes of a -+ * logical cluster performed in the primary cache with the following -+ * synchronization with the secondary cache (in flush time). Disk clusters, -+ * which live in the secondary cache, are supposed to be synchronized with -+ * disk. The mechanism of synchronization of primary and secondary caches -+ * includes so-called checkin/checkout technique described below. -+ * -+ * 2. Submitting modifications -+ * -+ * Each page cluster has associated jnode (a special in-memory header to -+ * keep a track of transactions in reiser4), which is attached to its first -+ * page when grabbing page cluster for modifications (see grab_page_cluster). -+ * Submitting modifications (see checkin_logical_cluster) is going per logical -+ * cluster and includes: -+ * . checkin_cluster_size; -+ * . checkin_page_cluster. -+ * checkin_cluster_size() is resolved to file size update (which completely -+ * defines new size of logical cluster (number of file's bytes in a logical -+ * cluster). -+ * checkin_page_cluster() captures jnode of a page cluster and installs -+ * jnode's dirty flag (if needed) to indicate that modifications are -+ * successfully checked in. -+ * -+ * 3. Checking out modifications -+ * -+ * Is going per logical cluster in flush time (see checkout_logical_cluster). -+ * This is the time of synchronizing primary and secondary caches. -+ * checkout_logical_cluster() includes: -+ * . checkout_page_cluster (retrieving checked in pages). -+ * . uncapture jnode (including clear dirty flag and unlock) -+ * -+ * 4. Committing modifications -+ * -+ * Proceeding a synchronization of primary and secondary caches. When checking -+ * out page cluster (the phase above) pages are locked/flushed/unlocked -+ * one-by-one in ascending order of their indexes to contiguous stream, which -+ * is supposed to be transformed (compressed, encrypted), chopped up into items -+ * and committed to disk as a disk cluster. -+ * -+ * 5. Managing page references -+ * -+ * Every checked in page have a special additional "control" reference, -+ * which is dropped at checkout. We need this to avoid unexpected evicting -+ * pages from memory before checkout. Control references are managed so -+ * they are not accumulated with every checkin: -+ * -+ * 0 -+ * checkin -> 1 -+ * 0 -> checkout -+ * checkin -> 1 -+ * checkin -> 1 -+ * checkin -> 1 -+ * 0 -> checkout -+ * ... -+ * -+ * Every page cluster has its own unique "cluster lock". Update/drop -+ * references are serialized via this lock. Number of checked in cluster -+ * pages is calculated by i_size under cluster lock. File size is updated -+ * at every checkin action also under cluster lock (except cases of -+ * appending/truncating fake logical clusters). -+ * -+ * Proof of correctness: -+ * -+ * Since we update file size under cluster lock, in the case of non-fake -+ * logical cluster with its lock held we do have expected number of checked -+ * in pages. On the other hand, append/truncate of fake logical clusters -+ * doesn't change number of checked in pages of any cluster. -+ * -+ * NOTE-EDWARD: As cluster lock we use guard (spinlock_t) of its jnode. -+ * Currently, I don't see any reason to create a special lock for those -+ * needs. -+ */ -+ -+static inline void lock_cluster(jnode * node) -+{ -+ spin_lock_jnode(node); -+} -+ -+static inline void unlock_cluster(jnode * node) -+{ -+ spin_unlock_jnode(node); -+} -+ -+static inline void unlock_cluster_uncapture(jnode * node) -+{ -+ uncapture_cluster_jnode(node); -+} -+ -+/* Set new file size by window. Cluster lock is required. */ -+static void checkin_file_size(struct cluster_handle * clust, -+ struct inode * inode) -+{ -+ loff_t new_size; -+ struct reiser4_slide * win; -+ -+ assert("edward-1181", clust != NULL); -+ assert("edward-1182", inode != NULL); -+ assert("edward-1473", clust->pages != NULL); -+ assert("edward-1474", clust->pages[0] != NULL); -+ assert("edward-1475", jprivate(clust->pages[0]) != NULL); -+ assert_spin_locked(&(jprivate(clust->pages[0])->guard)); -+ -+ -+ win = clust->win; -+ assert("edward-1183", win != NULL); -+ -+ new_size = clust_to_off(clust->index, inode) + win->off; -+ -+ switch (clust->op) { -+ case LC_APPOV: -+ if (new_size + win->count <= i_size_read(inode)) -+ /* overwrite only */ -+ return; -+ new_size += win->count; -+ break; -+ case LC_TRUNC: -+ break; -+ default: -+ impossible("edward-1184", "bad page cluster option"); -+ break; -+ } -+ inode_check_scale_nolock(inode, i_size_read(inode), new_size); -+ i_size_write(inode, new_size); -+ return; -+} -+ -+static inline void checkin_cluster_size(struct cluster_handle * clust, -+ struct inode * inode) -+{ -+ if (clust->win) -+ checkin_file_size(clust, inode); -+} -+ -+static int checkin_page_cluster(struct cluster_handle * clust, -+ struct inode * inode) -+{ -+ int result; -+ jnode * node; -+ int old_nrpages = clust->old_nrpages; -+ int new_nrpages = get_new_nrpages(clust); -+ -+ node = clust->node; -+ -+ assert("edward-221", node != NULL); -+ assert("edward-971", clust->reserved == 1); -+ assert("edward-1263", -+ clust->reserved_prepped == estimate_update_cluster(inode)); -+ assert("edward-1264", clust->reserved_unprepped == 0); -+ -+ if (JF_ISSET(node, JNODE_DIRTY)) { -+ /* -+ * page cluster was checked in, but not yet -+ * checked out, so release related resources -+ */ -+ free_reserved4cluster(inode, clust, -+ estimate_update_cluster(inode)); -+ __put_page_cluster(0, clust->old_nrpages, -+ clust->pages, inode); -+ } else { -+ result = capture_cluster_jnode(node); -+ if (unlikely(result)) { -+ unlock_cluster(node); -+ return result; -+ } -+ jnode_make_dirty_locked(node); -+ clust->reserved = 0; -+ } -+ unlock_cluster(node); -+ -+ if (new_nrpages < old_nrpages) { -+ /* truncate >= 1 complete pages */ -+ __put_page_cluster(new_nrpages, -+ old_nrpages - new_nrpages, -+ clust->pages, inode); -+ truncate_page_cluster_range(inode, -+ clust->pages, clust->index, -+ new_nrpages, -+ old_nrpages - new_nrpages, -+ 0); -+ } -+#if REISER4_DEBUG -+ clust->reserved_prepped -= estimate_update_cluster(inode); -+#endif -+ return 0; -+} -+ -+/* Submit modifications of a logical cluster */ -+static int checkin_logical_cluster(struct cluster_handle * clust, -+ struct inode *inode) -+{ -+ int result = 0; -+ jnode * node; -+ -+ node = clust->node; -+ -+ assert("edward-1035", node != NULL); -+ assert("edward-1029", clust != NULL); -+ assert("edward-1030", clust->reserved == 1); -+ assert("edward-1031", clust->nr_pages != 0); -+ assert("edward-1032", clust->pages != NULL); -+ assert("edward-1033", clust->pages[0] != NULL); -+ assert("edward-1446", jnode_is_cluster_page(node)); -+ assert("edward-1476", node == jprivate(clust->pages[0])); -+ -+ lock_cluster(node); -+ checkin_cluster_size(clust, inode); -+ /* this will unlock cluster */ -+ result = checkin_page_cluster(clust, inode); -+ jput(node); -+ clust->node = NULL; -+ return result; -+} -+ -+/* -+ * Retrieve size of logical cluster that was checked in at -+ * the latest modifying session (cluster lock is required) -+ */ -+static inline void checkout_cluster_size(struct cluster_handle * clust, -+ struct inode * inode) -+{ -+ struct tfm_cluster *tc = &clust->tc; -+ -+ tc->len = lbytes(clust->index, inode); -+ assert("edward-1478", tc->len != 0); -+} -+ -+/* -+ * Retrieve a page cluster with the latest submitted modifications -+ * and flush its pages to previously allocated contiguous stream. -+ */ -+static void checkout_page_cluster(struct cluster_handle * clust, -+ jnode * node, struct inode * inode) -+{ -+ int i; -+ int found; -+ int to_put; -+ struct tfm_cluster *tc = &clust->tc; -+ -+ /* find and put checked in pages: cluster is locked, -+ * so we must get expected number (to_put) of pages -+ */ -+ to_put = size_in_pages(lbytes(clust->index, inode)); -+ found = find_get_pages(inode->i_mapping, -+ clust_to_pg(clust->index, inode), -+ to_put, clust->pages); -+ BUG_ON(found != to_put); -+ -+ __put_page_cluster(0, to_put, clust->pages, inode); -+ unlock_cluster_uncapture(node); -+ -+ /* Flush found pages. -+ * -+ * Note, that we don't disable modifications while flushing, -+ * moreover, some found pages can be truncated, as we have -+ * released cluster lock. -+ */ -+ for (i = 0; i < found; i++) { -+ int in_page; -+ char * data; -+ assert("edward-1479", -+ clust->pages[i]->index == clust->pages[0]->index + i); -+ -+ lock_page(clust->pages[i]); -+ if (!PageUptodate(clust->pages[i])) { -+ /* page was truncated */ -+ assert("edward-1480", -+ i_size_read(inode) <= page_offset(clust->pages[i])); -+ assert("edward-1481", -+ clust->pages[i]->mapping != inode->i_mapping); -+ unlock_page(clust->pages[i]); -+ break; -+ } -+ /* Update the number of bytes in the logical cluster, -+ * as it could be partially truncated. Note, that only -+ * partial truncate is possible (complete truncate can -+ * not go here, as it is performed via ->kill_hook() -+ * called by cut_file_items(), and the last one must -+ * wait for znode locked with parent coord). -+ */ -+ checkout_cluster_size(clust, inode); -+ -+ /* this can be zero, as new file size is -+ checked in before truncating pages */ -+ in_page = __mbp(tc->len, i); -+ -+ data = kmap(clust->pages[i]); -+ memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i), -+ data, in_page); -+ kunmap(clust->pages[i]); -+ -+ if (PageDirty(clust->pages[i])) -+ cancel_dirty_page(clust->pages[i], PAGE_CACHE_SIZE); -+ -+ unlock_page(clust->pages[i]); -+ -+ if (in_page < PAGE_CACHE_SIZE) -+ /* end of the file */ -+ break; -+ } -+ put_found_pages(clust->pages, found); /* find_get_pages */ -+ tc->lsize = tc->len; -+ return; -+} -+ -+/* Check out modifications of a logical cluster */ -+int checkout_logical_cluster(struct cluster_handle * clust, -+ jnode * node, struct inode *inode) -+{ -+ int result; -+ struct tfm_cluster *tc = &clust->tc; -+ -+ assert("edward-980", node != NULL); -+ assert("edward-236", inode != NULL); -+ assert("edward-237", clust != NULL); -+ assert("edward-240", !clust->win); -+ assert("edward-241", reiser4_schedulable()); -+ assert("edward-718", cryptcompress_inode_ok(inode)); -+ -+ result = grab_tfm_stream(inode, tc, INPUT_STREAM); -+ if (result) { -+ warning("edward-1430", "alloc stream failed with ret=%d", -+ result); -+ return RETERR(-E_REPEAT); -+ } -+ lock_cluster(node); -+ -+ if (unlikely(!JF_ISSET(node, JNODE_DIRTY))) { -+ /* race with another flush */ -+ warning("edward-982", -+ "checking out logical cluster %lu of inode %llu: " -+ "jnode is not dirty", clust->index, -+ (unsigned long long)get_inode_oid(inode)); -+ unlock_cluster(node); -+ return RETERR(-E_REPEAT); -+ } -+ cluster_reserved2grabbed(estimate_update_cluster(inode)); -+ -+ /* this will unlock cluster */ -+ checkout_page_cluster(clust, node, inode); -+ return 0; -+} -+ -+/* set hint for the cluster of the index @index */ -+static void set_hint_cluster(struct inode *inode, hint_t * hint, -+ cloff_t index, znode_lock_mode mode) -+{ -+ reiser4_key key; -+ assert("edward-722", cryptcompress_inode_ok(inode)); -+ assert("edward-723", -+ inode_file_plugin(inode) == -+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); -+ -+ inode_file_plugin(inode)->key_by_inode(inode, -+ clust_to_off(index, inode), -+ &key); -+ -+ reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, &key); -+ hint->offset = get_key_offset(&key); -+ hint->mode = mode; -+} -+ -+void invalidate_hint_cluster(struct cluster_handle * clust) -+{ -+ assert("edward-1291", clust != NULL); -+ assert("edward-1292", clust->hint != NULL); -+ -+ done_lh(&clust->hint->lh); -+ hint_clr_valid(clust->hint); -+} -+ -+static void put_hint_cluster(struct cluster_handle * clust, -+ struct inode *inode, znode_lock_mode mode) -+{ -+ assert("edward-1286", clust != NULL); -+ assert("edward-1287", clust->hint != NULL); -+ -+ set_hint_cluster(inode, clust->hint, clust->index + 1, mode); -+ invalidate_hint_cluster(clust); -+} -+ -+static int balance_dirty_page_cluster(struct cluster_handle * clust, -+ struct inode *inode, loff_t off, -+ loff_t to_file, -+ int nr_dirtied) -+{ -+ int result; -+ struct cryptcompress_info * info; -+ -+ assert("edward-724", inode != NULL); -+ assert("edward-725", cryptcompress_inode_ok(inode)); -+ assert("edward-1547", -+ nr_dirtied != 0 && nr_dirtied <= cluster_nrpages(inode)); -+ -+ /* set next window params */ -+ move_update_window(inode, clust, off, to_file); -+ -+ result = update_sd_cryptcompress(inode); -+ if (result) -+ return result; -+ assert("edward-726", clust->hint->lh.owner == NULL); -+ info = cryptcompress_inode_data(inode); -+ -+ mutex_unlock(&info->checkin_mutex); -+ reiser4_txn_restart_current(); -+ balance_dirty_pages_ratelimited_nr(inode->i_mapping, nr_dirtied); -+ mutex_lock(&info->checkin_mutex); -+ return 0; -+} -+ -+/* set zeroes to the page cluster, proceed it, and maybe, try to capture -+ its pages */ -+static int write_hole(struct inode *inode, struct cluster_handle * clust, -+ loff_t file_off, loff_t to_file) -+{ -+ int result = 0; -+ unsigned cl_off, cl_count = 0; -+ unsigned to_pg, pg_off; -+ struct reiser4_slide * win; -+ -+ assert("edward-190", clust != NULL); -+ assert("edward-1069", clust->win != NULL); -+ assert("edward-191", inode != NULL); -+ assert("edward-727", cryptcompress_inode_ok(inode)); -+ assert("edward-1171", clust->dstat != INVAL_DISK_CLUSTER); -+ assert("edward-1154", -+ ergo(clust->dstat != FAKE_DISK_CLUSTER, clust->reserved == 1)); -+ -+ win = clust->win; -+ -+ assert("edward-1070", win != NULL); -+ assert("edward-201", win->stat == HOLE_WINDOW); -+ assert("edward-192", cluster_ok(clust, inode)); -+ -+ if (win->off == 0 && win->count == inode_cluster_size(inode)) { -+ /* This part of the hole will be represented by "fake" -+ * logical cluster, i.e. which doesn't have appropriate -+ * disk cluster until someone modify this logical cluster -+ * and make it dirty. -+ * So go forward here.. -+ */ -+ move_update_window(inode, clust, file_off, to_file); -+ return 0; -+ } -+ cl_count = win->count; /* number of zeroes to write */ -+ cl_off = win->off; -+ pg_off = off_to_pgoff(win->off); -+ -+ while (cl_count) { -+ struct page *page; -+ page = clust->pages[off_to_pg(cl_off)]; -+ -+ assert("edward-284", page != NULL); -+ -+ to_pg = min((typeof(pg_off))PAGE_CACHE_SIZE - pg_off, cl_count); -+ lock_page(page); -+ zero_user_page(page, pg_off, to_pg, KM_USER0); -+ SetPageUptodate(page); -+ reiser4_set_page_dirty_internal(page); -+ mark_page_accessed(page); -+ unlock_page(page); -+ -+ cl_off += to_pg; -+ cl_count -= to_pg; -+ pg_off = 0; -+ } -+ if (!win->delta) { -+ /* only zeroes in this window, try to capture -+ */ -+ result = checkin_logical_cluster(clust, inode); -+ if (result) -+ return result; -+ put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK); -+ result = balance_dirty_page_cluster(clust, -+ inode, file_off, to_file, -+ win_count_to_nrpages(win)); -+ } else -+ move_update_window(inode, clust, file_off, to_file); -+ return result; -+} -+ -+/* -+ The main disk search procedure for cryptcompress plugin, which -+ . scans all items of disk cluster with the lock mode @mode -+ . maybe reads each one (if @read) -+ . maybe makes its znode dirty (if write lock mode was specified) -+ -+ NOTE-EDWARD: Callers should handle the case when disk cluster -+ is incomplete (-EIO) -+*/ -+int find_disk_cluster(struct cluster_handle * clust, -+ struct inode *inode, int read, znode_lock_mode mode) -+{ -+ flow_t f; -+ hint_t *hint; -+ int result = 0; -+ int was_grabbed; -+ ra_info_t ra_info; -+ file_plugin *fplug; -+ item_plugin *iplug; -+ struct tfm_cluster *tc; -+ struct cryptcompress_info * info; -+ -+ assert("edward-138", clust != NULL); -+ assert("edward-728", clust->hint != NULL); -+ assert("edward-226", reiser4_schedulable()); -+ assert("edward-137", inode != NULL); -+ assert("edward-729", cryptcompress_inode_ok(inode)); -+ -+ hint = clust->hint; -+ fplug = inode_file_plugin(inode); -+ was_grabbed = get_current_context()->grabbed_blocks; -+ info = cryptcompress_inode_data(inode); -+ tc = &clust->tc; -+ -+ assert("edward-462", !tfm_cluster_is_uptodate(tc)); -+ assert("edward-461", ergo(read, tfm_stream_is_set(tc, INPUT_STREAM))); -+ -+ dclust_init_extension(hint); -+ -+ /* set key of the first disk cluster item */ -+ fplug->flow_by_inode(inode, -+ (read ? (char __user *)tfm_stream_data(tc, INPUT_STREAM) : NULL), -+ 0 /* kernel space */ , -+ inode_scaled_cluster_size(inode), -+ clust_to_off(clust->index, inode), READ_OP, &f); -+ if (mode == ZNODE_WRITE_LOCK) { -+ /* reserve for flush to make dirty all the leaf nodes -+ which contain disk cluster */ -+ result = -+ reiser4_grab_space_force(estimate_dirty_cluster(inode), -+ BA_CAN_COMMIT); -+ if (result) -+ goto out; -+ } -+ -+ ra_info.key_to_stop = f.key; -+ set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key())); -+ -+ while (f.length) { -+ result = find_cluster_item(hint, &f.key, mode, -+ NULL, FIND_EXACT, -+ (mode == ZNODE_WRITE_LOCK ? -+ CBK_FOR_INSERT : 0)); -+ switch (result) { -+ case CBK_COORD_NOTFOUND: -+ result = 0; -+ if (inode_scaled_offset -+ (inode, clust_to_off(clust->index, inode)) == -+ get_key_offset(&f.key)) { -+ /* first item not found, this is treated -+ as disk cluster is absent */ -+ clust->dstat = FAKE_DISK_CLUSTER; -+ goto out; -+ } -+ /* we are outside the cluster, stop search here */ -+ assert("edward-146", -+ f.length != inode_scaled_cluster_size(inode)); -+ goto ok; -+ case CBK_COORD_FOUND: -+ assert("edward-148", -+ hint->ext_coord.coord.between == AT_UNIT); -+ assert("edward-460", -+ hint->ext_coord.coord.unit_pos == 0); -+ -+ coord_clear_iplug(&hint->ext_coord.coord); -+ result = zload_ra(hint->ext_coord.coord.node, &ra_info); -+ if (unlikely(result)) -+ goto out; -+ iplug = item_plugin_by_coord(&hint->ext_coord.coord); -+ assert("edward-147", -+ item_id_by_coord(&hint->ext_coord.coord) == -+ CTAIL_ID); -+ -+ result = iplug->s.file.read(NULL, &f, hint); -+ if (result) { -+ zrelse(hint->ext_coord.coord.node); -+ goto out; -+ } -+ if (mode == ZNODE_WRITE_LOCK) { -+ /* Don't make dirty more nodes then it was -+ estimated (see comments before -+ estimate_dirty_cluster). Missed nodes will be -+ read up in flush time if they are evicted from -+ memory */ -+ if (dclust_get_extension_ncount(hint) <= -+ estimate_dirty_cluster(inode)) -+ znode_make_dirty(hint->ext_coord.coord.node); -+ -+ znode_set_convertible(hint->ext_coord.coord. -+ node); -+ } -+ zrelse(hint->ext_coord.coord.node); -+ break; -+ default: -+ goto out; -+ } -+ } -+ ok: -+ /* at least one item was found */ -+ /* NOTE-EDWARD: Callers should handle the case -+ when disk cluster is incomplete (-EIO) */ -+ tc->len = inode_scaled_cluster_size(inode) - f.length; -+ tc->lsize = lbytes(clust->index, inode); -+ assert("edward-1196", tc->len > 0); -+ assert("edward-1406", tc->lsize > 0); -+ -+ if (hint_is_unprepped_dclust(clust->hint)) { -+ clust->dstat = UNPR_DISK_CLUSTER; -+ } else if (clust->index == info->trunc_index) { -+ clust->dstat = TRNC_DISK_CLUSTER; -+ } else { -+ clust->dstat = PREP_DISK_CLUSTER; -+ dclust_set_extension_dsize(clust->hint, tc->len); -+ } -+ out: -+ assert("edward-1339", -+ get_current_context()->grabbed_blocks >= was_grabbed); -+ grabbed2free(get_current_context(), -+ get_current_super_private(), -+ get_current_context()->grabbed_blocks - was_grabbed); -+ return result; -+} -+ -+int get_disk_cluster_locked(struct cluster_handle * clust, struct inode *inode, -+ znode_lock_mode lock_mode) -+{ -+ reiser4_key key; -+ ra_info_t ra_info; -+ -+ assert("edward-730", reiser4_schedulable()); -+ assert("edward-731", clust != NULL); -+ assert("edward-732", inode != NULL); -+ -+ if (hint_is_valid(clust->hint)) { -+ assert("edward-1293", clust->dstat != INVAL_DISK_CLUSTER); -+ assert("edward-1294", -+ znode_is_write_locked(clust->hint->lh.node)); -+ /* already have a valid locked position */ -+ return (clust->dstat == -+ FAKE_DISK_CLUSTER ? CBK_COORD_NOTFOUND : -+ CBK_COORD_FOUND); -+ } -+ key_by_inode_cryptcompress(inode, clust_to_off(clust->index, inode), -+ &key); -+ ra_info.key_to_stop = key; -+ set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key())); -+ -+ return find_cluster_item(clust->hint, &key, lock_mode, NULL, FIND_EXACT, -+ CBK_FOR_INSERT); -+} -+ -+/* Read needed cluster pages before modifying. -+ If success, @clust->hint contains locked position in the tree. -+ Also: -+ . find and set disk cluster state -+ . make disk cluster dirty if its state is not FAKE_DISK_CLUSTER. -+*/ -+static int read_some_cluster_pages(struct inode * inode, -+ struct cluster_handle * clust) -+{ -+ int i; -+ int result = 0; -+ item_plugin *iplug; -+ struct reiser4_slide * win = clust->win; -+ znode_lock_mode mode = ZNODE_WRITE_LOCK; -+ -+ iplug = item_plugin_by_id(CTAIL_ID); -+ -+ assert("edward-924", !tfm_cluster_is_uptodate(&clust->tc)); -+ -+#if REISER4_DEBUG -+ if (clust->nr_pages == 0) { -+ /* start write hole from fake disk cluster */ -+ assert("edward-1117", win != NULL); -+ assert("edward-1118", win->stat == HOLE_WINDOW); -+ assert("edward-1119", new_logical_cluster(clust, inode)); -+ } -+#endif -+ if (new_logical_cluster(clust, inode)) { -+ /* -+ new page cluster is about to be written, nothing to read, -+ */ -+ assert("edward-734", reiser4_schedulable()); -+ assert("edward-735", clust->hint->lh.owner == NULL); -+ -+ if (clust->nr_pages) { -+ int off; -+ struct page * pg; -+ assert("edward-1419", clust->pages != NULL); -+ pg = clust->pages[clust->nr_pages - 1]; -+ assert("edward-1420", pg != NULL); -+ off = off_to_pgoff(win->off+win->count+win->delta); -+ if (off) { -+ lock_page(pg); -+ zero_user_page(pg, off, PAGE_CACHE_SIZE - off, -+ KM_USER0); -+ unlock_page(pg); -+ } -+ } -+ clust->dstat = FAKE_DISK_CLUSTER; -+ return 0; -+ } -+ /* -+ Here we should search for disk cluster to figure out its real state. -+ Also there is one more important reason to do disk search: we need -+ to make disk cluster _dirty_ if it exists -+ */ -+ -+ /* if windows is specified, read the only pages -+ that will be modified partially */ -+ -+ for (i = 0; i < clust->nr_pages; i++) { -+ struct page *pg = clust->pages[i]; -+ -+ lock_page(pg); -+ if (PageUptodate(pg)) { -+ unlock_page(pg); -+ continue; -+ } -+ unlock_page(pg); -+ -+ if (win && -+ i >= size_in_pages(win->off) && -+ i < off_to_pg(win->off + win->count + win->delta)) -+ /* page will be completely overwritten */ -+ continue; -+ -+ if (win && (i == clust->nr_pages - 1) && -+ /* the last page is -+ partially modified, -+ not uptodate .. */ -+ (size_in_pages(i_size_read(inode)) <= pg->index)) { -+ /* .. and appended, -+ so set zeroes to the rest */ -+ int offset; -+ lock_page(pg); -+ assert("edward-1260", -+ size_in_pages(win->off + win->count + -+ win->delta) - 1 == i); -+ -+ offset = -+ off_to_pgoff(win->off + win->count + win->delta); -+ zero_user_page(pg, offset, PAGE_CACHE_SIZE - offset, -+ KM_USER0); -+ unlock_page(pg); -+ /* still not uptodate */ -+ break; -+ } -+ lock_page(pg); -+ result = do_readpage_ctail(inode, clust, pg, mode); -+ -+ assert("edward-1526", ergo(!result, PageUptodate(pg))); -+ unlock_page(pg); -+ if (result) { -+ warning("edward-219", "do_readpage_ctail failed"); -+ goto out; -+ } -+ } -+ if (!tfm_cluster_is_uptodate(&clust->tc)) { -+ /* disk cluster unclaimed, but we need to make its znodes dirty -+ * to make flush update convert its content -+ */ -+ result = find_disk_cluster(clust, inode, -+ 0 /* do not read items */, -+ mode); -+ } -+ out: -+ tfm_cluster_clr_uptodate(&clust->tc); -+ return result; -+} -+ -+static int should_create_unprepped_cluster(struct cluster_handle * clust, -+ struct inode * inode) -+{ -+ assert("edward-737", clust != NULL); -+ -+ switch (clust->dstat) { -+ case PREP_DISK_CLUSTER: -+ case UNPR_DISK_CLUSTER: -+ return 0; -+ case FAKE_DISK_CLUSTER: -+ if (clust->win && -+ clust->win->stat == HOLE_WINDOW && clust->nr_pages == 0) { -+ assert("edward-1172", -+ new_logical_cluster(clust, inode)); -+ return 0; -+ } -+ return 1; -+ default: -+ impossible("edward-1173", "bad disk cluster state"); -+ return 0; -+ } -+} -+ -+static int cryptcompress_make_unprepped_cluster(struct cluster_handle * clust, -+ struct inode *inode) -+{ -+ int result; -+ -+ assert("edward-1123", reiser4_schedulable()); -+ assert("edward-737", clust != NULL); -+ assert("edward-738", inode != NULL); -+ assert("edward-739", cryptcompress_inode_ok(inode)); -+ assert("edward-1053", clust->hint != NULL); -+ -+ if (!should_create_unprepped_cluster(clust, inode)) { -+ if (clust->reserved) { -+ cluster_reserved2free(estimate_insert_cluster(inode)); -+#if REISER4_DEBUG -+ assert("edward-1267", -+ clust->reserved_unprepped == -+ estimate_insert_cluster(inode)); -+ clust->reserved_unprepped -= -+ estimate_insert_cluster(inode); -+#endif -+ } -+ return 0; -+ } -+ assert("edward-1268", clust->reserved); -+ cluster_reserved2grabbed(estimate_insert_cluster(inode)); -+#if REISER4_DEBUG -+ assert("edward-1441", -+ clust->reserved_unprepped == estimate_insert_cluster(inode)); -+ clust->reserved_unprepped -= estimate_insert_cluster(inode); -+#endif -+ result = ctail_insert_unprepped_cluster(clust, inode); -+ if (result) -+ return result; -+ -+ inode_add_bytes(inode, inode_cluster_size(inode)); -+ -+ assert("edward-743", cryptcompress_inode_ok(inode)); -+ assert("edward-744", znode_is_write_locked(clust->hint->lh.node)); -+ -+ clust->dstat = UNPR_DISK_CLUSTER; -+ return 0; -+} -+ -+/* . Grab page cluster for read, write, setattr, etc. operations; -+ * . Truncate its complete pages, if needed; -+ */ -+int prepare_page_cluster(struct inode * inode, struct cluster_handle * clust, -+ rw_op rw) -+{ -+ assert("edward-177", inode != NULL); -+ assert("edward-741", cryptcompress_inode_ok(inode)); -+ assert("edward-740", clust->pages != NULL); -+ -+ set_cluster_nrpages(clust, inode); -+ reset_cluster_pgset(clust, cluster_nrpages(inode)); -+ return grab_page_cluster(inode, clust, rw); -+} -+ -+/* Truncate complete page cluster of index @index. -+ * This is called by ->kill_hook() method of item -+ * plugin when deleting a disk cluster of such index. -+ */ -+void truncate_complete_page_cluster(struct inode *inode, cloff_t index, -+ int even_cows) -+{ -+ int found; -+ int nr_pages; -+ jnode *node; -+ struct page *pages[MAX_CLUSTER_NRPAGES]; -+ -+ node = jlookup(current_tree, get_inode_oid(inode), -+ clust_to_pg(index, inode)); -+ nr_pages = size_in_pages(lbytes(index, inode)); -+ assert("edward-1483", nr_pages != 0); -+ if (!node) -+ goto truncate; -+ found = find_get_pages(inode->i_mapping, -+ clust_to_pg(index, inode), -+ cluster_nrpages(inode), pages); -+ if (!found) { -+ assert("edward-1484", jnode_truncate_ok(inode, index)); -+ return; -+ } -+ lock_cluster(node); -+ -+ if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS) -+ && index == 0) -+ /* converting to unix_file is in progress */ -+ JF_CLR(node, JNODE_CLUSTER_PAGE); -+ if (JF_ISSET(node, JNODE_DIRTY)) { -+ /* -+ * @nr_pages were checked in, but not yet checked out - -+ * we need to release them. (also there can be pages -+ * attached to page cache by read(), etc. - don't take -+ * them into account). -+ */ -+ assert("edward-1198", found >= nr_pages); -+ -+ /* free disk space grabbed for disk cluster converting */ -+ cluster_reserved2grabbed(estimate_update_cluster(inode)); -+ grabbed2free(get_current_context(), -+ get_current_super_private(), -+ estimate_update_cluster(inode)); -+ __put_page_cluster(0, nr_pages, pages, inode); -+ -+ /* This will clear dirty bit, uncapture and unlock jnode */ -+ unlock_cluster_uncapture(node); -+ } else -+ unlock_cluster(node); -+ jput(node); /* jlookup */ -+ put_found_pages(pages, found); /* find_get_pages */ -+ truncate: -+ if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS) && -+ index == 0) -+ return; -+ truncate_page_cluster_range(inode, pages, index, 0, -+ cluster_nrpages(inode), -+ even_cows); -+ assert("edward-1201", -+ ergo(!reiser4_inode_get_flag(inode, -+ REISER4_FILE_CONV_IN_PROGRESS), -+ jnode_truncate_ok(inode, index))); -+ return; -+} -+ -+/* -+ * Set cluster handle @clust of a logical cluster before -+ * modifications which are supposed to be committed. -+ * -+ * . grab cluster pages; -+ * . reserve disk space; -+ * . maybe read pages from disk and set the disk cluster dirty; -+ * . maybe write hole and check in (partially zeroed) logical cluster; -+ * . create 'unprepped' disk cluster for new or fake logical one. -+ */ -+static int prepare_logical_cluster(struct inode *inode, -+ loff_t file_off, /* write position -+ in the file */ -+ loff_t to_file, /* bytes of users data -+ to write to the file */ -+ struct cluster_handle * clust, -+ logical_cluster_op op) -+{ -+ int result = 0; -+ struct reiser4_slide * win = clust->win; -+ -+ reset_cluster_params(clust); -+ cluster_set_tfm_act(&clust->tc, TFMA_READ); -+#if REISER4_DEBUG -+ clust->ctx = get_current_context(); -+#endif -+ assert("edward-1190", op != LC_INVAL); -+ -+ clust->op = op; -+ -+ result = prepare_page_cluster(inode, clust, WRITE_OP); -+ if (result) -+ return result; -+ assert("edward-1447", -+ ergo(clust->nr_pages != 0, jprivate(clust->pages[0]))); -+ assert("edward-1448", -+ ergo(clust->nr_pages != 0, -+ jnode_is_cluster_page(jprivate(clust->pages[0])))); -+ -+ result = reserve4cluster(inode, clust); -+ if (result) -+ goto err1; -+ result = read_some_cluster_pages(inode, clust); -+ if (result) { -+ free_reserved4cluster(inode, -+ clust, -+ estimate_update_cluster(inode) + -+ estimate_insert_cluster(inode)); -+ goto err1; -+ } -+ assert("edward-1124", clust->dstat != INVAL_DISK_CLUSTER); -+ -+ result = cryptcompress_make_unprepped_cluster(clust, inode); -+ if (result) -+ goto err2; -+ if (win && win->stat == HOLE_WINDOW) { -+ result = write_hole(inode, clust, file_off, to_file); -+ if (result) -+ goto err2; -+ } -+ return 0; -+ err2: -+ free_reserved4cluster(inode, clust, -+ estimate_update_cluster(inode)); -+ err1: -+ put_page_cluster(clust, inode, WRITE_OP); -+ assert("edward-1125", result == -ENOSPC); -+ return result; -+} -+ -+/* set window by two offsets */ -+static void set_window(struct cluster_handle * clust, -+ struct reiser4_slide * win, struct inode *inode, -+ loff_t o1, loff_t o2) -+{ -+ assert("edward-295", clust != NULL); -+ assert("edward-296", inode != NULL); -+ assert("edward-1071", win != NULL); -+ assert("edward-297", o1 <= o2); -+ -+ clust->index = off_to_clust(o1, inode); -+ -+ win->off = off_to_cloff(o1, inode); -+ win->count = min((loff_t)(inode_cluster_size(inode) - win->off), -+ o2 - o1); -+ win->delta = 0; -+ -+ clust->win = win; -+} -+ -+static int set_cluster_by_window(struct inode *inode, -+ struct cluster_handle * clust, -+ struct reiser4_slide * win, size_t length, -+ loff_t file_off) -+{ -+ int result; -+ -+ assert("edward-197", clust != NULL); -+ assert("edward-1072", win != NULL); -+ assert("edward-198", inode != NULL); -+ -+ result = alloc_cluster_pgset(clust, cluster_nrpages(inode)); -+ if (result) -+ return result; -+ -+ if (file_off > i_size_read(inode)) { -+ /* Uhmm, hole in cryptcompress file... */ -+ loff_t hole_size; -+ hole_size = file_off - inode->i_size; -+ -+ set_window(clust, win, inode, inode->i_size, file_off); -+ win->stat = HOLE_WINDOW; -+ if (win->off + hole_size < inode_cluster_size(inode)) -+ /* there is also user's data to append to the hole */ -+ win->delta = min(inode_cluster_size(inode) - -+ (win->off + win->count), length); -+ return 0; -+ } -+ set_window(clust, win, inode, file_off, file_off + length); -+ win->stat = DATA_WINDOW; -+ return 0; -+} -+ -+int set_cluster_by_page(struct cluster_handle * clust, struct page * page, -+ int count) -+{ -+ int result = 0; -+ int (*setting_actor)(struct cluster_handle * clust, int count); -+ -+ assert("edward-1358", clust != NULL); -+ assert("edward-1359", page != NULL); -+ assert("edward-1360", page->mapping != NULL); -+ assert("edward-1361", page->mapping->host != NULL); -+ -+ setting_actor = -+ (clust->pages ? reset_cluster_pgset : alloc_cluster_pgset); -+ result = setting_actor(clust, count); -+ clust->index = pg_to_clust(page->index, page->mapping->host); -+ return result; -+} -+ -+/* reset all the params that not get updated */ -+void reset_cluster_params(struct cluster_handle * clust) -+{ -+ assert("edward-197", clust != NULL); -+ -+ clust->dstat = INVAL_DISK_CLUSTER; -+ clust->tc.uptodate = 0; -+ clust->tc.len = 0; -+} -+ -+/* the heart of write_cryptcompress */ -+static loff_t do_write_cryptcompress(struct file *file, struct inode *inode, -+ const char __user *buf, size_t to_write, -+ loff_t pos, struct psched_context *cont) -+{ -+ int i; -+ hint_t *hint; -+ int result = 0; -+ size_t count; -+ struct reiser4_slide win; -+ struct cluster_handle clust; -+ struct cryptcompress_info * info; -+ -+ assert("edward-154", buf != NULL); -+ assert("edward-161", reiser4_schedulable()); -+ assert("edward-748", cryptcompress_inode_ok(inode)); -+ assert("edward-159", current_blocksize == PAGE_CACHE_SIZE); -+ assert("edward-1274", get_current_context()->grabbed_blocks == 0); -+ -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) -+ return RETERR(-ENOMEM); -+ -+ result = load_file_hint(file, hint); -+ if (result) { -+ kfree(hint); -+ return result; -+ } -+ count = to_write; -+ -+ reiser4_slide_init(&win); -+ cluster_init_read(&clust, &win); -+ clust.hint = hint; -+ info = cryptcompress_inode_data(inode); -+ -+ mutex_lock(&info->checkin_mutex); -+ -+ result = set_cluster_by_window(inode, &clust, &win, to_write, pos); -+ if (result) -+ goto out; -+ -+ if (next_window_stat(&win) == HOLE_WINDOW) { -+ /* write hole in this iteration -+ separated from the loop below */ -+ result = write_pschedule_hook(file, inode, -+ pos, -+ &clust, -+ cont); -+ if (result) -+ goto out; -+ result = prepare_logical_cluster(inode, pos, count, &clust, -+ LC_APPOV); -+ if (result) -+ goto out; -+ } -+ do { -+ const char __user * src; -+ unsigned page_off, to_page; -+ -+ assert("edward-750", reiser4_schedulable()); -+ -+ result = write_pschedule_hook(file, inode, -+ pos + to_write - count, -+ &clust, -+ cont); -+ if (result) -+ goto out; -+ if (cont->state == PSCHED_ASSIGNED_NEW) -+ goto out_no_release; -+ -+ result = prepare_logical_cluster(inode, pos, count, &clust, -+ LC_APPOV); -+ if (result) -+ goto out; -+ -+ assert("edward-751", cryptcompress_inode_ok(inode)); -+ assert("edward-204", win.stat == DATA_WINDOW); -+ assert("edward-1288", hint_is_valid(clust.hint)); -+ assert("edward-752", -+ znode_is_write_locked(hint->ext_coord.coord.node)); -+ put_hint_cluster(&clust, inode, ZNODE_WRITE_LOCK); -+ -+ /* set write position in page */ -+ page_off = off_to_pgoff(win.off); -+ -+ /* copy user's data to cluster pages */ -+ for (i = off_to_pg(win.off), src = buf; -+ i < size_in_pages(win.off + win.count); -+ i++, src += to_page) { -+ to_page = __mbp(win.off + win.count, i) - page_off; -+ assert("edward-1039", -+ page_off + to_page <= PAGE_CACHE_SIZE); -+ assert("edward-287", clust.pages[i] != NULL); -+ -+ fault_in_pages_readable(src, to_page); -+ -+ lock_page(clust.pages[i]); -+ result = -+ __copy_from_user((char *)kmap(clust.pages[i]) + -+ page_off, src, to_page); -+ kunmap(clust.pages[i]); -+ if (unlikely(result)) { -+ unlock_page(clust.pages[i]); -+ result = -EFAULT; -+ goto err2; -+ } -+ SetPageUptodate(clust.pages[i]); -+ reiser4_set_page_dirty_internal(clust.pages[i]); -+ flush_dcache_page(clust.pages[i]); -+ mark_page_accessed(clust.pages[i]); -+ unlock_page(clust.pages[i]); -+ page_off = 0; -+ } -+ assert("edward-753", cryptcompress_inode_ok(inode)); -+ -+ result = checkin_logical_cluster(&clust, inode); -+ if (result) -+ goto err2; -+ -+ buf += win.count; -+ count -= win.count; -+ -+ result = balance_dirty_page_cluster(&clust, inode, 0, count, -+ win_count_to_nrpages(&win)); -+ if (result) -+ goto err1; -+ assert("edward-755", hint->lh.owner == NULL); -+ reset_cluster_params(&clust); -+ continue; -+ err2: -+ put_page_cluster(&clust, inode, WRITE_OP); -+ err1: -+ if (clust.reserved) -+ free_reserved4cluster(inode, -+ &clust, -+ estimate_update_cluster(inode)); -+ break; -+ } while (count); -+ out: -+ done_lh(&hint->lh); -+ mutex_unlock(&info->checkin_mutex); -+ save_file_hint(file, hint); -+ out_no_release: -+ kfree(hint); -+ put_cluster_handle(&clust); -+ assert("edward-195", -+ ergo((to_write == count), -+ (result < 0 || cont->state == PSCHED_ASSIGNED_NEW))); -+ return (to_write - count) ? (to_write - count) : result; -+} -+ -+/** -+ * plugin->write() -+ * @file: file to write to -+ * @buf: address of user-space buffer -+ * @read_amount: number of bytes to write -+ * @off: position in file to write to -+ */ -+ssize_t write_cryptcompress(struct file *file, const char __user *buf, -+ size_t count, loff_t *off, -+ struct psched_context *cont) -+{ -+ ssize_t result; -+ struct inode *inode; -+ reiser4_context *ctx; -+ loff_t pos = *off; -+ struct cryptcompress_info *info; -+ -+ assert("edward-1449", cont->state == PSCHED_INVAL_STATE); -+ -+ inode = file->f_dentry->d_inode; -+ assert("edward-196", cryptcompress_inode_ok(inode)); -+ -+ info = cryptcompress_inode_data(inode); -+ ctx = get_current_context(); -+ -+ result = generic_write_checks(file, &pos, &count, 0); -+ if (unlikely(result != 0)) { -+ context_set_commit_async(ctx); -+ return result; -+ } -+ if (unlikely(count == 0)) -+ return 0; -+ result = remove_suid(file->f_dentry); -+ if (unlikely(result != 0)) { -+ context_set_commit_async(ctx); -+ return result; -+ } -+ /* remove_suid might create a transaction */ -+ reiser4_txn_restart(ctx); -+ -+ result = do_write_cryptcompress(file, inode, buf, count, pos, cont); -+ -+ if (unlikely(result < 0)) { -+ context_set_commit_async(ctx); -+ return result; -+ } -+ /* update position in a file */ -+ *off = pos + result; -+ return result; -+} -+ -+/* plugin->readpages */ -+int readpages_cryptcompress(struct file *file, struct address_space *mapping, -+ struct list_head *pages, unsigned nr_pages) -+{ -+ reiser4_context * ctx; -+ int ret; -+ -+ ctx = reiser4_init_context(mapping->host->i_sb); -+ if (IS_ERR(ctx)) { -+ ret = PTR_ERR(ctx); -+ goto err; -+ } -+ /* cryptcompress file can be built of ctail items only */ -+ ret = readpages_ctail(file, mapping, pages); -+ reiser4_txn_restart(ctx); -+ reiser4_exit_context(ctx); -+ if (ret) { -+err: -+ put_pages_list(pages); -+ } -+ return ret; -+} -+ -+static reiser4_block_nr cryptcompress_estimate_read(struct inode *inode) -+{ -+ /* reserve one block to update stat data item */ -+ assert("edward-1193", -+ inode_file_plugin(inode)->estimate.update == -+ estimate_update_common); -+ return estimate_update_common(inode); -+} -+ -+/** -+ * plugin->read -+ * @file: file to read from -+ * @buf: address of user-space buffer -+ * @read_amount: number of bytes to read -+ * @off: position in file to read from -+ */ -+ssize_t read_cryptcompress(struct file * file, char __user *buf, size_t size, -+ loff_t * off) -+{ -+ ssize_t result; -+ struct inode *inode; -+ reiser4_context *ctx; -+ struct cryptcompress_info *info; -+ reiser4_block_nr needed; -+ -+ inode = file->f_dentry->d_inode; -+ assert("edward-1194", !reiser4_inode_get_flag(inode, REISER4_NO_SD)); -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ info = cryptcompress_inode_data(inode); -+ needed = cryptcompress_estimate_read(inode); -+ -+ result = reiser4_grab_space(needed, BA_CAN_COMMIT); -+ if (result != 0) { -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ result = do_sync_read(file, buf, size, off); -+ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ -+ return result; -+} -+ -+/* Look for a disk cluster and keep lookup result in @found. -+ * If @index > 0, then find disk cluster of the index (@index - 1); -+ * If @index == 0, then find the rightmost disk cluster. -+ * Keep incremented index of the found disk cluster in @found. -+ * @found == 0 means that disk cluster was not found (in the last -+ * case (@index == 0) it means that file doesn't have disk clusters). -+ */ -+static int lookup_disk_cluster(struct inode *inode, cloff_t * found, -+ cloff_t index) -+{ -+ int result; -+ reiser4_key key; -+ loff_t offset; -+ hint_t *hint; -+ lock_handle *lh; -+ lookup_bias bias; -+ coord_t *coord; -+ item_plugin *iplug; -+ -+ assert("edward-1131", inode != NULL); -+ assert("edward-95", cryptcompress_inode_ok(inode)); -+ -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) -+ return RETERR(-ENOMEM); -+ hint_init_zero(hint); -+ lh = &hint->lh; -+ -+ bias = (index ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN); -+ offset = -+ (index ? clust_to_off(index, inode) - -+ 1 : get_key_offset(reiser4_max_key())); -+ -+ key_by_inode_cryptcompress(inode, offset, &key); -+ -+ /* find the last item of this object */ -+ result = -+ find_cluster_item(hint, &key, ZNODE_READ_LOCK, NULL /* ra_info */, -+ bias, 0); -+ if (cbk_errored(result)) { -+ done_lh(lh); -+ kfree(hint); -+ return result; -+ } -+ if (result == CBK_COORD_NOTFOUND) { -+ /* no real disk clusters */ -+ done_lh(lh); -+ kfree(hint); -+ *found = 0; -+ return 0; -+ } -+ /* disk cluster is found */ -+ coord = &hint->ext_coord.coord; -+ coord_clear_iplug(coord); -+ result = zload(coord->node); -+ if (unlikely(result)) { -+ done_lh(lh); -+ kfree(hint); -+ return result; -+ } -+ iplug = item_plugin_by_coord(coord); -+ assert("edward-277", iplug == item_plugin_by_id(CTAIL_ID)); -+ assert("edward-1202", ctail_ok(coord)); -+ -+ item_key_by_coord(coord, &key); -+ *found = off_to_clust(get_key_offset(&key), inode) + 1; -+ -+ assert("edward-1132", ergo(index, index == *found)); -+ -+ zrelse(coord->node); -+ done_lh(lh); -+ kfree(hint); -+ return 0; -+} -+ -+static int find_fake_appended(struct inode *inode, cloff_t * index) -+{ -+ return lookup_disk_cluster(inode, index, -+ 0 /* find last real one */ ); -+} -+ -+/* Set left coord when unit is not found after node_lookup() -+ This takes into account that there can be holes in a sequence -+ of disk clusters */ -+ -+static void adjust_left_coord(coord_t * left_coord) -+{ -+ switch (left_coord->between) { -+ case AFTER_UNIT: -+ left_coord->between = AFTER_ITEM; -+ case AFTER_ITEM: -+ case BEFORE_UNIT: -+ break; -+ default: -+ impossible("edward-1204", "bad left coord to cut"); -+ } -+ return; -+} -+ -+#define CRC_CUT_TREE_MIN_ITERATIONS 64 -+ -+/* plugin->cut_tree_worker */ -+int cut_tree_worker_cryptcompress(tap_t * tap, const reiser4_key * from_key, -+ const reiser4_key * to_key, -+ reiser4_key * smallest_removed, -+ struct inode *object, int truncate, -+ int *progress) -+{ -+ lock_handle next_node_lock; -+ coord_t left_coord; -+ int result; -+ -+ assert("edward-1158", tap->coord->node != NULL); -+ assert("edward-1159", znode_is_write_locked(tap->coord->node)); -+ assert("edward-1160", znode_get_level(tap->coord->node) == LEAF_LEVEL); -+ -+ *progress = 0; -+ init_lh(&next_node_lock); -+ -+ while (1) { -+ znode *node; /* node from which items are cut */ -+ node_plugin *nplug; /* node plugin for @node */ -+ -+ node = tap->coord->node; -+ -+ /* Move next_node_lock to the next node on the left. */ -+ result = -+ reiser4_get_left_neighbor(&next_node_lock, node, -+ ZNODE_WRITE_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ if (result != 0 && result != -E_NO_NEIGHBOR) -+ break; -+ /* FIXME-EDWARD: Check can we delete the node as a whole. */ -+ result = reiser4_tap_load(tap); -+ if (result) -+ return result; -+ -+ /* Prepare the second (right) point for cut_node() */ -+ if (*progress) -+ coord_init_last_unit(tap->coord, node); -+ -+ else if (item_plugin_by_coord(tap->coord)->b.lookup == NULL) -+ /* set rightmost unit for the items without lookup method */ -+ tap->coord->unit_pos = coord_last_unit_pos(tap->coord); -+ -+ nplug = node->nplug; -+ -+ assert("edward-1161", nplug); -+ assert("edward-1162", nplug->lookup); -+ -+ /* left_coord is leftmost unit cut from @node */ -+ result = nplug->lookup(node, from_key, FIND_EXACT, &left_coord); -+ -+ if (IS_CBKERR(result)) -+ break; -+ -+ if (result == CBK_COORD_NOTFOUND) -+ adjust_left_coord(&left_coord); -+ -+ /* adjust coordinates so that they are set to existing units */ -+ if (coord_set_to_right(&left_coord) -+ || coord_set_to_left(tap->coord)) { -+ result = 0; -+ break; -+ } -+ -+ if (coord_compare(&left_coord, tap->coord) == -+ COORD_CMP_ON_RIGHT) { -+ /* keys from @from_key to @to_key are not in the tree */ -+ result = 0; -+ break; -+ } -+ -+ /* cut data from one node */ -+ *smallest_removed = *reiser4_min_key(); -+ result = kill_node_content(&left_coord, -+ tap->coord, -+ from_key, -+ to_key, -+ smallest_removed, -+ next_node_lock.node, -+ object, truncate); -+ reiser4_tap_relse(tap); -+ -+ if (result) -+ break; -+ -+ ++(*progress); -+ -+ /* Check whether all items with keys >= from_key were removed -+ * from the tree. */ -+ if (keyle(smallest_removed, from_key)) -+ /* result = 0; */ -+ break; -+ -+ if (next_node_lock.node == NULL) -+ break; -+ -+ result = reiser4_tap_move(tap, &next_node_lock); -+ done_lh(&next_node_lock); -+ if (result) -+ break; -+ -+ /* Break long cut_tree operation (deletion of a large file) if -+ * atom requires commit. */ -+ if (*progress > CRC_CUT_TREE_MIN_ITERATIONS -+ && current_atom_should_commit()) { -+ result = -E_REPEAT; -+ break; -+ } -+ } -+ done_lh(&next_node_lock); -+ return result; -+} -+ -+/* Append or expand hole in two steps: -+ * 1) set zeroes to the rightmost page of the rightmost non-fake -+ * logical cluster; -+ * 2) expand hole via fake logical clusters (just increase i_size) -+ */ -+static int cryptcompress_append_hole(struct inode *inode /* with old size */, -+ loff_t new_size) -+{ -+ int result = 0; -+ hint_t *hint; -+ lock_handle *lh; -+ loff_t hole_size; -+ int nr_zeroes; -+ struct reiser4_slide win; -+ struct cluster_handle clust; -+ -+ assert("edward-1133", inode->i_size < new_size); -+ assert("edward-1134", reiser4_schedulable()); -+ assert("edward-1135", cryptcompress_inode_ok(inode)); -+ assert("edward-1136", current_blocksize == PAGE_CACHE_SIZE); -+ assert("edward-1333", off_to_cloff(inode->i_size, inode) != 0); -+ -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) -+ return RETERR(-ENOMEM); -+ hint_init_zero(hint); -+ lh = &hint->lh; -+ -+ reiser4_slide_init(&win); -+ cluster_init_read(&clust, &win); -+ clust.hint = hint; -+ -+ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode)); -+ if (result) -+ goto out; -+ if (off_to_cloff(inode->i_size, inode) == 0) -+ goto append_fake; -+ hole_size = new_size - inode->i_size; -+ nr_zeroes = -+ inode_cluster_size(inode) - off_to_cloff(inode->i_size, inode); -+ if (hole_size < nr_zeroes) -+ nr_zeroes = hole_size; -+ set_window(&clust, &win, inode, inode->i_size, -+ inode->i_size + nr_zeroes); -+ win.stat = HOLE_WINDOW; -+ -+ assert("edward-1137", -+ clust.index == off_to_clust(inode->i_size, inode)); -+ -+ result = prepare_logical_cluster(inode, 0, 0, &clust, LC_APPOV); -+ -+ assert("edward-1271", !result || result == -ENOSPC); -+ if (result) -+ goto out; -+ assert("edward-1139", -+ clust.dstat == PREP_DISK_CLUSTER || -+ clust.dstat == UNPR_DISK_CLUSTER); -+ -+ assert("edward-1431", hole_size >= nr_zeroes); -+ if (hole_size == nr_zeroes) -+ /* nothing to append anymore */ -+ goto out; -+ append_fake: -+ INODE_SET_SIZE(inode, new_size); -+ out: -+ done_lh(lh); -+ kfree(hint); -+ put_cluster_handle(&clust); -+ return result; -+} -+ -+static int update_cryptcompress_size(struct inode *inode, loff_t new_size, -+ int update_sd) -+{ -+ return (new_size & ((loff_t) (inode_cluster_size(inode)) - 1) -+ ? 0 : reiser4_update_file_size(inode, new_size, update_sd)); -+} -+ -+/* Prune cryptcompress file in two steps: -+ * 1) cut all nominated logical clusters except the leftmost one which -+ * is to be partially truncated. Note, that there can be "holes" -+ * represented by fake logical clusters. -+ * 2) set zeroes and capture leftmost partially truncated logical -+ * cluster, if it is not fake; otherwise prune fake logical cluster -+ * (just decrease i_size). -+ */ -+static int prune_cryptcompress(struct inode *inode, loff_t new_size, -+ int update_sd, cloff_t aidx) -+{ -+ int result = 0; -+ unsigned nr_zeroes; -+ loff_t to_prune; -+ loff_t old_size; -+ cloff_t ridx; -+ -+ hint_t *hint; -+ lock_handle *lh; -+ struct reiser4_slide win; -+ struct cluster_handle clust; -+ -+ assert("edward-1140", inode->i_size >= new_size); -+ assert("edward-1141", reiser4_schedulable()); -+ assert("edward-1142", cryptcompress_inode_ok(inode)); -+ assert("edward-1143", current_blocksize == PAGE_CACHE_SIZE); -+ -+ old_size = inode->i_size; -+ -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) -+ return RETERR(-ENOMEM); -+ hint_init_zero(hint); -+ lh = &hint->lh; -+ -+ reiser4_slide_init(&win); -+ cluster_init_read(&clust, &win); -+ clust.hint = hint; -+ -+ /* calculate index of the rightmost logical cluster -+ that will be completely truncated */ -+ ridx = size_in_lc(new_size, inode); -+ -+ /* truncate all disk clusters starting from @ridx */ -+ assert("edward-1174", ridx <= aidx); -+ old_size = inode->i_size; -+ if (ridx != aidx) { -+ struct cryptcompress_info * info; -+ info = cryptcompress_inode_data(inode); -+ result = cut_file_items(inode, -+ clust_to_off(ridx, inode), -+ update_sd, -+ clust_to_off(aidx, inode), -+ update_cryptcompress_size); -+ info->trunc_index = ULONG_MAX; -+ if (result) -+ goto out; -+ } -+ /* -+ * there can be pages of fake logical clusters, truncate them -+ */ -+ truncate_inode_pages(inode->i_mapping, clust_to_off(ridx, inode)); -+ assert("edward-1524", -+ pages_truncate_ok(inode, clust_to_pg(ridx, inode))); -+ /* -+ * now perform partial truncate of last logical cluster -+ */ -+ if (!off_to_cloff(new_size, inode)) { -+ /* no partial truncate is needed */ -+ assert("edward-1145", inode->i_size == new_size); -+ goto truncate_fake; -+ } -+ assert("edward-1146", new_size < inode->i_size); -+ -+ to_prune = inode->i_size - new_size; -+ -+ /* check if the last logical cluster is fake */ -+ result = lookup_disk_cluster(inode, &aidx, ridx); -+ if (result) -+ goto out; -+ if (!aidx) -+ /* yup, this is fake one */ -+ goto truncate_fake; -+ -+ assert("edward-1148", aidx == ridx); -+ -+ /* do partial truncate of the last page cluster, -+ and try to capture this one */ -+ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode)); -+ if (result) -+ goto out; -+ nr_zeroes = (off_to_pgoff(new_size) ? -+ PAGE_CACHE_SIZE - off_to_pgoff(new_size) : 0); -+ set_window(&clust, &win, inode, new_size, new_size + nr_zeroes); -+ win.stat = HOLE_WINDOW; -+ -+ assert("edward-1149", clust.index == ridx - 1); -+ -+ result = prepare_logical_cluster(inode, 0, 0, &clust, LC_TRUNC); -+ if (result) -+ goto out; -+ assert("edward-1151", -+ clust.dstat == PREP_DISK_CLUSTER || -+ clust.dstat == UNPR_DISK_CLUSTER); -+ -+ assert("edward-1191", inode->i_size == new_size); -+ assert("edward-1206", body_truncate_ok(inode, ridx)); -+ truncate_fake: -+ /* drop all the pages that don't have jnodes (i.e. pages -+ which can not be truncated by cut_file_items() because -+ of holes represented by fake disk clusters) including -+ the pages of partially truncated cluster which was -+ released by prepare_logical_cluster() */ -+ INODE_SET_SIZE(inode, new_size); -+ truncate_inode_pages(inode->i_mapping, new_size); -+ out: -+ assert("edward-1334", !result || result == -ENOSPC); -+ assert("edward-1497", -+ pages_truncate_ok(inode, size_in_pages(new_size))); -+ -+ done_lh(lh); -+ kfree(hint); -+ put_cluster_handle(&clust); -+ return result; -+} -+ -+/* Prepare cryptcompress file for truncate: -+ * prune or append rightmost fake logical clusters (if any) -+ */ -+static int start_truncate_fake(struct inode *inode, cloff_t aidx, -+ loff_t new_size, int update_sd) -+{ -+ int result = 0; -+ int bytes; -+ -+ if (new_size > inode->i_size) { -+ /* append */ -+ if (inode->i_size < clust_to_off(aidx, inode)) -+ /* no fake bytes */ -+ return 0; -+ bytes = new_size - inode->i_size; -+ INODE_SET_SIZE(inode, inode->i_size + bytes); -+ } else { -+ /* prune */ -+ if (inode->i_size <= clust_to_off(aidx, inode)) -+ /* no fake bytes */ -+ return 0; -+ bytes = inode->i_size - -+ max(new_size, clust_to_off(aidx, inode)); -+ if (!bytes) -+ return 0; -+ INODE_SET_SIZE(inode, inode->i_size - bytes); -+ /* In the case of fake prune we need to drop page cluster. -+ There are only 2 cases for partially truncated page: -+ 1. If is is dirty, therefore it is anonymous -+ (was dirtied via mmap), and will be captured -+ later via ->capture(). -+ 2. If is clean, therefore it is filled by zeroes. -+ In both cases we don't need to make it dirty and -+ capture here. -+ */ -+ truncate_inode_pages(inode->i_mapping, inode->i_size); -+ } -+ if (update_sd) -+ result = update_sd_cryptcompress(inode); -+ return result; -+} -+ -+/** -+ * This is called in setattr_cryptcompress when it is used to truncate, -+ * and in delete_object_cryptcompress -+ */ -+static int cryptcompress_truncate(struct inode *inode, /* old size */ -+ loff_t new_size, /* new size */ -+ int update_sd) -+{ -+ int result; -+ cloff_t aidx; -+ -+ result = find_fake_appended(inode, &aidx); -+ if (result) -+ return result; -+ assert("edward-1208", -+ ergo(aidx > 0, inode->i_size > clust_to_off(aidx - 1, inode))); -+ -+ result = start_truncate_fake(inode, aidx, new_size, update_sd); -+ if (result) -+ return result; -+ if (inode->i_size == new_size) -+ /* nothing to truncate anymore */ -+ return 0; -+ result = (inode->i_size < new_size ? -+ cryptcompress_append_hole(inode, new_size) : -+ prune_cryptcompress(inode, new_size, update_sd, aidx)); -+ if (!result && update_sd) -+ result = update_sd_cryptcompress(inode); -+ return result; -+} -+ -+/* Capture an anonymous pager cluster. (Page cluser is -+ * anonymous if it contains at least one anonymous page -+ */ -+static int capture_anon_page_cluster(struct cluster_handle * clust, -+ struct inode * inode) -+{ -+ int result; -+ -+ assert("edward-1073", clust != NULL); -+ assert("edward-1074", inode != NULL); -+ assert("edward-1075", clust->dstat == INVAL_DISK_CLUSTER); -+ -+ result = prepare_logical_cluster(inode, 0, 0, clust, LC_APPOV); -+ if (result) -+ return result; -+ set_cluster_pages_dirty(clust, inode); -+ result = checkin_logical_cluster(clust, inode); -+ put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK); -+ if (unlikely(result)) -+ put_page_cluster(clust, inode, WRITE_OP); -+ return result; -+} -+ -+/* Starting from @index find tagged pages of the same page cluster. -+ * Clear the tag for each of them. Return number of found pages. -+ */ -+static int find_anon_page_cluster(struct address_space * mapping, -+ pgoff_t * index, struct page ** pages) -+{ -+ int i = 0; -+ int found; -+ write_lock_irq(&mapping->tree_lock); -+ do { -+ /* looking for one page */ -+ found = radix_tree_gang_lookup_tag(&mapping->page_tree, -+ (void **)&pages[i], -+ *index, 1, -+ PAGECACHE_TAG_REISER4_MOVED); -+ if (!found) -+ break; -+ if (!same_page_cluster(pages[0], pages[i])) -+ break; -+ -+ /* found */ -+ page_cache_get(pages[i]); -+ *index = pages[i]->index + 1; -+ -+ radix_tree_tag_clear(&mapping->page_tree, -+ pages[i]->index, -+ PAGECACHE_TAG_REISER4_MOVED); -+ if (last_page_in_cluster(pages[i++])) -+ break; -+ } while (1); -+ write_unlock_irq(&mapping->tree_lock); -+ return i; -+} -+ -+#define MAX_PAGES_TO_CAPTURE (1024) -+ -+/* Capture anonymous page clusters */ -+static int capture_anon_pages(struct address_space * mapping, pgoff_t * index, -+ int to_capture) -+{ -+ int count = 0; -+ int found = 0; -+ int result = 0; -+ hint_t *hint; -+ lock_handle *lh; -+ struct inode * inode; -+ struct cluster_handle clust; -+ struct page * pages[MAX_CLUSTER_NRPAGES]; -+ -+ assert("edward-1127", mapping != NULL); -+ assert("edward-1128", mapping->host != NULL); -+ assert("edward-1440", mapping->host->i_mapping == mapping); -+ -+ inode = mapping->host; -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) -+ return RETERR(-ENOMEM); -+ hint_init_zero(hint); -+ lh = &hint->lh; -+ -+ cluster_init_read(&clust, NULL); -+ clust.hint = hint; -+ -+ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode)); -+ if (result) -+ goto out; -+ -+ while (to_capture > 0) { -+ found = find_anon_page_cluster(mapping, index, pages); -+ if (!found) { -+ *index = (pgoff_t) - 1; -+ break; -+ } -+ move_cluster_forward(&clust, inode, pages[0]->index); -+ result = capture_anon_page_cluster(&clust, inode); -+ -+ put_found_pages(pages, found); /* find_anon_page_cluster */ -+ if (result) -+ break; -+ to_capture -= clust.nr_pages; -+ count += clust.nr_pages; -+ } -+ if (result) { -+ warning("edward-1077", -+ "Capture failed (inode %llu, result=%i, captured=%d)\n", -+ (unsigned long long)get_inode_oid(inode), result, count); -+ } else { -+ assert("edward-1078", ergo(found > 0, count > 0)); -+ if (to_capture <= 0) -+ /* there may be left more pages */ -+ __mark_inode_dirty(inode, I_DIRTY_PAGES); -+ result = count; -+ } -+ out: -+ done_lh(lh); -+ kfree(hint); -+ put_cluster_handle(&clust); -+ return result; -+} -+ -+/* Returns true if inode's mapping has dirty pages -+ which do not belong to any atom */ -+static int cryptcompress_inode_has_anon_pages(struct inode *inode) -+{ -+ int result; -+ read_lock_irq(&inode->i_mapping->tree_lock); -+ result = radix_tree_tagged(&inode->i_mapping->page_tree, -+ PAGECACHE_TAG_REISER4_MOVED); -+ read_unlock_irq(&inode->i_mapping->tree_lock); -+ return result; -+} -+ -+/* plugin->writepages */ -+int writepages_cryptcompress(struct address_space *mapping, -+ struct writeback_control *wbc) -+{ -+ int result = 0; -+ long to_capture; -+ pgoff_t nrpages; -+ pgoff_t index = 0; -+ struct inode *inode; -+ struct cryptcompress_info *info; -+ -+ inode = mapping->host; -+ if (!cryptcompress_inode_has_anon_pages(inode)) -+ goto end; -+ info = cryptcompress_inode_data(inode); -+ nrpages = size_in_pages(i_size_read(inode)); -+ -+ if (wbc->sync_mode != WB_SYNC_ALL) -+ to_capture = min(wbc->nr_to_write, (long)MAX_PAGES_TO_CAPTURE); -+ else -+ to_capture = MAX_PAGES_TO_CAPTURE; -+ do { -+ reiser4_context *ctx; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) { -+ result = PTR_ERR(ctx); -+ break; -+ } -+ /* avoid recursive calls to ->sync_inodes */ -+ ctx->nobalance = 1; -+ -+ assert("edward-1079", -+ lock_stack_isclean(get_current_lock_stack())); -+ -+ reiser4_txn_restart_current(); -+ -+ if (get_current_context()->entd) { -+ if (mutex_trylock(&info->checkin_mutex) == 0) { -+ /* the mutex might be occupied by -+ entd caller */ -+ result = RETERR(-EBUSY); -+ reiser4_exit_context(ctx); -+ break; -+ } -+ } else -+ mutex_lock(&info->checkin_mutex); -+ -+ result = capture_anon_pages(inode->i_mapping, &index, -+ to_capture); -+ mutex_unlock(&info->checkin_mutex); -+ -+ if (result < 0) { -+ reiser4_exit_context(ctx); -+ break; -+ } -+ wbc->nr_to_write -= result; -+ if (wbc->sync_mode != WB_SYNC_ALL) { -+ reiser4_exit_context(ctx); -+ break; -+ } -+ result = txnmgr_force_commit_all(inode->i_sb, 0); -+ reiser4_exit_context(ctx); -+ } while (result >= 0 && index < nrpages); -+ -+ end: -+ if (is_in_reiser4_context()) { -+ if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) { -+ /* there are already pages to flush, flush them out, -+ do not delay until end of reiser4_sync_inodes */ -+ reiser4_writeout(inode->i_sb, wbc); -+ get_current_context()->nr_captured = 0; -+ } -+ } -+ return result; -+} -+ -+/* plugin->ioctl */ -+int ioctl_cryptcompress(struct inode *inode, struct file *filp, -+ unsigned int cmd, unsigned long arg) -+{ -+ return RETERR(-ENOSYS); -+} -+ -+/* plugin->mmap */ -+int mmap_cryptcompress(struct file *file, struct vm_area_struct *vma) -+{ -+ int result; -+ struct inode *inode; -+ reiser4_context *ctx; -+ -+ inode = file->f_dentry->d_inode; -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ /* -+ * generic_file_mmap will do update_atime. Grab space for stat data -+ * update. -+ */ -+ result = reiser4_grab_space_force -+ (inode_file_plugin(inode)->estimate.update(inode), -+ BA_CAN_COMMIT); -+ if (result) { -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ result = generic_file_mmap(file, vma); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/* plugin->delete_object */ -+int delete_object_cryptcompress(struct inode *inode) -+{ -+ int result; -+ struct cryptcompress_info * info; -+ -+ assert("edward-429", inode->i_nlink == 0); -+ -+ reiser4_txn_restart_current(); -+ info = cryptcompress_inode_data(inode); -+ -+ mutex_lock(&info->checkin_mutex); -+ result = cryptcompress_truncate(inode, 0, 0); -+ mutex_unlock(&info->checkin_mutex); -+ -+ if (result) { -+ warning("edward-430", -+ "cannot truncate cryptcompress file %lli: %i", -+ (unsigned long long)get_inode_oid(inode), -+ result); -+ } -+ truncate_inode_pages(inode->i_mapping, 0); -+ assert("edward-1487", pages_truncate_ok(inode, 0)); -+ /* and remove stat data */ -+ return reiser4_delete_object_common(inode); -+} -+ -+/* -+ * plugin->setattr -+ * This implements actual truncate (see comments in reiser4/page_cache.c) -+ */ -+int setattr_cryptcompress(struct dentry *dentry, struct iattr *attr) -+{ -+ int result; -+ struct inode *inode; -+ struct cryptcompress_info * info; -+ -+ inode = dentry->d_inode; -+ info = cryptcompress_inode_data(inode); -+ -+ if (attr->ia_valid & ATTR_SIZE) { -+ if (i_size_read(inode) != attr->ia_size) { -+ reiser4_context *ctx; -+ loff_t old_size; -+ -+ ctx = reiser4_init_context(dentry->d_inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ result = setattr_pschedule_hook(inode); -+ if (result) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ old_size = i_size_read(inode); -+ inode_check_scale(inode, old_size, attr->ia_size); -+ -+ mutex_lock(&info->checkin_mutex); -+ result = cryptcompress_truncate(inode, -+ attr->ia_size, -+ 1/* update sd */); -+ mutex_unlock(&info->checkin_mutex); -+ if (result) { -+ warning("edward-1192", -+ "truncate_cryptcompress failed: oid %lli, " -+ "old size %lld, new size %lld, retval %d", -+ (unsigned long long) -+ get_inode_oid(inode), old_size, -+ attr->ia_size, result); -+ } -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ } else -+ result = 0; -+ } else -+ result = reiser4_setattr_common(dentry, attr); -+ return result; -+} -+ -+/* plugin->release */ -+int release_cryptcompress(struct inode *inode, struct file *file) -+{ -+ reiser4_context *ctx = reiser4_init_context(inode->i_sb); -+ -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ reiser4_free_file_fsdata(file); -+ reiser4_exit_context(ctx); -+ return 0; -+} -+ -+/* plugin->prepare_write */ -+int prepare_write_cryptcompress(struct file *file, struct page *page, -+ unsigned from, unsigned to) -+{ -+ return -EINVAL; -+} -+ -+/* plugin->commit_write */ -+int commit_write_cryptcompress(struct file *file, struct page *page, -+ unsigned from, unsigned to) -+{ -+ BUG(); -+ return 0; -+} -+ -+/* plugin->bmap */ -+sector_t bmap_cryptcompress(struct address_space *mapping, sector_t lblock) -+{ -+ return -EINVAL; -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/file/cryptcompress.h linux-2.6.24/fs/reiser4/plugin/file/cryptcompress.h ---- linux-2.6.24.orig/fs/reiser4/plugin/file/cryptcompress.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/file/cryptcompress.h 2008-01-25 11:39:06.980219023 +0300 -@@ -0,0 +1,616 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+/* See http://www.namesys.com/cryptcompress_design.html */ -+ -+#if !defined( __FS_REISER4_CRYPTCOMPRESS_H__ ) -+#define __FS_REISER4_CRYPTCOMPRESS_H__ -+ -+#include "../../page_cache.h" -+#include "../compress/compress.h" -+#include "../crypto/cipher.h" -+ -+#include -+ -+#define MIN_CLUSTER_SHIFT PAGE_CACHE_SHIFT -+#define MAX_CLUSTER_SHIFT 16 -+#define MAX_CLUSTER_NRPAGES (1U << MAX_CLUSTER_SHIFT >> PAGE_CACHE_SHIFT) -+#define DC_CHECKSUM_SIZE 4 -+ -+#define MIN_LATTICE_FACTOR 1 -+#define MAX_LATTICE_FACTOR 32 -+ -+/* this mask contains all non-standard plugins that might -+ be present in reiser4-specific part of inode managed by -+ cryptcompress file plugin */ -+#define cryptcompress_mask \ -+ ((1 << PSET_FILE) | \ -+ (1 << PSET_CLUSTER) | \ -+ (1 << PSET_CIPHER) | \ -+ (1 << PSET_DIGEST) | \ -+ (1 << PSET_COMPRESSION) | \ -+ (1 << PSET_COMPRESSION_MODE)) -+ -+#if REISER4_DEBUG -+static inline int cluster_shift_ok(int shift) -+{ -+ return (shift >= MIN_CLUSTER_SHIFT) && (shift <= MAX_CLUSTER_SHIFT); -+} -+#endif -+ -+#if REISER4_DEBUG -+#define INODE_PGCOUNT(inode) \ -+({ \ -+ assert("edward-1530", inode_file_plugin(inode) == \ -+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); \ -+ atomic_read(&cryptcompress_inode_data(inode)->pgcount); \ -+ }) -+#define INODE_PGCOUNT_INC(inode) \ -+do { \ -+ assert("edward-1531", inode_file_plugin(inode) == \ -+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); \ -+ atomic_inc(&cryptcompress_inode_data(inode)->pgcount); \ -+} while (0) -+#define INODE_PGCOUNT_DEC(inode) \ -+do { \ -+ if (inode_file_plugin(inode) == \ -+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)) \ -+ atomic_dec(&cryptcompress_inode_data(inode)->pgcount); \ -+} while (0) -+#else -+#define INODE_PGCOUNT(inode) (0) -+#define INODE_PGCOUNT_INC(inode) -+#define INODE_PGCOUNT_DEC(inode) -+#endif /* REISER4_DEBUG */ -+ -+struct tfm_stream { -+ __u8 *data; -+ size_t size; -+}; -+ -+typedef enum { -+ INPUT_STREAM, -+ OUTPUT_STREAM, -+ LAST_STREAM -+} tfm_stream_id; -+ -+typedef struct tfm_stream * tfm_unit[LAST_STREAM]; -+ -+static inline __u8 *ts_data(struct tfm_stream * stm) -+{ -+ assert("edward-928", stm != NULL); -+ return stm->data; -+} -+ -+static inline size_t ts_size(struct tfm_stream * stm) -+{ -+ assert("edward-929", stm != NULL); -+ return stm->size; -+} -+ -+static inline void set_ts_size(struct tfm_stream * stm, size_t size) -+{ -+ assert("edward-930", stm != NULL); -+ -+ stm->size = size; -+} -+ -+static inline int alloc_ts(struct tfm_stream ** stm) -+{ -+ assert("edward-931", stm); -+ assert("edward-932", *stm == NULL); -+ -+ *stm = kzalloc(sizeof(**stm), reiser4_ctx_gfp_mask_get()); -+ if (!*stm) -+ return -ENOMEM; -+ return 0; -+} -+ -+static inline void free_ts(struct tfm_stream * stm) -+{ -+ assert("edward-933", !ts_data(stm)); -+ assert("edward-934", !ts_size(stm)); -+ -+ kfree(stm); -+} -+ -+static inline int alloc_ts_data(struct tfm_stream * stm, size_t size) -+{ -+ assert("edward-935", !ts_data(stm)); -+ assert("edward-936", !ts_size(stm)); -+ assert("edward-937", size != 0); -+ -+ stm->data = reiser4_vmalloc(size); -+ if (!stm->data) -+ return -ENOMEM; -+ set_ts_size(stm, size); -+ return 0; -+} -+ -+static inline void free_ts_data(struct tfm_stream * stm) -+{ -+ assert("edward-938", equi(ts_data(stm), ts_size(stm))); -+ -+ if (ts_data(stm)) -+ vfree(ts_data(stm)); -+ memset(stm, 0, sizeof *stm); -+} -+ -+/* Write modes for item conversion in flush convert phase */ -+typedef enum { -+ CRC_APPEND_ITEM = 1, -+ CRC_OVERWRITE_ITEM = 2, -+ CRC_CUT_ITEM = 3 -+} cryptcompress_write_mode_t; -+ -+typedef enum { -+ LC_INVAL = 0, /* invalid value */ -+ LC_APPOV = 1, /* append and/or overwrite */ -+ LC_TRUNC = 2 /* truncate */ -+} logical_cluster_op; -+ -+/* Transform cluster. -+ * Intermediate state between page cluster and disk cluster -+ * Is used for data transform (compression/encryption) -+ */ -+struct tfm_cluster { -+ coa_set coa; /* compression algorithms info */ -+ tfm_unit tun; /* plain and transformed streams */ -+ tfm_action act; -+ int uptodate; -+ int lsize; /* number of bytes in logical cluster */ -+ int len; /* length of the transform stream */ -+}; -+ -+static inline coa_t get_coa(struct tfm_cluster * tc, reiser4_compression_id id, -+ tfm_action act) -+{ -+ return tc->coa[id][act]; -+} -+ -+static inline void set_coa(struct tfm_cluster * tc, reiser4_compression_id id, -+ tfm_action act, coa_t coa) -+{ -+ tc->coa[id][act] = coa; -+} -+ -+static inline int alloc_coa(struct tfm_cluster * tc, compression_plugin * cplug) -+{ -+ coa_t coa; -+ -+ coa = cplug->alloc(tc->act); -+ if (IS_ERR(coa)) -+ return PTR_ERR(coa); -+ set_coa(tc, cplug->h.id, tc->act, coa); -+ return 0; -+} -+ -+static inline int -+grab_coa(struct tfm_cluster * tc, compression_plugin * cplug) -+{ -+ return (cplug->alloc && !get_coa(tc, cplug->h.id, tc->act) ? -+ alloc_coa(tc, cplug) : 0); -+} -+ -+static inline void free_coa_set(struct tfm_cluster * tc) -+{ -+ tfm_action j; -+ reiser4_compression_id i; -+ compression_plugin *cplug; -+ -+ assert("edward-810", tc != NULL); -+ -+ for (j = 0; j < TFMA_LAST; j++) -+ for (i = 0; i < LAST_COMPRESSION_ID; i++) { -+ if (!get_coa(tc, i, j)) -+ continue; -+ cplug = compression_plugin_by_id(i); -+ assert("edward-812", cplug->free != NULL); -+ cplug->free(get_coa(tc, i, j), j); -+ set_coa(tc, i, j, 0); -+ } -+ return; -+} -+ -+static inline struct tfm_stream * get_tfm_stream(struct tfm_cluster * tc, -+ tfm_stream_id id) -+{ -+ return tc->tun[id]; -+} -+ -+static inline void set_tfm_stream(struct tfm_cluster * tc, -+ tfm_stream_id id, struct tfm_stream * ts) -+{ -+ tc->tun[id] = ts; -+} -+ -+static inline __u8 *tfm_stream_data(struct tfm_cluster * tc, tfm_stream_id id) -+{ -+ return ts_data(get_tfm_stream(tc, id)); -+} -+ -+static inline void set_tfm_stream_data(struct tfm_cluster * tc, -+ tfm_stream_id id, __u8 * data) -+{ -+ get_tfm_stream(tc, id)->data = data; -+} -+ -+static inline size_t tfm_stream_size(struct tfm_cluster * tc, tfm_stream_id id) -+{ -+ return ts_size(get_tfm_stream(tc, id)); -+} -+ -+static inline void -+set_tfm_stream_size(struct tfm_cluster * tc, tfm_stream_id id, size_t size) -+{ -+ get_tfm_stream(tc, id)->size = size; -+} -+ -+static inline int -+alloc_tfm_stream(struct tfm_cluster * tc, size_t size, tfm_stream_id id) -+{ -+ assert("edward-939", tc != NULL); -+ assert("edward-940", !get_tfm_stream(tc, id)); -+ -+ tc->tun[id] = kzalloc(sizeof(struct tfm_stream), -+ reiser4_ctx_gfp_mask_get()); -+ if (!tc->tun[id]) -+ return -ENOMEM; -+ return alloc_ts_data(get_tfm_stream(tc, id), size); -+} -+ -+static inline int -+realloc_tfm_stream(struct tfm_cluster * tc, size_t size, tfm_stream_id id) -+{ -+ assert("edward-941", tfm_stream_size(tc, id) < size); -+ free_ts_data(get_tfm_stream(tc, id)); -+ return alloc_ts_data(get_tfm_stream(tc, id), size); -+} -+ -+static inline void free_tfm_stream(struct tfm_cluster * tc, tfm_stream_id id) -+{ -+ free_ts_data(get_tfm_stream(tc, id)); -+ free_ts(get_tfm_stream(tc, id)); -+ set_tfm_stream(tc, id, 0); -+} -+ -+static inline unsigned coa_overrun(compression_plugin * cplug, int ilen) -+{ -+ return (cplug->overrun != NULL ? cplug->overrun(ilen) : 0); -+} -+ -+static inline void free_tfm_unit(struct tfm_cluster * tc) -+{ -+ tfm_stream_id id; -+ for (id = 0; id < LAST_STREAM; id++) { -+ if (!get_tfm_stream(tc, id)) -+ continue; -+ free_tfm_stream(tc, id); -+ } -+} -+ -+static inline void put_tfm_cluster(struct tfm_cluster * tc) -+{ -+ assert("edward-942", tc != NULL); -+ free_coa_set(tc); -+ free_tfm_unit(tc); -+} -+ -+static inline int tfm_cluster_is_uptodate(struct tfm_cluster * tc) -+{ -+ assert("edward-943", tc != NULL); -+ assert("edward-944", tc->uptodate == 0 || tc->uptodate == 1); -+ return (tc->uptodate == 1); -+} -+ -+static inline void tfm_cluster_set_uptodate(struct tfm_cluster * tc) -+{ -+ assert("edward-945", tc != NULL); -+ assert("edward-946", tc->uptodate == 0 || tc->uptodate == 1); -+ tc->uptodate = 1; -+ return; -+} -+ -+static inline void tfm_cluster_clr_uptodate(struct tfm_cluster * tc) -+{ -+ assert("edward-947", tc != NULL); -+ assert("edward-948", tc->uptodate == 0 || tc->uptodate == 1); -+ tc->uptodate = 0; -+ return; -+} -+ -+static inline int tfm_stream_is_set(struct tfm_cluster * tc, tfm_stream_id id) -+{ -+ return (get_tfm_stream(tc, id) && -+ tfm_stream_data(tc, id) && tfm_stream_size(tc, id)); -+} -+ -+static inline int tfm_cluster_is_set(struct tfm_cluster * tc) -+{ -+ int i; -+ for (i = 0; i < LAST_STREAM; i++) -+ if (!tfm_stream_is_set(tc, i)) -+ return 0; -+ return 1; -+} -+ -+static inline void alternate_streams(struct tfm_cluster * tc) -+{ -+ struct tfm_stream *tmp = get_tfm_stream(tc, INPUT_STREAM); -+ -+ set_tfm_stream(tc, INPUT_STREAM, get_tfm_stream(tc, OUTPUT_STREAM)); -+ set_tfm_stream(tc, OUTPUT_STREAM, tmp); -+} -+ -+/* Set of states to indicate a kind of data -+ * that will be written to the window */ -+typedef enum { -+ DATA_WINDOW, /* user's data */ -+ HOLE_WINDOW /* zeroes (such kind of data can be written -+ * if we start to write from offset > i_size) */ -+} window_stat; -+ -+/* Window (of logical cluster size) discretely sliding along a file. -+ * Is used to locate hole region in a logical cluster to be properly -+ * represented on disk. -+ * We split a write to cryptcompress file into writes to its logical -+ * clusters. Before writing to a logical cluster we set a window, i.e. -+ * calculate values of the following fields: -+ */ -+struct reiser4_slide { -+ unsigned off; /* offset to write from */ -+ unsigned count; /* number of bytes to write */ -+ unsigned delta; /* number of bytes to append to the hole */ -+ window_stat stat; /* what kind of data will be written starting -+ from @off */ -+}; -+ -+/* Possible states of a disk cluster */ -+typedef enum { -+ INVAL_DISK_CLUSTER, /* unknown state */ -+ PREP_DISK_CLUSTER, /* disk cluster got converted by flush -+ * at least 1 time */ -+ UNPR_DISK_CLUSTER, /* disk cluster just created and should be -+ * converted by flush */ -+ FAKE_DISK_CLUSTER, /* disk cluster doesn't exist neither in memory -+ * nor on disk */ -+ TRNC_DISK_CLUSTER /* disk cluster is partially truncated */ -+} disk_cluster_stat; -+ -+/* The following structure represents various stages of the same logical -+ * cluster of index @index: -+ * . fixed slide -+ * . page cluster (stage in primary cache) -+ * . transform cluster (transition stage) -+ * . disk cluster (stage in secondary cache) -+ * This structure is used in transition and synchronizing operations, e.g. -+ * transform cluster is a transition state when synchronizing page cluster -+ * and disk cluster. -+ * FIXME: Encapsulate page cluster, disk cluster. -+ */ -+struct cluster_handle { -+ cloff_t index; /* offset in a file (unit is a cluster size) */ -+ int index_valid; /* for validating the index above, if needed */ -+ struct file *file; /* host file */ -+ -+ /* logical cluster */ -+ struct reiser4_slide *win; /* sliding window to locate holes */ -+ logical_cluster_op op; /* logical cluster operation (truncate or -+ append/overwrite) */ -+ /* transform cluster */ -+ struct tfm_cluster tc; /* contains all needed info to synchronize -+ page cluster and disk cluster) */ -+ /* page cluster */ -+ int nr_pages; /* number of pages of current checkin action */ -+ int old_nrpages; /* number of pages of last checkin action */ -+ struct page **pages; /* attached pages */ -+ jnode * node; /* jnode for capture */ -+ -+ /* disk cluster */ -+ hint_t *hint; /* current position in the tree */ -+ disk_cluster_stat dstat; /* state of the current disk cluster */ -+ int reserved; /* is space for disk cluster reserved */ -+#if REISER4_DEBUG -+ reiser4_context *ctx; -+ int reserved_prepped; -+ int reserved_unprepped; -+#endif -+ -+}; -+ -+static inline __u8 * tfm_input_data (struct cluster_handle * clust) -+{ -+ return tfm_stream_data(&clust->tc, INPUT_STREAM); -+} -+ -+static inline __u8 * tfm_output_data (struct cluster_handle * clust) -+{ -+ return tfm_stream_data(&clust->tc, OUTPUT_STREAM); -+} -+ -+static inline int reset_cluster_pgset(struct cluster_handle * clust, -+ int nrpages) -+{ -+ assert("edward-1057", clust->pages != NULL); -+ memset(clust->pages, 0, sizeof(*clust->pages) * nrpages); -+ return 0; -+} -+ -+static inline int alloc_cluster_pgset(struct cluster_handle * clust, -+ int nrpages) -+{ -+ assert("edward-949", clust != NULL); -+ assert("edward-1362", clust->pages == NULL); -+ assert("edward-950", nrpages != 0 && nrpages <= MAX_CLUSTER_NRPAGES); -+ -+ clust->pages = kzalloc(sizeof(*clust->pages) * nrpages, -+ reiser4_ctx_gfp_mask_get()); -+ if (!clust->pages) -+ return RETERR(-ENOMEM); -+ return 0; -+} -+ -+static inline void move_cluster_pgset(struct cluster_handle *clust, -+ struct page ***pages, int * nr_pages) -+{ -+ assert("edward-1545", clust != NULL && clust->pages != NULL); -+ assert("edward-1546", pages != NULL && *pages == NULL); -+ *pages = clust->pages; -+ *nr_pages = clust->nr_pages; -+ clust->pages = NULL; -+} -+ -+static inline void free_cluster_pgset(struct cluster_handle * clust) -+{ -+ assert("edward-951", clust->pages != NULL); -+ kfree(clust->pages); -+ clust->pages = NULL; -+} -+ -+static inline void put_cluster_handle(struct cluster_handle * clust) -+{ -+ assert("edward-435", clust != NULL); -+ -+ put_tfm_cluster(&clust->tc); -+ if (clust->pages) -+ free_cluster_pgset(clust); -+ memset(clust, 0, sizeof *clust); -+} -+ -+static inline void inc_keyload_count(struct reiser4_crypto_info * data) -+{ -+ assert("edward-1410", data != NULL); -+ data->keyload_count++; -+} -+ -+static inline void dec_keyload_count(struct reiser4_crypto_info * data) -+{ -+ assert("edward-1411", data != NULL); -+ assert("edward-1412", data->keyload_count > 0); -+ data->keyload_count--; -+} -+ -+static inline int capture_cluster_jnode(jnode * node) -+{ -+ return reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0); -+} -+ -+/* cryptcompress specific part of reiser4_inode */ -+struct cryptcompress_info { -+ struct mutex checkin_mutex; /* This is to serialize -+ * checkin_logical_cluster operations */ -+ cloff_t trunc_index; /* Index of the leftmost truncated disk -+ * cluster (to resolve races with read) */ -+ struct reiser4_crypto_info *crypt; -+ /* -+ * the following 2 fields are controlled by compression mode plugin -+ */ -+ int compress_toggle; /* Current status of compressibility */ -+ int lattice_factor; /* Factor of dynamic lattice. FIXME: Have -+ * a compression_toggle to keep the factor -+ */ -+#if REISER4_DEBUG -+ atomic_t pgcount; /* number of grabbed pages */ -+#endif -+}; -+ -+static inline void set_compression_toggle (struct cryptcompress_info * info, int val) -+{ -+ info->compress_toggle = val; -+} -+ -+static inline int get_compression_toggle (struct cryptcompress_info * info) -+{ -+ return info->compress_toggle; -+} -+ -+static inline int compression_is_on(struct cryptcompress_info * info) -+{ -+ return get_compression_toggle(info) == 1; -+} -+ -+static inline void turn_on_compression(struct cryptcompress_info * info) -+{ -+ set_compression_toggle(info, 1); -+} -+ -+static inline void turn_off_compression(struct cryptcompress_info * info) -+{ -+ set_compression_toggle(info, 0); -+} -+ -+static inline void set_lattice_factor(struct cryptcompress_info * info, int val) -+{ -+ info->lattice_factor = val; -+} -+ -+static inline int get_lattice_factor(struct cryptcompress_info * info) -+{ -+ return info->lattice_factor; -+} -+ -+struct cryptcompress_info *cryptcompress_inode_data(const struct inode *); -+int equal_to_rdk(znode *, const reiser4_key *); -+int goto_right_neighbor(coord_t *, lock_handle *); -+int cryptcompress_inode_ok(struct inode *inode); -+int coord_is_unprepped_ctail(const coord_t * coord); -+extern int do_readpage_ctail(struct inode *, struct cluster_handle *, -+ struct page * page, znode_lock_mode mode); -+extern int ctail_insert_unprepped_cluster(struct cluster_handle * clust, -+ struct inode * inode); -+extern int readpages_cryptcompress(struct file*, struct address_space*, -+ struct list_head*, unsigned); -+int bind_cryptcompress(struct inode *child, struct inode *parent); -+void destroy_inode_cryptcompress(struct inode * inode); -+int grab_page_cluster(struct inode *inode, struct cluster_handle * clust, -+ rw_op rw); -+int write_pschedule_hook(struct file *file, struct inode * inode, -+ loff_t pos, struct cluster_handle * clust, -+ struct psched_context * cont); -+int setattr_pschedule_hook(struct inode * inode); -+struct reiser4_crypto_info * inode_crypto_info(struct inode * inode); -+void inherit_crypto_info_common(struct inode * parent, struct inode * object, -+ int (*can_inherit)(struct inode * child, -+ struct inode * parent)); -+void reiser4_attach_crypto_info(struct inode * inode, -+ struct reiser4_crypto_info * info); -+void change_crypto_info(struct inode * inode, struct reiser4_crypto_info * new); -+struct reiser4_crypto_info * reiser4_alloc_crypto_info (struct inode * inode); -+ -+static inline struct crypto_blkcipher * info_get_cipher(struct reiser4_crypto_info * info) -+{ -+ return info->cipher; -+} -+ -+static inline void info_set_cipher(struct reiser4_crypto_info * info, -+ struct crypto_blkcipher * tfm) -+{ -+ info->cipher = tfm; -+} -+ -+static inline struct crypto_hash * info_get_digest(struct reiser4_crypto_info * info) -+{ -+ return info->digest; -+} -+ -+static inline void info_set_digest(struct reiser4_crypto_info * info, -+ struct crypto_hash * tfm) -+{ -+ info->digest = tfm; -+} -+ -+static inline void put_cluster_page(struct page * page) -+{ -+ page_cache_release(page); -+} -+ -+#endif /* __FS_REISER4_CRYPTCOMPRESS_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/file/file.c linux-2.6.24/fs/reiser4/plugin/file/file.c ---- linux-2.6.24.orig/fs/reiser4/plugin/file/file.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/file/file.c 2008-01-25 11:40:16.694168755 +0300 -@@ -0,0 +1,2724 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* -+ * this file contains implementations of inode/file/address_space/file plugin -+ * operations specific for "unix file plugin" (plugin id is -+ * UNIX_FILE_PLUGIN_ID). "Unix file" is either built of tail items only -+ * (FORMATTING_ID) or of extent items only (EXTENT_POINTER_ID) or empty (have -+ * no items but stat data) -+ */ -+ -+#include "../../inode.h" -+#include "../../super.h" -+#include "../../tree_walk.h" -+#include "../../carry.h" -+#include "../../page_cache.h" -+#include "../../ioctl.h" -+#include "../object.h" -+#include "../cluster.h" -+#include "../../safe_link.h" -+ -+#include -+#include -+#include -+ -+ -+static int unpack(struct file *file, struct inode *inode, int forever); -+static void drop_access(struct unix_file_info *); -+static int hint_validate(hint_t * hint, const reiser4_key * key, int check_key, -+ znode_lock_mode lock_mode); -+ -+/* Get exclusive access and make sure that file is not partially -+ * converted (It may happen that another process is doing tail -+ * conversion. If so, wait until it completes) -+ */ -+static inline void get_exclusive_access_careful(struct unix_file_info * uf_info, -+ struct inode *inode) -+{ -+ do { -+ get_exclusive_access(uf_info); -+ if (!reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV)) -+ break; -+ drop_exclusive_access(uf_info); -+ schedule(); -+ } while (1); -+} -+ -+/* get unix file plugin specific portion of inode */ -+struct unix_file_info *unix_file_inode_data(const struct inode *inode) -+{ -+ return &reiser4_inode_data(inode)->file_plugin_data.unix_file_info; -+} -+ -+/** -+ * equal_to_rdk - compare key and znode's right delimiting key -+ * @node: node whose right delimiting key to compare with @key -+ * @key: key to compare with @node's right delimiting key -+ * -+ * Returns true if @key is equal to right delimiting key of @node. -+ */ -+int equal_to_rdk(znode *node, const reiser4_key *key) -+{ -+ int result; -+ -+ read_lock_dk(znode_get_tree(node)); -+ result = keyeq(key, znode_get_rd_key(node)); -+ read_unlock_dk(znode_get_tree(node)); -+ return result; -+} -+ -+#if REISER4_DEBUG -+ -+/** -+ * equal_to_ldk - compare key and znode's left delimiting key -+ * @node: node whose left delimiting key to compare with @key -+ * @key: key to compare with @node's left delimiting key -+ * -+ * Returns true if @key is equal to left delimiting key of @node. -+ */ -+int equal_to_ldk(znode *node, const reiser4_key *key) -+{ -+ int result; -+ -+ read_lock_dk(znode_get_tree(node)); -+ result = keyeq(key, znode_get_ld_key(node)); -+ read_unlock_dk(znode_get_tree(node)); -+ return result; -+} -+ -+/** -+ * check_coord - check whether coord corresponds to key -+ * @coord: coord to check -+ * @key: key @coord has to correspond to -+ * -+ * Returns true if @coord is set as if it was set as result of lookup with @key -+ * in coord->node. -+ */ -+static int check_coord(const coord_t *coord, const reiser4_key *key) -+{ -+ coord_t twin; -+ -+ node_plugin_by_node(coord->node)->lookup(coord->node, key, -+ FIND_MAX_NOT_MORE_THAN, &twin); -+ return coords_equal(coord, &twin); -+} -+ -+#endif /* REISER4_DEBUG */ -+ -+/** -+ * init_uf_coord - initialize extended coord -+ * @uf_coord: -+ * @lh: -+ * -+ * -+ */ -+void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh) -+{ -+ coord_init_zero(&uf_coord->coord); -+ coord_clear_iplug(&uf_coord->coord); -+ uf_coord->lh = lh; -+ init_lh(lh); -+ memset(&uf_coord->extension, 0, sizeof(uf_coord->extension)); -+ uf_coord->valid = 0; -+} -+ -+static void validate_extended_coord(uf_coord_t *uf_coord, loff_t offset) -+{ -+ assert("vs-1333", uf_coord->valid == 0); -+ -+ if (coord_is_between_items(&uf_coord->coord)) -+ return; -+ -+ assert("vs-1348", -+ item_plugin_by_coord(&uf_coord->coord)->s.file. -+ init_coord_extension); -+ -+ item_body_by_coord(&uf_coord->coord); -+ item_plugin_by_coord(&uf_coord->coord)->s.file. -+ init_coord_extension(uf_coord, offset); -+} -+ -+/** -+ * goto_right_neighbor - lock right neighbor, drop current node lock -+ * @coord: -+ * @lh: -+ * -+ * Obtain lock on right neighbor and drop lock on current node. -+ */ -+int goto_right_neighbor(coord_t *coord, lock_handle *lh) -+{ -+ int result; -+ lock_handle lh_right; -+ -+ assert("vs-1100", znode_is_locked(coord->node)); -+ -+ init_lh(&lh_right); -+ result = reiser4_get_right_neighbor(&lh_right, coord->node, -+ znode_is_wlocked(coord->node) ? -+ ZNODE_WRITE_LOCK : ZNODE_READ_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ if (result) { -+ done_lh(&lh_right); -+ return result; -+ } -+ -+ /* -+ * we hold two longterm locks on neighboring nodes. Unlock left of -+ * them -+ */ -+ done_lh(lh); -+ -+ coord_init_first_unit_nocheck(coord, lh_right.node); -+ move_lh(lh, &lh_right); -+ -+ return 0; -+ -+} -+ -+/** -+ * set_file_state -+ * @uf_info: -+ * @cbk_result: -+ * @level: -+ * -+ * This is to be used by find_file_item and in find_file_state to -+ * determine real state of file -+ */ -+static void set_file_state(struct unix_file_info *uf_info, int cbk_result, -+ tree_level level) -+{ -+ if (cbk_errored(cbk_result)) -+ /* error happened in find_file_item */ -+ return; -+ -+ assert("vs-1164", level == LEAF_LEVEL || level == TWIG_LEVEL); -+ -+ if (uf_info->container == UF_CONTAINER_UNKNOWN) { -+ if (cbk_result == CBK_COORD_NOTFOUND) -+ uf_info->container = UF_CONTAINER_EMPTY; -+ else if (level == LEAF_LEVEL) -+ uf_info->container = UF_CONTAINER_TAILS; -+ else -+ uf_info->container = UF_CONTAINER_EXTENTS; -+ } else { -+ /* -+ * file state is known, check whether it is set correctly if -+ * file is not being tail converted -+ */ -+ if (!reiser4_inode_get_flag(unix_file_info_to_inode(uf_info), -+ REISER4_PART_IN_CONV)) { -+ assert("vs-1162", -+ ergo(level == LEAF_LEVEL && -+ cbk_result == CBK_COORD_FOUND, -+ uf_info->container == UF_CONTAINER_TAILS)); -+ assert("vs-1165", -+ ergo(level == TWIG_LEVEL && -+ cbk_result == CBK_COORD_FOUND, -+ uf_info->container == UF_CONTAINER_EXTENTS)); -+ } -+ } -+} -+ -+int find_file_item_nohint(coord_t *coord, lock_handle *lh, -+ const reiser4_key *key, znode_lock_mode lock_mode, -+ struct inode *inode) -+{ -+ return reiser4_object_lookup(inode, key, coord, lh, lock_mode, -+ FIND_MAX_NOT_MORE_THAN, -+ TWIG_LEVEL, LEAF_LEVEL, -+ (lock_mode == ZNODE_READ_LOCK) ? CBK_UNIQUE : -+ (CBK_UNIQUE | CBK_FOR_INSERT), -+ NULL /* ra_info */ ); -+} -+ -+/** -+ * find_file_item - look for file item in the tree -+ * @hint: provides coordinate, lock handle, seal -+ * @key: key for search -+ * @mode: mode of lock to put on returned node -+ * @ra_info: -+ * @inode: -+ * -+ * This finds position in the tree corresponding to @key. It first tries to use -+ * @hint's seal if it is set. -+ */ -+int find_file_item(hint_t *hint, const reiser4_key *key, -+ znode_lock_mode lock_mode, -+ struct inode *inode) -+{ -+ int result; -+ coord_t *coord; -+ lock_handle *lh; -+ -+ assert("nikita-3030", reiser4_schedulable()); -+ assert("vs-1707", hint != NULL); -+ assert("vs-47", inode != NULL); -+ -+ coord = &hint->ext_coord.coord; -+ lh = hint->ext_coord.lh; -+ init_lh(lh); -+ -+ result = hint_validate(hint, key, 1 /* check key */, lock_mode); -+ if (!result) { -+ if (coord->between == AFTER_UNIT && -+ equal_to_rdk(coord->node, key)) { -+ result = goto_right_neighbor(coord, lh); -+ if (result == -E_NO_NEIGHBOR) -+ return RETERR(-EIO); -+ if (result) -+ return result; -+ assert("vs-1152", equal_to_ldk(coord->node, key)); -+ /* -+ * we moved to different node. Invalidate coord -+ * extension, zload is necessary to init it again -+ */ -+ hint->ext_coord.valid = 0; -+ } -+ -+ set_file_state(unix_file_inode_data(inode), CBK_COORD_FOUND, -+ znode_get_level(coord->node)); -+ -+ return CBK_COORD_FOUND; -+ } -+ -+ coord_init_zero(coord); -+ result = find_file_item_nohint(coord, lh, key, lock_mode, inode); -+ set_file_state(unix_file_inode_data(inode), result, -+ znode_get_level(coord->node)); -+ -+ /* FIXME: we might already have coord extension initialized */ -+ hint->ext_coord.valid = 0; -+ return result; -+} -+ -+/* plugin->u.file.write_flowom = NULL -+ plugin->u.file.read_flow = NULL */ -+ -+void hint_init_zero(hint_t * hint) -+{ -+ memset(hint, 0, sizeof(*hint)); -+ init_lh(&hint->lh); -+ hint->ext_coord.lh = &hint->lh; -+} -+ -+static int find_file_state(struct inode *inode, struct unix_file_info *uf_info) -+{ -+ int result; -+ reiser4_key key; -+ coord_t coord; -+ lock_handle lh; -+ -+ assert("vs-1628", ea_obtained(uf_info)); -+ -+ if (uf_info->container == UF_CONTAINER_UNKNOWN) { -+ key_by_inode_and_offset_common(inode, 0, &key); -+ init_lh(&lh); -+ result = find_file_item_nohint(&coord, &lh, &key, -+ ZNODE_READ_LOCK, inode); -+ set_file_state(uf_info, result, znode_get_level(coord.node)); -+ done_lh(&lh); -+ if (!cbk_errored(result)) -+ result = 0; -+ } else -+ result = 0; -+ assert("vs-1074", -+ ergo(result == 0, uf_info->container != UF_CONTAINER_UNKNOWN)); -+ reiser4_txn_restart_current(); -+ return result; -+} -+ -+/** -+ * Estimate and reserve space needed to truncate page -+ * which gets partially truncated: one block for page -+ * itself, stat-data update (estimate_one_insert_into_item) -+ * and one item insertion (estimate_one_insert_into_item) -+ * which may happen if page corresponds to hole extent and -+ * unallocated one will have to be created -+ */ -+static int reserve_partial_page(reiser4_tree * tree) -+{ -+ grab_space_enable(); -+ return reiser4_grab_reserved(reiser4_get_current_sb(), -+ 1 + -+ 2 * estimate_one_insert_into_item(tree), -+ BA_CAN_COMMIT); -+} -+ -+/* estimate and reserve space needed to cut one item and update one stat data */ -+static int reserve_cut_iteration(reiser4_tree * tree) -+{ -+ __u64 estimate = estimate_one_item_removal(tree) -+ + estimate_one_insert_into_item(tree); -+ -+ assert("nikita-3172", lock_stack_isclean(get_current_lock_stack())); -+ -+ grab_space_enable(); -+ /* We need to double our estimate now that we can delete more than one -+ node. */ -+ return reiser4_grab_reserved(reiser4_get_current_sb(), estimate * 2, -+ BA_CAN_COMMIT); -+} -+ -+int reiser4_update_file_size(struct inode *inode, loff_t new_size, -+ int update_sd) -+{ -+ int result = 0; -+ -+ INODE_SET_SIZE(inode, new_size); -+ if (update_sd) { -+ inode->i_ctime = inode->i_mtime = CURRENT_TIME; -+ result = reiser4_update_sd(inode); -+ } -+ return result; -+} -+ -+/** -+ * Cut file items one by one starting from the last one until -+ * new file size (inode->i_size) is reached. Reserve space -+ * and update file stat data on every single cut from the tree -+ */ -+int cut_file_items(struct inode *inode, loff_t new_size, -+ int update_sd, loff_t cur_size, -+ int (*update_actor) (struct inode *, loff_t, int)) -+{ -+ reiser4_key from_key, to_key; -+ reiser4_key smallest_removed; -+ file_plugin *fplug = inode_file_plugin(inode); -+ int result; -+ int progress = 0; -+ -+ assert("vs-1248", -+ fplug == file_plugin_by_id(UNIX_FILE_PLUGIN_ID) || -+ fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); -+ -+ fplug->key_by_inode(inode, new_size, &from_key); -+ to_key = from_key; -+ set_key_offset(&to_key, cur_size - 1 /*get_key_offset(reiser4_max_key()) */ ); -+ /* this loop normally runs just once */ -+ while (1) { -+ result = reserve_cut_iteration(reiser4_tree_by_inode(inode)); -+ if (result) -+ break; -+ -+ result = reiser4_cut_tree_object(current_tree, &from_key, &to_key, -+ &smallest_removed, inode, 1, -+ &progress); -+ if (result == -E_REPEAT) { -+ /** -+ * -E_REPEAT is a signal to interrupt a long -+ * file truncation process -+ */ -+ if (progress) { -+ result = update_actor(inode, -+ get_key_offset(&smallest_removed), -+ update_sd); -+ if (result) -+ break; -+ } -+ /* the below does up(sbinfo->delete_mutex). -+ * Do not get folled */ -+ reiser4_release_reserved(inode->i_sb); -+ /** -+ * reiser4_cut_tree_object() was interrupted probably -+ * because current atom requires commit, we have to -+ * release transaction handle to allow atom commit. -+ */ -+ reiser4_txn_restart_current(); -+ continue; -+ } -+ if (result -+ && !(result == CBK_COORD_NOTFOUND && new_size == 0 -+ && inode->i_size == 0)) -+ break; -+ -+ set_key_offset(&smallest_removed, new_size); -+ /* Final sd update after the file gets its correct size */ -+ result = update_actor(inode, get_key_offset(&smallest_removed), -+ update_sd); -+ break; -+ } -+ -+ /* the below does up(sbinfo->delete_mutex). Do not get folled */ -+ reiser4_release_reserved(inode->i_sb); -+ -+ return result; -+} -+ -+int find_or_create_extent(struct page *page); -+ -+/* part of truncate_file_body: it is called when truncate is used to make file -+ shorter */ -+static int shorten_file(struct inode *inode, loff_t new_size) -+{ -+ int result; -+ struct page *page; -+ int padd_from; -+ unsigned long index; -+ struct unix_file_info *uf_info; -+ -+ /* -+ * all items of ordinary reiser4 file are grouped together. That is why -+ * we can use reiser4_cut_tree. Plan B files (for instance) can not be -+ * truncated that simply -+ */ -+ result = cut_file_items(inode, new_size, 1 /*update_sd */ , -+ get_key_offset(reiser4_max_key()), -+ reiser4_update_file_size); -+ if (result) -+ return result; -+ -+ uf_info = unix_file_inode_data(inode); -+ assert("vs-1105", new_size == inode->i_size); -+ if (new_size == 0) { -+ uf_info->container = UF_CONTAINER_EMPTY; -+ return 0; -+ } -+ -+ result = find_file_state(inode, uf_info); -+ if (result) -+ return result; -+ if (uf_info->container == UF_CONTAINER_TAILS) -+ /* -+ * No need to worry about zeroing last page after new file -+ * end -+ */ -+ return 0; -+ -+ padd_from = inode->i_size & (PAGE_CACHE_SIZE - 1); -+ if (!padd_from) -+ /* file is truncated to page boundary */ -+ return 0; -+ -+ result = reserve_partial_page(reiser4_tree_by_inode(inode)); -+ if (result) { -+ reiser4_release_reserved(inode->i_sb); -+ return result; -+ } -+ -+ /* last page is partially truncated - zero its content */ -+ index = (inode->i_size >> PAGE_CACHE_SHIFT); -+ page = read_mapping_page(inode->i_mapping, index, NULL); -+ if (IS_ERR(page)) { -+ /* -+ * the below does up(sbinfo->delete_mutex). Do not get -+ * confused -+ */ -+ reiser4_release_reserved(inode->i_sb); -+ if (likely(PTR_ERR(page) == -EINVAL)) { -+ /* looks like file is built of tail items */ -+ return 0; -+ } -+ return PTR_ERR(page); -+ } -+ wait_on_page_locked(page); -+ if (!PageUptodate(page)) { -+ page_cache_release(page); -+ /* -+ * the below does up(sbinfo->delete_mutex). Do not get -+ * confused -+ */ -+ reiser4_release_reserved(inode->i_sb); -+ return RETERR(-EIO); -+ } -+ -+ /* -+ * if page correspons to hole extent unit - unallocated one will be -+ * created here. This is not necessary -+ */ -+ result = find_or_create_extent(page); -+ -+ /* -+ * FIXME: cut_file_items has already updated inode. Probably it would -+ * be better to update it here when file is really truncated -+ */ -+ if (result) { -+ page_cache_release(page); -+ /* -+ * the below does up(sbinfo->delete_mutex). Do not get -+ * confused -+ */ -+ reiser4_release_reserved(inode->i_sb); -+ return result; -+ } -+ -+ lock_page(page); -+ assert("vs-1066", PageLocked(page)); -+ zero_user_page(page, padd_from, PAGE_CACHE_SIZE - padd_from, KM_USER0); -+ unlock_page(page); -+ page_cache_release(page); -+ /* the below does up(sbinfo->delete_mutex). Do not get confused */ -+ reiser4_release_reserved(inode->i_sb); -+ return 0; -+} -+ -+/** -+ * should_have_notail -+ * @uf_info: -+ * @new_size: -+ * -+ * Calls formatting plugin to see whether file of size @new_size has to be -+ * stored in unformatted nodes or in tail items. 0 is returned for later case. -+ */ -+static int should_have_notail(const struct unix_file_info *uf_info, loff_t new_size) -+{ -+ if (!uf_info->tplug) -+ return 1; -+ return !uf_info->tplug->have_tail(unix_file_info_to_inode(uf_info), -+ new_size); -+ -+} -+ -+/** -+ * truncate_file_body - change length of file -+ * @inode: inode of file -+ * @new_size: new file length -+ * -+ * Adjusts items file @inode is built of to match @new_size. It may either cut -+ * items or add them to represent a hole at the end of file. The caller has to -+ * obtain exclusive access to the file. -+ */ -+static int truncate_file_body(struct inode *inode, struct iattr *attr) -+{ -+ int result; -+ loff_t new_size = attr->ia_size; -+ -+ if (inode->i_size < new_size) { -+ /* expanding truncate */ -+ struct unix_file_info *uf_info = unix_file_inode_data(inode); -+ -+ result = find_file_state(inode, uf_info); -+ if (result) -+ return result; -+ -+ if (should_have_notail(uf_info, new_size)) { -+ /* -+ * file of size @new_size has to be built of -+ * extents. If it is built of tails - convert to -+ * extents -+ */ -+ if (uf_info->container == UF_CONTAINER_TAILS) { -+ /* -+ * if file is being convered by another process -+ * - wait until it completes -+ */ -+ while (1) { -+ if (reiser4_inode_get_flag(inode, -+ REISER4_PART_IN_CONV)) { -+ drop_exclusive_access(uf_info); -+ schedule(); -+ get_exclusive_access(uf_info); -+ continue; -+ } -+ break; -+ } -+ -+ if (uf_info->container == UF_CONTAINER_TAILS) { -+ result = tail2extent(uf_info); -+ if (result) -+ return result; -+ } -+ } -+ result = reiser4_write_extent(NULL, inode, NULL, -+ 0, &new_size); -+ if (result) -+ return result; -+ uf_info->container = UF_CONTAINER_EXTENTS; -+ } else { -+ if (uf_info->container == UF_CONTAINER_EXTENTS) { -+ result = reiser4_write_extent(NULL, inode, NULL, -+ 0, &new_size); -+ if (result) -+ return result; -+ } else { -+ result = reiser4_write_tail(NULL, inode, NULL, -+ 0, &new_size); -+ if (result) -+ return result; -+ uf_info->container = UF_CONTAINER_TAILS; -+ } -+ } -+ BUG_ON(result > 0); -+ result = reiser4_update_file_size(inode, new_size, 1); -+ BUG_ON(result != 0); -+ } else -+ result = shorten_file(inode, new_size); -+ return result; -+} -+ -+/* plugin->u.write_sd_by_inode = write_sd_by_inode_common */ -+ -+/** -+ * load_file_hint - copy hint from struct file to local variable -+ * @file: file to get hint from -+ * @hint: structure to fill -+ * -+ * Reiser4 specific portion of struct file may contain information (hint) -+ * stored on exiting from previous read or write. That information includes -+ * seal of znode and coord within that znode where previous read or write -+ * stopped. This function copies that information to @hint if it was stored or -+ * initializes @hint by 0s otherwise. -+ */ -+int load_file_hint(struct file *file, hint_t *hint) -+{ -+ reiser4_file_fsdata *fsdata; -+ -+ if (file) { -+ fsdata = reiser4_get_file_fsdata(file); -+ if (IS_ERR(fsdata)) -+ return PTR_ERR(fsdata); -+ -+ spin_lock_inode(file->f_dentry->d_inode); -+ if (reiser4_seal_is_set(&fsdata->reg.hint.seal)) { -+ *hint = fsdata->reg.hint; -+ init_lh(&hint->lh); -+ hint->ext_coord.lh = &hint->lh; -+ spin_unlock_inode(file->f_dentry->d_inode); -+ /* -+ * force re-validation of the coord on the first -+ * iteration of the read/write loop. -+ */ -+ hint->ext_coord.valid = 0; -+ assert("nikita-19892", coords_equal(&hint->seal.coord1, -+ &hint->ext_coord. -+ coord)); -+ return 0; -+ } -+ memset(&fsdata->reg.hint, 0, sizeof(hint_t)); -+ spin_unlock_inode(file->f_dentry->d_inode); -+ } -+ hint_init_zero(hint); -+ return 0; -+} -+ -+/** -+ * save_file_hint - copy hint to reiser4 private struct file's part -+ * @file: file to save hint in -+ * @hint: hint to save -+ * -+ * This copies @hint to reiser4 private part of struct file. It can help -+ * speedup future accesses to the file. -+ */ -+void save_file_hint(struct file *file, const hint_t *hint) -+{ -+ reiser4_file_fsdata *fsdata; -+ -+ assert("edward-1337", hint != NULL); -+ -+ if (!file || !reiser4_seal_is_set(&hint->seal)) -+ return; -+ fsdata = reiser4_get_file_fsdata(file); -+ assert("vs-965", !IS_ERR(fsdata)); -+ assert("nikita-19891", -+ coords_equal(&hint->seal.coord1, &hint->ext_coord.coord)); -+ assert("vs-30", hint->lh.owner == NULL); -+ spin_lock_inode(file->f_dentry->d_inode); -+ fsdata->reg.hint = *hint; -+ spin_unlock_inode(file->f_dentry->d_inode); -+ return; -+} -+ -+void reiser4_unset_hint(hint_t * hint) -+{ -+ assert("vs-1315", hint); -+ hint->ext_coord.valid = 0; -+ reiser4_seal_done(&hint->seal); -+ done_lh(&hint->lh); -+} -+ -+/* coord must be set properly. So, that reiser4_set_hint -+ has nothing to do */ -+void reiser4_set_hint(hint_t * hint, const reiser4_key * key, -+ znode_lock_mode mode) -+{ -+ ON_DEBUG(coord_t * coord = &hint->ext_coord.coord); -+ assert("vs-1207", WITH_DATA(coord->node, check_coord(coord, key))); -+ -+ reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, key); -+ hint->offset = get_key_offset(key); -+ hint->mode = mode; -+ done_lh(&hint->lh); -+} -+ -+int hint_is_set(const hint_t * hint) -+{ -+ return reiser4_seal_is_set(&hint->seal); -+} -+ -+#if REISER4_DEBUG -+static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2) -+{ -+ return (get_key_locality(k1) == get_key_locality(k2) && -+ get_key_type(k1) == get_key_type(k2) && -+ get_key_band(k1) == get_key_band(k2) && -+ get_key_ordering(k1) == get_key_ordering(k2) && -+ get_key_objectid(k1) == get_key_objectid(k2)); -+} -+#endif -+ -+static int -+hint_validate(hint_t * hint, const reiser4_key * key, int check_key, -+ znode_lock_mode lock_mode) -+{ -+ if (!hint || !hint_is_set(hint) || hint->mode != lock_mode) -+ /* hint either not set or set by different operation */ -+ return RETERR(-E_REPEAT); -+ -+ assert("vs-1277", all_but_offset_key_eq(key, &hint->seal.key)); -+ -+ if (check_key && get_key_offset(key) != hint->offset) -+ /* hint is set for different key */ -+ return RETERR(-E_REPEAT); -+ -+ assert("vs-31", hint->ext_coord.lh == &hint->lh); -+ return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord, key, -+ hint->ext_coord.lh, lock_mode, -+ ZNODE_LOCK_LOPRI); -+} -+ -+/** -+ * Look for place at twig level for extent corresponding to page, -+ * call extent's writepage method to create unallocated extent if -+ * it does not exist yet, initialize jnode, capture page -+ */ -+int find_or_create_extent(struct page *page) -+{ -+ int result; -+ struct inode *inode; -+ int plugged_hole; -+ -+ jnode *node; -+ -+ assert("vs-1065", page->mapping && page->mapping->host); -+ inode = page->mapping->host; -+ -+ lock_page(page); -+ node = jnode_of_page(page); -+ if (IS_ERR(node)) { -+ unlock_page(page); -+ return PTR_ERR(node); -+ } -+ JF_SET(node, JNODE_WRITE_PREPARED); -+ unlock_page(page); -+ -+ if (node->blocknr == 0) { -+ plugged_hole = 0; -+ result = reiser4_update_extent(inode, node, page_offset(page), -+ &plugged_hole); -+ if (result) { -+ JF_CLR(node, JNODE_WRITE_PREPARED); -+ jput(node); -+ warning("edward-1549", -+ "reiser4_update_extent failed: %d", result); -+ return result; -+ } -+ if (plugged_hole) -+ reiser4_update_sd(inode); -+ } else { -+ spin_lock_jnode(node); -+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0); -+ BUG_ON(result != 0); -+ jnode_make_dirty_locked(node); -+ spin_unlock_jnode(node); -+ } -+ -+ BUG_ON(node->atom == NULL); -+ JF_CLR(node, JNODE_WRITE_PREPARED); -+ jput(node); -+ -+ if (get_current_context()->entd) { -+ entd_context *ent = get_entd_context(node->tree->super); -+ -+ if (ent->cur_request->page == page) -+ ent->cur_request->node = node; -+ } -+ return 0; -+} -+ -+/** -+ * has_anonymous_pages - check whether inode has pages dirtied via mmap -+ * @inode: inode to check -+ * -+ * Returns true if inode's mapping has dirty pages which do not belong to any -+ * atom. Those are either tagged PAGECACHE_TAG_REISER4_MOVED in mapping's page -+ * tree or were eflushed and can be found via jnodes tagged -+ * EFLUSH_TAG_ANONYMOUS in radix tree of jnodes. -+ */ -+static int has_anonymous_pages(struct inode *inode) -+{ -+ int result; -+ -+ read_lock_irq(&inode->i_mapping->tree_lock); -+ result = radix_tree_tagged(&inode->i_mapping->page_tree, PAGECACHE_TAG_REISER4_MOVED); -+ read_unlock_irq(&inode->i_mapping->tree_lock); -+ return result; -+} -+ -+/** -+ * capture_page_and_create_extent - -+ * @page: page to be captured -+ * -+ * Grabs space for extent creation and stat data update and calls function to -+ * do actual work. -+ */ -+static int capture_page_and_create_extent(struct page *page) -+{ -+ int result; -+ struct inode *inode; -+ -+ assert("vs-1084", page->mapping && page->mapping->host); -+ inode = page->mapping->host; -+ assert("vs-1139", -+ unix_file_inode_data(inode)->container == UF_CONTAINER_EXTENTS); -+ /* page belongs to file */ -+ assert("vs-1393", -+ inode->i_size > page_offset(page)); -+ -+ /* page capture may require extent creation (if it does not exist yet) -+ and stat data's update (number of blocks changes on extent -+ creation) */ -+ grab_space_enable(); -+ result = reiser4_grab_space(2 * estimate_one_insert_into_item -+ (reiser4_tree_by_inode(inode)), -+ BA_CAN_COMMIT); -+ if (likely(!result)) -+ result = find_or_create_extent(page); -+ -+ if (result != 0) -+ SetPageError(page); -+ return result; -+} -+ -+/* this is implementation of method commit_write of struct -+ address_space_operations for unix file plugin */ -+int -+commit_write_unix_file(struct file *file, struct page *page, -+ unsigned from, unsigned to) -+{ -+ reiser4_context *ctx; -+ struct inode *inode; -+ int result; -+ -+ assert("umka-3101", file != NULL); -+ assert("umka-3102", page != NULL); -+ assert("umka-3093", PageLocked(page)); -+ -+ SetPageUptodate(page); -+ -+ inode = page->mapping->host; -+ ctx = reiser4_init_context(page->mapping->host->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ page_cache_get(page); -+ unlock_page(page); -+ result = capture_page_and_create_extent(page); -+ lock_page(page); -+ page_cache_release(page); -+ -+ /* don't commit transaction under inode semaphore */ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/* -+ * Support for "anonymous" pages and jnodes. -+ * -+ * When file is write-accessed through mmap pages can be dirtied from the user -+ * level. In this case kernel is not notified until one of following happens: -+ * -+ * (1) msync() -+ * -+ * (2) truncate() (either explicit or through unlink) -+ * -+ * (3) VM scanner starts reclaiming mapped pages, dirtying them before -+ * starting write-back. -+ * -+ * As a result of (3) ->writepage may be called on a dirty page without -+ * jnode. Such page is called "anonymous" in reiser4. Certain work-loads -+ * (iozone) generate huge number of anonymous pages. Emergency flush handles -+ * this situation by creating jnode for anonymous page, starting IO on the -+ * page, and marking jnode with JNODE_KEEPME bit so that it's not thrown out of -+ * memory. Such jnode is also called anonymous. -+ * -+ * reiser4_sync_sb() method tries to insert anonymous pages and jnodes into -+ * tree. This is done by capture_anonymous_*() functions below. -+ */ -+ -+/** -+ * capture_anonymous_page - involve page into transaction -+ * @pg: page to deal with -+ * -+ * Takes care that @page has corresponding metadata in the tree, creates jnode -+ * for @page and captures it. On success 1 is returned. -+ */ -+static int capture_anonymous_page(struct page *page) -+{ -+ int result; -+ -+ if (PageWriteback(page)) -+ /* FIXME: do nothing? */ -+ return 0; -+ -+ result = capture_page_and_create_extent(page); -+ if (result == 0) { -+ result = 1; -+ } else -+ warning("nikita-3329", -+ "Cannot capture anon page: %i", result); -+ -+ return result; -+} -+ -+/** -+ * capture_anonymous_pages - find and capture pages dirtied via mmap -+ * @mapping: address space where to look for pages -+ * @index: start index -+ * @to_capture: maximum number of pages to capture -+ * -+ * Looks for pages tagged REISER4_MOVED starting from the *@index-th page, -+ * captures (involves into atom) them, returns number of captured pages, -+ * updates @index to next page after the last captured one. -+ */ -+static int -+capture_anonymous_pages(struct address_space *mapping, pgoff_t *index, -+ unsigned int to_capture) -+{ -+ int result; -+ struct pagevec pvec; -+ unsigned int i, count; -+ int nr; -+ -+ pagevec_init(&pvec, 0); -+ count = min(pagevec_space(&pvec), to_capture); -+ nr = 0; -+ -+ /* find pages tagged MOVED */ -+ write_lock_irq(&mapping->tree_lock); -+ pvec.nr = radix_tree_gang_lookup_tag(&mapping->page_tree, -+ (void **)pvec.pages, *index, count, -+ PAGECACHE_TAG_REISER4_MOVED); -+ if (pagevec_count(&pvec) == 0) { -+ /* -+ * there are no pages tagged MOVED in mapping->page_tree -+ * starting from *index -+ */ -+ write_unlock_irq(&mapping->tree_lock); -+ *index = (pgoff_t)-1; -+ return 0; -+ } -+ -+ /* clear MOVED tag for all found pages */ -+ for (i = 0; i < pagevec_count(&pvec); i++) { -+ page_cache_get(pvec.pages[i]); -+ radix_tree_tag_clear(&mapping->page_tree, pvec.pages[i]->index, -+ PAGECACHE_TAG_REISER4_MOVED); -+ } -+ write_unlock_irq(&mapping->tree_lock); -+ -+ -+ *index = pvec.pages[i - 1]->index + 1; -+ -+ for (i = 0; i < pagevec_count(&pvec); i++) { -+ /* -+ * tag PAGECACHE_TAG_REISER4_MOVED will be cleared by -+ * reiser4_set_page_dirty_internal which is called when jnode is -+ * captured -+ */ -+ result = capture_anonymous_page(pvec.pages[i]); -+ if (result == 1) -+ nr++; -+ else { -+ if (result < 0) { -+ warning("vs-1454", -+ "failed to capture page: " -+ "result=%d, captured=%d)\n", -+ result, i); -+ -+ /* -+ * set MOVED tag to all pages which left not -+ * captured -+ */ -+ write_lock_irq(&mapping->tree_lock); -+ for (; i < pagevec_count(&pvec); i ++) { -+ radix_tree_tag_set(&mapping->page_tree, -+ pvec.pages[i]->index, -+ PAGECACHE_TAG_REISER4_MOVED); -+ } -+ write_unlock_irq(&mapping->tree_lock); -+ -+ pagevec_release(&pvec); -+ return result; -+ } else { -+ /* -+ * result == 0. capture_anonymous_page returns -+ * 0 for Writeback-ed page. Set MOVED tag on -+ * that page -+ */ -+ write_lock_irq(&mapping->tree_lock); -+ radix_tree_tag_set(&mapping->page_tree, -+ pvec.pages[i]->index, -+ PAGECACHE_TAG_REISER4_MOVED); -+ write_unlock_irq(&mapping->tree_lock); -+ if (i == 0) -+ *index = pvec.pages[0]->index; -+ else -+ *index = pvec.pages[i - 1]->index + 1; -+ } -+ } -+ } -+ pagevec_release(&pvec); -+ return nr; -+} -+ -+/** -+ * capture_anonymous_jnodes - find and capture anonymous jnodes -+ * @mapping: address space where to look for jnodes -+ * @from: start index -+ * @to: end index -+ * @to_capture: maximum number of jnodes to capture -+ * -+ * Looks for jnodes tagged EFLUSH_TAG_ANONYMOUS in inode's tree of jnodes in -+ * the range of indexes @from-@to and captures them, returns number of captured -+ * jnodes, updates @from to next jnode after the last captured one. -+ */ -+static int -+capture_anonymous_jnodes(struct address_space *mapping, -+ pgoff_t *from, pgoff_t to, int to_capture) -+{ -+ *from = to; -+ return 0; -+} -+ -+/* -+ * Commit atom of the jnode of a page. -+ */ -+static int sync_page(struct page *page) -+{ -+ int result; -+ do { -+ jnode *node; -+ txn_atom *atom; -+ -+ lock_page(page); -+ node = jprivate(page); -+ if (node != NULL) { -+ spin_lock_jnode(node); -+ atom = jnode_get_atom(node); -+ spin_unlock_jnode(node); -+ } else -+ atom = NULL; -+ unlock_page(page); -+ result = reiser4_sync_atom(atom); -+ } while (result == -E_REPEAT); -+ /* -+ * ZAM-FIXME-HANS: document the logic of this loop, is it just to -+ * handle the case where more pages get added to the atom while we are -+ * syncing it? -+ */ -+ assert("nikita-3485", ergo(result == 0, -+ get_current_context()->trans->atom == NULL)); -+ return result; -+} -+ -+/* -+ * Commit atoms of pages on @pages list. -+ * call sync_page for each page from mapping's page tree -+ */ -+static int sync_page_list(struct inode *inode) -+{ -+ int result; -+ struct address_space *mapping; -+ unsigned long from; /* start index for radix_tree_gang_lookup */ -+ unsigned int found; /* return value for radix_tree_gang_lookup */ -+ -+ mapping = inode->i_mapping; -+ from = 0; -+ result = 0; -+ read_lock_irq(&mapping->tree_lock); -+ while (result == 0) { -+ struct page *page; -+ -+ found = -+ radix_tree_gang_lookup(&mapping->page_tree, (void **)&page, -+ from, 1); -+ assert("edward-1550", found < 2); -+ if (found == 0) -+ break; -+ /** -+ * page may not leave radix tree because it is protected from -+ * truncating by inode->i_mutex locked by sys_fsync -+ */ -+ page_cache_get(page); -+ read_unlock_irq(&mapping->tree_lock); -+ -+ from = page->index + 1; -+ -+ result = sync_page(page); -+ -+ page_cache_release(page); -+ read_lock_irq(&mapping->tree_lock); -+ } -+ -+ read_unlock_irq(&mapping->tree_lock); -+ return result; -+} -+ -+static int commit_file_atoms(struct inode *inode) -+{ -+ int result; -+ struct unix_file_info *uf_info; -+ -+ uf_info = unix_file_inode_data(inode); -+ -+ get_exclusive_access(uf_info); -+ /* -+ * find what items file is made from -+ */ -+ result = find_file_state(inode, uf_info); -+ drop_exclusive_access(uf_info); -+ if (result != 0) -+ return result; -+ -+ /* -+ * file state cannot change because we are under ->i_mutex -+ */ -+ switch (uf_info->container) { -+ case UF_CONTAINER_EXTENTS: -+ /* find_file_state might open join an atom */ -+ reiser4_txn_restart_current(); -+ result = -+ /* -+ * when we are called by -+ * filemap_fdatawrite-> -+ * do_writepages()-> -+ * reiser4_writepages() -+ * -+ * inode->i_mapping->dirty_pages are spices into -+ * ->io_pages, leaving ->dirty_pages dirty. -+ * -+ * When we are called from -+ * reiser4_fsync()->sync_unix_file(), we have to -+ * commit atoms of all pages on the ->dirty_list. -+ * -+ * So for simplicity we just commit ->io_pages and -+ * ->dirty_pages. -+ */ -+ sync_page_list(inode); -+ break; -+ case UF_CONTAINER_TAILS: -+ /* -+ * NOTE-NIKITA probably we can be smarter for tails. For now -+ * just commit all existing atoms. -+ */ -+ result = txnmgr_force_commit_all(inode->i_sb, 0); -+ break; -+ case UF_CONTAINER_EMPTY: -+ result = 0; -+ break; -+ case UF_CONTAINER_UNKNOWN: -+ default: -+ result = -EIO; -+ break; -+ } -+ -+ /* -+ * commit current transaction: there can be captured nodes from -+ * find_file_state() and finish_conversion(). -+ */ -+ reiser4_txn_restart_current(); -+ return result; -+} -+ -+/** -+ * writepages_unix_file - writepages of struct address_space_operations -+ * @mapping: -+ * @wbc: -+ * -+ * This captures anonymous pages and anonymous jnodes. Anonymous pages are -+ * pages which are dirtied via mmapping. Anonymous jnodes are ones which were -+ * created by reiser4_writepage. -+ */ -+int writepages_unix_file(struct address_space *mapping, -+ struct writeback_control *wbc) -+{ -+ int result; -+ struct unix_file_info *uf_info; -+ pgoff_t pindex, jindex, nr_pages; -+ long to_capture; -+ struct inode *inode; -+ -+ inode = mapping->host; -+ if (!has_anonymous_pages(inode)) { -+ result = 0; -+ goto end; -+ } -+ jindex = pindex = wbc->range_start >> PAGE_CACHE_SHIFT; -+ result = 0; -+ nr_pages = size_in_pages(i_size_read(inode)); -+ -+ uf_info = unix_file_inode_data(inode); -+ -+ do { -+ reiser4_context *ctx; -+ -+ if (wbc->sync_mode != WB_SYNC_ALL) -+ to_capture = min(wbc->nr_to_write, CAPTURE_APAGE_BURST); -+ else -+ to_capture = CAPTURE_APAGE_BURST; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) { -+ result = PTR_ERR(ctx); -+ break; -+ } -+ /* avoid recursive calls to ->sync_inodes */ -+ ctx->nobalance = 1; -+ assert("zam-760", lock_stack_isclean(get_current_lock_stack())); -+ assert("edward-1551", LOCK_CNT_NIL(inode_sem_w)); -+ assert("edward-1552", LOCK_CNT_NIL(inode_sem_r)); -+ -+ reiser4_txn_restart_current(); -+ -+ /* we have to get nonexclusive access to the file */ -+ if (get_current_context()->entd) { -+ /* -+ * use nonblocking version of nonexclusive_access to -+ * avoid deadlock which might look like the following: -+ * process P1 holds NEA on file F1 and called entd to -+ * reclaim some memory. Entd works for P1 and is going -+ * to capture pages of file F2. To do that entd has to -+ * get NEA to F2. F2 is held by process P2 which also -+ * called entd. But entd is serving P1 at the moment -+ * and P2 has to wait. Process P3 trying to get EA to -+ * file F2. Existence of pending EA request to file F2 -+ * makes impossible for entd to get NEA to file -+ * F2. Neither of these process can continue. Using -+ * nonblocking version of gettign NEA is supposed to -+ * avoid this deadlock. -+ */ -+ if (try_to_get_nonexclusive_access(uf_info) == 0) { -+ result = RETERR(-EBUSY); -+ reiser4_exit_context(ctx); -+ break; -+ } -+ } else -+ get_nonexclusive_access(uf_info); -+ -+ while (to_capture > 0) { -+ pgoff_t start; -+ -+ assert("vs-1727", jindex <= pindex); -+ if (pindex == jindex) { -+ start = pindex; -+ result = -+ capture_anonymous_pages(inode->i_mapping, -+ &pindex, -+ to_capture); -+ if (result <= 0) -+ break; -+ to_capture -= result; -+ wbc->nr_to_write -= result; -+ if (start + result == pindex) { -+ jindex = pindex; -+ continue; -+ } -+ if (to_capture <= 0) -+ break; -+ } -+ /* deal with anonymous jnodes between jindex and pindex */ -+ result = -+ capture_anonymous_jnodes(inode->i_mapping, &jindex, -+ pindex, to_capture); -+ if (result < 0) -+ break; -+ to_capture -= result; -+ get_current_context()->nr_captured += result; -+ -+ if (jindex == (pgoff_t) - 1) { -+ assert("vs-1728", pindex == (pgoff_t) - 1); -+ break; -+ } -+ } -+ if (to_capture <= 0) -+ /* there may be left more pages */ -+ __mark_inode_dirty(inode, I_DIRTY_PAGES); -+ -+ drop_nonexclusive_access(uf_info); -+ if (result < 0) { -+ /* error happened */ -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ if (wbc->sync_mode != WB_SYNC_ALL) { -+ reiser4_exit_context(ctx); -+ return 0; -+ } -+ result = commit_file_atoms(inode); -+ reiser4_exit_context(ctx); -+ if (pindex >= nr_pages && jindex == pindex) -+ break; -+ } while (1); -+ -+ end: -+ if (is_in_reiser4_context()) { -+ if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) { -+ /* -+ * there are already pages to flush, flush them out, do -+ * not delay until end of reiser4_sync_inodes -+ */ -+ reiser4_writeout(inode->i_sb, wbc); -+ get_current_context()->nr_captured = 0; -+ } -+ } -+ return result; -+} -+ -+/** -+ * readpage_unix_file_nolock - readpage of struct address_space_operations -+ * @file: -+ * @page: -+ * -+ * Compose a key and search for item containing information about @page -+ * data. If item is found - its readpage method is called. -+ */ -+int readpage_unix_file(struct file *file, struct page *page) -+{ -+ reiser4_context *ctx; -+ int result; -+ struct inode *inode; -+ reiser4_key key; -+ item_plugin *iplug; -+ hint_t *hint; -+ lock_handle *lh; -+ coord_t *coord; -+ -+ assert("vs-1062", PageLocked(page)); -+ assert("vs-976", !PageUptodate(page)); -+ assert("vs-1061", page->mapping && page->mapping->host); -+ -+ if (page->mapping->host->i_size <= page_offset(page)) { -+ /* page is out of file */ -+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0); -+ SetPageUptodate(page); -+ unlock_page(page); -+ return 0; -+ } -+ -+ inode = page->mapping->host; -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) { -+ unlock_page(page); -+ return PTR_ERR(ctx); -+ } -+ -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) { -+ unlock_page(page); -+ reiser4_exit_context(ctx); -+ return RETERR(-ENOMEM); -+ } -+ -+ result = load_file_hint(file, hint); -+ if (result) { -+ kfree(hint); -+ unlock_page(page); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ lh = &hint->lh; -+ -+ /* get key of first byte of the page */ -+ key_by_inode_and_offset_common(inode, page_offset(page), &key); -+ -+ /* look for file metadata corresponding to first byte of page */ -+ page_cache_get(page); -+ unlock_page(page); -+ result = find_file_item(hint, &key, ZNODE_READ_LOCK, inode); -+ lock_page(page); -+ page_cache_release(page); -+ -+ if (page->mapping == NULL) { -+ /* -+ * readpage allows truncate to run concurrently. Page was -+ * truncated while it was not locked -+ */ -+ done_lh(lh); -+ kfree(hint); -+ unlock_page(page); -+ reiser4_txn_restart(ctx); -+ reiser4_exit_context(ctx); -+ return -EINVAL; -+ } -+ -+ if (result != CBK_COORD_FOUND || hint->ext_coord.coord.between != AT_UNIT) { -+ if (result == CBK_COORD_FOUND && -+ hint->ext_coord.coord.between != AT_UNIT) -+ /* file is truncated */ -+ result = -EINVAL; -+ done_lh(lh); -+ kfree(hint); -+ unlock_page(page); -+ reiser4_txn_restart(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ /* -+ * item corresponding to page is found. It can not be removed because -+ * znode lock is held -+ */ -+ if (PageUptodate(page)) { -+ done_lh(lh); -+ kfree(hint); -+ unlock_page(page); -+ reiser4_txn_restart(ctx); -+ reiser4_exit_context(ctx); -+ return 0; -+ } -+ -+ coord = &hint->ext_coord.coord; -+ result = zload(coord->node); -+ if (result) { -+ done_lh(lh); -+ kfree(hint); -+ unlock_page(page); -+ reiser4_txn_restart(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ validate_extended_coord(&hint->ext_coord, page_offset(page)); -+ -+ if (!coord_is_existing_unit(coord)) { -+ /* this indicates corruption */ -+ warning("vs-280", -+ "Looking for page %lu of file %llu (size %lli). " -+ "No file items found (%d). File is corrupted?\n", -+ page->index, (unsigned long long)get_inode_oid(inode), -+ inode->i_size, result); -+ zrelse(coord->node); -+ done_lh(lh); -+ kfree(hint); -+ unlock_page(page); -+ reiser4_txn_restart(ctx); -+ reiser4_exit_context(ctx); -+ return RETERR(-EIO); -+ } -+ -+ /* -+ * get plugin of found item or use plugin if extent if there are no -+ * one -+ */ -+ iplug = item_plugin_by_coord(coord); -+ if (iplug->s.file.readpage) -+ result = iplug->s.file.readpage(coord, page); -+ else -+ result = RETERR(-EINVAL); -+ -+ if (!result) { -+ set_key_offset(&key, -+ (loff_t) (page->index + 1) << PAGE_CACHE_SHIFT); -+ /* FIXME should call reiser4_set_hint() */ -+ reiser4_unset_hint(hint); -+ } else { -+ unlock_page(page); -+ reiser4_unset_hint(hint); -+ } -+ assert("vs-979", -+ ergo(result == 0, (PageLocked(page) || PageUptodate(page)))); -+ assert("vs-9791", ergo(result != 0, !PageLocked(page))); -+ -+ zrelse(coord->node); -+ done_lh(lh); -+ -+ save_file_hint(file, hint); -+ kfree(hint); -+ -+ /* -+ * FIXME: explain why it is needed. HINT: page allocation in write can -+ * not be done when atom is not NULL because reiser4_writepage can not -+ * kick entd and have to eflush -+ */ -+ reiser4_txn_restart(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+struct uf_readpages_context { -+ lock_handle lh; -+ coord_t coord; -+}; -+ -+/* A callback function for readpages_unix_file/read_cache_pages. -+ * If the file is build of tails, then return error (-ENOENT). -+ * -+ * @data -- a pointer to reiser4_readpages_context object, -+ * to save the twig lock and the coord between -+ * read_cache_page iterations. -+ * @page -- page to start read. -+ */ -+static int uf_readpages_filler(void * data, struct page * page) -+{ -+ struct uf_readpages_context *rc = data; -+ jnode * node; -+ int ret = 0; -+ reiser4_extent *ext; -+ __u64 ext_index; -+ int cbk_done = 0; -+ struct address_space * mapping = page->mapping; -+ -+ if (PageUptodate(page)) { -+ unlock_page(page); -+ return 0; -+ } -+ page_cache_get(page); -+ -+ if (rc->lh.node == 0) { -+ /* no twig lock - have to do tree search. */ -+ reiser4_key key; -+ repeat: -+ unlock_page(page); -+ key_by_inode_and_offset_common( -+ mapping->host, page_offset(page), &key); -+ ret = coord_by_key( -+ &get_super_private(mapping->host->i_sb)->tree, -+ &key, &rc->coord, &rc->lh, -+ ZNODE_READ_LOCK, FIND_EXACT, -+ TWIG_LEVEL, TWIG_LEVEL, CBK_UNIQUE, NULL); -+ if (unlikely(ret)) -+ goto exit; -+ lock_page(page); -+ if (PageUptodate(page)) -+ goto unlock; -+ cbk_done = 1; -+ } -+ ret = zload(rc->coord.node); -+ if (unlikely(ret)) -+ goto unlock; -+ if (!coord_is_existing_item(&rc->coord) || -+ !item_is_extent(&rc->coord)) { -+ zrelse(rc->coord.node); -+ ret = RETERR(-EIO); -+ goto unlock; -+ } -+ ext = extent_by_coord(&rc->coord); -+ ext_index = extent_unit_index(&rc->coord); -+ if (page->index < ext_index || -+ page->index >= ext_index + extent_get_width(ext)) { -+ /* the page index doesn't belong to the extent unit -+ which the coord points to - release the lock and -+ repeat with tree search. */ -+ zrelse(rc->coord.node); -+ done_lh(&rc->lh); -+ /* we can be here after a CBK call only in case of -+ corruption of the tree or the tree lookup algorithm bug. */ -+ if (unlikely(cbk_done)) { -+ ret = RETERR(-EIO); -+ goto unlock; -+ } -+ goto repeat; -+ } -+ node = jnode_of_page(page); -+ if (unlikely(IS_ERR(node))) { -+ zrelse(rc->coord.node); -+ ret = PTR_ERR(node); -+ goto unlock; -+ } -+ ret = reiser4_do_readpage_extent(ext, page->index - ext_index, page); -+ jput(node); -+ zrelse(rc->coord.node); -+ if (likely(!ret)) -+ goto exit; -+ unlock: -+ unlock_page(page); -+ exit: -+ page_cache_release(page); -+ return ret; -+} -+ -+/** -+ * readpages_unix_file - called by the readahead code, starts reading for each -+ * page of given list of pages -+ */ -+int readpages_unix_file( -+ struct file *file, struct address_space *mapping, -+ struct list_head *pages, unsigned nr_pages) -+{ -+ reiser4_context *ctx; -+ struct uf_readpages_context rc; -+ int ret; -+ -+ ctx = reiser4_init_context(mapping->host->i_sb); -+ if (IS_ERR(ctx)) { -+ put_pages_list(pages); -+ return PTR_ERR(ctx); -+ } -+ init_lh(&rc.lh); -+ ret = read_cache_pages(mapping, pages, uf_readpages_filler, &rc); -+ done_lh(&rc.lh); -+ context_set_commit_async(ctx); -+ /* close the transaction to protect further page allocation from deadlocks */ -+ reiser4_txn_restart(ctx); -+ reiser4_exit_context(ctx); -+ return ret; -+} -+ -+static reiser4_block_nr unix_file_estimate_read(struct inode *inode, -+ loff_t count UNUSED_ARG) -+{ -+ /* We should reserve one block, because of updating of the stat data -+ item */ -+ assert("vs-1249", -+ inode_file_plugin(inode)->estimate.update == -+ estimate_update_common); -+ return estimate_update_common(inode); -+} -+ -+/* this is called with nonexclusive access obtained, file's container can not change */ -+static ssize_t read_file(hint_t *hint, struct file *file, /* file to read from to */ -+ char __user *buf, /* address of user-space buffer */ -+ size_t count, /* number of bytes to read */ -+ loff_t *off) -+{ -+ int result; -+ struct inode *inode; -+ flow_t flow; -+ int (*read_f) (struct file *, flow_t *, hint_t *); -+ coord_t *coord; -+ znode *loaded; -+ -+ inode = file->f_dentry->d_inode; -+ -+ /* build flow */ -+ assert("vs-1250", -+ inode_file_plugin(inode)->flow_by_inode == -+ flow_by_inode_unix_file); -+ result = -+ flow_by_inode_unix_file(inode, buf, 1 /* user space */ , count, -+ *off, READ_OP, &flow); -+ if (unlikely(result)) -+ return result; -+ -+ /* get seal and coord sealed with it from reiser4 private data -+ of struct file. The coord will tell us where our last read -+ of this file finished, and the seal will help to determine -+ if that location is still valid. -+ */ -+ coord = &hint->ext_coord.coord; -+ while (flow.length && result == 0) { -+ result = -+ find_file_item(hint, &flow.key, ZNODE_READ_LOCK, inode); -+ if (cbk_errored(result)) -+ /* error happened */ -+ break; -+ -+ if (coord->between != AT_UNIT) { -+ /* there were no items corresponding to given offset */ -+ done_lh(hint->ext_coord.lh); -+ break; -+ } -+ -+ loaded = coord->node; -+ result = zload(loaded); -+ if (unlikely(result)) { -+ done_lh(hint->ext_coord.lh); -+ break; -+ } -+ -+ if (hint->ext_coord.valid == 0) -+ validate_extended_coord(&hint->ext_coord, -+ get_key_offset(&flow.key)); -+ -+ assert("vs-4", hint->ext_coord.valid == 1); -+ assert("vs-33", hint->ext_coord.lh == &hint->lh); -+ /* call item's read method */ -+ read_f = item_plugin_by_coord(coord)->s.file.read; -+ result = read_f(file, &flow, hint); -+ zrelse(loaded); -+ done_lh(hint->ext_coord.lh); -+ } -+ -+ return (count - flow.length) ? (count - flow.length) : result; -+} -+ -+static ssize_t read_unix_file_container_tails(struct file*, char __user*, size_t, loff_t*); -+ -+/** -+ * read_unix_file - read of struct file_operations -+ * @file: file to read from -+ * @buf: address of user-space buffer -+ * @read_amount: number of bytes to read -+ * @off: position in file to read from -+ * -+ * This is implementation of vfs's read method of struct file_operations for -+ * unix file plugin. -+ */ -+ssize_t read_unix_file(struct file *file, char __user *buf, size_t read_amount, -+ loff_t *off) -+{ -+ reiser4_context *ctx; -+ ssize_t result; -+ struct inode *inode; -+ struct unix_file_info *uf_info; -+ -+ if (unlikely(read_amount == 0)) -+ return 0; -+ -+ assert("umka-072", file != NULL); -+ assert("umka-074", off != NULL); -+ inode = file->f_dentry->d_inode; -+ assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD)); -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ uf_info = unix_file_inode_data(inode); -+ if (uf_info->container == UF_CONTAINER_UNKNOWN) { -+ get_exclusive_access(uf_info); -+ result = find_file_state(inode, uf_info); -+ if (unlikely(result != 0)) -+ goto out; -+ } else -+ get_nonexclusive_access(uf_info); -+ result = reiser4_grab_space_force(unix_file_estimate_read(inode, read_amount), -+ BA_CAN_COMMIT); -+ if (unlikely(result != 0)) -+ goto out; -+ if (uf_info->container == UF_CONTAINER_EXTENTS){ -+ result = do_sync_read(file, buf, read_amount, off); -+ } else if (uf_info->container == UF_CONTAINER_TAILS || -+ reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV) || -+ reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) { -+ result = read_unix_file_container_tails(file, buf, read_amount, off); -+ } else { -+ assert("zam-1085", uf_info->container == UF_CONTAINER_EMPTY); -+ result = 0; -+ } -+out: -+ drop_access(uf_info); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+static ssize_t read_unix_file_container_tails( -+ struct file *file, char __user *buf, size_t read_amount, loff_t *off) -+{ -+ int result; -+ struct inode *inode; -+ hint_t *hint; -+ struct unix_file_info *uf_info; -+ size_t count, read, left; -+ loff_t size; -+ -+ assert("umka-072", file != NULL); -+ assert("umka-074", off != NULL); -+ inode = file->f_dentry->d_inode; -+ assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD)); -+ -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) -+ return RETERR(-ENOMEM); -+ -+ result = load_file_hint(file, hint); -+ if (result) { -+ kfree(hint); -+ return result; -+ } -+ -+ left = read_amount; -+ count = 0; -+ uf_info = unix_file_inode_data(inode); -+ while (left > 0) { -+ reiser4_txn_restart_current(); -+ size = i_size_read(inode); -+ if (*off >= size) -+ /* position to read from is past the end of file */ -+ break; -+ if (*off + left > size) -+ left = size - *off; -+ /* faultin user page */ -+ result = fault_in_pages_writeable(buf, left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left); -+ if (result) -+ return RETERR(-EFAULT); -+ -+ read = read_file(hint, file, buf, -+ left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left, -+ off); -+ if (read < 0) { -+ result = read; -+ break; -+ } -+ left -= read; -+ buf += read; -+ -+ /* update position in a file */ -+ *off += read; -+ /* total number of read bytes */ -+ count += read; -+ } -+ done_lh(&hint->lh); -+ save_file_hint(file, hint); -+ kfree(hint); -+ if (count) -+ file_accessed(file); -+ /* return number of read bytes or error code if nothing is read */ -+ return count ? count : result; -+} -+ -+/* This function takes care about @file's pages. First of all it checks if -+ filesystems readonly and if so gets out. Otherwise, it throws out all -+ pages of file if it was mapped for read and going to be mapped for write -+ and consists of tails. This is done in order to not manage few copies -+ of the data (first in page cache and second one in tails them selves) -+ for the case of mapping files consisting tails. -+ -+ Here also tail2extent conversion is performed if it is allowed and file -+ is going to be written or mapped for write. This functions may be called -+ from write_unix_file() or mmap_unix_file(). */ -+static int check_pages_unix_file(struct file *file, struct inode *inode) -+{ -+ reiser4_invalidate_pages(inode->i_mapping, 0, -+ (inode->i_size + PAGE_CACHE_SIZE - -+ 1) >> PAGE_CACHE_SHIFT, 0); -+ return unpack(file, inode, 0 /* not forever */ ); -+} -+ -+/** -+ * mmap_unix_file - mmap of struct file_operations -+ * @file: file to mmap -+ * @vma: -+ * -+ * This is implementation of vfs's mmap method of struct file_operations for -+ * unix file plugin. It converts file to extent if necessary. Sets -+ * reiser4_inode's flag - REISER4_HAS_MMAP. -+ */ -+int mmap_unix_file(struct file *file, struct vm_area_struct *vma) -+{ -+ reiser4_context *ctx; -+ int result; -+ struct inode *inode; -+ struct unix_file_info *uf_info; -+ reiser4_block_nr needed; -+ -+ inode = file->f_dentry->d_inode; -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ uf_info = unix_file_inode_data(inode); -+ -+ get_exclusive_access_careful(uf_info, inode); -+ -+ if (!IS_RDONLY(inode) && (vma->vm_flags & (VM_MAYWRITE | VM_SHARED))) { -+ /* -+ * we need file built of extent items. If it is still built of -+ * tail items we have to convert it. Find what items the file -+ * is built of -+ */ -+ result = find_file_state(inode, uf_info); -+ if (result != 0) { -+ drop_exclusive_access(uf_info); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ assert("vs-1648", (uf_info->container == UF_CONTAINER_TAILS || -+ uf_info->container == UF_CONTAINER_EXTENTS || -+ uf_info->container == UF_CONTAINER_EMPTY)); -+ if (uf_info->container == UF_CONTAINER_TAILS) { -+ /* -+ * invalidate all pages and convert file from tails to -+ * extents -+ */ -+ result = check_pages_unix_file(file, inode); -+ if (result) { -+ drop_exclusive_access(uf_info); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ } -+ } -+ -+ /* -+ * generic_file_mmap will do update_atime. Grab space for stat data -+ * update. -+ */ -+ needed = inode_file_plugin(inode)->estimate.update(inode); -+ result = reiser4_grab_space_force(needed, BA_CAN_COMMIT); -+ if (result) { -+ drop_exclusive_access(uf_info); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ result = generic_file_mmap(file, vma); -+ if (result == 0) { -+ /* mark file as having mapping. */ -+ reiser4_inode_set_flag(inode, REISER4_HAS_MMAP); -+ } -+ -+ drop_exclusive_access(uf_info); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/** -+ * find_first_item -+ * @inode: -+ * -+ * Finds file item which is responsible for first byte in the file. -+ */ -+static int find_first_item(struct inode *inode) -+{ -+ coord_t coord; -+ lock_handle lh; -+ reiser4_key key; -+ int result; -+ -+ coord_init_zero(&coord); -+ init_lh(&lh); -+ inode_file_plugin(inode)->key_by_inode(inode, 0, &key); -+ result = find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, -+ inode); -+ if (result == CBK_COORD_FOUND) { -+ if (coord.between == AT_UNIT) { -+ result = zload(coord.node); -+ if (result == 0) { -+ result = item_id_by_coord(&coord); -+ zrelse(coord.node); -+ if (result != EXTENT_POINTER_ID && -+ result != FORMATTING_ID) -+ result = RETERR(-EIO); -+ } -+ } else -+ result = RETERR(-EIO); -+ } -+ done_lh(&lh); -+ return result; -+} -+ -+/** -+ * open_unix_file -+ * @inode: -+ * @file: -+ * -+ * If filesystem is not readonly - complete uncompleted tail conversion if -+ * there was one -+ */ -+int open_unix_file(struct inode *inode, struct file *file) -+{ -+ int result; -+ reiser4_context *ctx; -+ struct unix_file_info *uf_info; -+ -+ if (IS_RDONLY(inode)) -+ return 0; -+ -+ if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) -+ return 0; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ uf_info = unix_file_inode_data(inode); -+ -+ get_exclusive_access_careful(uf_info, inode); -+ -+ if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) { -+ /* -+ * other process completed the conversion -+ */ -+ drop_exclusive_access(uf_info); -+ reiser4_exit_context(ctx); -+ return 0; -+ } -+ -+ /* -+ * file left in semi converted state after unclean shutdown or another -+ * thread is doing conversion and dropped exclusive access which doing -+ * balance dirty pages. Complete the conversion -+ */ -+ result = find_first_item(inode); -+ if (result == EXTENT_POINTER_ID) -+ /* -+ * first item is extent, therefore there was incomplete -+ * tail2extent conversion. Complete it -+ */ -+ result = tail2extent(unix_file_inode_data(inode)); -+ else if (result == FORMATTING_ID) -+ /* -+ * first item is formatting item, therefore there was -+ * incomplete extent2tail conversion. Complete it -+ */ -+ result = extent2tail(file, unix_file_inode_data(inode)); -+ else -+ result = -EIO; -+ -+ assert("vs-1712", -+ ergo(result == 0, -+ (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED) && -+ !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV)))); -+ drop_exclusive_access(uf_info); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+#define NEITHER_OBTAINED 0 -+#define EA_OBTAINED 1 -+#define NEA_OBTAINED 2 -+ -+static void drop_access(struct unix_file_info *uf_info) -+{ -+ if (uf_info->exclusive_use) -+ drop_exclusive_access(uf_info); -+ else -+ drop_nonexclusive_access(uf_info); -+} -+ -+#define debug_wuf(format, ...) printk("%s: %d: %s: " format "\n", \ -+ __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) -+ -+/** -+ * write_unix_file - private ->write() method of unix_file plugin. -+ * -+ * @file: file to write to -+ * @buf: address of user-space buffer -+ * @count: number of bytes to write -+ * @pos: position in file to write to -+ * @cont: unused argument, as we don't perform plugin conversion when being -+ * managed by unix_file plugin. -+ */ -+ssize_t write_unix_file(struct file *file, const char __user *buf, -+ size_t count, loff_t *pos, struct psched_context *cont) -+{ -+ int result; -+ reiser4_context *ctx; -+ struct inode *inode; -+ struct unix_file_info *uf_info; -+ ssize_t written; -+ int try_free_space; -+ int to_write = PAGE_CACHE_SIZE * WRITE_GRANULARITY; -+ size_t left; -+ ssize_t (*write_op)(struct file *, struct inode *, -+ const char __user *, size_t, -+ loff_t *pos); -+ int ea; -+ loff_t new_size; -+ -+ ctx = get_current_context(); -+ inode = file->f_dentry->d_inode; -+ -+ assert("vs-947", !reiser4_inode_get_flag(inode, REISER4_NO_SD)); -+ assert("vs-9471", (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED))); -+ -+ /* check amount of bytes to write and writing position */ -+ result = generic_write_checks(file, pos, &count, 0); -+ if (result) { -+ context_set_commit_async(ctx); -+ return result; -+ } -+ -+ result = remove_suid(file->f_dentry); -+ if (result) { -+ context_set_commit_async(ctx); -+ return result; -+ } -+ /* remove_suid might create a transaction */ -+ reiser4_txn_restart(ctx); -+ -+ uf_info = unix_file_inode_data(inode); -+ -+ current->backing_dev_info = inode->i_mapping->backing_dev_info; -+ written = 0; -+ try_free_space = 0; -+ left = count; -+ ea = NEITHER_OBTAINED; -+ -+ new_size = i_size_read(inode); -+ if (*pos + count > new_size) -+ new_size = *pos + count; -+ -+ while (left) { -+ if (left < to_write) -+ to_write = left; -+ -+ if (uf_info->container == UF_CONTAINER_EMPTY) { -+ get_exclusive_access(uf_info); -+ ea = EA_OBTAINED; -+ if (uf_info->container != UF_CONTAINER_EMPTY) { -+ /* file is made not empty by another process */ -+ drop_exclusive_access(uf_info); -+ ea = NEITHER_OBTAINED; -+ continue; -+ } -+ } else if (uf_info->container == UF_CONTAINER_UNKNOWN) { -+ /* -+ * get exclusive access directly just to not have to -+ * re-obtain it if file will appear empty -+ */ -+ get_exclusive_access(uf_info); -+ ea = EA_OBTAINED; -+ result = find_file_state(inode, uf_info); -+ if (result) { -+ drop_exclusive_access(uf_info); -+ ea = NEITHER_OBTAINED; -+ break; -+ } -+ } else { -+ get_nonexclusive_access(uf_info); -+ ea = NEA_OBTAINED; -+ } -+ -+ /* either EA or NEA is obtained. Choose item write method */ -+ if (uf_info->container == UF_CONTAINER_EXTENTS) { -+ /* file is built of extent items */ -+ write_op = reiser4_write_extent; -+ } else if (uf_info->container == UF_CONTAINER_EMPTY) { -+ /* file is empty */ -+ if (should_have_notail(uf_info, new_size)) -+ write_op = reiser4_write_extent; -+ else -+ write_op = reiser4_write_tail; -+ } else { -+ /* file is built of tail items */ -+ if (should_have_notail(uf_info, new_size)) { -+ if (ea == NEA_OBTAINED) { -+ drop_nonexclusive_access(uf_info); -+ get_exclusive_access(uf_info); -+ ea = EA_OBTAINED; -+ } -+ if (uf_info->container == UF_CONTAINER_TAILS) { -+ /* -+ * if file is being convered by another -+ * process - wait until it completes -+ */ -+ while (1) { -+ if (reiser4_inode_get_flag(inode, -+ REISER4_PART_IN_CONV)) { -+ drop_exclusive_access(uf_info); -+ schedule(); -+ get_exclusive_access(uf_info); -+ continue; -+ } -+ break; -+ } -+ if (uf_info->container == UF_CONTAINER_TAILS) { -+ result = tail2extent(uf_info); -+ if (result) -+ break; -+ } -+ } -+ drop_exclusive_access(uf_info); -+ ea = NEITHER_OBTAINED; -+ continue; -+ } -+ write_op = reiser4_write_tail; -+ } -+ -+ written = write_op(file, inode, buf, to_write, pos); -+ if (written == -ENOSPC && try_free_space) { -+ drop_access(uf_info); -+ txnmgr_force_commit_all(inode->i_sb, 0); -+ try_free_space = 0; -+ continue; -+ } -+ if (written < 0) { -+ drop_access(uf_info); -+ result = written; -+ break; -+ } -+ /* something is written. */ -+ if (uf_info->container == UF_CONTAINER_EMPTY) { -+ assert("edward-1553", ea == EA_OBTAINED); -+ uf_info->container = -+ (write_op == reiser4_write_extent) ? -+ UF_CONTAINER_EXTENTS : UF_CONTAINER_TAILS; -+ } else { -+ assert("edward-1554", ergo(uf_info->container == UF_CONTAINER_EXTENTS, -+ write_op == reiser4_write_extent)); -+ assert("edward-1555", ergo(uf_info->container == UF_CONTAINER_TAILS, -+ write_op == reiser4_write_tail)); -+ } -+ if (*pos + written > inode->i_size) -+ INODE_SET_FIELD(inode, i_size, *pos + written); -+ file_update_time(file); -+ result = reiser4_update_sd(inode); -+ if (result) { -+ current->backing_dev_info = NULL; -+ drop_access(uf_info); -+ context_set_commit_async(ctx); -+ return result; -+ } -+ drop_access(uf_info); -+ ea = NEITHER_OBTAINED; -+ reiser4_txn_restart(ctx); -+ current->journal_info = NULL; -+ /* -+ * tell VM how many pages were dirtied. Maybe number of pages -+ * which were dirty already should not be counted -+ */ -+ balance_dirty_pages_ratelimited_nr(inode->i_mapping, -+ (written + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE); -+ current->journal_info = ctx; -+ -+ left -= written; -+ buf += written; -+ *pos += written; -+ } -+ if (result == 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { -+ reiser4_txn_restart_current(); -+ grab_space_enable(); -+ result = reiser4_sync_file_common(file, file->f_dentry, -+ 0 /* data and stat data */); -+ if (result) -+ warning("reiser4-7", "failed to sync file %llu", -+ (unsigned long long)get_inode_oid(inode)); -+ } -+ -+ current->backing_dev_info = NULL; -+ -+ /* -+ * return number of written bytes or error code if nothing is -+ * written. Note, that it does not work correctly in case when -+ * sync_unix_file returns error -+ */ -+ return (count - left) ? (count - left) : result; -+} -+ -+/** -+ * release_unix_file - release of struct file_operations -+ * @inode: inode of released file -+ * @file: file to release -+ * -+ * Implementation of release method of struct file_operations for unix file -+ * plugin. If last reference to indode is released - convert all extent items -+ * into tail items if necessary. Frees reiser4 specific file data. -+ */ -+int release_unix_file(struct inode *inode, struct file *file) -+{ -+ reiser4_context *ctx; -+ struct unix_file_info *uf_info; -+ int result; -+ int in_reiser4; -+ -+ in_reiser4 = is_in_reiser4_context(); -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ result = 0; -+ if (in_reiser4 == 0) { -+ uf_info = unix_file_inode_data(inode); -+ -+ get_exclusive_access_careful(uf_info, inode); -+ if (atomic_read(&file->f_dentry->d_count) == 1 && -+ uf_info->container == UF_CONTAINER_EXTENTS && -+ !should_have_notail(uf_info, inode->i_size) && -+ !rofs_inode(inode)) { -+ result = extent2tail(file, uf_info); -+ if (result != 0) { -+ warning("nikita-3233", -+ "Failed (%d) to convert in %s (%llu)", -+ result, __FUNCTION__, -+ (unsigned long long) -+ get_inode_oid(inode)); -+ } -+ } -+ drop_exclusive_access(uf_info); -+ } else { -+ /* -+ we are within reiser4 context already. How latter is -+ possible? Simple: -+ -+ (gdb) bt -+ #0 get_exclusive_access () -+ #2 0xc01e56d3 in release_unix_file () -+ #3 0xc01c3643 in reiser4_release () -+ #4 0xc014cae0 in __fput () -+ #5 0xc013ffc3 in remove_vm_struct () -+ #6 0xc0141786 in exit_mmap () -+ #7 0xc0118480 in mmput () -+ #8 0xc0133205 in oom_kill () -+ #9 0xc01332d1 in out_of_memory () -+ #10 0xc013bc1d in try_to_free_pages () -+ #11 0xc013427b in __alloc_pages () -+ #12 0xc013f058 in do_anonymous_page () -+ #13 0xc013f19d in do_no_page () -+ #14 0xc013f60e in handle_mm_fault () -+ #15 0xc01131e5 in do_page_fault () -+ #16 0xc0104935 in error_code () -+ #17 0xc025c0c6 in __copy_to_user_ll () -+ #18 0xc01d496f in reiser4_read_tail () -+ #19 0xc01e4def in read_unix_file () -+ #20 0xc01c3504 in reiser4_read () -+ #21 0xc014bd4f in vfs_read () -+ #22 0xc014bf66 in sys_read () -+ */ -+ warning("vs-44", "out of memory?"); -+ } -+ -+ reiser4_free_file_fsdata(file); -+ -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+static void set_file_notail(struct inode *inode) -+{ -+ reiser4_inode *state; -+ formatting_plugin *tplug; -+ -+ state = reiser4_inode_data(inode); -+ tplug = formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID); -+ force_plugin_pset(inode, PSET_FORMATTING, (reiser4_plugin *)tplug); -+} -+ -+/* if file is built of tails - convert it to extents */ -+static int unpack(struct file *filp, struct inode *inode, int forever) -+{ -+ int result = 0; -+ struct unix_file_info *uf_info; -+ -+ uf_info = unix_file_inode_data(inode); -+ assert("vs-1628", ea_obtained(uf_info)); -+ -+ result = find_file_state(inode, uf_info); -+ if (result) -+ return result; -+ assert("vs-1074", uf_info->container != UF_CONTAINER_UNKNOWN); -+ -+ if (uf_info->container == UF_CONTAINER_TAILS) { -+ /* -+ * if file is being convered by another process - wait until it -+ * completes -+ */ -+ while (1) { -+ if (reiser4_inode_get_flag(inode, -+ REISER4_PART_IN_CONV)) { -+ drop_exclusive_access(uf_info); -+ schedule(); -+ get_exclusive_access(uf_info); -+ continue; -+ } -+ break; -+ } -+ if (uf_info->container == UF_CONTAINER_TAILS) { -+ result = tail2extent(uf_info); -+ if (result) -+ return result; -+ } -+ } -+ if (forever) { -+ /* safe new formatting plugin in stat data */ -+ __u64 tograb; -+ -+ set_file_notail(inode); -+ -+ grab_space_enable(); -+ tograb = inode_file_plugin(inode)->estimate.update(inode); -+ result = reiser4_grab_space(tograb, BA_CAN_COMMIT); -+ result = reiser4_update_sd(inode); -+ } -+ -+ return result; -+} -+ -+/* implentation of vfs' ioctl method of struct file_operations for unix file -+ plugin -+*/ -+int -+ioctl_unix_file(struct inode *inode, struct file *filp, -+ unsigned int cmd, unsigned long arg UNUSED_ARG) -+{ -+ reiser4_context *ctx; -+ int result; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ switch (cmd) { -+ case REISER4_IOC_UNPACK: -+ get_exclusive_access(unix_file_inode_data(inode)); -+ result = unpack(filp, inode, 1 /* forever */ ); -+ drop_exclusive_access(unix_file_inode_data(inode)); -+ break; -+ -+ default: -+ result = RETERR(-ENOSYS); -+ break; -+ } -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/* implentation of vfs' bmap method of struct address_space_operations for unix -+ file plugin -+*/ -+sector_t bmap_unix_file(struct address_space * mapping, sector_t lblock) -+{ -+ reiser4_context *ctx; -+ sector_t result; -+ reiser4_key key; -+ coord_t coord; -+ lock_handle lh; -+ struct inode *inode; -+ item_plugin *iplug; -+ sector_t block; -+ -+ inode = mapping->host; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ key_by_inode_and_offset_common(inode, -+ (loff_t) lblock * current_blocksize, -+ &key); -+ -+ init_lh(&lh); -+ result = -+ find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, inode); -+ if (cbk_errored(result)) { -+ done_lh(&lh); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ result = zload(coord.node); -+ if (result) { -+ done_lh(&lh); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ iplug = item_plugin_by_coord(&coord); -+ if (iplug->s.file.get_block) { -+ result = iplug->s.file.get_block(&coord, lblock, &block); -+ if (result == 0) -+ result = block; -+ } else -+ result = RETERR(-EINVAL); -+ -+ zrelse(coord.node); -+ done_lh(&lh); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/** -+ * flow_by_inode_unix_file - initizlize structure flow -+ * @inode: inode of file for which read or write is abou -+ * @buf: buffer to perform read to or write from -+ * @user: flag showing whether @buf is user space or kernel space -+ * @size: size of buffer @buf -+ * @off: start offset fro read or write -+ * @op: READ or WRITE -+ * @flow: -+ * -+ * Initializes fields of @flow: key, size of data, i/o mode (read or write). -+ */ -+int flow_by_inode_unix_file(struct inode *inode, -+ const char __user *buf, int user, -+ loff_t size, loff_t off, -+ rw_op op, flow_t *flow) -+{ -+ assert("nikita-1100", inode != NULL); -+ -+ flow->length = size; -+ memcpy(&flow->data, &buf, sizeof(buf)); -+ flow->user = user; -+ flow->op = op; -+ assert("nikita-1931", inode_file_plugin(inode) != NULL); -+ assert("nikita-1932", -+ inode_file_plugin(inode)->key_by_inode == -+ key_by_inode_and_offset_common); -+ /* calculate key of write position and insert it into flow->key */ -+ return key_by_inode_and_offset_common(inode, off, &flow->key); -+} -+ -+/* plugin->u.file.set_plug_in_sd = NULL -+ plugin->u.file.set_plug_in_inode = NULL -+ plugin->u.file.create_blank_sd = NULL */ -+/* plugin->u.file.delete */ -+/* -+ plugin->u.file.add_link = reiser4_add_link_common -+ plugin->u.file.rem_link = NULL */ -+ -+/* plugin->u.file.owns_item -+ this is common_file_owns_item with assertion */ -+/* Audited by: green(2002.06.15) */ -+int -+owns_item_unix_file(const struct inode *inode /* object to check against */ , -+ const coord_t * coord /* coord to check */ ) -+{ -+ int result; -+ -+ result = owns_item_common(inode, coord); -+ if (!result) -+ return 0; -+ if (!plugin_of_group(item_plugin_by_coord(coord), -+ UNIX_FILE_METADATA_ITEM_TYPE)) -+ return 0; -+ assert("vs-547", -+ item_id_by_coord(coord) == EXTENT_POINTER_ID || -+ item_id_by_coord(coord) == FORMATTING_ID); -+ return 1; -+} -+ -+static int setattr_truncate(struct inode *inode, struct iattr *attr) -+{ -+ int result; -+ int s_result; -+ loff_t old_size; -+ reiser4_tree *tree; -+ -+ inode_check_scale(inode, inode->i_size, attr->ia_size); -+ -+ old_size = inode->i_size; -+ tree = reiser4_tree_by_inode(inode); -+ -+ result = safe_link_grab(tree, BA_CAN_COMMIT); -+ if (result == 0) -+ result = safe_link_add(inode, SAFE_TRUNCATE); -+ if (result == 0) -+ result = truncate_file_body(inode, attr); -+ if (result) -+ warning("vs-1588", "truncate_file failed: oid %lli, " -+ "old size %lld, new size %lld, retval %d", -+ (unsigned long long)get_inode_oid(inode), -+ old_size, attr->ia_size, result); -+ -+ s_result = safe_link_grab(tree, BA_CAN_COMMIT); -+ if (s_result == 0) -+ s_result = -+ safe_link_del(tree, get_inode_oid(inode), SAFE_TRUNCATE); -+ if (s_result != 0) { -+ warning("nikita-3417", "Cannot kill safelink %lli: %i", -+ (unsigned long long)get_inode_oid(inode), s_result); -+ } -+ safe_link_release(tree); -+ return result; -+} -+ -+/* plugin->u.file.setattr method */ -+/* This calls inode_setattr and if truncate is in effect it also takes -+ exclusive inode access to avoid races */ -+int setattr_unix_file(struct dentry *dentry, /* Object to change attributes */ -+ struct iattr *attr /* change description */ ) -+{ -+ int result; -+ -+ if (attr->ia_valid & ATTR_SIZE) { -+ reiser4_context *ctx; -+ struct unix_file_info *uf_info; -+ -+ /* truncate does reservation itself and requires exclusive -+ access obtained */ -+ ctx = reiser4_init_context(dentry->d_inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ uf_info = unix_file_inode_data(dentry->d_inode); -+ get_exclusive_access_careful(uf_info, dentry->d_inode); -+ result = setattr_truncate(dentry->d_inode, attr); -+ drop_exclusive_access(uf_info); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ } else -+ result = reiser4_setattr_common(dentry, attr); -+ -+ return result; -+} -+ -+/* plugin->u.file.init_inode_data */ -+void -+init_inode_data_unix_file(struct inode *inode, -+ reiser4_object_create_data * crd, int create) -+{ -+ struct unix_file_info *data; -+ -+ data = unix_file_inode_data(inode); -+ data->container = create ? UF_CONTAINER_EMPTY : UF_CONTAINER_UNKNOWN; -+ init_rwsem(&data->latch); -+ data->tplug = inode_formatting_plugin(inode); -+ data->exclusive_use = 0; -+ -+#if REISER4_DEBUG -+ data->ea_owner = NULL; -+ atomic_set(&data->nr_neas, 0); -+#endif -+ init_inode_ordering(inode, crd, create); -+} -+ -+/** -+ * delete_unix_file - delete_object of file_plugin -+ * @inode: inode to be deleted -+ * -+ * Truncates file to length 0, removes stat data and safe link. -+ */ -+int delete_object_unix_file(struct inode *inode) -+{ -+ struct unix_file_info *uf_info; -+ int result; -+ -+ if (reiser4_inode_get_flag(inode, REISER4_NO_SD)) -+ return 0; -+ -+ /* truncate file bogy first */ -+ uf_info = unix_file_inode_data(inode); -+ get_exclusive_access(uf_info); -+ result = shorten_file(inode, 0 /* size */ ); -+ drop_exclusive_access(uf_info); -+ -+ if (result) -+ warning("edward-1556", -+ "failed to truncate file (%llu) on removal: %d", -+ get_inode_oid(inode), result); -+ -+ /* remove stat data and safe link */ -+ return reiser4_delete_object_common(inode); -+} -+ -+int -+prepare_write_unix_file(struct file *file, struct page *page, -+ unsigned from, unsigned to) -+{ -+ reiser4_context *ctx; -+ struct unix_file_info *uf_info; -+ int ret; -+ -+ ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ uf_info = unix_file_inode_data(file->f_dentry->d_inode); -+ get_exclusive_access(uf_info); -+ ret = find_file_state(file->f_dentry->d_inode, uf_info); -+ if (ret == 0) { -+ if (uf_info->container == UF_CONTAINER_TAILS) -+ ret = -EINVAL; -+ else -+ ret = do_prepare_write(file, page, from, to); -+ } -+ drop_exclusive_access(uf_info); -+ -+ /* don't commit transaction under inode semaphore */ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return ret; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/file/file_conversion.c linux-2.6.24/fs/reiser4/plugin/file/file_conversion.c ---- linux-2.6.24.orig/fs/reiser4/plugin/file/file_conversion.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/file/file_conversion.c 2008-01-25 11:39:06.988221084 +0300 -@@ -0,0 +1,689 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, -+ licensing governed by reiser4/README */ -+ -+/** -+ * This file contains plugin schedule hooks, and plugin conversion methods. -+ * -+ * Plugin schedule hook makes a decision (at plugin schedule point) about the -+ * most reasonable plugins for managing a regular file. Usually such decisions -+ * is made by some O(1)-heuristic. -+ * -+ * By default we assign a unix_file plugin id when writing incompressible file -+ * managed by cryptcompress plugin id. Currently used heuristic for estimating -+ * compressibility is very simple: if first complete logical cluster (64K by -+ * default) of a file is incompressible, then we make a decision, that the whole -+ * file is incompressible (*). -+ * -+ * To enable a conversion we install a special "magic" compression mode plugin -+ * (CONVX_COMPRESSION_MODE_ID, see plugin/compress/compress_mode.c for details) -+ * at file creation time (**). -+ * -+ * Note, that we don't perform back conversion (unix_file->cryptcompress) -+ * because of compatibility reasons (see http://dev.namesys.com/Version4.X.Y -+ * for details). -+ * -+ * The conversion is accompanied by rebuilding disk structures of a file, so it -+ * is important to protect them from being interacted with other plugins which -+ * don't expect them to be in such inconsistent state. For this to be protected -+ * we serialize readers and writers of a file's conversion set (FCS). -+ * -+ * We define FCS as a file plugin installed in inode's pset plus file's data -+ * and metadata that this file plugin manipulates with (items, etc). -+ * Note, that FCS is defined per file. -+ * FCS reader is defined as a set of instruction of the following type: -+ * {inode_file_plugin(inode)->method()} (I.e. retrieving a file plugin id -+ * conjoined with all method's instructions should be atomic). -+ * FCS writer is a set of instructions that perform file plugin conversion -+ * (convert items, update pset, etc). -+ * Example: -+ * reiser4_write_careful() supplied to VFS as a ->write() file operation is -+ * composed of the following (optional) instructions: -+ * 1 2 3 -+ * *********************** ####### --------------------------------------------> -+ * -+ * 1) "****" are instructions performed on behalf of cryptcompress file plugin; -+ * 2) "####" is a FCS writer (performing a conversion cryptcompress->unix_file); -+ * 3) "----" are instructions performed on behalf of unix_file plugin; -+ * Here (1) and (3) are FCS readers. -+ * -+ * In this example FCS readers and writers are already serialized (by design), -+ * however there can be readers and writers executing at the same time in -+ * different contexts, so we need a common mechanism of serialization. -+ * -+ * Currently serialization of FCS readers and writers is performed via acquiring -+ * a special per-inode rw-semaphore (conv_sem). And yes, {down, up}_read is for -+ * FCS readers, and {down, up}_write is for FCS writers, see the macros below -+ * for passive/active protection. -+ * -+ * --- -+ * (*) This heuristic can be changed to a better one (benchmarking is needed). -+ * (**) Such technique allows to keep enable/disable state on disk. -+ */ -+ -+#include "../../inode.h" -+#include "../cluster.h" -+#include "file.h" -+ -+#define conversion_enabled(inode) \ -+ (inode_compression_mode_plugin(inode) == \ -+ compression_mode_plugin_by_id(CONVX_COMPRESSION_MODE_ID)) -+ -+/** -+ * Located sections (readers and writers of @pset) are not permanently -+ * critical: cryptcompress file can be converted only if the conversion -+ * is enabled (see the macrio above). Also we don't perform back -+ * conversion. The following helper macro is a sanity check to decide -+ * if we need the protection (locks are always additional overheads). -+ */ -+#define should_protect(inode) \ -+ (inode_file_plugin(inode) == \ -+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID) && \ -+ conversion_enabled(inode)) -+/** -+ * To avoid confusion with read/write file operations, we'll speak about -+ * "passive" protection for FCS readers and "active" protection for FCS -+ * writers. All methods with active or passive protection have suffix -+ * "careful". -+ */ -+/** -+ * Macros for passive protection. -+ * -+ * Construct invariant operation to be supplied to VFS. -+ * The macro accepts the following lexemes: -+ * @type - type of the value represented by the compound statement; -+ * @method - name of an operation to be supplied to VFS (reiser4 file -+ * plugin also should contain a method with such name). -+ */ -+#define PROT_PASSIVE(type, method, args) \ -+({ \ -+ type _result; \ -+ struct rw_semaphore * guard = \ -+ &reiser4_inode_data(inode)->conv_sem; \ -+ \ -+ if (should_protect(inode)) { \ -+ down_read(guard); \ -+ if (!should_protect(inode)) \ -+ up_read(guard); \ -+ } \ -+ _result = inode_file_plugin(inode)->method args; \ -+ if (should_protect(inode)) \ -+ up_read(guard); \ -+ _result; \ -+}) -+ -+#define PROT_PASSIVE_VOID(method, args) \ -+({ \ -+ struct rw_semaphore * guard = \ -+ &reiser4_inode_data(inode)->conv_sem; \ -+ \ -+ if (should_protect(inode)) { \ -+ down_read(guard); \ -+ if (!should_protect(inode)) \ -+ up_read(guard); \ -+ } \ -+ inode_file_plugin(inode)->method args; \ -+ \ -+ if (should_protect(inode)) \ -+ up_read(guard); \ -+}) -+ -+/* Pass management to the unix-file plugin with "notail" policy */ -+static int __cryptcompress2unixfile(struct file *file, struct inode * inode) -+{ -+ int result; -+ reiser4_inode *info; -+ struct unix_file_info * uf; -+ info = reiser4_inode_data(inode); -+ -+ result = aset_set_unsafe(&info->pset, -+ PSET_FILE, -+ (reiser4_plugin *) -+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID)); -+ if (result) -+ return result; -+ result = aset_set_unsafe(&info->pset, -+ PSET_FORMATTING, -+ (reiser4_plugin *) -+ formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID)); -+ if (result) -+ return result; -+ /* get rid of non-standard plugins */ -+ info->plugin_mask &= ~cryptcompress_mask; -+ /* get rid of plugin stat-data extension */ -+ info->extmask &= ~(1 << PLUGIN_STAT); -+ -+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN); -+ -+ /* FIXME use init_inode_data_unix_file() instead, -+ but aviod init_inode_ordering() */ -+ /* Init unix-file specific part of inode */ -+ uf = unix_file_inode_data(inode); -+ uf->container = UF_CONTAINER_UNKNOWN; -+ init_rwsem(&uf->latch); -+ uf->tplug = inode_formatting_plugin(inode); -+ uf->exclusive_use = 0; -+#if REISER4_DEBUG -+ uf->ea_owner = NULL; -+ atomic_set(&uf->nr_neas, 0); -+#endif -+ /** -+ * we was carefull for file_ops, inode_ops and as_ops -+ * to be invariant for plugin conversion, so there is -+ * no need to update ones already installed in the -+ * vfs's residence. -+ */ -+ return 0; -+} -+ -+#if REISER4_DEBUG -+static int disabled_conversion_inode_ok(struct inode * inode) -+{ -+ __u64 extmask = reiser4_inode_data(inode)->extmask; -+ __u16 plugin_mask = reiser4_inode_data(inode)->plugin_mask; -+ -+ return ((extmask & (1 << LIGHT_WEIGHT_STAT)) && -+ (extmask & (1 << UNIX_STAT)) && -+ (extmask & (1 << LARGE_TIMES_STAT)) && -+ (extmask & (1 << PLUGIN_STAT)) && -+ (plugin_mask & (1 << PSET_COMPRESSION_MODE))); -+} -+#endif -+ -+/** -+ * Disable future attempts to schedule/convert file plugin. -+ * This function is called by plugin schedule hooks. -+ * -+ * To disable conversion we assign any compression mode plugin id -+ * different from CONVX_COMPRESSION_MODE_ID. -+ */ -+static int disable_conversion(struct inode * inode) -+{ -+ int result; -+ result = -+ force_plugin_pset(inode, -+ PSET_COMPRESSION_MODE, -+ (reiser4_plugin *)compression_mode_plugin_by_id -+ (LATTD_COMPRESSION_MODE_ID)); -+ assert("edward-1500", -+ ergo(!result, disabled_conversion_inode_ok(inode))); -+ return result; -+} -+ -+/** -+ * Check if we really have achieved plugin scheduling point -+ */ -+static int check_psched_point(struct inode * inode, -+ loff_t pos /* position in the -+ file to write from */, -+ struct cluster_handle * clust, -+ struct psched_context * cont) -+{ -+ assert("edward-1505", conversion_enabled(inode)); -+ /* -+ * if file size is more then cluster size, then compressible -+ * status must be figured out (i.e. compression was disabled, -+ * or file plugin was converted to unix_file) -+ */ -+ assert("edward-1506", inode->i_size <= inode_cluster_size(inode)); -+ -+ if (pos > inode->i_size) -+ /* first logical cluster will contain a (partial) hole */ -+ return disable_conversion(inode); -+ if (pos < inode_cluster_size(inode)) -+ /* writing to the first logical cluster */ -+ return 0; -+ /* -+ * here we have: -+ * cluster_size <= pos <= i_size <= cluster_size, -+ * and, hence, pos == i_size == cluster_size -+ */ -+ assert("edward-1498", -+ pos == inode->i_size && -+ pos == inode_cluster_size(inode)); -+ assert("edward-1539", cont != NULL); -+ assert("edward-1540", cont->state == PSCHED_INVAL_STATE); -+ -+ cont->state = PSCHED_SCHED_POINT; -+ return 0; -+} -+ -+static void start_check_compressibility(struct inode * inode, -+ struct cluster_handle * clust, -+ hint_t * hint) -+{ -+ assert("edward-1507", clust->index == 1); -+ assert("edward-1508", !tfm_cluster_is_uptodate(&clust->tc)); -+ assert("edward-1509", cluster_get_tfm_act(&clust->tc) == TFMA_READ); -+ -+ hint_init_zero(hint); -+ clust->hint = hint; -+ clust->index --; -+ clust->nr_pages = size_in_pages(lbytes(clust->index, inode)); -+ -+ /* first logical cluster (of index #0) must be complete */ -+ assert("edward-1510", lbytes(clust->index, inode) == -+ inode_cluster_size(inode)); -+} -+ -+static void finish_check_compressibility(struct inode * inode, -+ struct cluster_handle * clust, -+ hint_t * hint) -+{ -+ reiser4_unset_hint(clust->hint); -+ clust->hint = hint; -+ clust->index ++; -+} -+ -+#if REISER4_DEBUG -+static int prepped_dclust_ok(hint_t * hint) -+{ -+ reiser4_key key; -+ coord_t * coord = &hint->ext_coord.coord; -+ -+ item_key_by_coord(coord, &key); -+ return (item_id_by_coord(coord) == CTAIL_ID && -+ !coord_is_unprepped_ctail(coord) && -+ (get_key_offset(&key) + nr_units_ctail(coord) == -+ dclust_get_extension_dsize(hint))); -+} -+#endif -+ -+#define fifty_persent(size) (size >> 1) -+/* evaluation of data compressibility */ -+#define data_is_compressible(osize, isize) \ -+ (osize < fifty_persent(isize)) -+ -+/** -+ * A simple O(1)-heuristic for compressibility. -+ * This is called not more then one time per file's life. -+ * Read first logical cluster (of index #0) and estimate its compressibility. -+ * Save estimation result in @cont. -+ */ -+static int read_check_compressibility(struct inode * inode, -+ struct cluster_handle * clust, -+ struct psched_context * cont) -+{ -+ int i; -+ int result; -+ __u32 dst_len; -+ hint_t tmp_hint; -+ hint_t * cur_hint = clust->hint; -+ assert("edward-1541", cont->state == PSCHED_SCHED_POINT); -+ -+ start_check_compressibility(inode, clust, &tmp_hint); -+ -+ reset_cluster_pgset(clust, cluster_nrpages(inode)); -+ result = grab_page_cluster(inode, clust, READ_OP); -+ if (result) -+ return result; -+ /* Read page cluster here */ -+ for (i = 0; i < clust->nr_pages; i++) { -+ struct page *page = clust->pages[i]; -+ lock_page(page); -+ result = do_readpage_ctail(inode, clust, page, -+ ZNODE_READ_LOCK); -+ unlock_page(page); -+ if (result) -+ goto error; -+ } -+ tfm_cluster_clr_uptodate(&clust->tc); -+ -+ cluster_set_tfm_act(&clust->tc, TFMA_WRITE); -+ -+ if (hint_is_valid(&tmp_hint) && !hint_is_unprepped_dclust(&tmp_hint)) { -+ /* lenght of compressed data is known, no need to compress */ -+ assert("edward-1511", -+ znode_is_any_locked(tmp_hint.lh.node)); -+ assert("edward-1512", -+ WITH_DATA(tmp_hint.ext_coord.coord.node, -+ prepped_dclust_ok(&tmp_hint))); -+ dst_len = dclust_get_extension_dsize(&tmp_hint); -+ } -+ else { -+ struct tfm_cluster * tc = &clust->tc; -+ compression_plugin * cplug = inode_compression_plugin(inode); -+ result = grab_tfm_stream(inode, tc, INPUT_STREAM); -+ if (result) -+ goto error; -+ for (i = 0; i < clust->nr_pages; i++) { -+ char *data; -+ lock_page(clust->pages[i]); -+ BUG_ON(!PageUptodate(clust->pages[i])); -+ data = kmap(clust->pages[i]); -+ memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i), -+ data, PAGE_CACHE_SIZE); -+ kunmap(clust->pages[i]); -+ unlock_page(clust->pages[i]); -+ } -+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM); -+ if (result) -+ goto error; -+ result = grab_coa(tc, cplug); -+ if (result) -+ goto error; -+ tc->len = tc->lsize = lbytes(clust->index, inode); -+ assert("edward-1513", tc->len == inode_cluster_size(inode)); -+ dst_len = tfm_stream_size(tc, OUTPUT_STREAM); -+ cplug->compress(get_coa(tc, cplug->h.id, tc->act), -+ tfm_input_data(clust), tc->len, -+ tfm_output_data(clust), &dst_len); -+ assert("edward-1514", -+ dst_len <= tfm_stream_size(tc, OUTPUT_STREAM)); -+ } -+ finish_check_compressibility(inode, clust, cur_hint); -+ cont->state = -+ (data_is_compressible(dst_len, inode_cluster_size(inode)) ? -+ PSCHED_REMAINS_OLD : -+ PSCHED_ASSIGNED_NEW); -+ return 0; -+ error: -+ put_page_cluster(clust, inode, READ_OP); -+ return result; -+} -+ -+/* Cut disk cluster of index @idx */ -+static int cut_disk_cluster(struct inode * inode, cloff_t idx) -+{ -+ reiser4_key from, to; -+ assert("edward-1515", inode_file_plugin(inode) == -+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); -+ key_by_inode_cryptcompress(inode, clust_to_off(idx, inode), &from); -+ to = from; -+ set_key_offset(&to, -+ get_key_offset(&from) + inode_cluster_size(inode) - 1); -+ return reiser4_cut_tree(reiser4_tree_by_inode(inode), -+ &from, &to, inode, 0); -+} -+ -+static int reserve_cryptcompress2unixfile(struct inode *inode) -+{ -+ reiser4_block_nr unformatted_nodes; -+ reiser4_tree *tree; -+ -+ tree = reiser4_tree_by_inode(inode); -+ -+ /* number of unformatted nodes which will be created */ -+ unformatted_nodes = cluster_nrpages(inode); /* N */ -+ -+ /* -+ * space required for one iteration of extent->tail conversion: -+ * -+ * 1. kill ctail items -+ * -+ * 2. insert N unformatted nodes -+ * -+ * 3. insert N (worst-case single-block -+ * extents) extent units. -+ * -+ * 4. drilling to the leaf level by coord_by_key() -+ * -+ * 5. possible update of stat-data -+ * -+ */ -+ grab_space_enable(); -+ return reiser4_grab_space -+ (2 * tree->height + -+ unformatted_nodes + -+ unformatted_nodes * estimate_one_insert_into_item(tree) + -+ 1 + estimate_one_insert_item(tree) + -+ inode_file_plugin(inode)->estimate.update(inode), -+ BA_CAN_COMMIT); -+} -+ -+/** -+ * Convert cryptcompress file plugin to unix_file plugin. -+ */ -+static int cryptcompress2unixfile(struct file * file, struct inode * inode, -+ struct psched_context * cont) -+{ -+ int i; -+ int result = 0; -+ struct cryptcompress_info *cr_info; -+ struct unix_file_info *uf_info; -+ assert("edward-1516", cont->pages[0]->index == 0); -+ -+ /* release all cryptcompress-specific resources */ -+ cr_info = cryptcompress_inode_data(inode); -+ result = reserve_cryptcompress2unixfile(inode); -+ if (result) -+ goto out; -+ /* tell kill_hook to not truncate pages */ -+ reiser4_inode_set_flag(inode, REISER4_FILE_CONV_IN_PROGRESS); -+ result = cut_disk_cluster(inode, 0); -+ if (result) -+ goto out; -+ /* captured jnode of cluster and assotiated resources (pages, -+ reserved disk space) were released by ->kill_hook() method -+ of the item plugin */ -+ -+ result = __cryptcompress2unixfile(file, inode); -+ if (result) -+ goto out; -+ /* At this point file is managed by unix file plugin */ -+ -+ uf_info = unix_file_inode_data(inode); -+ -+ assert("edward-1518", -+ ergo(jprivate(cont->pages[0]), -+ !jnode_is_cluster_page(jprivate(cont->pages[0])))); -+ for(i = 0; i < cont->nr_pages; i++) { -+ assert("edward-1519", cont->pages[i]); -+ assert("edward-1520", PageUptodate(cont->pages[i])); -+ -+ result = find_or_create_extent(cont->pages[i]); -+ if (result) -+ break; -+ } -+ if (unlikely(result)) -+ goto out; -+ uf_info->container = UF_CONTAINER_EXTENTS; -+ result = reiser4_update_sd(inode); -+ out: -+ all_grabbed2free(); -+ return result; -+} -+ -+#define convert_file_plugin cryptcompress2unixfile -+ -+/** -+ * This is called by ->write() method of a cryptcompress file plugin. -+ * Make a decision about the most reasonable file plugin id to manage -+ * the file. -+ */ -+int write_pschedule_hook(struct file * file, struct inode * inode, -+ loff_t pos, struct cluster_handle * clust, -+ struct psched_context * cont) -+{ -+ int result; -+ if (!conversion_enabled(inode)) -+ return 0; -+ result = check_psched_point(inode, pos, clust, cont); -+ if (result || cont->state != PSCHED_SCHED_POINT) -+ return result; -+ result = read_check_compressibility(inode, clust, cont); -+ if (result) -+ return result; -+ if (cont->state == PSCHED_REMAINS_OLD) { -+ put_page_cluster(clust, inode, READ_OP); -+ return disable_conversion(inode); -+ } -+ assert("edward-1543", cont->state == PSCHED_ASSIGNED_NEW); -+ /* -+ * page cluster is grabbed and uptodate. It will be -+ * released with a pgset after plugin conversion is -+ * finished, see put_psched_context(). -+ */ -+ reiser4_unset_hint(clust->hint); -+ move_cluster_pgset(clust, &cont->pages, &cont->nr_pages); -+ return 0; -+} -+ -+/** -+ * This is called by ->setattr() method of cryptcompress file plugin. -+ */ -+int setattr_pschedule_hook(struct inode * inode) -+{ -+ if (conversion_enabled(inode)) -+ return disable_conversion(inode); -+ return 0; -+} -+ -+static inline void init_psched_context(struct psched_context * cont) -+{ -+ memset(cont, 0, sizeof(*cont)); -+} -+ -+static inline void done_psched_context(struct psched_context * cont, -+ struct inode * inode) -+{ -+ if (cont->pages) { -+ __put_page_cluster(0, cont->nr_pages, cont->pages, inode); -+ kfree(cont->pages); -+ } -+} -+/** -+ * Here are wrappers with "protection", aka Reiser4 "careful" methods. -+ * They are used by vfs (as methods of file_ops, inode_ops or as_ops), -+ * which is not aware of plugin conversion performed by Reiser4. -+ */ -+ -+/* -+ * Wrappers with active protection for: -+ * -+ * ->write(); -+ */ -+ -+/* -+ * ->write() file operation supplied to VFS. -+ * Write a file in 3 steps (some of them can be optional). -+ */ -+ssize_t reiser4_write_careful(struct file *file, const char __user *buf, -+ size_t count, loff_t *off) -+{ -+ int result; -+ reiser4_context *ctx; -+ ssize_t written_old = 0; /* bytes written with initial plugin */ -+ ssize_t written_new = 0; /* bytes written with new plugin */ -+ struct psched_context cont; -+ struct inode * inode = file->f_dentry->d_inode; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ init_psched_context(&cont); -+ mutex_lock(&inode->i_mutex); -+ /** -+ * First step. -+ * Start write with initial file plugin. -+ * Keep a plugin schedule status at @cont (if any). -+ */ -+ written_old = inode_file_plugin(inode)->write(file, -+ buf, -+ count, -+ off, -+ &cont); -+ if (cont.state != PSCHED_ASSIGNED_NEW || written_old < 0) -+ goto exit; -+ /** -+ * Second step. -+ * New file plugin has been scheduled. -+ * Perform conversion to the new plugin. -+ */ -+ down_read(&reiser4_inode_data(inode)->conv_sem); -+ result = convert_file_plugin(file, inode, &cont); -+ up_read(&reiser4_inode_data(inode)->conv_sem); -+ if (result) { -+ warning("edward-1544", -+ "Inode %llu: file plugin conversion failed (%d)", -+ (unsigned long long)get_inode_oid(inode), -+ result); -+ context_set_commit_async(ctx); -+ goto exit; -+ } -+ reiser4_txn_restart(ctx); -+ /** -+ * Third step: -+ * Finish write with the new file plugin. -+ */ -+ assert("edward-1536", -+ inode_file_plugin(inode) == -+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID)); -+ -+ written_new = inode_file_plugin(inode)->write(file, -+ buf + written_old, -+ count - written_old, -+ off, -+ NULL); -+ exit: -+ mutex_unlock(&inode->i_mutex); -+ done_psched_context(&cont, inode); -+ reiser4_exit_context(ctx); -+ -+ return written_old + (written_new < 0 ? 0 : written_new); -+} -+ -+/* Wrappers with passive protection for: -+ * -+ * ->open(); -+ * ->read(); -+ * ->ioctl(); -+ * ->mmap(); -+ * ->release(); -+ * ->bmap(). -+ */ -+ -+int reiser4_open_careful(struct inode *inode, struct file *file) -+{ -+ return PROT_PASSIVE(int, open, (inode, file)); -+} -+ -+ssize_t reiser4_read_careful(struct file * file, char __user * buf, -+ size_t size, loff_t * off) -+{ -+ struct inode * inode = file->f_dentry->d_inode; -+ return PROT_PASSIVE(ssize_t, read, (file, buf, size, off)); -+} -+ -+int reiser4_ioctl_careful(struct inode *inode, struct file *filp, -+ unsigned int cmd, unsigned long arg) -+{ -+ return PROT_PASSIVE(int, ioctl, (inode, filp, cmd, arg)); -+} -+ -+int reiser4_mmap_careful(struct file *file, struct vm_area_struct *vma) -+{ -+ struct inode *inode = file->f_dentry->d_inode; -+ return PROT_PASSIVE(int, mmap, (file, vma)); -+} -+ -+int reiser4_release_careful(struct inode *inode, struct file *file) -+{ -+ return PROT_PASSIVE(int, release, (inode, file)); -+} -+ -+sector_t reiser4_bmap_careful(struct address_space * mapping, sector_t lblock) -+{ -+ struct inode *inode = mapping->host; -+ return PROT_PASSIVE(sector_t, bmap, (mapping, lblock)); -+} -+ -+/* -+ * Wrappers without protection for: -+ * -+ * ->setattr() -+ */ -+int reiser4_setattr(struct dentry *dentry, struct iattr *attr) -+{ -+ return inode_file_plugin(dentry->d_inode)->setattr(dentry, attr); -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/file/file.h linux-2.6.24/fs/reiser4/plugin/file/file.h ---- linux-2.6.24.orig/fs/reiser4/plugin/file/file.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/file/file.h 2008-01-25 11:40:16.694168755 +0300 -@@ -0,0 +1,331 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* this file contains declarations of methods implementing -+ file plugins (UNIX_FILE_PLUGIN_ID, CRYPTCOMPRESS_FILE_PLUGIN_ID -+ and SYMLINK_FILE_PLUGIN_ID) */ -+ -+#if !defined( __REISER4_FILE_H__ ) -+#define __REISER4_FILE_H__ -+ -+/* possible states when scheduling a new file plugin */ -+typedef enum { -+ PSCHED_INVAL_STATE, /* invalid state */ -+ PSCHED_SCHED_POINT, /* scheduling point has been achieved */ -+ PSCHED_REMAINS_OLD, /* made a decision to be managed by old plugin */ -+ PSCHED_ASSIGNED_NEW /* new plugin has been scheduled */ -+} psched_state; -+ -+struct psched_context { -+ int nr_pages; -+ struct page **pages; -+ psched_state state; -+}; -+ -+/** -+ * Declarations of common/careful/generic methods. -+ * Suppose ->foo() is a vs method (of f_ops, i_ops, or a_ops); -+ * Then common reiser4 method for foo looks like reiser4_foo_common; -+ * careful method looks like reiser4_foo_careful; -+ * generic method looks like reiser4_foo. -+ * -+ * Common method is a simple instruction set eligible for more -+ * then one plugin id. -+ * -+ * Generic method looks at the plugin installed in inode's -+ * plugin set and calls its appropriate method. -+ * -+ * Careful method looks like generic method with protected pset -+ * (see plugin/file/file_conversion.c for details). -+ */ -+ -+/* inode operations */ -+int reiser4_setattr(struct dentry *, struct iattr *); -+ -+/* file operations */ -+ssize_t reiser4_read_careful(struct file *, char __user *buf, -+ size_t count, loff_t *off); -+ssize_t reiser4_write_careful(struct file *, const char __user *buf, -+ size_t count, loff_t * off); -+int reiser4_ioctl_careful(struct inode *inode, struct file *filp, -+ unsigned int cmd, unsigned long arg); -+int reiser4_mmap_careful(struct file *, struct vm_area_struct *); -+int reiser4_open_careful(struct inode *inode, struct file *file); -+int reiser4_release_careful(struct inode *, struct file *); -+int reiser4_sync_file_common(struct file *, struct dentry *, int datasync); -+ -+/* address space operations */ -+int reiser4_readpage(struct file *, struct page *); -+int reiser4_readpages(struct file*, struct address_space*, struct list_head*, -+ unsigned); -+int reiser4_writepages(struct address_space *, struct writeback_control *); -+int reiser4_prepare_write(struct file *, struct page *, unsigned from, -+ unsigned to); -+int reiser4_commit_write(struct file *, struct page *, unsigned from, -+ unsigned to); -+sector_t reiser4_bmap_careful(struct address_space *, sector_t lblock); -+ -+/* -+ * Private methods of unix-file plugin -+ * (UNIX_FILE_PLUGIN_ID) -+ */ -+ -+/* private inode operations */ -+int setattr_unix_file(struct dentry *, struct iattr *); -+ -+/* private file operations */ -+ -+ssize_t read_unix_file(struct file *, char __user *buf, size_t read_amount, -+ loff_t *off); -+ssize_t write_unix_file(struct file *, const char __user *buf, size_t write_amount, -+ loff_t * off, struct psched_context * cont); -+int ioctl_unix_file(struct inode *, struct file *, unsigned int cmd, -+ unsigned long arg); -+int mmap_unix_file(struct file *, struct vm_area_struct *); -+int open_unix_file(struct inode *, struct file *); -+int release_unix_file(struct inode *, struct file *); -+ -+/* private address space operations */ -+int readpage_unix_file(struct file *, struct page *); -+int readpages_unix_file(struct file*, struct address_space*, struct list_head*, unsigned); -+int writepages_unix_file(struct address_space *, struct writeback_control *); -+int prepare_write_unix_file(struct file *, struct page *, unsigned from, -+ unsigned to); -+int commit_write_unix_file(struct file *, struct page *, unsigned from, -+ unsigned to); -+sector_t bmap_unix_file(struct address_space *, sector_t lblock); -+ -+/* other private methods */ -+int delete_object_unix_file(struct inode *); -+int flow_by_inode_unix_file(struct inode *, const char __user *buf, -+ int user, loff_t, loff_t, rw_op, flow_t *); -+int owns_item_unix_file(const struct inode *, const coord_t *); -+void init_inode_data_unix_file(struct inode *, reiser4_object_create_data *, -+ int create); -+ -+/* -+ * Private methods of cryptcompress file plugin -+ * (CRYPTCOMPRESS_FILE_PLUGIN_ID) -+ */ -+ -+/* private inode operations */ -+int setattr_cryptcompress(struct dentry *, struct iattr *); -+ -+/* private file operations */ -+ssize_t read_cryptcompress(struct file *, char __user *buf, -+ size_t count, loff_t *off); -+ssize_t write_cryptcompress(struct file *, const char __user *buf, -+ size_t count, loff_t * off, -+ struct psched_context *cont); -+int ioctl_cryptcompress(struct inode *, struct file *, unsigned int cmd, -+ unsigned long arg); -+int mmap_cryptcompress(struct file *, struct vm_area_struct *); -+int open_cryptcompress(struct inode *, struct file *); -+int release_cryptcompress(struct inode *, struct file *); -+ -+/* private address space operations */ -+int readpage_cryptcompress(struct file *, struct page *); -+int readpages_cryptcompress(struct file*, struct address_space*, -+ struct list_head*, unsigned); -+int writepages_cryptcompress(struct address_space *, -+ struct writeback_control *); -+int prepare_write_cryptcompress(struct file *, struct page *, unsigned from, -+ unsigned to); -+int commit_write_cryptcompress(struct file *, struct page *, unsigned from, -+ unsigned to); -+sector_t bmap_cryptcompress(struct address_space *, sector_t lblock); -+ -+/* other private methods */ -+int flow_by_inode_cryptcompress(struct inode *, const char __user *buf, -+ int user, loff_t, loff_t, rw_op, flow_t *); -+int key_by_inode_cryptcompress(struct inode *, loff_t off, reiser4_key *); -+int create_object_cryptcompress(struct inode *, struct inode *, -+ reiser4_object_create_data *); -+int delete_object_cryptcompress(struct inode *); -+void init_inode_data_cryptcompress(struct inode *, reiser4_object_create_data *, -+ int create); -+int cut_tree_worker_cryptcompress(tap_t *, const reiser4_key * from_key, -+ const reiser4_key * to_key, -+ reiser4_key * smallest_removed, -+ struct inode *object, int truncate, -+ int *progress); -+void destroy_inode_cryptcompress(struct inode *); -+ -+/* -+ * Private methods of symlink file plugin -+ * (SYMLINK_FILE_PLUGIN_ID) -+ */ -+int reiser4_create_symlink(struct inode *symlink, struct inode *dir, -+ reiser4_object_create_data *); -+void destroy_inode_symlink(struct inode *); -+ -+/* -+ * all the write into unix file is performed by item write method. Write method -+ * of unix file plugin only decides which item plugin (extent or tail) and in -+ * which mode (one from the enum below) to call -+ */ -+typedef enum { -+ FIRST_ITEM = 1, -+ APPEND_ITEM = 2, -+ OVERWRITE_ITEM = 3 -+} write_mode_t; -+ -+/* unix file may be in one the following states */ -+typedef enum { -+ UF_CONTAINER_UNKNOWN = 0, -+ UF_CONTAINER_TAILS = 1, -+ UF_CONTAINER_EXTENTS = 2, -+ UF_CONTAINER_EMPTY = 3 -+} file_container_t; -+ -+struct formatting_plugin; -+struct inode; -+ -+/* unix file plugin specific part of reiser4 inode */ -+struct unix_file_info { -+ /* -+ * this read-write lock protects file containerization change. Accesses -+ * which do not change file containerization (see file_container_t) -+ * (read, readpage, writepage, write (until tail conversion is -+ * involved)) take read-lock. Accesses which modify file -+ * containerization (truncate, conversion from tail to extent and back) -+ * take write-lock. -+ */ -+ struct rw_semaphore latch; -+ /* this enum specifies which items are used to build the file */ -+ file_container_t container; -+ /* -+ * plugin which controls when file is to be converted to extents and -+ * back to tail -+ */ -+ struct formatting_plugin *tplug; -+ /* if this is set, file is in exclusive use */ -+ int exclusive_use; -+#if REISER4_DEBUG -+ /* pointer to task struct of thread owning exclusive access to file */ -+ void *ea_owner; -+ atomic_t nr_neas; -+ void *last_reader; -+#endif -+}; -+ -+struct unix_file_info *unix_file_inode_data(const struct inode *inode); -+void get_exclusive_access(struct unix_file_info *); -+void drop_exclusive_access(struct unix_file_info *); -+void get_nonexclusive_access(struct unix_file_info *); -+void drop_nonexclusive_access(struct unix_file_info *); -+int try_to_get_nonexclusive_access(struct unix_file_info *); -+int find_file_item(hint_t *, const reiser4_key *, znode_lock_mode, -+ struct inode *); -+int find_file_item_nohint(coord_t *, lock_handle *, -+ const reiser4_key *, znode_lock_mode, -+ struct inode *); -+ -+int load_file_hint(struct file *, hint_t *); -+void save_file_hint(struct file *, const hint_t *); -+ -+#include "../item/extent.h" -+#include "../item/tail.h" -+#include "../item/ctail.h" -+ -+struct uf_coord { -+ coord_t coord; -+ lock_handle *lh; -+ int valid; -+ union { -+ struct extent_coord_extension extent; -+ struct tail_coord_extension tail; -+ struct ctail_coord_extension ctail; -+ } extension; -+}; -+ -+#include "../../forward.h" -+#include "../../seal.h" -+#include "../../lock.h" -+ -+/* -+ * This structure is used to speed up file operations (reads and writes). A -+ * hint is a suggestion about where a key resolved to last time. A seal -+ * indicates whether a node has been modified since a hint was last recorded. -+ * You check the seal, and if the seal is still valid, you can use the hint -+ * without traversing the tree again. -+ */ -+struct hint { -+ seal_t seal; /* a seal over last file item accessed */ -+ uf_coord_t ext_coord; -+ loff_t offset; -+ znode_lock_mode mode; -+ lock_handle lh; -+}; -+ -+static inline int hint_is_valid(hint_t * hint) -+{ -+ return hint->ext_coord.valid; -+} -+ -+static inline void hint_set_valid(hint_t * hint) -+{ -+ hint->ext_coord.valid = 1; -+} -+ -+static inline void hint_clr_valid(hint_t * hint) -+{ -+ hint->ext_coord.valid = 0; -+} -+ -+int load_file_hint(struct file *, hint_t *); -+void save_file_hint(struct file *, const hint_t *); -+void hint_init_zero(hint_t *); -+void reiser4_set_hint(hint_t *, const reiser4_key *, znode_lock_mode); -+int hint_is_set(const hint_t *); -+void reiser4_unset_hint(hint_t *); -+ -+int reiser4_update_file_size(struct inode *, loff_t, int update_sd); -+int cut_file_items(struct inode *, loff_t new_size, -+ int update_sd, loff_t cur_size, -+ int (*update_actor) (struct inode *, loff_t, int)); -+#if REISER4_DEBUG -+ -+/* return 1 is exclusive access is obtained, 0 - otherwise */ -+static inline int ea_obtained(struct unix_file_info * uf_info) -+{ -+ int ret; -+ -+ ret = down_read_trylock(&uf_info->latch); -+ if (ret) -+ up_read(&uf_info->latch); -+ return !ret; -+} -+ -+#endif -+ -+#define WRITE_GRANULARITY 32 -+ -+int tail2extent(struct unix_file_info *); -+int extent2tail(struct file *, struct unix_file_info *); -+ -+int goto_right_neighbor(coord_t *, lock_handle *); -+int find_or_create_extent(struct page *); -+int equal_to_ldk(znode *, const reiser4_key *); -+ -+void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh); -+ -+static inline int cbk_errored(int cbk_result) -+{ -+ return (cbk_result != CBK_COORD_NOTFOUND -+ && cbk_result != CBK_COORD_FOUND); -+} -+ -+/* __REISER4_FILE_H__ */ -+#endif -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/file/Makefile linux-2.6.24/fs/reiser4/plugin/file/Makefile ---- linux-2.6.24.orig/fs/reiser4/plugin/file/Makefile 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/file/Makefile 2008-01-25 11:39:06.988221084 +0300 -@@ -0,0 +1,7 @@ -+obj-$(CONFIG_REISER4_FS) += file_plugins.o -+ -+file_plugins-objs := \ -+ file.o \ -+ tail_conversion.o \ -+ symlink.o \ -+ cryptcompress.o -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/file/symfile.c linux-2.6.24/fs/reiser4/plugin/file/symfile.c ---- linux-2.6.24.orig/fs/reiser4/plugin/file/symfile.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/file/symfile.c 2008-01-25 11:39:06.992222114 +0300 -@@ -0,0 +1,87 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Symfiles are a generalization of Unix symlinks. -+ -+ A symfile when read behaves as though you took its contents and -+ substituted them into the reiser4 naming system as the right hand side -+ of an assignment, and then read that which you had assigned to it. -+ -+ A key issue for symfiles is how to implement writes through to -+ subfiles. In general, one must have some method of determining what -+ of that which is written to the symfile is written to what subfile. -+ This can be done by use of custom plugin methods written by users, or -+ by using a few general methods we provide for those willing to endure -+ the insertion of delimiters into what is read. -+ -+ Writing to symfiles without delimiters to denote what is written to -+ what subfile is not supported by any plugins we provide in this -+ release. Our most sophisticated support for writes is that embodied -+ by the invert plugin (see invert.c). -+ -+ A read only version of the /etc/passwd file might be -+ constructed as a symfile whose contents are as follows: -+ -+ /etc/passwd/userlines/* -+ -+ or -+ -+ /etc/passwd/userlines/demidov+/etc/passwd/userlines/edward+/etc/passwd/userlines/reiser+/etc/passwd/userlines/root -+ -+ or -+ -+ /etc/passwd/userlines/(demidov+edward+reiser+root) -+ -+ A symfile with contents -+ -+ /filenameA+"(some text stored in the uninvertable symfile)+/filenameB -+ -+ will return when read -+ -+ The contents of filenameAsome text stored in the uninvertable symfileThe contents of filenameB -+ -+ and write of what has been read will not be possible to implement as -+ an identity operation because there are no delimiters denoting the -+ boundaries of what is to be written to what subfile. -+ -+ Note that one could make this a read/write symfile if one specified -+ delimiters, and the write method understood those delimiters delimited -+ what was written to subfiles. -+ -+ So, specifying the symfile in a manner that allows writes: -+ -+ /etc/passwd/userlines/demidov+"( -+ )+/etc/passwd/userlines/edward+"( -+ )+/etc/passwd/userlines/reiser+"( -+ )+/etc/passwd/userlines/root+"( -+ ) -+ -+ or -+ -+ /etc/passwd/userlines/(demidov+"( -+ )+edward+"( -+ )+reiser+"( -+ )+root+"( -+ )) -+ -+ and the file demidov might be specified as: -+ -+ /etc/passwd/userlines/demidov/username+"(:)+/etc/passwd/userlines/demidov/password+"(:)+/etc/passwd/userlines/demidov/userid+"(:)+/etc/passwd/userlines/demidov/groupid+"(:)+/etc/passwd/userlines/demidov/gecos+"(:)+/etc/passwd/userlines/demidov/home+"(:)+/etc/passwd/userlines/demidov/shell -+ -+ or -+ -+ /etc/passwd/userlines/demidov/(username+"(:)+password+"(:)+userid+"(:)+groupid+"(:)+gecos+"(:)+home+"(:)+shell) -+ -+ Notice that if the file demidov has a carriage return in it, the -+ parsing fails, but then if you put carriage returns in the wrong place -+ in a normal /etc/passwd file it breaks things also. -+ -+ Note that it is forbidden to have no text between two interpolations -+ if one wants to be able to define what parts of a write go to what -+ subfiles referenced in an interpolation. -+ -+ If one wants to be able to add new lines by writing to the file, one -+ must either write a custom plugin for /etc/passwd that knows how to -+ name an added line, or one must use an invert, or one must use a more -+ sophisticated symfile syntax that we are not planning to write for -+ version 4.0. -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/file/symlink.c linux-2.6.24/fs/reiser4/plugin/file/symlink.c ---- linux-2.6.24.orig/fs/reiser4/plugin/file/symlink.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/file/symlink.c 2008-01-25 11:39:06.992222114 +0300 -@@ -0,0 +1,95 @@ -+/* Copyright 2002, 2003, 2005 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "../../inode.h" -+ -+#include -+#include -+ -+/* file plugin methods specific for symlink files -+ (SYMLINK_FILE_PLUGIN_ID) */ -+ -+/* this is implementation of create_object method of file plugin for -+ SYMLINK_FILE_PLUGIN_ID -+ */ -+ -+/** -+ * reiser4_create_symlink - create_object of file plugin for SYMLINK_FILE_PLUGIN_ID -+ * @symlink: inode of symlink object -+ * @dir: inode of parent directory -+ * @info: parameters of new object -+ * -+ * Inserts stat data with symlink extension where into the tree. -+ */ -+int reiser4_create_symlink(struct inode *symlink, -+ struct inode *dir UNUSED_ARG, -+ reiser4_object_create_data *data /* info passed to us -+ * this is filled by -+ * reiser4() syscall -+ * in particular */) -+{ -+ int result; -+ -+ assert("nikita-680", symlink != NULL); -+ assert("nikita-681", S_ISLNK(symlink->i_mode)); -+ assert("nikita-685", reiser4_inode_get_flag(symlink, REISER4_NO_SD)); -+ assert("nikita-682", dir != NULL); -+ assert("nikita-684", data != NULL); -+ assert("nikita-686", data->id == SYMLINK_FILE_PLUGIN_ID); -+ -+ /* -+ * stat data of symlink has symlink extension in which we store -+ * symlink content, that is, path symlink is pointing to. -+ */ -+ reiser4_inode_data(symlink)->extmask |= (1 << SYMLINK_STAT); -+ -+ assert("vs-838", symlink->i_private == NULL); -+ symlink->i_private = (void *)data->name; -+ -+ assert("vs-843", symlink->i_size == 0); -+ INODE_SET_FIELD(symlink, i_size, strlen(data->name)); -+ -+ /* insert stat data appended with data->name */ -+ result = inode_file_plugin(symlink)->write_sd_by_inode(symlink); -+ if (result) { -+ /* FIXME-VS: Make sure that symlink->i_private is not attached -+ to kmalloced data */ -+ INODE_SET_FIELD(symlink, i_size, 0); -+ } else { -+ assert("vs-849", symlink->i_private -+ && reiser4_inode_get_flag(symlink, -+ REISER4_GENERIC_PTR_USED)); -+ assert("vs-850", -+ !memcmp((char *)symlink->i_private, data->name, -+ (size_t) symlink->i_size + 1)); -+ } -+ return result; -+} -+ -+/* this is implementation of destroy_inode method of file plugin for -+ SYMLINK_FILE_PLUGIN_ID -+ */ -+void destroy_inode_symlink(struct inode *inode) -+{ -+ assert("edward-799", -+ inode_file_plugin(inode) == -+ file_plugin_by_id(SYMLINK_FILE_PLUGIN_ID)); -+ assert("edward-800", !is_bad_inode(inode) && is_inode_loaded(inode)); -+ assert("edward-801", reiser4_inode_get_flag(inode, -+ REISER4_GENERIC_PTR_USED)); -+ assert("vs-839", S_ISLNK(inode->i_mode)); -+ -+ kfree(inode->i_private); -+ inode->i_private = NULL; -+ reiser4_inode_clr_flag(inode, REISER4_GENERIC_PTR_USED); -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/file/tail_conversion.c linux-2.6.24/fs/reiser4/plugin/file/tail_conversion.c ---- linux-2.6.24.orig/fs/reiser4/plugin/file/tail_conversion.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/file/tail_conversion.c 2008-01-25 11:40:16.694168755 +0300 -@@ -0,0 +1,726 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "../../inode.h" -+#include "../../super.h" -+#include "../../page_cache.h" -+#include "../../carry.h" -+#include "../../safe_link.h" -+#include "../../vfs_ops.h" -+ -+#include -+ -+/* this file contains: -+ tail2extent and extent2tail */ -+ -+/* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */ -+void get_exclusive_access(struct unix_file_info * uf_info) -+{ -+ assert("nikita-3028", reiser4_schedulable()); -+ assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w)); -+ assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r)); -+ /* -+ * "deadlock avoidance": sometimes we commit a transaction under -+ * rw-semaphore on a file. Such commit can deadlock with another -+ * thread that captured some block (hence preventing atom from being -+ * committed) and waits on rw-semaphore. -+ */ -+ reiser4_txn_restart_current(); -+ LOCK_CNT_INC(inode_sem_w); -+ down_write(&uf_info->latch); -+ uf_info->exclusive_use = 1; -+ assert("vs-1713", uf_info->ea_owner == NULL); -+ assert("vs-1713", atomic_read(&uf_info->nr_neas) == 0); -+ ON_DEBUG(uf_info->ea_owner = current); -+} -+ -+void drop_exclusive_access(struct unix_file_info * uf_info) -+{ -+ assert("vs-1714", uf_info->ea_owner == current); -+ assert("vs-1715", atomic_read(&uf_info->nr_neas) == 0); -+ ON_DEBUG(uf_info->ea_owner = NULL); -+ uf_info->exclusive_use = 0; -+ up_write(&uf_info->latch); -+ assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r)); -+ assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w)); -+ LOCK_CNT_DEC(inode_sem_w); -+ reiser4_txn_restart_current(); -+} -+ -+/** -+ * nea_grabbed - do something when file semaphore is down_read-ed -+ * @uf_info: -+ * -+ * This is called when nonexclisive access is obtained on file. All it does is -+ * for debugging purposes. -+ */ -+static void nea_grabbed(struct unix_file_info *uf_info) -+{ -+#if REISER4_DEBUG -+ LOCK_CNT_INC(inode_sem_r); -+ assert("vs-1716", uf_info->ea_owner == NULL); -+ atomic_inc(&uf_info->nr_neas); -+ uf_info->last_reader = current; -+#endif -+} -+ -+/** -+ * get_nonexclusive_access - get nonexclusive access to a file -+ * @uf_info: unix file specific part of inode to obtain access to -+ * -+ * Nonexclusive access is obtained on a file before read, write, readpage. -+ */ -+void get_nonexclusive_access(struct unix_file_info *uf_info) -+{ -+ assert("nikita-3029", reiser4_schedulable()); -+ assert("nikita-3361", get_current_context()->trans->atom == NULL); -+ -+ down_read(&uf_info->latch); -+ nea_grabbed(uf_info); -+} -+ -+/** -+ * try_to_get_nonexclusive_access - try to get nonexclusive access to a file -+ * @uf_info: unix file specific part of inode to obtain access to -+ * -+ * Non-blocking version of nonexclusive access obtaining. -+ */ -+int try_to_get_nonexclusive_access(struct unix_file_info *uf_info) -+{ -+ int result; -+ -+ result = down_read_trylock(&uf_info->latch); -+ if (result) -+ nea_grabbed(uf_info); -+ return result; -+} -+ -+void drop_nonexclusive_access(struct unix_file_info * uf_info) -+{ -+ assert("vs-1718", uf_info->ea_owner == NULL); -+ assert("vs-1719", atomic_read(&uf_info->nr_neas) > 0); -+ ON_DEBUG(atomic_dec(&uf_info->nr_neas)); -+ -+ up_read(&uf_info->latch); -+ -+ LOCK_CNT_DEC(inode_sem_r); -+ reiser4_txn_restart_current(); -+} -+ -+/* part of tail2extent. Cut all items covering @count bytes starting from -+ @offset */ -+/* Audited by: green(2002.06.15) */ -+static int cut_formatting_items(struct inode *inode, loff_t offset, int count) -+{ -+ reiser4_key from, to; -+ -+ /* AUDIT: How about putting an assertion here, what would check -+ all provided range is covered by tail items only? */ -+ /* key of first byte in the range to be cut */ -+ inode_file_plugin(inode)->key_by_inode(inode, offset, &from); -+ -+ /* key of last byte in that range */ -+ to = from; -+ set_key_offset(&to, (__u64) (offset + count - 1)); -+ -+ /* cut everything between those keys */ -+ return reiser4_cut_tree(reiser4_tree_by_inode(inode), &from, &to, -+ inode, 0); -+} -+ -+static void release_all_pages(struct page **pages, unsigned nr_pages) -+{ -+ unsigned i; -+ -+ for (i = 0; i < nr_pages; i++) { -+ if (pages[i] == NULL) { -+ unsigned j; -+ for (j = i + 1; j < nr_pages; j++) -+ assert("vs-1620", pages[j] == NULL); -+ break; -+ } -+ page_cache_release(pages[i]); -+ pages[i] = NULL; -+ } -+} -+ -+/* part of tail2extent. replace tail items with extent one. Content of tail -+ items (@count bytes) being cut are copied already into -+ pages. extent_writepage method is called to create extents corresponding to -+ those pages */ -+static int replace(struct inode *inode, struct page **pages, unsigned nr_pages, int count) -+{ -+ int result; -+ unsigned i; -+ STORE_COUNTERS; -+ -+ if (nr_pages == 0) -+ return 0; -+ -+ assert("vs-596", pages[0]); -+ -+ /* cut copied items */ -+ result = cut_formatting_items(inode, page_offset(pages[0]), count); -+ if (result) -+ return result; -+ -+ CHECK_COUNTERS; -+ -+ /* put into tree replacement for just removed items: extent item, namely */ -+ for (i = 0; i < nr_pages; i++) { -+ result = add_to_page_cache_lru(pages[i], inode->i_mapping, -+ pages[i]->index, -+ mapping_gfp_mask(inode-> -+ i_mapping)); -+ if (result) -+ break; -+ unlock_page(pages[i]); -+ result = find_or_create_extent(pages[i]); -+ if (result) -+ break; -+ SetPageUptodate(pages[i]); -+ } -+ return result; -+} -+ -+#define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail -+ * items */ -+ -+static int reserve_tail2extent_iteration(struct inode *inode) -+{ -+ reiser4_block_nr unformatted_nodes; -+ reiser4_tree *tree; -+ -+ tree = reiser4_tree_by_inode(inode); -+ -+ /* number of unformatted nodes which will be created */ -+ unformatted_nodes = TAIL2EXTENT_PAGE_NUM; -+ -+ /* -+ * space required for one iteration of extent->tail conversion: -+ * -+ * 1. kill N tail items -+ * -+ * 2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes -+ * -+ * 3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block -+ * extents) extent units. -+ * -+ * 4. drilling to the leaf level by coord_by_key() -+ * -+ * 5. possible update of stat-data -+ * -+ */ -+ grab_space_enable(); -+ return reiser4_grab_space -+ (2 * tree->height + -+ TAIL2EXTENT_PAGE_NUM + -+ TAIL2EXTENT_PAGE_NUM * estimate_one_insert_into_item(tree) + -+ 1 + estimate_one_insert_item(tree) + -+ inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT); -+} -+ -+/* clear stat data's flag indicating that conversion is being converted */ -+static int complete_conversion(struct inode *inode) -+{ -+ int result; -+ -+ grab_space_enable(); -+ result = -+ reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode), -+ BA_CAN_COMMIT); -+ if (result == 0) { -+ reiser4_inode_clr_flag(inode, REISER4_PART_MIXED); -+ result = reiser4_update_sd(inode); -+ } -+ if (result) -+ warning("vs-1696", "Failed to clear converting bit of %llu: %i", -+ (unsigned long long)get_inode_oid(inode), result); -+ return 0; -+} -+ -+/** -+ * find_start -+ * @inode: -+ * @id: -+ * @offset: -+ * -+ * this is used by tail2extent and extent2tail to detect where previous -+ * uncompleted conversion stopped -+ */ -+static int find_start(struct inode *inode, reiser4_plugin_id id, __u64 *offset) -+{ -+ int result; -+ lock_handle lh; -+ coord_t coord; -+ struct unix_file_info *ufo; -+ int found; -+ reiser4_key key; -+ -+ ufo = unix_file_inode_data(inode); -+ init_lh(&lh); -+ result = 0; -+ found = 0; -+ inode_file_plugin(inode)->key_by_inode(inode, *offset, &key); -+ do { -+ init_lh(&lh); -+ result = find_file_item_nohint(&coord, &lh, &key, -+ ZNODE_READ_LOCK, inode); -+ -+ if (result == CBK_COORD_FOUND) { -+ if (coord.between == AT_UNIT) { -+ /*coord_clear_iplug(&coord); */ -+ result = zload(coord.node); -+ if (result == 0) { -+ if (item_id_by_coord(&coord) == id) -+ found = 1; -+ else -+ item_plugin_by_coord(&coord)->s. -+ file.append_key(&coord, -+ &key); -+ zrelse(coord.node); -+ } -+ } else -+ result = RETERR(-ENOENT); -+ } -+ done_lh(&lh); -+ } while (result == 0 && !found); -+ *offset = get_key_offset(&key); -+ return result; -+} -+ -+/** -+ * tail2extent -+ * @uf_info: -+ * -+ * -+ */ -+int tail2extent(struct unix_file_info *uf_info) -+{ -+ int result; -+ reiser4_key key; /* key of next byte to be moved to page */ -+ char *p_data; /* data of page */ -+ unsigned page_off = 0, /* offset within the page where to copy data */ -+ count; /* number of bytes of item which can be -+ * copied to page */ -+ struct page *pages[TAIL2EXTENT_PAGE_NUM]; -+ struct page *page; -+ int done; /* set to 1 when all file is read */ -+ char *item; -+ int i; -+ struct inode *inode; -+ int first_iteration; -+ int bytes; -+ __u64 offset; -+ -+ assert("nikita-3362", ea_obtained(uf_info)); -+ inode = unix_file_info_to_inode(uf_info); -+ assert("nikita-3412", !IS_RDONLY(inode)); -+ assert("vs-1649", uf_info->container != UF_CONTAINER_EXTENTS); -+ assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV)); -+ -+ offset = 0; -+ first_iteration = 1; -+ result = 0; -+ if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) { -+ /* -+ * file is marked on disk as there was a conversion which did -+ * not complete due to either crash or some error. Find which -+ * offset tail conversion stopped at -+ */ -+ result = find_start(inode, FORMATTING_ID, &offset); -+ if (result == -ENOENT) { -+ /* no tail items found, everything is converted */ -+ uf_info->container = UF_CONTAINER_EXTENTS; -+ complete_conversion(inode); -+ return 0; -+ } else if (result != 0) -+ /* some other error */ -+ return result; -+ first_iteration = 0; -+ } -+ -+ reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV); -+ -+ /* get key of first byte of a file */ -+ inode_file_plugin(inode)->key_by_inode(inode, offset, &key); -+ -+ done = 0; -+ while (done == 0) { -+ memset(pages, 0, sizeof(pages)); -+ result = reserve_tail2extent_iteration(inode); -+ if (result != 0) -+ goto out; -+ if (first_iteration) { -+ reiser4_inode_set_flag(inode, REISER4_PART_MIXED); -+ reiser4_update_sd(inode); -+ first_iteration = 0; -+ } -+ bytes = 0; -+ for (i = 0; i < sizeof_array(pages) && done == 0; i++) { -+ assert("vs-598", -+ (get_key_offset(&key) & ~PAGE_CACHE_MASK) == 0); -+ page = alloc_page(reiser4_ctx_gfp_mask_get()); -+ if (!page) { -+ result = RETERR(-ENOMEM); -+ goto error; -+ } -+ -+ page->index = -+ (unsigned long)(get_key_offset(&key) >> -+ PAGE_CACHE_SHIFT); -+ /* -+ * usually when one is going to longterm lock znode (as -+ * find_file_item does, for instance) he must not hold -+ * locked pages. However, there is an exception for -+ * case tail2extent. Pages appearing here are not -+ * reachable to everyone else, they are clean, they do -+ * not have jnodes attached so keeping them locked do -+ * not risk deadlock appearance -+ */ -+ assert("vs-983", !PagePrivate(page)); -+ reiser4_invalidate_pages(inode->i_mapping, page->index, -+ 1, 0); -+ -+ for (page_off = 0; page_off < PAGE_CACHE_SIZE;) { -+ coord_t coord; -+ lock_handle lh; -+ -+ /* get next item */ -+ /* FIXME: we might want to readahead here */ -+ init_lh(&lh); -+ result = -+ find_file_item_nohint(&coord, &lh, &key, -+ ZNODE_READ_LOCK, -+ inode); -+ if (result != CBK_COORD_FOUND) { -+ /* -+ * error happened of not items of file -+ * were found -+ */ -+ done_lh(&lh); -+ page_cache_release(page); -+ goto error; -+ } -+ -+ if (coord.between == AFTER_UNIT) { -+ /* -+ * end of file is reached. Padd page -+ * with zeros -+ */ -+ done_lh(&lh); -+ done = 1; -+ p_data = kmap_atomic(page, KM_USER0); -+ memset(p_data + page_off, 0, -+ PAGE_CACHE_SIZE - page_off); -+ kunmap_atomic(p_data, KM_USER0); -+ break; -+ } -+ -+ result = zload(coord.node); -+ if (result) { -+ page_cache_release(page); -+ done_lh(&lh); -+ goto error; -+ } -+ assert("vs-856", coord.between == AT_UNIT); -+ item = ((char *)item_body_by_coord(&coord)) + -+ coord.unit_pos; -+ -+ /* how many bytes to copy */ -+ count = -+ item_length_by_coord(&coord) - -+ coord.unit_pos; -+ /* limit length of copy to end of page */ -+ if (count > PAGE_CACHE_SIZE - page_off) -+ count = PAGE_CACHE_SIZE - page_off; -+ -+ /* -+ * copy item (as much as will fit starting from -+ * the beginning of the item) into the page -+ */ -+ p_data = kmap_atomic(page, KM_USER0); -+ memcpy(p_data + page_off, item, count); -+ kunmap_atomic(p_data, KM_USER0); -+ -+ page_off += count; -+ bytes += count; -+ set_key_offset(&key, -+ get_key_offset(&key) + count); -+ -+ zrelse(coord.node); -+ done_lh(&lh); -+ } /* end of loop which fills one page by content of -+ * formatting items */ -+ -+ if (page_off) { -+ /* something was copied into page */ -+ pages[i] = page; -+ } else { -+ page_cache_release(page); -+ assert("vs-1648", done == 1); -+ break; -+ } -+ } /* end of loop through pages of one conversion iteration */ -+ -+ if (i > 0) { -+ result = replace(inode, pages, i, bytes); -+ release_all_pages(pages, sizeof_array(pages)); -+ if (result) -+ goto error; -+ /* -+ * We have to drop exclusive access to avoid deadlock -+ * which may happen because called by reiser4_writepages -+ * capture_unix_file requires to get non-exclusive -+ * access to a file. It is safe to drop EA in the middle -+ * of tail2extent conversion because write_unix_file, -+ * setattr_unix_file(truncate), mmap_unix_file, -+ * release_unix_file(extent2tail) checks if conversion -+ * is not in progress (see comments before -+ * get_exclusive_access_careful(). -+ * Other processes that acquire non-exclusive access -+ * (read_unix_file, reiser4_writepages, etc) should work -+ * on partially converted files. -+ */ -+ drop_exclusive_access(uf_info); -+ /* throttle the conversion */ -+ reiser4_throttle_write(inode); -+ get_exclusive_access(uf_info); -+ -+ /* -+ * nobody is allowed to complete conversion but a -+ * process which started it -+ */ -+ assert("", reiser4_inode_get_flag(inode, -+ REISER4_PART_MIXED)); -+ } -+ } -+ -+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV); -+ -+ if (result == 0) { -+ /* file is converted to extent items */ -+ assert("vs-1697", reiser4_inode_get_flag(inode, -+ REISER4_PART_MIXED)); -+ -+ uf_info->container = UF_CONTAINER_EXTENTS; -+ complete_conversion(inode); -+ } else { -+ /* -+ * conversion is not complete. Inode was already marked as -+ * REISER4_PART_CONV and stat-data were updated at the first -+ * iteration of the loop above. -+ */ -+ error: -+ release_all_pages(pages, sizeof_array(pages)); -+ warning("nikita-2282", "Partial conversion of %llu: %i", -+ (unsigned long long)get_inode_oid(inode), result); -+ } -+ -+ out: -+ return result; -+} -+ -+static int reserve_extent2tail_iteration(struct inode *inode) -+{ -+ reiser4_tree *tree; -+ -+ tree = reiser4_tree_by_inode(inode); -+ /* -+ * reserve blocks for (in this order): -+ * -+ * 1. removal of extent item -+ * -+ * 2. insertion of tail by insert_flow() -+ * -+ * 3. drilling to the leaf level by coord_by_key() -+ * -+ * 4. possible update of stat-data -+ */ -+ grab_space_enable(); -+ return reiser4_grab_space -+ (estimate_one_item_removal(tree) + -+ estimate_insert_flow(tree->height) + -+ 1 + estimate_one_insert_item(tree) + -+ inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT); -+} -+ -+/* for every page of file: read page, cut part of extent pointing to this page, -+ put data of page tree by tail item */ -+int extent2tail(struct file * file, struct unix_file_info *uf_info) -+{ -+ int result; -+ struct inode *inode; -+ struct page *page; -+ unsigned long num_pages, i; -+ unsigned long start_page; -+ reiser4_key from; -+ reiser4_key to; -+ unsigned count; -+ __u64 offset; -+ -+ assert("nikita-3362", ea_obtained(uf_info)); -+ inode = unix_file_info_to_inode(uf_info); -+ assert("nikita-3412", !IS_RDONLY(inode)); -+ assert("vs-1649", uf_info->container != UF_CONTAINER_TAILS); -+ assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV)); -+ -+ offset = 0; -+ if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) { -+ /* -+ * file is marked on disk as there was a conversion which did -+ * not complete due to either crash or some error. Find which -+ * offset tail conversion stopped at -+ */ -+ result = find_start(inode, EXTENT_POINTER_ID, &offset); -+ if (result == -ENOENT) { -+ /* no extent found, everything is converted */ -+ uf_info->container = UF_CONTAINER_TAILS; -+ complete_conversion(inode); -+ return 0; -+ } else if (result != 0) -+ /* some other error */ -+ return result; -+ } -+ -+ reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV); -+ -+ /* number of pages in the file */ -+ num_pages = -+ (inode->i_size + - offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; -+ start_page = offset >> PAGE_CACHE_SHIFT; -+ -+ inode_file_plugin(inode)->key_by_inode(inode, offset, &from); -+ to = from; -+ -+ result = 0; -+ for (i = 0; i < num_pages; i++) { -+ __u64 start_byte; -+ -+ result = reserve_extent2tail_iteration(inode); -+ if (result != 0) -+ break; -+ if (i == 0 && offset == 0) { -+ reiser4_inode_set_flag(inode, REISER4_PART_MIXED); -+ reiser4_update_sd(inode); -+ } -+ -+ page = read_mapping_page(inode->i_mapping, -+ (unsigned)(i + start_page), NULL); -+ if (IS_ERR(page)) { -+ result = PTR_ERR(page); -+ break; -+ } -+ -+ wait_on_page_locked(page); -+ -+ if (!PageUptodate(page)) { -+ page_cache_release(page); -+ result = RETERR(-EIO); -+ break; -+ } -+ -+ /* cut part of file we have read */ -+ start_byte = (__u64) ((i + start_page) << PAGE_CACHE_SHIFT); -+ set_key_offset(&from, start_byte); -+ set_key_offset(&to, start_byte + PAGE_CACHE_SIZE - 1); -+ /* -+ * reiser4_cut_tree_object() returns -E_REPEAT to allow atom -+ * commits during over-long truncates. But -+ * extent->tail conversion should be performed in one -+ * transaction. -+ */ -+ result = reiser4_cut_tree(reiser4_tree_by_inode(inode), &from, -+ &to, inode, 0); -+ -+ if (result) { -+ page_cache_release(page); -+ break; -+ } -+ -+ /* put page data into tree via tail_write */ -+ count = PAGE_CACHE_SIZE; -+ if ((i == (num_pages - 1)) && -+ (inode->i_size & ~PAGE_CACHE_MASK)) -+ /* last page can be incompleted */ -+ count = (inode->i_size & ~PAGE_CACHE_MASK); -+ while (count) { -+ loff_t pos = start_byte; -+ -+ assert("edward-1537", -+ file != NULL && file->f_dentry != NULL); -+ assert("edward-1538", -+ file->f_dentry->d_inode == inode); -+ -+ result = reiser4_write_tail(file, inode, -+ (char __user *)kmap(page), -+ count, &pos); -+ reiser4_free_file_fsdata(file); -+ if (result <= 0) { -+ warning("", "reiser4_write_tail failed"); -+ page_cache_release(page); -+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV); -+ return result; -+ } -+ count -= result; -+ } -+ -+ /* release page */ -+ lock_page(page); -+ /* page is already detached from jnode and mapping. */ -+ assert("vs-1086", page->mapping == NULL); -+ assert("nikita-2690", -+ (!PagePrivate(page) && jprivate(page) == 0)); -+ /* waiting for writeback completion with page lock held is -+ * perfectly valid. */ -+ wait_on_page_writeback(page); -+ reiser4_drop_page(page); -+ /* release reference taken by read_cache_page() above */ -+ page_cache_release(page); -+ -+ drop_exclusive_access(uf_info); -+ /* throttle the conversion */ -+ reiser4_throttle_write(inode); -+ get_exclusive_access(uf_info); -+ /* -+ * nobody is allowed to complete conversion but a process which -+ * started it -+ */ -+ assert("", reiser4_inode_get_flag(inode, REISER4_PART_MIXED)); -+ } -+ -+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV); -+ -+ if (i == num_pages) { -+ /* file is converted to formatted items */ -+ assert("vs-1698", reiser4_inode_get_flag(inode, -+ REISER4_PART_MIXED)); -+ assert("vs-1260", -+ inode_has_no_jnodes(reiser4_inode_data(inode))); -+ -+ uf_info->container = UF_CONTAINER_TAILS; -+ complete_conversion(inode); -+ return 0; -+ } -+ /* -+ * conversion is not complete. Inode was already marked as -+ * REISER4_PART_MIXED and stat-data were updated at the first * -+ * iteration of the loop above. -+ */ -+ warning("nikita-2282", -+ "Partial conversion of %llu: %lu of %lu: %i", -+ (unsigned long long)get_inode_oid(inode), i, -+ num_pages, result); -+ -+ return result; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/file_ops.c linux-2.6.24/fs/reiser4/plugin/file_ops.c ---- linux-2.6.24.orig/fs/reiser4/plugin/file_ops.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/file_ops.c 2008-01-25 11:39:06.992222114 +0300 -@@ -0,0 +1,205 @@ -+/* Copyright 2005 by Hans Reiser, licensing governed by -+ reiser4/README */ -+ -+/* this file contains typical implementations for some of methods of -+ struct file_operations and of struct address_space_operations -+*/ -+ -+#include "../inode.h" -+#include "object.h" -+ -+/* file operations */ -+ -+/* implementation of vfs's llseek method of struct file_operations for -+ typical directory can be found in readdir_common.c -+*/ -+loff_t reiser4_llseek_dir_common(struct file *, loff_t, int origin); -+ -+/* implementation of vfs's readdir method of struct file_operations for -+ typical directory can be found in readdir_common.c -+*/ -+int reiser4_readdir_common(struct file *, void *dirent, filldir_t); -+ -+/** -+ * reiser4_release_dir_common - release of struct file_operations -+ * @inode: inode of released file -+ * @file: file to release -+ * -+ * Implementation of release method of struct file_operations for typical -+ * directory. All it does is freeing of reiser4 specific file data. -+*/ -+int reiser4_release_dir_common(struct inode *inode, struct file *file) -+{ -+ reiser4_context *ctx; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ reiser4_free_file_fsdata(file); -+ reiser4_exit_context(ctx); -+ return 0; -+} -+ -+/* this is common implementation of vfs's fsync method of struct -+ file_operations -+*/ -+int reiser4_sync_common(struct file *file, struct dentry *dentry, int datasync) -+{ -+ reiser4_context *ctx; -+ int result; -+ -+ ctx = reiser4_init_context(dentry->d_inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ result = txnmgr_force_commit_all(dentry->d_inode->i_sb, 0); -+ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/* -+ * common sync method for regular files. -+ * -+ * We are trying to be smart here. Instead of committing all atoms (original -+ * solution), we scan dirty pages of this file and commit all atoms they are -+ * part of. -+ * -+ * Situation is complicated by anonymous pages: i.e., extent-less pages -+ * dirtied through mmap. Fortunately sys_fsync() first calls -+ * filemap_fdatawrite() that will ultimately call reiser4_writepages(), insert -+ * all missing extents and capture anonymous pages. -+ */ -+int reiser4_sync_file_common(struct file *file, -+ struct dentry *dentry, int datasync) -+{ -+ reiser4_context *ctx; -+ txn_atom *atom; -+ reiser4_block_nr reserve; -+ -+ ctx = reiser4_init_context(dentry->d_inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ reserve = estimate_update_common(dentry->d_inode); -+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) { -+ reiser4_exit_context(ctx); -+ return RETERR(-ENOSPC); -+ } -+ write_sd_by_inode_common(dentry->d_inode); -+ -+ atom = get_current_atom_locked(); -+ spin_lock_txnh(ctx->trans); -+ force_commit_atom(ctx->trans); -+ reiser4_exit_context(ctx); -+ return 0; -+} -+ -+/* this is common implementation of vfs's sendfile method of struct -+ file_operations -+ -+ Reads @count bytes from @file and calls @actor for every page read. This is -+ needed for loop back devices support. -+*/ -+#if 0 -+ssize_t -+sendfile_common(struct file *file, loff_t *ppos, size_t count, -+ read_actor_t actor, void *target) -+{ -+ reiser4_context *ctx; -+ ssize_t result; -+ -+ ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ result = generic_file_sendfile(file, ppos, count, actor, target); -+ reiser4_exit_context(ctx); -+ return result; -+} -+#endif /* 0 */ -+ -+/* address space operations */ -+ -+/* this is common implementation of vfs's prepare_write method of struct -+ address_space_operations -+*/ -+int -+prepare_write_common(struct file *file, struct page *page, unsigned from, -+ unsigned to) -+{ -+ reiser4_context *ctx; -+ int result; -+ -+ ctx = reiser4_init_context(page->mapping->host->i_sb); -+ result = do_prepare_write(file, page, from, to); -+ -+ /* don't commit transaction under inode semaphore */ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ -+ return result; -+} -+ -+/* this is helper for prepare_write_common and prepare_write_unix_file -+ */ -+int -+do_prepare_write(struct file *file, struct page *page, unsigned from, -+ unsigned to) -+{ -+ int result; -+ file_plugin *fplug; -+ struct inode *inode; -+ -+ assert("umka-3099", file != NULL); -+ assert("umka-3100", page != NULL); -+ assert("umka-3095", PageLocked(page)); -+ -+ if (to - from == PAGE_CACHE_SIZE || PageUptodate(page)) -+ return 0; -+ -+ inode = page->mapping->host; -+ fplug = inode_file_plugin(inode); -+ -+ if (page->mapping->a_ops->readpage == NULL) -+ return RETERR(-EINVAL); -+ -+ result = page->mapping->a_ops->readpage(file, page); -+ if (result != 0) { -+ SetPageError(page); -+ ClearPageUptodate(page); -+ /* All reiser4 readpage() implementations should return the -+ * page locked in case of error. */ -+ assert("nikita-3472", PageLocked(page)); -+ } else { -+ /* -+ * ->readpage() either: -+ * -+ * 1. starts IO against @page. @page is locked for IO in -+ * this case. -+ * -+ * 2. doesn't start IO. @page is unlocked. -+ * -+ * In either case, page should be locked. -+ */ -+ lock_page(page); -+ /* -+ * IO (if any) is completed at this point. Check for IO -+ * errors. -+ */ -+ if (!PageUptodate(page)) -+ result = RETERR(-EIO); -+ } -+ assert("umka-3098", PageLocked(page)); -+ return result; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/file_ops_readdir.c linux-2.6.24/fs/reiser4/plugin/file_ops_readdir.c ---- linux-2.6.24.orig/fs/reiser4/plugin/file_ops_readdir.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/file_ops_readdir.c 2008-01-25 11:39:06.996223145 +0300 -@@ -0,0 +1,658 @@ -+/* Copyright 2005 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#include "../inode.h" -+ -+/* return true, iff @coord points to the valid directory item that is part of -+ * @inode directory. */ -+static int is_valid_dir_coord(struct inode *inode, coord_t * coord) -+{ -+ return plugin_of_group(item_plugin_by_coord(coord), -+ DIR_ENTRY_ITEM_TYPE) && -+ inode_file_plugin(inode)->owns_item(inode, coord); -+} -+ -+/* compare two logical positions within the same directory */ -+static cmp_t dir_pos_cmp(const struct dir_pos * p1, const struct dir_pos * p2) -+{ -+ cmp_t result; -+ -+ assert("nikita-2534", p1 != NULL); -+ assert("nikita-2535", p2 != NULL); -+ -+ result = de_id_cmp(&p1->dir_entry_key, &p2->dir_entry_key); -+ if (result == EQUAL_TO) { -+ int diff; -+ -+ diff = p1->pos - p2->pos; -+ result = -+ (diff < 0) ? LESS_THAN : (diff ? GREATER_THAN : EQUAL_TO); -+ } -+ return result; -+} -+ -+/* see comment before reiser4_readdir_common() for overview of why "adjustment" is -+ * necessary. */ -+static void -+adjust_dir_pos(struct file *dir, struct readdir_pos * readdir_spot, -+ const struct dir_pos * mod_point, int adj) -+{ -+ struct dir_pos *pos; -+ -+ /* -+ * new directory entry was added (adj == +1) or removed (adj == -1) at -+ * the @mod_point. Directory file descriptor @dir is doing readdir and -+ * is currently positioned at @readdir_spot. Latter has to be updated -+ * to maintain stable readdir. -+ */ -+ /* directory is positioned to the beginning. */ -+ if (readdir_spot->entry_no == 0) -+ return; -+ -+ pos = &readdir_spot->position; -+ switch (dir_pos_cmp(mod_point, pos)) { -+ case LESS_THAN: -+ /* @mod_pos is _before_ @readdir_spot, that is, entry was -+ * added/removed on the left (in key order) of current -+ * position. */ -+ /* logical number of directory entry readdir is "looking" at -+ * changes */ -+ readdir_spot->entry_no += adj; -+ assert("nikita-2577", -+ ergo(dir != NULL, reiser4_get_dir_fpos(dir) + adj >= 0)); -+ if (de_id_cmp(&pos->dir_entry_key, -+ &mod_point->dir_entry_key) == EQUAL_TO) { -+ assert("nikita-2575", mod_point->pos < pos->pos); -+ /* -+ * if entry added/removed has the same key as current -+ * for readdir, update counter of duplicate keys in -+ * @readdir_spot. -+ */ -+ pos->pos += adj; -+ } -+ break; -+ case GREATER_THAN: -+ /* directory is modified after @pos: nothing to do. */ -+ break; -+ case EQUAL_TO: -+ /* cannot insert an entry readdir is looking at, because it -+ already exists. */ -+ assert("nikita-2576", adj < 0); -+ /* directory entry to which @pos points to is being -+ removed. -+ -+ NOTE-NIKITA: Right thing to do is to update @pos to point -+ to the next entry. This is complex (we are under spin-lock -+ for one thing). Just rewind it to the beginning. Next -+ readdir will have to scan the beginning of -+ directory. Proper solution is to use semaphore in -+ spin lock's stead and use rewind_right() here. -+ -+ NOTE-NIKITA: now, semaphore is used, so... -+ */ -+ memset(readdir_spot, 0, sizeof *readdir_spot); -+ } -+} -+ -+/* scan all file-descriptors for this directory and adjust their -+ positions respectively. Should be used by implementations of -+ add_entry and rem_entry of dir plugin */ -+void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de, -+ int offset, int adj) -+{ -+ reiser4_file_fsdata *scan; -+ struct dir_pos mod_point; -+ -+ assert("nikita-2536", dir != NULL); -+ assert("nikita-2538", de != NULL); -+ assert("nikita-2539", adj != 0); -+ -+ build_de_id(dir, &de->d_name, &mod_point.dir_entry_key); -+ mod_point.pos = offset; -+ -+ spin_lock_inode(dir); -+ -+ /* -+ * new entry was added/removed in directory @dir. Scan all file -+ * descriptors for @dir that are currently involved into @readdir and -+ * update them. -+ */ -+ -+ list_for_each_entry(scan, get_readdir_list(dir), dir.linkage) -+ adjust_dir_pos(scan->back, &scan->dir.readdir, &mod_point, adj); -+ -+ spin_unlock_inode(dir); -+} -+ -+/* -+ * traverse tree to start/continue readdir from the readdir position @pos. -+ */ -+static int dir_go_to(struct file *dir, struct readdir_pos * pos, tap_t * tap) -+{ -+ reiser4_key key; -+ int result; -+ struct inode *inode; -+ -+ assert("nikita-2554", pos != NULL); -+ -+ inode = dir->f_dentry->d_inode; -+ result = inode_dir_plugin(inode)->build_readdir_key(dir, &key); -+ if (result != 0) -+ return result; -+ result = reiser4_object_lookup(inode, -+ &key, -+ tap->coord, -+ tap->lh, -+ tap->mode, -+ FIND_EXACT, -+ LEAF_LEVEL, LEAF_LEVEL, -+ 0, &tap->ra_info); -+ if (result == CBK_COORD_FOUND) -+ result = rewind_right(tap, (int)pos->position.pos); -+ else { -+ tap->coord->node = NULL; -+ done_lh(tap->lh); -+ result = RETERR(-EIO); -+ } -+ return result; -+} -+ -+/* -+ * handling of non-unique keys: calculate at what ordinal position within -+ * sequence of directory items with identical keys @pos is. -+ */ -+static int set_pos(struct inode *inode, struct readdir_pos * pos, tap_t * tap) -+{ -+ int result; -+ coord_t coord; -+ lock_handle lh; -+ tap_t scan; -+ de_id *did; -+ reiser4_key de_key; -+ -+ coord_init_zero(&coord); -+ init_lh(&lh); -+ reiser4_tap_init(&scan, &coord, &lh, ZNODE_READ_LOCK); -+ reiser4_tap_copy(&scan, tap); -+ reiser4_tap_load(&scan); -+ pos->position.pos = 0; -+ -+ did = &pos->position.dir_entry_key; -+ -+ if (is_valid_dir_coord(inode, scan.coord)) { -+ -+ build_de_id_by_key(unit_key_by_coord(scan.coord, &de_key), did); -+ -+ while (1) { -+ -+ result = go_prev_unit(&scan); -+ if (result != 0) -+ break; -+ -+ if (!is_valid_dir_coord(inode, scan.coord)) { -+ result = -EINVAL; -+ break; -+ } -+ -+ /* get key of directory entry */ -+ unit_key_by_coord(scan.coord, &de_key); -+ if (de_id_key_cmp(did, &de_key) != EQUAL_TO) { -+ /* duplicate-sequence is over */ -+ break; -+ } -+ pos->position.pos++; -+ } -+ } else -+ result = RETERR(-ENOENT); -+ reiser4_tap_relse(&scan); -+ reiser4_tap_done(&scan); -+ return result; -+} -+ -+/* -+ * "rewind" directory to @offset, i.e., set @pos and @tap correspondingly. -+ */ -+static int dir_rewind(struct file *dir, struct readdir_pos * pos, tap_t * tap) -+{ -+ __u64 destination; -+ __s64 shift; -+ int result; -+ struct inode *inode; -+ loff_t dirpos; -+ -+ assert("nikita-2553", dir != NULL); -+ assert("nikita-2548", pos != NULL); -+ assert("nikita-2551", tap->coord != NULL); -+ assert("nikita-2552", tap->lh != NULL); -+ -+ dirpos = reiser4_get_dir_fpos(dir); -+ shift = dirpos - pos->fpos; -+ /* this is logical directory entry within @dir which we are rewinding -+ * to */ -+ destination = pos->entry_no + shift; -+ -+ inode = dir->f_dentry->d_inode; -+ if (dirpos < 0) -+ return RETERR(-EINVAL); -+ else if (destination == 0ll || dirpos == 0) { -+ /* rewind to the beginning of directory */ -+ memset(pos, 0, sizeof *pos); -+ return dir_go_to(dir, pos, tap); -+ } else if (destination >= inode->i_size) -+ return RETERR(-ENOENT); -+ -+ if (shift < 0) { -+ /* I am afraid of negative numbers */ -+ shift = -shift; -+ /* rewinding to the left */ -+ if (shift <= (int)pos->position.pos) { -+ /* destination is within sequence of entries with -+ duplicate keys. */ -+ result = dir_go_to(dir, pos, tap); -+ } else { -+ shift -= pos->position.pos; -+ while (1) { -+ /* repetitions: deadlock is possible when -+ going to the left. */ -+ result = dir_go_to(dir, pos, tap); -+ if (result == 0) { -+ result = rewind_left(tap, shift); -+ if (result == -E_DEADLOCK) { -+ reiser4_tap_done(tap); -+ continue; -+ } -+ } -+ break; -+ } -+ } -+ } else { -+ /* rewinding to the right */ -+ result = dir_go_to(dir, pos, tap); -+ if (result == 0) -+ result = rewind_right(tap, shift); -+ } -+ if (result == 0) { -+ result = set_pos(inode, pos, tap); -+ if (result == 0) { -+ /* update pos->position.pos */ -+ pos->entry_no = destination; -+ pos->fpos = dirpos; -+ } -+ } -+ return result; -+} -+ -+/* -+ * Function that is called by common_readdir() on each directory entry while -+ * doing readdir. ->filldir callback may block, so we had to release long term -+ * lock while calling it. To avoid repeating tree traversal, seal is used. If -+ * seal is broken, we return -E_REPEAT. Node is unlocked in this case. -+ * -+ * Whether node is unlocked in case of any other error is undefined. It is -+ * guaranteed to be still locked if success (0) is returned. -+ * -+ * When ->filldir() wants no more, feed_entry() returns 1, and node is -+ * unlocked. -+ */ -+static int -+feed_entry(struct file *f, struct readdir_pos * pos, tap_t * tap, -+ filldir_t filldir, void *dirent) -+{ -+ item_plugin *iplug; -+ char *name; -+ reiser4_key sd_key; -+ int result; -+ char buf[DE_NAME_BUF_LEN]; -+ char name_buf[32]; -+ char *local_name; -+ unsigned file_type; -+ seal_t seal; -+ coord_t *coord; -+ reiser4_key entry_key; -+ -+ coord = tap->coord; -+ iplug = item_plugin_by_coord(coord); -+ -+ /* pointer to name within the node */ -+ name = iplug->s.dir.extract_name(coord, buf); -+ assert("nikita-1371", name != NULL); -+ -+ /* key of object the entry points to */ -+ if (iplug->s.dir.extract_key(coord, &sd_key) != 0) -+ return RETERR(-EIO); -+ -+ /* we must release longterm znode lock before calling filldir to avoid -+ deadlock which may happen if filldir causes page fault. So, copy -+ name to intermediate buffer */ -+ if (strlen(name) + 1 > sizeof(name_buf)) { -+ local_name = kmalloc(strlen(name) + 1, -+ reiser4_ctx_gfp_mask_get()); -+ if (local_name == NULL) -+ return RETERR(-ENOMEM); -+ } else -+ local_name = name_buf; -+ -+ strcpy(local_name, name); -+ file_type = iplug->s.dir.extract_file_type(coord); -+ -+ unit_key_by_coord(coord, &entry_key); -+ reiser4_seal_init(&seal, coord, &entry_key); -+ -+ longterm_unlock_znode(tap->lh); -+ -+ /* -+ * send information about directory entry to the ->filldir() filler -+ * supplied to us by caller (VFS). -+ * -+ * ->filldir is entitled to do weird things. For example, ->filldir -+ * supplied by knfsd re-enters file system. Make sure no locks are -+ * held. -+ */ -+ assert("nikita-3436", lock_stack_isclean(get_current_lock_stack())); -+ -+ reiser4_txn_restart_current(); -+ result = filldir(dirent, name, (int)strlen(name), -+ /* offset of this entry */ -+ f->f_pos, -+ /* inode number of object bounden by this entry */ -+ oid_to_uino(get_key_objectid(&sd_key)), file_type); -+ if (local_name != name_buf) -+ kfree(local_name); -+ if (result < 0) -+ /* ->filldir() is satisfied. (no space in buffer, IOW) */ -+ result = 1; -+ else -+ result = reiser4_seal_validate(&seal, coord, &entry_key, -+ tap->lh, tap->mode, -+ ZNODE_LOCK_HIPRI); -+ return result; -+} -+ -+static void move_entry(struct readdir_pos * pos, coord_t * coord) -+{ -+ reiser4_key de_key; -+ de_id *did; -+ -+ /* update @pos */ -+ ++pos->entry_no; -+ did = &pos->position.dir_entry_key; -+ -+ /* get key of directory entry */ -+ unit_key_by_coord(coord, &de_key); -+ -+ if (de_id_key_cmp(did, &de_key) == EQUAL_TO) -+ /* we are within sequence of directory entries -+ with duplicate keys. */ -+ ++pos->position.pos; -+ else { -+ pos->position.pos = 0; -+ build_de_id_by_key(&de_key, did); -+ } -+ ++pos->fpos; -+} -+ -+/* -+ * STATELESS READDIR -+ * -+ * readdir support in reiser4 relies on ability to update readdir_pos embedded -+ * into reiser4_file_fsdata on each directory modification (name insertion and -+ * removal), see reiser4_readdir_common() function below. This obviously doesn't -+ * work when reiser4 is accessed over NFS, because NFS doesn't keep any state -+ * across client READDIR requests for the same directory. -+ * -+ * To address this we maintain a "pool" of detached reiser4_file_fsdata -+ * (d_cursor). Whenever NFS readdir request comes, we detect this, and try to -+ * find detached reiser4_file_fsdata corresponding to previous readdir -+ * request. In other words, additional state is maintained on the -+ * server. (This is somewhat contrary to the design goals of NFS protocol.) -+ * -+ * To efficiently detect when our ->readdir() method is called by NFS server, -+ * dentry is marked as "stateless" in reiser4_decode_fh() (this is checked by -+ * file_is_stateless() function). -+ * -+ * To find out d_cursor in the pool, we encode client id (cid) in the highest -+ * bits of NFS readdir cookie: when first readdir request comes to the given -+ * directory from the given client, cookie is set to 0. This situation is -+ * detected, global cid_counter is incremented, and stored in highest bits of -+ * all direntry offsets returned to the client, including last one. As the -+ * only valid readdir cookie is one obtained as direntry->offset, we are -+ * guaranteed that next readdir request (continuing current one) will have -+ * current cid in the highest bits of starting readdir cookie. All d_cursors -+ * are hashed into per-super-block hash table by (oid, cid) key. -+ * -+ * In addition d_cursors are placed into per-super-block radix tree where they -+ * are keyed by oid alone. This is necessary to efficiently remove them during -+ * rmdir. -+ * -+ * At last, currently unused d_cursors are linked into special list. This list -+ * is used d_cursor_shrink to reclaim d_cursors on memory pressure. -+ * -+ */ -+ -+/* -+ * prepare for readdir. -+ */ -+static int dir_readdir_init(struct file *f, tap_t * tap, -+ struct readdir_pos ** pos) -+{ -+ struct inode *inode; -+ reiser4_file_fsdata *fsdata; -+ int result; -+ -+ assert("nikita-1359", f != NULL); -+ inode = f->f_dentry->d_inode; -+ assert("nikita-1360", inode != NULL); -+ -+ if (!S_ISDIR(inode->i_mode)) -+ return RETERR(-ENOTDIR); -+ -+ /* try to find detached readdir state */ -+ result = reiser4_attach_fsdata(f, inode); -+ if (result != 0) -+ return result; -+ -+ fsdata = reiser4_get_file_fsdata(f); -+ assert("nikita-2571", fsdata != NULL); -+ if (IS_ERR(fsdata)) -+ return PTR_ERR(fsdata); -+ -+ /* add file descriptor to the readdir list hanging of directory -+ * inode. This list is used to scan "readdirs-in-progress" while -+ * inserting or removing names in the directory. */ -+ spin_lock_inode(inode); -+ if (list_empty_careful(&fsdata->dir.linkage)) -+ list_add(&fsdata->dir.linkage, get_readdir_list(inode)); -+ *pos = &fsdata->dir.readdir; -+ spin_unlock_inode(inode); -+ -+ /* move @tap to the current position */ -+ return dir_rewind(f, *pos, tap); -+} -+ -+/* this is implementation of vfs's llseek method of struct file_operations for -+ typical directory -+ See comment before reiser4_readdir_common() for explanation. -+*/ -+loff_t reiser4_llseek_dir_common(struct file * file, loff_t off, int origin) -+{ -+ reiser4_context *ctx; -+ loff_t result; -+ struct inode *inode; -+ -+ inode = file->f_dentry->d_inode; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ mutex_lock(&inode->i_mutex); -+ -+ /* update ->f_pos */ -+ result = default_llseek(file, off, origin); -+ if (result >= 0) { -+ int ff; -+ coord_t coord; -+ lock_handle lh; -+ tap_t tap; -+ struct readdir_pos *pos; -+ -+ coord_init_zero(&coord); -+ init_lh(&lh); -+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK); -+ -+ ff = dir_readdir_init(file, &tap, &pos); -+ reiser4_detach_fsdata(file); -+ if (ff != 0) -+ result = (loff_t) ff; -+ reiser4_tap_done(&tap); -+ } -+ reiser4_detach_fsdata(file); -+ mutex_unlock(&inode->i_mutex); -+ -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/* this is common implementation of vfs's readdir method of struct -+ file_operations -+ -+ readdir problems: -+ -+ readdir(2)/getdents(2) interface is based on implicit assumption that -+ readdir can be restarted from any particular point by supplying file system -+ with off_t-full of data. That is, file system fills ->d_off field in struct -+ dirent and later user passes ->d_off to the seekdir(3), which is, actually, -+ implemented by glibc as lseek(2) on directory. -+ -+ Reiser4 cannot restart readdir from 64 bits of data, because two last -+ components of the key of directory entry are unknown, which given 128 bits: -+ locality and type fields in the key of directory entry are always known, to -+ start readdir() from given point objectid and offset fields have to be -+ filled. -+ -+ Traditional UNIX API for scanning through directory -+ (readdir/seekdir/telldir/opendir/closedir/rewindir/getdents) is based on the -+ assumption that directory is structured very much like regular file, in -+ particular, it is implied that each name within given directory (directory -+ entry) can be uniquely identified by scalar offset and that such offset is -+ stable across the life-time of the name is identifies. -+ -+ This is manifestly not so for reiser4. In reiser4 the only stable unique -+ identifies for the directory entry is its key that doesn't fit into -+ seekdir/telldir API. -+ -+ solution: -+ -+ Within each file descriptor participating in readdir-ing of directory -+ plugin/dir/dir.h:readdir_pos is maintained. This structure keeps track of -+ the "current" directory entry that file descriptor looks at. It contains a -+ key of directory entry (plus some additional info to deal with non-unique -+ keys that we wouldn't dwell onto here) and a logical position of this -+ directory entry starting from the beginning of the directory, that is -+ ordinal number of this entry in the readdir order. -+ -+ Obviously this logical position is not stable in the face of directory -+ modifications. To work around this, on each addition or removal of directory -+ entry all file descriptors for directory inode are scanned and their -+ readdir_pos are updated accordingly (adjust_dir_pos()). -+*/ -+int reiser4_readdir_common(struct file *f /* directory file being read */, -+ void *dirent /* opaque data passed to us by VFS */, -+ filldir_t filld /* filler function passed to us -+ * by VFS */) -+{ -+ reiser4_context *ctx; -+ int result; -+ struct inode *inode; -+ coord_t coord; -+ lock_handle lh; -+ tap_t tap; -+ struct readdir_pos *pos; -+ -+ assert("nikita-1359", f != NULL); -+ inode = f->f_dentry->d_inode; -+ assert("nikita-1360", inode != NULL); -+ -+ if (!S_ISDIR(inode->i_mode)) -+ return RETERR(-ENOTDIR); -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ coord_init_zero(&coord); -+ init_lh(&lh); -+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK); -+ -+ reiser4_readdir_readahead_init(inode, &tap); -+ -+ repeat: -+ result = dir_readdir_init(f, &tap, &pos); -+ if (result == 0) { -+ result = reiser4_tap_load(&tap); -+ /* scan entries one by one feeding them to @filld */ -+ while (result == 0) { -+ coord_t *coord; -+ -+ coord = tap.coord; -+ assert("nikita-2572", coord_is_existing_unit(coord)); -+ assert("nikita-3227", is_valid_dir_coord(inode, coord)); -+ -+ result = feed_entry(f, pos, &tap, filld, dirent); -+ if (result > 0) { -+ break; -+ } else if (result == 0) { -+ ++f->f_pos; -+ result = go_next_unit(&tap); -+ if (result == -E_NO_NEIGHBOR || -+ result == -ENOENT) { -+ result = 0; -+ break; -+ } else if (result == 0) { -+ if (is_valid_dir_coord(inode, coord)) -+ move_entry(pos, coord); -+ else -+ break; -+ } -+ } else if (result == -E_REPEAT) { -+ /* feed_entry() had to restart. */ -+ ++f->f_pos; -+ reiser4_tap_relse(&tap); -+ goto repeat; -+ } else -+ warning("vs-1617", -+ "reiser4_readdir_common: unexpected error %d", -+ result); -+ } -+ reiser4_tap_relse(&tap); -+ -+ if (result >= 0) -+ f->f_version = inode->i_version; -+ } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) -+ result = 0; -+ reiser4_tap_done(&tap); -+ reiser4_detach_fsdata(f); -+ -+ /* try to update directory's atime */ -+ if (reiser4_grab_space_force(inode_file_plugin(inode)->estimate.update(inode), -+ BA_CAN_COMMIT) != 0) -+ warning("", "failed to update atime on readdir: %llu", -+ get_inode_oid(inode)); -+ else -+ file_accessed(f); -+ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ -+ return (result <= 0) ? result : 0; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/file_plugin_common.c linux-2.6.24/fs/reiser4/plugin/file_plugin_common.c ---- linux-2.6.24.orig/fs/reiser4/plugin/file_plugin_common.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/file_plugin_common.c 2008-01-25 11:55:43.900543447 +0300 -@@ -0,0 +1,1009 @@ -+/* Copyright 2005 by Hans Reiser, licensing governed by -+ reiser4/README */ -+ -+/* this file contains typical implementations for most of methods of -+ file plugin -+*/ -+ -+#include "../inode.h" -+#include "object.h" -+#include "../safe_link.h" -+ -+#include -+ -+static int insert_new_sd(struct inode *inode); -+static int update_sd(struct inode *inode); -+ -+/* this is common implementation of write_sd_by_inode method of file plugin -+ either insert stat data or update it -+ */ -+int write_sd_by_inode_common(struct inode *inode /* object to save */ ) -+{ -+ int result; -+ -+ assert("nikita-730", inode != NULL); -+ -+ if (reiser4_inode_get_flag(inode, REISER4_NO_SD)) -+ /* object doesn't have stat-data yet */ -+ result = insert_new_sd(inode); -+ else -+ result = update_sd(inode); -+ if (result != 0 && result != -ENAMETOOLONG && result != -ENOMEM) -+ /* Don't issue warnings about "name is too long" */ -+ warning("nikita-2221", "Failed to save sd for %llu: %i", -+ (unsigned long long)get_inode_oid(inode), result); -+ return result; -+} -+ -+/* this is common implementation of key_by_inode method of file plugin -+ */ -+int -+key_by_inode_and_offset_common(struct inode *inode, loff_t off, -+ reiser4_key * key) -+{ -+ reiser4_key_init(key); -+ set_key_locality(key, reiser4_inode_data(inode)->locality_id); -+ set_key_ordering(key, get_inode_ordering(inode)); -+ set_key_objectid(key, get_inode_oid(inode)); /*FIXME: inode->i_ino */ -+ set_key_type(key, KEY_BODY_MINOR); -+ set_key_offset(key, (__u64) off); -+ return 0; -+} -+ -+/* this is common implementation of set_plug_in_inode method of file plugin -+ */ -+int set_plug_in_inode_common(struct inode *object /* inode to set plugin on */ , -+ struct inode *parent /* parent object */ , -+ reiser4_object_create_data * data /* creational -+ * data */ ) -+{ -+ __u64 mask; -+ -+ object->i_mode = data->mode; -+ /* this should be plugin decision */ -+ object->i_uid = current->fsuid; -+ object->i_mtime = object->i_atime = object->i_ctime = CURRENT_TIME; -+ -+ /* support for BSD style group-id assignment. See mount's manual page -+ description of bsdgroups ext2 mount options for more details */ -+ if (reiser4_is_set(object->i_sb, REISER4_BSD_GID)) -+ object->i_gid = parent->i_gid; -+ else if (parent->i_mode & S_ISGID) { -+ /* parent directory has sguid bit */ -+ object->i_gid = parent->i_gid; -+ if (S_ISDIR(object->i_mode)) -+ /* sguid is inherited by sub-directories */ -+ object->i_mode |= S_ISGID; -+ } else -+ object->i_gid = current->fsgid; -+ -+ /* this object doesn't have stat-data yet */ -+ reiser4_inode_set_flag(object, REISER4_NO_SD); -+#if 0 -+ /* this is now called after all inode plugins are initialized: -+ do_create_vfs_child after adjust_to_parent */ -+ /* setup inode and file-operations for this inode */ -+ setup_inode_ops(object, data); -+#endif -+ object->i_nlink = 0; -+ reiser4_seal_init(&reiser4_inode_data(object)->sd_seal, NULL, NULL); -+ mask = (1 << UNIX_STAT) | (1 << LIGHT_WEIGHT_STAT); -+ if (!reiser4_is_set(object->i_sb, REISER4_32_BIT_TIMES)) -+ mask |= (1 << LARGE_TIMES_STAT); -+ -+ reiser4_inode_data(object)->extmask = mask; -+ return 0; -+} -+ -+/* this is common implementation of adjust_to_parent method of file plugin for -+ regular files -+ */ -+int adjust_to_parent_common(struct inode *object /* new object */ , -+ struct inode *parent /* parent directory */ , -+ struct inode *root /* root directory */ ) -+{ -+ assert("nikita-2165", object != NULL); -+ if (parent == NULL) -+ parent = root; -+ assert("nikita-2069", parent != NULL); -+ -+ /* -+ * inherit missing plugins from parent -+ */ -+ -+ grab_plugin_pset(object, parent, PSET_FILE); -+ grab_plugin_pset(object, parent, PSET_SD); -+ grab_plugin_pset(object, parent, PSET_FORMATTING); -+ grab_plugin_pset(object, parent, PSET_PERM); -+ return 0; -+} -+ -+/* this is common implementation of adjust_to_parent method of file plugin for -+ typical directories -+ */ -+int adjust_to_parent_common_dir(struct inode *object /* new object */ , -+ struct inode *parent /* parent directory */ , -+ struct inode *root /* root directory */ ) -+{ -+ int result = 0; -+ pset_member memb; -+ -+ assert("nikita-2166", object != NULL); -+ if (parent == NULL) -+ parent = root; -+ assert("nikita-2167", parent != NULL); -+ -+ /* -+ * inherit missing plugins from parent -+ */ -+ for (memb = 0; memb < PSET_LAST; ++memb) { -+ result = grab_plugin_pset(object, parent, memb); -+ if (result != 0) -+ break; -+ } -+ return result; -+} -+ -+int adjust_to_parent_cryptcompress(struct inode *object /* new object */ , -+ struct inode *parent /* parent directory */, -+ struct inode *root /* root directory */) -+{ -+ int result; -+ result = adjust_to_parent_common(object, parent, root); -+ if (result) -+ return result; -+ assert("edward-1416", parent != NULL); -+ -+ grab_plugin_pset(object, parent, PSET_CLUSTER); -+ grab_plugin_pset(object, parent, PSET_CIPHER); -+ grab_plugin_pset(object, parent, PSET_DIGEST); -+ grab_plugin_pset(object, parent, PSET_COMPRESSION); -+ grab_plugin_pset(object, parent, PSET_COMPRESSION_MODE); -+ -+ return 0; -+} -+ -+/* this is common implementation of create_object method of file plugin -+ */ -+int reiser4_create_object_common(struct inode *object, struct inode *parent, -+ reiser4_object_create_data * data) -+{ -+ reiser4_block_nr reserve; -+ assert("nikita-744", object != NULL); -+ assert("nikita-745", parent != NULL); -+ assert("nikita-747", data != NULL); -+ assert("nikita-748", reiser4_inode_get_flag(object, REISER4_NO_SD)); -+ -+ reserve = estimate_create_common(object); -+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) -+ return RETERR(-ENOSPC); -+ return write_sd_by_inode_common(object); -+} -+ -+static int common_object_delete_no_reserve(struct inode *inode); -+ -+/** -+ * reiser4_delete_object_common - delete_object of file_plugin -+ * @inode: inode to be deleted -+ * -+ * This is common implementation of delete_object method of file_plugin. It -+ * applies to object its deletion consists of removing two items - stat data -+ * and safe-link. -+ */ -+int reiser4_delete_object_common(struct inode *inode) -+{ -+ int result; -+ -+ assert("nikita-1477", inode != NULL); -+ /* FIXME: if file body deletion failed (i/o error, for instance), -+ inode->i_size can be != 0 here */ -+ assert("nikita-3420", inode->i_size == 0 || S_ISLNK(inode->i_mode)); -+ assert("nikita-3421", inode->i_nlink == 0); -+ -+ if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) { -+ reiser4_block_nr reserve; -+ -+ /* grab space which is needed to remove 2 items from the tree: -+ stat data and safe-link */ -+ reserve = 2 * -+ estimate_one_item_removal(reiser4_tree_by_inode(inode)); -+ if (reiser4_grab_space_force(reserve, -+ BA_RESERVED | BA_CAN_COMMIT)) -+ return RETERR(-ENOSPC); -+ result = common_object_delete_no_reserve(inode); -+ } else -+ result = 0; -+ return result; -+} -+ -+/** -+ * reiser4_delete_dir_common - delete_object of file_plugin -+ * @inode: inode to be deleted -+ * -+ * This is common implementation of delete_object method of file_plugin for -+ * typical directory. It calls done method of dir_plugin to remove "." and -+ * removes stat data and safe-link. -+ */ -+int reiser4_delete_dir_common(struct inode *inode) -+{ -+ int result; -+ dir_plugin *dplug; -+ -+ assert("", (get_current_context() && -+ get_current_context()->trans->atom == NULL)); -+ -+ dplug = inode_dir_plugin(inode); -+ assert("vs-1101", dplug && dplug->done); -+ -+ /* kill cursors which might be attached to inode */ -+ reiser4_kill_cursors(inode); -+ -+ /* grab space enough for removing two items */ -+ if (reiser4_grab_space -+ (2 * estimate_one_item_removal(reiser4_tree_by_inode(inode)), -+ BA_RESERVED | BA_CAN_COMMIT)) -+ return RETERR(-ENOSPC); -+ -+ result = dplug->done(inode); -+ if (!result) -+ result = common_object_delete_no_reserve(inode); -+ return result; -+} -+ -+/* this is common implementation of add_link method of file plugin -+ */ -+int reiser4_add_link_common(struct inode *object, struct inode *parent) -+{ -+ /* -+ * increment ->i_nlink and update ->i_ctime -+ */ -+ -+ INODE_INC_FIELD(object, i_nlink); -+ object->i_ctime = CURRENT_TIME; -+ return 0; -+} -+ -+/* this is common implementation of rem_link method of file plugin -+ */ -+int reiser4_rem_link_common(struct inode *object, struct inode *parent) -+{ -+ assert("nikita-2021", object != NULL); -+ assert("nikita-2163", object->i_nlink > 0); -+ -+ /* -+ * decrement ->i_nlink and update ->i_ctime -+ */ -+ -+ INODE_DEC_FIELD(object, i_nlink); -+ object->i_ctime = CURRENT_TIME; -+ return 0; -+} -+ -+/* this is common implementation of rem_link method of file plugin for typical -+ directory -+*/ -+int rem_link_common_dir(struct inode *object, struct inode *parent UNUSED_ARG) -+{ -+ assert("nikita-20211", object != NULL); -+ assert("nikita-21631", object->i_nlink > 0); -+ -+ /* -+ * decrement ->i_nlink and update ->i_ctime -+ */ -+ INODE_DEC_FIELD(object, i_nlink); -+ if (object->i_nlink == 1) -+ INODE_DEC_FIELD(object, i_nlink); -+ object->i_ctime = CURRENT_TIME; -+ return 0; -+} -+ -+/* this is common implementation of owns_item method of file plugin -+ compare objectids of keys in inode and coord */ -+int owns_item_common(const struct inode *inode, /* object to check -+ * against */ -+ const coord_t * coord /* coord to check */ ) -+{ -+ reiser4_key item_key; -+ reiser4_key file_key; -+ -+ assert("nikita-760", inode != NULL); -+ assert("nikita-761", coord != NULL); -+ -+ return coord_is_existing_item(coord) && -+ (get_key_objectid(build_sd_key(inode, &file_key)) == -+ get_key_objectid(item_key_by_coord(coord, &item_key))); -+} -+ -+/* this is common implementation of owns_item method of file plugin -+ for typical directory -+*/ -+int owns_item_common_dir(const struct inode *inode, /* object to check against */ -+ const coord_t * coord /* coord of item to check */ ) -+{ -+ reiser4_key item_key; -+ -+ assert("nikita-1335", inode != NULL); -+ assert("nikita-1334", coord != NULL); -+ -+ if (plugin_of_group(item_plugin_by_coord(coord), DIR_ENTRY_ITEM_TYPE)) -+ return get_key_locality(item_key_by_coord(coord, &item_key)) == -+ get_inode_oid(inode); -+ else -+ return owns_item_common(inode, coord); -+} -+ -+/* this is common implementation of can_add_link method of file plugin -+ checks whether yet another hard links to this object can be added -+*/ -+int can_add_link_common(const struct inode *object /* object to check */ ) -+{ -+ assert("nikita-732", object != NULL); -+ -+ /* inode->i_nlink is unsigned int, so just check for integer -+ overflow */ -+ return object->i_nlink + 1 != 0; -+} -+ -+/* this is common implementation of can_rem_link method of file plugin for -+ typical directory -+*/ -+int can_rem_link_common_dir(const struct inode *inode) -+{ -+ /* is_dir_empty() returns 0 is dir is empty */ -+ return !is_dir_empty(inode); -+} -+ -+/* this is common implementation of detach method of file plugin for typical -+ directory -+*/ -+int reiser4_detach_common_dir(struct inode *child, struct inode *parent) -+{ -+ dir_plugin *dplug; -+ -+ dplug = inode_dir_plugin(child); -+ assert("nikita-2883", dplug != NULL); -+ assert("nikita-2884", dplug->detach != NULL); -+ return dplug->detach(child, parent); -+} -+ -+/* this is common implementation of bind method of file plugin for typical -+ directory -+*/ -+int reiser4_bind_common_dir(struct inode *child, struct inode *parent) -+{ -+ dir_plugin *dplug; -+ -+ dplug = inode_dir_plugin(child); -+ assert("nikita-2646", dplug != NULL); -+ return dplug->attach(child, parent); -+} -+ -+static int process_truncate(struct inode *, __u64 size); -+ -+/* this is common implementation of safelink method of file plugin -+ */ -+int safelink_common(struct inode *object, reiser4_safe_link_t link, __u64 value) -+{ -+ int result; -+ -+ assert("vs-1705", get_current_context()->trans->atom == NULL); -+ if (link == SAFE_UNLINK) -+ /* nothing to do. iput() in the caller (process_safelink) will -+ * finish with file */ -+ result = 0; -+ else if (link == SAFE_TRUNCATE) -+ result = process_truncate(object, value); -+ else { -+ warning("nikita-3438", "Unrecognized safe-link type: %i", link); -+ result = RETERR(-EIO); -+ } -+ return result; -+} -+ -+/* this is common implementation of estimate.create method of file plugin -+ can be used when object creation involves insertion of one item (usually stat -+ data) into tree -+*/ -+reiser4_block_nr estimate_create_common(const struct inode * object) -+{ -+ return estimate_one_insert_item(reiser4_tree_by_inode(object)); -+} -+ -+/* this is common implementation of estimate.create method of file plugin for -+ typical directory -+ can be used when directory creation involves insertion of two items (usually -+ stat data and item containing "." and "..") into tree -+*/ -+reiser4_block_nr estimate_create_common_dir(const struct inode * object) -+{ -+ return 2 * estimate_one_insert_item(reiser4_tree_by_inode(object)); -+} -+ -+/* this is common implementation of estimate.update method of file plugin -+ can be used when stat data update does not do more than inserting a unit -+ into a stat data item which is probably true for most cases -+*/ -+reiser4_block_nr estimate_update_common(const struct inode * inode) -+{ -+ return estimate_one_insert_into_item(reiser4_tree_by_inode(inode)); -+} -+ -+/* this is common implementation of estimate.unlink method of file plugin -+ */ -+reiser4_block_nr -+estimate_unlink_common(const struct inode * object UNUSED_ARG, -+ const struct inode * parent UNUSED_ARG) -+{ -+ return 0; -+} -+ -+/* this is common implementation of estimate.unlink method of file plugin for -+ typical directory -+*/ -+reiser4_block_nr -+estimate_unlink_common_dir(const struct inode * object, -+ const struct inode * parent) -+{ -+ dir_plugin *dplug; -+ -+ dplug = inode_dir_plugin(object); -+ assert("nikita-2888", dplug != NULL); -+ assert("nikita-2887", dplug->estimate.unlink != NULL); -+ return dplug->estimate.unlink(object, parent); -+} -+ -+char *wire_write_common(struct inode *inode, char *start) -+{ -+ return build_inode_onwire(inode, start); -+} -+ -+char *wire_read_common(char *addr, reiser4_object_on_wire * obj) -+{ -+ if (!obj) -+ return locate_obj_key_id_onwire(addr); -+ return extract_obj_key_id_from_onwire(addr, &obj->u.std.key_id); -+} -+ -+struct dentry *wire_get_common(struct super_block *sb, -+ reiser4_object_on_wire * obj) -+{ -+ struct inode *inode; -+ struct dentry *dentry; -+ reiser4_key key; -+ -+ extract_key_from_id(&obj->u.std.key_id, &key); -+ inode = reiser4_iget(sb, &key, 1); -+ if (!IS_ERR(inode)) { -+ reiser4_iget_complete(inode); -+ dentry = d_alloc_anon(inode); -+ if (dentry == NULL) { -+ iput(inode); -+ dentry = ERR_PTR(-ENOMEM); -+ } else -+ dentry->d_op = &get_super_private(sb)->ops.dentry; -+ } else if (PTR_ERR(inode) == -ENOENT) -+ /* -+ * inode wasn't found at the key encoded in the file -+ * handle. Hence, file handle is stale. -+ */ -+ dentry = ERR_PTR(RETERR(-ESTALE)); -+ else -+ dentry = (void *)inode; -+ return dentry; -+} -+ -+int wire_size_common(struct inode *inode) -+{ -+ return inode_onwire_size(inode); -+} -+ -+void wire_done_common(reiser4_object_on_wire * obj) -+{ -+ /* nothing to do */ -+} -+ -+/* helper function to print errors */ -+static void key_warning(const reiser4_key * key /* key to print */ , -+ const struct inode *inode, -+ int code /* error code to print */ ) -+{ -+ assert("nikita-716", key != NULL); -+ -+ if (code != -ENOMEM) { -+ warning("nikita-717", "Error for inode %llu (%i)", -+ (unsigned long long)get_key_objectid(key), code); -+ reiser4_print_key("for key", key); -+ } -+} -+ -+/* NIKITA-FIXME-HANS: perhaps this function belongs in another file? */ -+#if REISER4_DEBUG -+static void -+check_inode_seal(const struct inode *inode, -+ const coord_t * coord, const reiser4_key * key) -+{ -+ reiser4_key unit_key; -+ -+ unit_key_by_coord(coord, &unit_key); -+ assert("nikita-2752", -+ WITH_DATA_RET(coord->node, 1, keyeq(key, &unit_key))); -+ assert("nikita-2753", get_inode_oid(inode) == get_key_objectid(key)); -+} -+ -+static void check_sd_coord(coord_t * coord, const reiser4_key * key) -+{ -+ reiser4_key ukey; -+ -+ coord_clear_iplug(coord); -+ if (zload(coord->node)) -+ return; -+ -+ if (!coord_is_existing_unit(coord) || -+ !item_plugin_by_coord(coord) || -+ !keyeq(unit_key_by_coord(coord, &ukey), key) || -+ (znode_get_level(coord->node) != LEAF_LEVEL) || -+ !item_is_statdata(coord)) { -+ warning("nikita-1901", "Conspicuous seal"); -+ reiser4_print_key("key", key); -+ print_coord("coord", coord, 1); -+ impossible("nikita-2877", "no way"); -+ } -+ zrelse(coord->node); -+} -+ -+#else -+#define check_inode_seal(inode, coord, key) noop -+#define check_sd_coord(coord, key) noop -+#endif -+ -+/* insert new stat-data into tree. Called with inode state -+ locked. Return inode state locked. */ -+static int insert_new_sd(struct inode *inode /* inode to create sd for */ ) -+{ -+ int result; -+ reiser4_key key; -+ coord_t coord; -+ reiser4_item_data data; -+ char *area; -+ reiser4_inode *ref; -+ lock_handle lh; -+ oid_t oid; -+ -+ assert("nikita-723", inode != NULL); -+ assert("nikita-3406", reiser4_inode_get_flag(inode, REISER4_NO_SD)); -+ -+ ref = reiser4_inode_data(inode); -+ spin_lock_inode(inode); -+ -+ if (ref->plugin_mask != 0) -+ /* inode has non-standard plugins */ -+ inode_set_extension(inode, PLUGIN_STAT); -+ /* -+ * prepare specification of new item to be inserted -+ */ -+ -+ data.iplug = inode_sd_plugin(inode); -+ data.length = data.iplug->s.sd.save_len(inode); -+ spin_unlock_inode(inode); -+ -+ data.data = NULL; -+ data.user = 0; -+/* could be optimized for case where there is only one node format in -+ * use in the filesystem, probably there are lots of such -+ * places we could optimize for only one node layout.... -Hans */ -+ if (data.length > reiser4_tree_by_inode(inode)->nplug->max_item_size()){ -+ /* This is silly check, but we don't know actual node where -+ insertion will go into. */ -+ return RETERR(-ENAMETOOLONG); -+ } -+ oid = oid_allocate(inode->i_sb); -+/* NIKITA-FIXME-HANS: what is your opinion on whether this error check should be encapsulated into oid_allocate? */ -+ if (oid == ABSOLUTE_MAX_OID) -+ return RETERR(-EOVERFLOW); -+ -+ set_inode_oid(inode, oid); -+ -+ coord_init_zero(&coord); -+ init_lh(&lh); -+ -+ result = insert_by_key(reiser4_tree_by_inode(inode), -+ build_sd_key(inode, &key), &data, &coord, &lh, -+ /* stat data lives on a leaf level */ -+ LEAF_LEVEL, CBK_UNIQUE); -+ -+ /* we don't want to re-check that somebody didn't insert -+ stat-data while we were doing io, because if it did, -+ insert_by_key() returned error. */ -+ /* but what _is_ possible is that plugin for inode's stat-data, -+ list of non-standard plugins or their state would change -+ during io, so that stat-data wouldn't fit into sd. To avoid -+ this race we keep inode_state lock. This lock has to be -+ taken each time you access inode in a way that would cause -+ changes in sd size: changing plugins etc. -+ */ -+ -+ if (result == IBK_INSERT_OK) { -+ coord_clear_iplug(&coord); -+ result = zload(coord.node); -+ if (result == 0) { -+ /* have we really inserted stat data? */ -+ assert("nikita-725", item_is_statdata(&coord)); -+ -+ /* inode was just created. It is inserted into hash -+ table, but no directory entry was yet inserted into -+ parent. So, inode is inaccessible through -+ ->lookup(). All places that directly grab inode -+ from hash-table (like old knfsd), should check -+ IMMUTABLE flag that is set by common_create_child. -+ */ -+ assert("nikita-3240", data.iplug != NULL); -+ assert("nikita-3241", data.iplug->s.sd.save != NULL); -+ area = item_body_by_coord(&coord); -+ result = data.iplug->s.sd.save(inode, &area); -+ znode_make_dirty(coord.node); -+ if (result == 0) { -+ /* object has stat-data now */ -+ reiser4_inode_clr_flag(inode, REISER4_NO_SD); -+ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN); -+ /* initialise stat-data seal */ -+ reiser4_seal_init(&ref->sd_seal, &coord, &key); -+ ref->sd_coord = coord; -+ check_inode_seal(inode, &coord, &key); -+ } else if (result != -ENOMEM) -+ /* -+ * convert any other error code to -EIO to -+ * avoid confusing user level with unexpected -+ * errors. -+ */ -+ result = RETERR(-EIO); -+ zrelse(coord.node); -+ } -+ } -+ done_lh(&lh); -+ -+ if (result != 0) -+ key_warning(&key, inode, result); -+ else -+ oid_count_allocated(); -+ -+ return result; -+} -+ -+/* find sd of inode in a tree, deal with errors */ -+int lookup_sd(struct inode *inode /* inode to look sd for */ , -+ znode_lock_mode lock_mode /* lock mode */ , -+ coord_t * coord /* resulting coord */ , -+ lock_handle * lh /* resulting lock handle */ , -+ const reiser4_key * key /* resulting key */ , -+ int silent) -+{ -+ int result; -+ __u32 flags; -+ -+ assert("nikita-1692", inode != NULL); -+ assert("nikita-1693", coord != NULL); -+ assert("nikita-1694", key != NULL); -+ -+ /* look for the object's stat data in a tree. -+ This returns in "node" pointer to a locked znode and in "pos" -+ position of an item found in node. Both are only valid if -+ coord_found is returned. */ -+ flags = (lock_mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0; -+ flags |= CBK_UNIQUE; -+ /* -+ * traverse tree to find stat data. We cannot use vroot here, because -+ * it only covers _body_ of the file, and stat data don't belong -+ * there. -+ */ -+ result = coord_by_key(reiser4_tree_by_inode(inode), -+ key, -+ coord, -+ lh, -+ lock_mode, -+ FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, flags, NULL); -+ if (REISER4_DEBUG && result == 0) -+ check_sd_coord(coord, key); -+ -+ if (result != 0 && !silent) -+ key_warning(key, inode, result); -+ return result; -+} -+ -+static int -+locate_inode_sd(struct inode *inode, -+ reiser4_key * key, coord_t * coord, lock_handle * lh) -+{ -+ reiser4_inode *state; -+ seal_t seal; -+ int result; -+ -+ assert("nikita-3483", inode != NULL); -+ -+ state = reiser4_inode_data(inode); -+ spin_lock_inode(inode); -+ *coord = state->sd_coord; -+ coord_clear_iplug(coord); -+ seal = state->sd_seal; -+ spin_unlock_inode(inode); -+ -+ build_sd_key(inode, key); -+ if (reiser4_seal_is_set(&seal)) { -+ /* first, try to use seal */ -+ result = reiser4_seal_validate(&seal, -+ coord, -+ key, -+ lh, ZNODE_WRITE_LOCK, -+ ZNODE_LOCK_LOPRI); -+ if (result == 0) -+ check_sd_coord(coord, key); -+ } else -+ result = -E_REPEAT; -+ -+ if (result != 0) { -+ coord_init_zero(coord); -+ result = lookup_sd(inode, ZNODE_WRITE_LOCK, coord, lh, key, 0); -+ } -+ return result; -+} -+ -+#if REISER4_DEBUG -+static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2) -+{ -+ return (get_key_locality(k1) == get_key_locality(k2) && -+ get_key_type(k1) == get_key_type(k2) && -+ get_key_band(k1) == get_key_band(k2) && -+ get_key_ordering(k1) == get_key_ordering(k2) && -+ get_key_objectid(k1) == get_key_objectid(k2)); -+} -+ -+#include "../tree_walk.h" -+ -+/* make some checks before and after stat-data resize operation */ -+static int check_sd_resize(struct inode * inode, coord_t * coord, -+ int length, int progress /* 1 means after resize */) -+{ -+ int ret = 0; -+ lock_handle left_lock; -+ coord_t left_coord; -+ reiser4_key left_key; -+ reiser4_key key; -+ -+ if (inode_file_plugin(inode) != -+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)) -+ return 0; -+ if (!length) -+ return 0; -+ if (coord->item_pos != 0) -+ return 0; -+ -+ init_lh(&left_lock); -+ ret = reiser4_get_left_neighbor(&left_lock, -+ coord->node, -+ ZNODE_WRITE_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR || -+ ret == -ENOENT || ret == -EINVAL -+ || ret == -E_DEADLOCK) { -+ ret = 0; -+ goto exit; -+ } -+ ret = zload(left_lock.node); -+ if (ret) -+ goto exit; -+ coord_init_last_unit(&left_coord, left_lock.node); -+ item_key_by_coord(&left_coord, &left_key); -+ item_key_by_coord(coord, &key); -+ -+ if (all_but_offset_key_eq(&key, &left_key)) -+ /* corruption occured */ -+ ret = 1; -+ zrelse(left_lock.node); -+ exit: -+ done_lh(&left_lock); -+ return ret; -+} -+#endif -+ -+/* update stat-data at @coord */ -+static int -+update_sd_at(struct inode *inode, coord_t * coord, reiser4_key * key, -+ lock_handle * lh) -+{ -+ int result; -+ reiser4_item_data data; -+ char *area; -+ reiser4_inode *state; -+ znode *loaded; -+ -+ state = reiser4_inode_data(inode); -+ -+ coord_clear_iplug(coord); -+ result = zload(coord->node); -+ if (result != 0) -+ return result; -+ loaded = coord->node; -+ -+ spin_lock_inode(inode); -+ assert("nikita-728", inode_sd_plugin(inode) != NULL); -+ data.iplug = inode_sd_plugin(inode); -+ -+ /* if inode has non-standard plugins, add appropriate stat data -+ * extension */ -+ if (state->extmask & (1 << PLUGIN_STAT)) { -+ if (state->plugin_mask == 0) -+ inode_clr_extension(inode, PLUGIN_STAT); -+ } else if (state->plugin_mask != 0) -+ inode_set_extension(inode, PLUGIN_STAT); -+ -+ if (state->extmask & (1 << HEIR_STAT)) { -+ if (state->heir_mask == 0) -+ inode_clr_extension(inode, HEIR_STAT); -+ } else if (state->heir_mask != 0) -+ inode_set_extension(inode, HEIR_STAT); -+ -+ /* data.length is how much space to add to (or remove -+ from if negative) sd */ -+ if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) { -+ /* recalculate stat-data length */ -+ data.length = -+ data.iplug->s.sd.save_len(inode) - -+ item_length_by_coord(coord); -+ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN); -+ } else -+ data.length = 0; -+ spin_unlock_inode(inode); -+ -+ /* if on-disk stat data is of different length than required -+ for this inode, resize it */ -+ -+ if (data.length != 0) { -+ data.data = NULL; -+ data.user = 0; -+ -+ assert("edward-1441", -+ !check_sd_resize(inode, coord, -+ data.length, 0/* before resize */)); -+ -+ /* insertion code requires that insertion point (coord) was -+ * between units. */ -+ coord->between = AFTER_UNIT; -+ result = reiser4_resize_item(coord, &data, key, lh, -+ COPI_DONT_SHIFT_LEFT); -+ if (result != 0) { -+ key_warning(key, inode, result); -+ zrelse(loaded); -+ return result; -+ } -+ if (loaded != coord->node) { -+ /* reiser4_resize_item moved coord to another node. -+ Zload it */ -+ zrelse(loaded); -+ coord_clear_iplug(coord); -+ result = zload(coord->node); -+ if (result != 0) -+ return result; -+ loaded = coord->node; -+ } -+ assert("edward-1442", -+ !check_sd_resize(inode, coord, -+ data.length, 1/* after resize */)); -+ } -+ area = item_body_by_coord(coord); -+ spin_lock_inode(inode); -+ result = data.iplug->s.sd.save(inode, &area); -+ znode_make_dirty(coord->node); -+ -+ /* re-initialise stat-data seal */ -+ -+ /* -+ * coord.between was possibly skewed from AT_UNIT when stat-data size -+ * was changed and new extensions were pasted into item. -+ */ -+ coord->between = AT_UNIT; -+ reiser4_seal_init(&state->sd_seal, coord, key); -+ state->sd_coord = *coord; -+ spin_unlock_inode(inode); -+ check_inode_seal(inode, coord, key); -+ zrelse(loaded); -+ return result; -+} -+ -+/* Update existing stat-data in a tree. Called with inode state locked. Return -+ inode state locked. */ -+static int update_sd(struct inode *inode /* inode to update sd for */ ) -+{ -+ int result; -+ reiser4_key key; -+ coord_t coord; -+ lock_handle lh; -+ -+ assert("nikita-726", inode != NULL); -+ -+ /* no stat-data, nothing to update?! */ -+ assert("nikita-3482", !reiser4_inode_get_flag(inode, REISER4_NO_SD)); -+ -+ init_lh(&lh); -+ -+ result = locate_inode_sd(inode, &key, &coord, &lh); -+ if (result == 0) -+ result = update_sd_at(inode, &coord, &key, &lh); -+ done_lh(&lh); -+ -+ return result; -+} -+ -+/* helper for reiser4_delete_object_common and reiser4_delete_dir_common. -+ Remove object stat data. Space for that must be reserved by caller before -+*/ -+static int -+common_object_delete_no_reserve(struct inode *inode /* object to remove */ ) -+{ -+ int result; -+ -+ assert("nikita-1477", inode != NULL); -+ -+ if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) { -+ reiser4_key sd_key; -+ -+ DQUOT_FREE_INODE(inode); -+ DQUOT_DROP(inode); -+ -+ build_sd_key(inode, &sd_key); -+ result = -+ reiser4_cut_tree(reiser4_tree_by_inode(inode), -+ &sd_key, &sd_key, NULL, 0); -+ if (result == 0) { -+ reiser4_inode_set_flag(inode, REISER4_NO_SD); -+ result = oid_release(inode->i_sb, get_inode_oid(inode)); -+ if (result == 0) { -+ oid_count_released(); -+ -+ result = safe_link_del(reiser4_tree_by_inode(inode), -+ get_inode_oid(inode), -+ SAFE_UNLINK); -+ } -+ } -+ } else -+ result = 0; -+ return result; -+} -+ -+/* helper for safelink_common */ -+static int process_truncate(struct inode *inode, __u64 size) -+{ -+ int result; -+ struct iattr attr; -+ file_plugin *fplug; -+ reiser4_context *ctx; -+ struct dentry dentry; -+ -+ assert("vs-21", is_in_reiser4_context()); -+ ctx = reiser4_init_context(inode->i_sb); -+ assert("vs-22", !IS_ERR(ctx)); -+ -+ attr.ia_size = size; -+ attr.ia_valid = ATTR_SIZE | ATTR_CTIME; -+ fplug = inode_file_plugin(inode); -+ -+ mutex_lock(&inode->i_mutex); -+ assert("vs-1704", get_current_context()->trans->atom == NULL); -+ dentry.d_inode = inode; -+ result = inode->i_op->setattr(&dentry, &attr); -+ mutex_unlock(&inode->i_mutex); -+ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ -+ return result; -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/hash.c linux-2.6.24/fs/reiser4/plugin/hash.c ---- linux-2.6.24.orig/fs/reiser4/plugin/hash.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/hash.c 2008-01-25 11:39:06.996223145 +0300 -@@ -0,0 +1,353 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Hash functions */ -+ -+#include "../debug.h" -+#include "plugin_header.h" -+#include "plugin.h" -+#include "../super.h" -+#include "../inode.h" -+ -+#include -+ -+/* old rupasov (yura) hash */ -+static __u64 hash_rupasov(const unsigned char *name /* name to hash */ , -+ int len /* @name's length */ ) -+{ -+ int i; -+ int j; -+ int pow; -+ __u64 a; -+ __u64 c; -+ -+ assert("nikita-672", name != NULL); -+ assert("nikita-673", len >= 0); -+ -+ for (pow = 1, i = 1; i < len; ++i) -+ pow = pow * 10; -+ -+ if (len == 1) -+ a = name[0] - 48; -+ else -+ a = (name[0] - 48) * pow; -+ -+ for (i = 1; i < len; ++i) { -+ c = name[i] - 48; -+ for (pow = 1, j = i; j < len - 1; ++j) -+ pow = pow * 10; -+ a = a + c * pow; -+ } -+ for (; i < 40; ++i) { -+ c = '0' - 48; -+ for (pow = 1, j = i; j < len - 1; ++j) -+ pow = pow * 10; -+ a = a + c * pow; -+ } -+ -+ for (; i < 256; ++i) { -+ c = i; -+ for (pow = 1, j = i; j < len - 1; ++j) -+ pow = pow * 10; -+ a = a + c * pow; -+ } -+ -+ a = a << 7; -+ return a; -+} -+ -+/* r5 hash */ -+static __u64 hash_r5(const unsigned char *name /* name to hash */ , -+ int len UNUSED_ARG /* @name's length */ ) -+{ -+ __u64 a = 0; -+ -+ assert("nikita-674", name != NULL); -+ assert("nikita-675", len >= 0); -+ -+ while (*name) { -+ a += *name << 4; -+ a += *name >> 4; -+ a *= 11; -+ name++; -+ } -+ return a; -+} -+ -+/* Keyed 32-bit hash function using TEA in a Davis-Meyer function -+ H0 = Key -+ Hi = E Mi(Hi-1) + Hi-1 -+ -+ (see Applied Cryptography, 2nd edition, p448). -+ -+ Jeremy Fitzhardinge 1998 -+ -+ Jeremy has agreed to the contents of reiserfs/README. -Hans -+ -+ This code was blindly upgraded to __u64 by s/__u32/__u64/g. -+*/ -+static __u64 hash_tea(const unsigned char *name /* name to hash */ , -+ int len /* @name's length */ ) -+{ -+ __u64 k[] = { 0x9464a485u, 0x542e1a94u, 0x3e846bffu, 0xb75bcfc3u }; -+ -+ __u64 h0 = k[0], h1 = k[1]; -+ __u64 a, b, c, d; -+ __u64 pad; -+ int i; -+ -+ assert("nikita-676", name != NULL); -+ assert("nikita-677", len >= 0); -+ -+#define DELTA 0x9E3779B9u -+#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */ -+#define PARTROUNDS 6 /* 6 gets complete mixing */ -+ -+/* a, b, c, d - data; h0, h1 - accumulated hash */ -+#define TEACORE(rounds) \ -+ do { \ -+ __u64 sum = 0; \ -+ int n = rounds; \ -+ __u64 b0, b1; \ -+ \ -+ b0 = h0; \ -+ b1 = h1; \ -+ \ -+ do \ -+ { \ -+ sum += DELTA; \ -+ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \ -+ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \ -+ } while(--n); \ -+ \ -+ h0 += b0; \ -+ h1 += b1; \ -+ } while(0) -+ -+ pad = (__u64) len | ((__u64) len << 8); -+ pad |= pad << 16; -+ -+ while (len >= 16) { -+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] << -+ 16 | (__u64) name[3] << 24; -+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] << -+ 16 | (__u64) name[7] << 24; -+ c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] << -+ 16 | (__u64) name[11] << 24; -+ d = (__u64) name[12] | (__u64) name[13] << 8 | (__u64) name[14] -+ << 16 | (__u64) name[15] << 24; -+ -+ TEACORE(PARTROUNDS); -+ -+ len -= 16; -+ name += 16; -+ } -+ -+ if (len >= 12) { -+ //assert(len < 16); -+ if (len >= 16) -+ *(int *)0 = 0; -+ -+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] << -+ 16 | (__u64) name[3] << 24; -+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] << -+ 16 | (__u64) name[7] << 24; -+ c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] << -+ 16 | (__u64) name[11] << 24; -+ -+ d = pad; -+ for (i = 12; i < len; i++) { -+ d <<= 8; -+ d |= name[i]; -+ } -+ } else if (len >= 8) { -+ //assert(len < 12); -+ if (len >= 12) -+ *(int *)0 = 0; -+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] << -+ 16 | (__u64) name[3] << 24; -+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] << -+ 16 | (__u64) name[7] << 24; -+ -+ c = d = pad; -+ for (i = 8; i < len; i++) { -+ c <<= 8; -+ c |= name[i]; -+ } -+ } else if (len >= 4) { -+ //assert(len < 8); -+ if (len >= 8) -+ *(int *)0 = 0; -+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] << -+ 16 | (__u64) name[3] << 24; -+ -+ b = c = d = pad; -+ for (i = 4; i < len; i++) { -+ b <<= 8; -+ b |= name[i]; -+ } -+ } else { -+ //assert(len < 4); -+ if (len >= 4) -+ *(int *)0 = 0; -+ a = b = c = d = pad; -+ for (i = 0; i < len; i++) { -+ a <<= 8; -+ a |= name[i]; -+ } -+ } -+ -+ TEACORE(FULLROUNDS); -+ -+/* return 0;*/ -+ return h0 ^ h1; -+ -+} -+ -+/* classical 64 bit Fowler/Noll/Vo-1 (FNV-1) hash. -+ -+ See http://www.isthe.com/chongo/tech/comp/fnv/ for details. -+ -+ Excerpts: -+ -+ FNV hashes are designed to be fast while maintaining a low collision -+ rate. -+ -+ [This version also seems to preserve lexicographical order locally.] -+ -+ FNV hash algorithms and source code have been released into the public -+ domain. -+ -+*/ -+static __u64 hash_fnv1(const unsigned char *name /* name to hash */ , -+ int len UNUSED_ARG /* @name's length */ ) -+{ -+ unsigned long long a = 0xcbf29ce484222325ull; -+ const unsigned long long fnv_64_prime = 0x100000001b3ull; -+ -+ assert("nikita-678", name != NULL); -+ assert("nikita-679", len >= 0); -+ -+ /* FNV-1 hash each octet in the buffer */ -+ for (; *name; ++name) { -+ /* multiply by the 32 bit FNV magic prime mod 2^64 */ -+ a *= fnv_64_prime; -+ /* xor the bottom with the current octet */ -+ a ^= (unsigned long long)(*name); -+ } -+ /* return our new hash value */ -+ return a; -+} -+ -+/* degenerate hash function used to simplify testing of non-unique key -+ handling */ -+static __u64 hash_deg(const unsigned char *name UNUSED_ARG /* name to hash */ , -+ int len UNUSED_ARG /* @name's length */ ) -+{ -+ return 0xc0c0c0c010101010ull; -+} -+ -+static int change_hash(struct inode *inode, -+ reiser4_plugin * plugin, -+ pset_member memb) -+{ -+ int result; -+ -+ assert("nikita-3503", inode != NULL); -+ assert("nikita-3504", plugin != NULL); -+ -+ assert("nikita-3505", is_reiser4_inode(inode)); -+ assert("nikita-3507", plugin->h.type_id == REISER4_HASH_PLUGIN_TYPE); -+ -+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE)) -+ return RETERR(-EINVAL); -+ -+ result = 0; -+ if (inode_hash_plugin(inode) == NULL || -+ inode_hash_plugin(inode)->h.id != plugin->h.id) { -+ if (is_dir_empty(inode) == 0) -+ result = aset_set_unsafe(&reiser4_inode_data(inode)->pset, -+ PSET_HASH, plugin); -+ else -+ result = RETERR(-ENOTEMPTY); -+ -+ } -+ return result; -+} -+ -+static reiser4_plugin_ops hash_plugin_ops = { -+ .init = NULL, -+ .load = NULL, -+ .save_len = NULL, -+ .save = NULL, -+ .change = change_hash -+}; -+ -+/* hash plugins */ -+hash_plugin hash_plugins[LAST_HASH_ID] = { -+ [RUPASOV_HASH_ID] = { -+ .h = { -+ .type_id = REISER4_HASH_PLUGIN_TYPE, -+ .id = RUPASOV_HASH_ID, -+ .pops = &hash_plugin_ops, -+ .label = "rupasov", -+ .desc = "Original Yura's hash", -+ .linkage = {NULL, NULL} -+ }, -+ .hash = hash_rupasov -+ }, -+ [R5_HASH_ID] = { -+ .h = { -+ .type_id = REISER4_HASH_PLUGIN_TYPE, -+ .id = R5_HASH_ID, -+ .pops = &hash_plugin_ops, -+ .label = "r5", -+ .desc = "r5 hash", -+ .linkage = {NULL, NULL} -+ }, -+ .hash = hash_r5 -+ }, -+ [TEA_HASH_ID] = { -+ .h = { -+ .type_id = REISER4_HASH_PLUGIN_TYPE, -+ .id = TEA_HASH_ID, -+ .pops = &hash_plugin_ops, -+ .label = "tea", -+ .desc = "tea hash", -+ .linkage = {NULL, NULL} -+ }, -+ .hash = hash_tea -+ }, -+ [FNV1_HASH_ID] = { -+ .h = { -+ .type_id = REISER4_HASH_PLUGIN_TYPE, -+ .id = FNV1_HASH_ID, -+ .pops = &hash_plugin_ops, -+ .label = "fnv1", -+ .desc = "fnv1 hash", -+ .linkage = {NULL, NULL} -+ }, -+ .hash = hash_fnv1 -+ }, -+ [DEGENERATE_HASH_ID] = { -+ .h = { -+ .type_id = REISER4_HASH_PLUGIN_TYPE, -+ .id = DEGENERATE_HASH_ID, -+ .pops = &hash_plugin_ops, -+ .label = "degenerate hash", -+ .desc = "Degenerate hash: only for testing", -+ .linkage = {NULL, NULL} -+ }, -+ .hash = hash_deg -+ } -+}; -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/inode_ops.c linux-2.6.24/fs/reiser4/plugin/inode_ops.c ---- linux-2.6.24.orig/fs/reiser4/plugin/inode_ops.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/inode_ops.c 2008-01-25 11:39:07.000224175 +0300 -@@ -0,0 +1,897 @@ -+/* -+ * Copyright 2005 by Hans Reiser, licensing governed by reiser4/README -+ */ -+ -+/* -+ * this file contains typical implementations for most of methods of struct -+ * inode_operations -+ */ -+ -+#include "../inode.h" -+#include "../safe_link.h" -+ -+#include -+#include -+ -+static int create_vfs_object(struct inode *parent, struct dentry *dentry, -+ reiser4_object_create_data *data); -+ -+/** -+ * reiser4_create_common - create of inode operations -+ * @parent: inode of parent directory -+ * @dentry: dentry of new object to create -+ * @mode: the permissions to use -+ * @nameidata: -+ * -+ * This is common implementation of vfs's create method of struct -+ * inode_operations. -+ * Creates regular file using file plugin from parent directory plugin set. -+ */ -+int reiser4_create_common(struct inode *parent, struct dentry *dentry, -+ int mode, struct nameidata *nameidata) -+{ -+ reiser4_object_create_data data; -+ file_plugin *fplug; -+ -+ memset(&data, 0, sizeof data); -+ data.mode = S_IFREG | mode; -+ fplug = child_create_plugin(parent) ? : inode_create_plugin(parent); -+ if (!plugin_of_group(fplug, REISER4_REGULAR_FILE)) { -+ warning("vpf-1900", "'%s' is not a regular file plugin.", -+ fplug->h.label); -+ return RETERR(-EIO); -+ } -+ data.id = fplug->h.id; -+ return create_vfs_object(parent, dentry, &data); -+} -+ -+int reiser4_lookup_name(struct inode *dir, struct dentry *, reiser4_key *); -+void check_light_weight(struct inode *inode, struct inode *parent); -+ -+/** -+ * reiser4_lookup_common - lookup of inode operations -+ * @parent: inode of directory to lookup into -+ * @dentry: name to look for -+ * @nameidata: -+ * -+ * This is common implementation of vfs's lookup method of struct -+ * inode_operations. -+ */ -+struct dentry *reiser4_lookup_common(struct inode *parent, -+ struct dentry *dentry, -+ struct nameidata *nameidata) -+{ -+ reiser4_context *ctx; -+ int result; -+ struct dentry *new; -+ struct inode *inode; -+ reiser4_dir_entry_desc entry; -+ -+ ctx = reiser4_init_context(parent->i_sb); -+ if (IS_ERR(ctx)) -+ return (struct dentry *)ctx; -+ -+ /* set up operations on dentry. */ -+ dentry->d_op = &get_super_private(parent->i_sb)->ops.dentry; -+ -+ result = reiser4_lookup_name(parent, dentry, &entry.key); -+ if (result) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ if (result == -ENOENT) { -+ /* object not found */ -+ if (!IS_DEADDIR(parent)) -+ d_add(dentry, NULL); -+ return NULL; -+ } -+ return ERR_PTR(result); -+ } -+ -+ inode = reiser4_iget(parent->i_sb, &entry.key, 0); -+ if (IS_ERR(inode)) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return ERR_PTR(PTR_ERR(inode)); -+ } -+ -+ /* success */ -+ check_light_weight(inode, parent); -+ new = d_splice_alias(inode, dentry); -+ reiser4_iget_complete(inode); -+ -+ /* prevent balance_dirty_pages() from being called: we don't want to -+ * do this under directory i_mutex. */ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return new; -+} -+ -+static reiser4_block_nr common_estimate_link(struct inode *parent, -+ struct inode *object); -+int reiser4_update_dir(struct inode *); -+ -+/** -+ * reiser4_link_common - link of inode operations -+ * @existing: dentry of object which is to get new name -+ * @parent: directory where new name is to be created -+ * @newname: new name -+ * -+ * This is common implementation of vfs's link method of struct -+ * inode_operations. -+ */ -+int reiser4_link_common(struct dentry *existing, struct inode *parent, -+ struct dentry *newname) -+{ -+ reiser4_context *ctx; -+ int result; -+ struct inode *object; -+ dir_plugin *parent_dplug; -+ reiser4_dir_entry_desc entry; -+ reiser4_object_create_data data; -+ reiser4_block_nr reserve; -+ -+ ctx = reiser4_init_context(parent->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ assert("nikita-1431", existing != NULL); -+ assert("nikita-1432", parent != NULL); -+ assert("nikita-1433", newname != NULL); -+ -+ object = existing->d_inode; -+ assert("nikita-1434", object != NULL); -+ -+ /* check for race with create_object() */ -+ if (reiser4_inode_get_flag(object, REISER4_IMMUTABLE)) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return RETERR(-E_REPEAT); -+ } -+ -+ parent_dplug = inode_dir_plugin(parent); -+ -+ memset(&entry, 0, sizeof entry); -+ entry.obj = object; -+ -+ data.mode = object->i_mode; -+ data.id = inode_file_plugin(object)->h.id; -+ -+ reserve = common_estimate_link(parent, existing->d_inode); -+ if ((__s64) reserve < 0) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return reserve; -+ } -+ -+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return RETERR(-ENOSPC); -+ } -+ -+ /* -+ * Subtle race handling: sys_link() doesn't take i_mutex on @parent. It -+ * means that link(2) can race against unlink(2) or rename(2), and -+ * inode is dead (->i_nlink == 0) when reiser4_link() is entered. -+ * -+ * For such inode we have to undo special processing done in -+ * reiser4_unlink() viz. creation of safe-link. -+ */ -+ if (unlikely(object->i_nlink == 0)) { -+ result = safe_link_del(reiser4_tree_by_inode(object), -+ get_inode_oid(object), SAFE_UNLINK); -+ if (result != 0) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ } -+ -+ /* increment nlink of @existing and update its stat data */ -+ result = reiser4_add_nlink(object, parent, 1); -+ if (result == 0) { -+ /* add entry to the parent */ -+ result = -+ parent_dplug->add_entry(parent, newname, &data, &entry); -+ if (result != 0) { -+ /* failed to add entry to the parent, decrement nlink -+ of @existing */ -+ reiser4_del_nlink(object, parent, 1); -+ /* -+ * now, if that failed, we have a file with too big -+ * nlink---space leak, much better than directory -+ * entry pointing to nowhere -+ */ -+ } -+ } -+ if (result == 0) { -+ atomic_inc(&object->i_count); -+ /* -+ * Upon successful completion, link() shall mark for update -+ * the st_ctime field of the file. Also, the st_ctime and -+ * st_mtime fields of the directory that contains the new -+ * entry shall be marked for update. --SUS -+ */ -+ result = reiser4_update_dir(parent); -+ } -+ if (result == 0) -+ d_instantiate(newname, existing->d_inode); -+ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+static int unlink_check_and_grab(struct inode *parent, struct dentry *victim); -+ -+/** -+ * reiser4_unlink_common - unlink of inode operations -+ * @parent: inode of directory to remove name from -+ * @victim: name to be removed -+ * -+ * This is common implementation of vfs's unlink method of struct -+ * inode_operations. -+ */ -+int reiser4_unlink_common(struct inode *parent, struct dentry *victim) -+{ -+ reiser4_context *ctx; -+ int result; -+ struct inode *object; -+ file_plugin *fplug; -+ -+ ctx = reiser4_init_context(parent->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ object = victim->d_inode; -+ fplug = inode_file_plugin(object); -+ assert("nikita-2882", fplug->detach != NULL); -+ -+ result = unlink_check_and_grab(parent, victim); -+ if (result != 0) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ result = fplug->detach(object, parent); -+ if (result == 0) { -+ dir_plugin *parent_dplug; -+ reiser4_dir_entry_desc entry; -+ -+ parent_dplug = inode_dir_plugin(parent); -+ memset(&entry, 0, sizeof entry); -+ -+ /* first, delete directory entry */ -+ result = parent_dplug->rem_entry(parent, victim, &entry); -+ if (result == 0) { -+ /* -+ * if name was removed successfully, we _have_ to -+ * return 0 from this function, because upper level -+ * caller (vfs_{rmdir,unlink}) expect this. -+ * -+ * now that directory entry is removed, update -+ * stat-data -+ */ -+ reiser4_del_nlink(object, parent, 1); -+ /* -+ * Upon successful completion, unlink() shall mark for -+ * update the st_ctime and st_mtime fields of the -+ * parent directory. Also, if the file's link count is -+ * not 0, the st_ctime field of the file shall be -+ * marked for update. --SUS -+ */ -+ reiser4_update_dir(parent); -+ /* add safe-link for this file */ -+ if (object->i_nlink == 0) -+ safe_link_add(object, SAFE_UNLINK); -+ } -+ } -+ -+ if (unlikely(result != 0)) { -+ if (result != -ENOMEM) -+ warning("nikita-3398", "Cannot unlink %llu (%i)", -+ (unsigned long long)get_inode_oid(object), -+ result); -+ /* if operation failed commit pending inode modifications to -+ * the stat-data */ -+ reiser4_update_sd(object); -+ reiser4_update_sd(parent); -+ } -+ -+ reiser4_release_reserved(object->i_sb); -+ -+ /* @object's i_ctime was updated by ->rem_link() method(). */ -+ -+ /* @victim can be already removed from the disk by this time. Inode is -+ then marked so that iput() wouldn't try to remove stat data. But -+ inode itself is still there. -+ */ -+ -+ /* -+ * we cannot release directory semaphore here, because name has -+ * already been deleted, but dentry (@victim) still exists. Prevent -+ * balance_dirty_pages() from being called on exiting this context: we -+ * don't want to do this under directory i_mutex. -+ */ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/** -+ * reiser4_symlink_common - symlink of inode operations -+ * @parent: inode of parent directory -+ * @dentry: dentry of object to be created -+ * @linkname: string symlink is to contain -+ * -+ * This is common implementation of vfs's symlink method of struct -+ * inode_operations. -+ * Creates object using file plugin SYMLINK_FILE_PLUGIN_ID. -+ */ -+int reiser4_symlink_common(struct inode *parent, struct dentry *dentry, -+ const char *linkname) -+{ -+ reiser4_object_create_data data; -+ -+ memset(&data, 0, sizeof data); -+ data.name = linkname; -+ data.id = SYMLINK_FILE_PLUGIN_ID; -+ data.mode = S_IFLNK | S_IRWXUGO; -+ return create_vfs_object(parent, dentry, &data); -+} -+ -+/** -+ * reiser4_mkdir_common - mkdir of inode operations -+ * @parent: inode of parent directory -+ * @dentry: dentry of object to be created -+ * @mode: the permissions to use -+ * -+ * This is common implementation of vfs's mkdir method of struct -+ * inode_operations. -+ * Creates object using file plugin DIRECTORY_FILE_PLUGIN_ID. -+ */ -+int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode) -+{ -+ reiser4_object_create_data data; -+ -+ memset(&data, 0, sizeof data); -+ data.mode = S_IFDIR | mode; -+ data.id = DIRECTORY_FILE_PLUGIN_ID; -+ return create_vfs_object(parent, dentry, &data); -+} -+ -+/** -+ * reiser4_mknod_common - mknod of inode operations -+ * @parent: inode of parent directory -+ * @dentry: dentry of object to be created -+ * @mode: the permissions to use and file type -+ * @rdev: minor and major of new device file -+ * -+ * This is common implementation of vfs's mknod method of struct -+ * inode_operations. -+ * Creates object using file plugin SPECIAL_FILE_PLUGIN_ID. -+ */ -+int reiser4_mknod_common(struct inode *parent, struct dentry *dentry, -+ int mode, dev_t rdev) -+{ -+ reiser4_object_create_data data; -+ -+ memset(&data, 0, sizeof data); -+ data.mode = mode; -+ data.rdev = rdev; -+ data.id = SPECIAL_FILE_PLUGIN_ID; -+ return create_vfs_object(parent, dentry, &data); -+} -+ -+/* -+ * implementation of vfs's rename method of struct inode_operations for typical -+ * directory is in inode_ops_rename.c -+ */ -+ -+/** -+ * reiser4_follow_link_common - follow_link of inode operations -+ * @dentry: dentry of symlink -+ * @data: -+ * -+ * This is common implementation of vfs's followlink method of struct -+ * inode_operations. -+ * Assumes that inode's i_private points to the content of symbolic link. -+ */ -+void *reiser4_follow_link_common(struct dentry *dentry, struct nameidata *nd) -+{ -+ assert("vs-851", S_ISLNK(dentry->d_inode->i_mode)); -+ -+ if (!dentry->d_inode->i_private -+ || !reiser4_inode_get_flag(dentry->d_inode, -+ REISER4_GENERIC_PTR_USED)) -+ return ERR_PTR(RETERR(-EINVAL)); -+ nd_set_link(nd, dentry->d_inode->i_private); -+ return NULL; -+} -+ -+/** -+ * reiser4_permission_common - permission of inode operations -+ * @inode: inode to check permissions for -+ * @mask: mode bits to check permissions for -+ * @nameidata: -+ * -+ * Uses generic function to check for rwx permissions. -+ */ -+int reiser4_permission_common(struct inode *inode, int mask, -+ struct nameidata *nameidata) -+{ -+ return generic_permission(inode, mask, NULL); -+} -+ -+static int setattr_reserve(reiser4_tree *); -+ -+/* this is common implementation of vfs's setattr method of struct -+ inode_operations -+*/ -+int reiser4_setattr_common(struct dentry *dentry, struct iattr *attr) -+{ -+ reiser4_context *ctx; -+ struct inode *inode; -+ int result; -+ -+ inode = dentry->d_inode; -+ result = inode_change_ok(inode, attr); -+ if (result) -+ return result; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ assert("nikita-3119", !(attr->ia_valid & ATTR_SIZE)); -+ -+ /* -+ * grab disk space and call standard inode_setattr(). -+ */ -+ result = setattr_reserve(reiser4_tree_by_inode(inode)); -+ if (!result) { -+ if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) -+ || (attr->ia_valid & ATTR_GID -+ && attr->ia_gid != inode->i_gid)) { -+ result = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; -+ if (result) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ } -+ result = inode_setattr(inode, attr); -+ if (!result) -+ reiser4_update_sd(inode); -+ } -+ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/* this is common implementation of vfs's getattr method of struct -+ inode_operations -+*/ -+int reiser4_getattr_common(struct vfsmount *mnt UNUSED_ARG, -+ struct dentry *dentry, struct kstat *stat) -+{ -+ struct inode *obj; -+ -+ assert("nikita-2298", dentry != NULL); -+ assert("nikita-2299", stat != NULL); -+ assert("nikita-2300", dentry->d_inode != NULL); -+ -+ obj = dentry->d_inode; -+ -+ stat->dev = obj->i_sb->s_dev; -+ stat->ino = oid_to_uino(get_inode_oid(obj)); -+ stat->mode = obj->i_mode; -+ /* don't confuse userland with huge nlink. This is not entirely -+ * correct, because nlink_t is not necessary 16 bit signed. */ -+ stat->nlink = min(obj->i_nlink, (typeof(obj->i_nlink)) 0x7fff); -+ stat->uid = obj->i_uid; -+ stat->gid = obj->i_gid; -+ stat->rdev = obj->i_rdev; -+ stat->atime = obj->i_atime; -+ stat->mtime = obj->i_mtime; -+ stat->ctime = obj->i_ctime; -+ stat->size = obj->i_size; -+ stat->blocks = -+ (inode_get_bytes(obj) + VFS_BLKSIZE - 1) >> VFS_BLKSIZE_BITS; -+ /* "preferred" blocksize for efficient file system I/O */ -+ stat->blksize = get_super_private(obj->i_sb)->optimal_io_size; -+ -+ return 0; -+} -+ -+/* Estimate the maximum amount of nodes which might be allocated or changed on -+ typical new object creation. Typical creation consists of calling create -+ method of file plugin, adding directory entry to parent and update parent -+ directory's stat data. -+*/ -+static reiser4_block_nr estimate_create_vfs_object(struct inode *parent, /* parent object */ -+ struct inode *object -+ /* object */ ) -+{ -+ assert("vpf-309", parent != NULL); -+ assert("vpf-307", object != NULL); -+ -+ return -+ /* object creation estimation */ -+ inode_file_plugin(object)->estimate.create(object) + -+ /* stat data of parent directory estimation */ -+ inode_file_plugin(parent)->estimate.update(parent) + -+ /* adding entry estimation */ -+ inode_dir_plugin(parent)->estimate.add_entry(parent) + -+ /* to undo in the case of failure */ -+ inode_dir_plugin(parent)->estimate.rem_entry(parent); -+} -+ -+/* Create child in directory. -+ -+ . get object's plugin -+ . get fresh inode -+ . initialize inode -+ . add object's stat-data -+ . initialize object's directory -+ . add entry to the parent -+ . instantiate dentry -+ -+*/ -+static int do_create_vfs_child(reiser4_object_create_data * data, /* parameters of new -+ object */ -+ struct inode **retobj) -+{ -+ int result; -+ -+ struct dentry *dentry; /* parent object */ -+ struct inode *parent; /* new name */ -+ -+ dir_plugin *par_dir; /* directory plugin on the parent */ -+ dir_plugin *obj_dir; /* directory plugin on the new object */ -+ file_plugin *obj_plug; /* object plugin on the new object */ -+ struct inode *object; /* new object */ -+ reiser4_block_nr reserve; -+ -+ reiser4_dir_entry_desc entry; /* new directory entry */ -+ -+ assert("nikita-1420", data != NULL); -+ parent = data->parent; -+ dentry = data->dentry; -+ -+ assert("nikita-1418", parent != NULL); -+ assert("nikita-1419", dentry != NULL); -+ -+ /* check, that name is acceptable for parent */ -+ par_dir = inode_dir_plugin(parent); -+ if (par_dir->is_name_acceptable && -+ !par_dir->is_name_acceptable(parent, -+ dentry->d_name.name, -+ (int)dentry->d_name.len)) -+ return RETERR(-ENAMETOOLONG); -+ -+ result = 0; -+ obj_plug = file_plugin_by_id((int)data->id); -+ if (obj_plug == NULL) { -+ warning("nikita-430", "Cannot find plugin %i", data->id); -+ return RETERR(-ENOENT); -+ } -+ object = new_inode(parent->i_sb); -+ if (object == NULL) -+ return RETERR(-ENOMEM); -+ /* we'll update i_nlink below */ -+ object->i_nlink = 0; -+ /* new_inode() initializes i_ino to "arbitrary" value. Reset it to 0, -+ * to simplify error handling: if some error occurs before i_ino is -+ * initialized with oid, i_ino should already be set to some -+ * distinguished value. */ -+ object->i_ino = 0; -+ -+ /* So that on error iput will be called. */ -+ *retobj = object; -+ -+ if (DQUOT_ALLOC_INODE(object)) { -+ DQUOT_DROP(object); -+ object->i_flags |= S_NOQUOTA; -+ return RETERR(-EDQUOT); -+ } -+ -+ memset(&entry, 0, sizeof entry); -+ entry.obj = object; -+ -+ set_plugin(&reiser4_inode_data(object)->pset, PSET_FILE, -+ file_plugin_to_plugin(obj_plug)); -+ result = obj_plug->set_plug_in_inode(object, parent, data); -+ if (result) { -+ warning("nikita-431", "Cannot install plugin %i on %llx", -+ data->id, (unsigned long long)get_inode_oid(object)); -+ DQUOT_FREE_INODE(object); -+ object->i_flags |= S_NOQUOTA; -+ return result; -+ } -+ -+ /* reget plugin after installation */ -+ obj_plug = inode_file_plugin(object); -+ -+ if (obj_plug->create_object == NULL) { -+ DQUOT_FREE_INODE(object); -+ object->i_flags |= S_NOQUOTA; -+ return RETERR(-EPERM); -+ } -+ -+ /* if any of hash, tail, sd or permission plugins for newly created -+ object are not set yet set them here inheriting them from parent -+ directory -+ */ -+ assert("nikita-2070", obj_plug->adjust_to_parent != NULL); -+ result = obj_plug->adjust_to_parent(object, -+ parent, -+ object->i_sb->s_root->d_inode); -+ if (result == 0) -+ result = finish_pset(object); -+ if (result != 0) { -+ warning("nikita-432", "Cannot inherit from %llx to %llx", -+ (unsigned long long)get_inode_oid(parent), -+ (unsigned long long)get_inode_oid(object)); -+ DQUOT_FREE_INODE(object); -+ object->i_flags |= S_NOQUOTA; -+ return result; -+ } -+ -+ /* setup inode and file-operations for this inode */ -+ setup_inode_ops(object, data); -+ -+ /* call file plugin's method to initialize plugin specific part of -+ * inode */ -+ if (obj_plug->init_inode_data) -+ obj_plug->init_inode_data(object, data, 1 /*create */ ); -+ -+ /* obtain directory plugin (if any) for new object. */ -+ obj_dir = inode_dir_plugin(object); -+ if (obj_dir != NULL && obj_dir->init == NULL) { -+ DQUOT_FREE_INODE(object); -+ object->i_flags |= S_NOQUOTA; -+ return RETERR(-EPERM); -+ } -+ -+ reiser4_inode_data(object)->locality_id = get_inode_oid(parent); -+ -+ reserve = estimate_create_vfs_object(parent, object); -+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) { -+ DQUOT_FREE_INODE(object); -+ object->i_flags |= S_NOQUOTA; -+ return RETERR(-ENOSPC); -+ } -+ -+ /* mark inode `immutable'. We disable changes to the file being -+ created until valid directory entry for it is inserted. Otherwise, -+ if file were expanded and insertion of directory entry fails, we -+ have to remove file, but we only alloted enough space in -+ transaction to remove _empty_ file. 3.x code used to remove stat -+ data in different transaction thus possibly leaking disk space on -+ crash. This all only matters if it's possible to access file -+ without name, for example, by inode number -+ */ -+ reiser4_inode_set_flag(object, REISER4_IMMUTABLE); -+ -+ /* create empty object, this includes allocation of new objectid. For -+ directories this implies creation of dot and dotdot */ -+ assert("nikita-2265", reiser4_inode_get_flag(object, REISER4_NO_SD)); -+ -+ /* mark inode as `loaded'. From this point onward -+ reiser4_delete_inode() will try to remove its stat-data. */ -+ reiser4_inode_set_flag(object, REISER4_LOADED); -+ -+ result = obj_plug->create_object(object, parent, data); -+ if (result != 0) { -+ reiser4_inode_clr_flag(object, REISER4_IMMUTABLE); -+ if (result != -ENAMETOOLONG && result != -ENOMEM) -+ warning("nikita-2219", -+ "Failed to create sd for %llu", -+ (unsigned long long)get_inode_oid(object)); -+ DQUOT_FREE_INODE(object); -+ object->i_flags |= S_NOQUOTA; -+ return result; -+ } -+ -+ if (obj_dir != NULL) -+ result = obj_dir->init(object, parent, data); -+ if (result == 0) { -+ assert("nikita-434", !reiser4_inode_get_flag(object, -+ REISER4_NO_SD)); -+ /* insert inode into VFS hash table */ -+ insert_inode_hash(object); -+ /* create entry */ -+ result = par_dir->add_entry(parent, dentry, data, &entry); -+ if (result == 0) { -+ result = reiser4_add_nlink(object, parent, 0); -+ /* If O_CREAT is set and the file did not previously -+ exist, upon successful completion, open() shall -+ mark for update the st_atime, st_ctime, and -+ st_mtime fields of the file and the st_ctime and -+ st_mtime fields of the parent directory. --SUS -+ */ -+ /* @object times are already updated by -+ reiser4_add_nlink() */ -+ if (result == 0) -+ reiser4_update_dir(parent); -+ if (result != 0) -+ /* cleanup failure to add nlink */ -+ par_dir->rem_entry(parent, dentry, &entry); -+ } -+ if (result != 0) -+ /* cleanup failure to add entry */ -+ obj_plug->detach(object, parent); -+ } else if (result != -ENOMEM) -+ warning("nikita-2219", "Failed to initialize dir for %llu: %i", -+ (unsigned long long)get_inode_oid(object), result); -+ -+ /* -+ * update stat-data, committing all pending modifications to the inode -+ * fields. -+ */ -+ reiser4_update_sd(object); -+ if (result != 0) { -+ DQUOT_FREE_INODE(object); -+ object->i_flags |= S_NOQUOTA; -+ /* if everything was ok (result == 0), parent stat-data is -+ * already updated above (update_parent_dir()) */ -+ reiser4_update_sd(parent); -+ /* failure to create entry, remove object */ -+ obj_plug->delete_object(object); -+ } -+ -+ /* file has name now, clear immutable flag */ -+ reiser4_inode_clr_flag(object, REISER4_IMMUTABLE); -+ -+ /* on error, iput() will call ->delete_inode(). We should keep track -+ of the existence of stat-data for this inode and avoid attempt to -+ remove it in reiser4_delete_inode(). This is accomplished through -+ REISER4_NO_SD bit in inode.u.reiser4_i.plugin.flags -+ */ -+ return result; -+} -+ -+/* this is helper for common implementations of reiser4_mkdir, reiser4_create, -+ reiser4_mknod and reiser4_symlink -+*/ -+static int -+create_vfs_object(struct inode *parent, -+ struct dentry *dentry, reiser4_object_create_data * data) -+{ -+ reiser4_context *ctx; -+ int result; -+ struct inode *child; -+ -+ ctx = reiser4_init_context(parent->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ context_set_commit_async(ctx); -+ -+ data->parent = parent; -+ data->dentry = dentry; -+ child = NULL; -+ result = do_create_vfs_child(data, &child); -+ if (unlikely(result != 0)) { -+ if (child != NULL) { -+ reiser4_make_bad_inode(child); -+ iput(child); -+ } -+ } else -+ d_instantiate(dentry, child); -+ -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+/* helper for link_common. Estimate disk space necessary to add a link -+ from @parent to @object -+*/ -+static reiser4_block_nr common_estimate_link(struct inode *parent, /* parent directory */ -+ struct inode *object -+ /* object to which new link is being cerated */ -+ ) -+{ -+ reiser4_block_nr res = 0; -+ file_plugin *fplug; -+ dir_plugin *dplug; -+ -+ assert("vpf-317", object != NULL); -+ assert("vpf-318", parent != NULL); -+ -+ fplug = inode_file_plugin(object); -+ dplug = inode_dir_plugin(parent); -+ /* VS-FIXME-HANS: why do we do fplug->estimate.update(object) twice instead of multiplying by 2? */ -+ /* reiser4_add_nlink(object) */ -+ res += fplug->estimate.update(object); -+ /* add_entry(parent) */ -+ res += dplug->estimate.add_entry(parent); -+ /* reiser4_del_nlink(object) */ -+ res += fplug->estimate.update(object); -+ /* update_dir(parent) */ -+ res += inode_file_plugin(parent)->estimate.update(parent); -+ /* safe-link */ -+ res += estimate_one_item_removal(reiser4_tree_by_inode(object)); -+ -+ return res; -+} -+ -+/* Estimate disk space necessary to remove a link between @parent and -+ @object. -+*/ -+static reiser4_block_nr estimate_unlink(struct inode *parent, /* parent directory */ -+ struct inode *object -+ /* object to which new link is being cerated */ -+ ) -+{ -+ reiser4_block_nr res = 0; -+ file_plugin *fplug; -+ dir_plugin *dplug; -+ -+ assert("vpf-317", object != NULL); -+ assert("vpf-318", parent != NULL); -+ -+ fplug = inode_file_plugin(object); -+ dplug = inode_dir_plugin(parent); -+ -+ /* rem_entry(parent) */ -+ res += dplug->estimate.rem_entry(parent); -+ /* reiser4_del_nlink(object) */ -+ res += fplug->estimate.update(object); -+ /* update_dir(parent) */ -+ res += inode_file_plugin(parent)->estimate.update(parent); -+ /* fplug->unlink */ -+ res += fplug->estimate.unlink(object, parent); -+ /* safe-link */ -+ res += estimate_one_insert_item(reiser4_tree_by_inode(object)); -+ -+ return res; -+} -+ -+/* helper for reiser4_unlink_common. Estimate and grab space for unlink. */ -+static int unlink_check_and_grab(struct inode *parent, struct dentry *victim) -+{ -+ file_plugin *fplug; -+ struct inode *child; -+ int result; -+ -+ result = 0; -+ child = victim->d_inode; -+ fplug = inode_file_plugin(child); -+ -+ /* check for race with create_object() */ -+ if (reiser4_inode_get_flag(child, REISER4_IMMUTABLE)) -+ return RETERR(-E_REPEAT); -+ /* object being deleted should have stat data */ -+ assert("vs-949", !reiser4_inode_get_flag(child, REISER4_NO_SD)); -+ -+ /* ask object plugin */ -+ if (fplug->can_rem_link != NULL && !fplug->can_rem_link(child)) -+ return RETERR(-ENOTEMPTY); -+ -+ result = (int)estimate_unlink(parent, child); -+ if (result < 0) -+ return result; -+ -+ return reiser4_grab_reserved(child->i_sb, result, BA_CAN_COMMIT); -+} -+ -+/* helper for reiser4_setattr_common */ -+static int setattr_reserve(reiser4_tree * tree) -+{ -+ assert("vs-1096", is_grab_enabled(get_current_context())); -+ return reiser4_grab_space(estimate_one_insert_into_item(tree), -+ BA_CAN_COMMIT); -+} -+ -+/* helper function. Standards require that for many file-system operations -+ on success ctime and mtime of parent directory is to be updated. */ -+int reiser4_update_dir(struct inode *dir) -+{ -+ assert("nikita-2525", dir != NULL); -+ -+ dir->i_ctime = dir->i_mtime = CURRENT_TIME; -+ return reiser4_update_sd(dir); -+} -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/inode_ops_rename.c linux-2.6.24/fs/reiser4/plugin/inode_ops_rename.c ---- linux-2.6.24.orig/fs/reiser4/plugin/inode_ops_rename.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/inode_ops_rename.c 2008-01-25 11:39:07.000224175 +0300 -@@ -0,0 +1,912 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#include "../inode.h" -+#include "../safe_link.h" -+ -+static const char *possible_leak = "Possible disk space leak."; -+ -+/* re-bind existing name at @from_coord in @from_dir to point to @to_inode. -+ -+ Helper function called from hashed_rename() */ -+static int replace_name(struct inode *to_inode, /* inode where @from_coord is -+ * to be re-targeted at */ -+ struct inode *from_dir, /* directory where @from_coord -+ * lives */ -+ struct inode *from_inode, /* inode @from_coord -+ * originally point to */ -+ coord_t * from_coord, /* where directory entry is in -+ * the tree */ -+ lock_handle * from_lh /* lock handle on @from_coord */ ) -+{ -+ item_plugin *from_item; -+ int result; -+ znode *node; -+ -+ coord_clear_iplug(from_coord); -+ node = from_coord->node; -+ result = zload(node); -+ if (result != 0) -+ return result; -+ from_item = item_plugin_by_coord(from_coord); -+ if (plugin_of_group(item_plugin_by_coord(from_coord), -+ DIR_ENTRY_ITEM_TYPE)) -+ { -+ reiser4_key to_key; -+ -+ build_sd_key(to_inode, &to_key); -+ -+ /* everything is found and prepared to change directory entry -+ at @from_coord to point to @to_inode. -+ -+ @to_inode is just about to get new name, so bump its link -+ counter. -+ -+ */ -+ result = reiser4_add_nlink(to_inode, from_dir, 0); -+ if (result != 0) { -+ /* Don't issue warning: this may be plain -EMLINK */ -+ zrelse(node); -+ return result; -+ } -+ -+ result = -+ from_item->s.dir.update_key(from_coord, &to_key, from_lh); -+ if (result != 0) { -+ reiser4_del_nlink(to_inode, from_dir, 0); -+ zrelse(node); -+ return result; -+ } -+ -+ /* @from_inode just lost its name, he-he. -+ -+ If @from_inode was directory, it contained dotdot pointing -+ to @from_dir. @from_dir i_nlink will be decreased when -+ iput() will be called on @from_inode. -+ -+ If file-system is not ADG (hard-links are -+ supported on directories), iput(from_inode) will not remove -+ @from_inode, and thus above is incorrect, but hard-links on -+ directories are problematic in many other respects. -+ */ -+ result = reiser4_del_nlink(from_inode, from_dir, 0); -+ if (result != 0) { -+ warning("nikita-2330", -+ "Cannot remove link from source: %i. %s", -+ result, possible_leak); -+ } -+ /* Has to return success, because entry is already -+ * modified. */ -+ result = 0; -+ -+ /* NOTE-NIKITA consider calling plugin method in stead of -+ accessing inode fields directly. */ -+ from_dir->i_mtime = CURRENT_TIME; -+ } else { -+ warning("nikita-2326", "Unexpected item type"); -+ result = RETERR(-EIO); -+ } -+ zrelse(node); -+ return result; -+} -+ -+/* add new entry pointing to @inode into @dir at @coord, locked by @lh -+ -+ Helper function used by hashed_rename(). */ -+static int add_name(struct inode *inode, /* inode where @coord is to be -+ * re-targeted at */ -+ struct inode *dir, /* directory where @coord lives */ -+ struct dentry *name, /* new name */ -+ coord_t * coord, /* where directory entry is in the tree */ -+ lock_handle * lh, /* lock handle on @coord */ -+ int is_dir /* true, if @inode is directory */ ) -+{ -+ int result; -+ reiser4_dir_entry_desc entry; -+ -+ assert("nikita-2333", lh->node == coord->node); -+ assert("nikita-2334", is_dir == S_ISDIR(inode->i_mode)); -+ -+ memset(&entry, 0, sizeof entry); -+ entry.obj = inode; -+ /* build key of directory entry description */ -+ inode_dir_plugin(dir)->build_entry_key(dir, &name->d_name, &entry.key); -+ -+ /* ext2 does this in different order: first inserts new entry, -+ then increases directory nlink. We don't want do this, -+ because reiser4_add_nlink() calls ->add_link() plugin -+ method that can fail for whatever reason, leaving as with -+ cleanup problems. -+ */ -+ /* @inode is getting new name */ -+ reiser4_add_nlink(inode, dir, 0); -+ /* create @new_name in @new_dir pointing to -+ @old_inode */ -+ result = WITH_COORD(coord, -+ inode_dir_item_plugin(dir)->s.dir.add_entry(dir, -+ coord, -+ lh, -+ name, -+ &entry)); -+ if (result != 0) { -+ int result2; -+ result2 = reiser4_del_nlink(inode, dir, 0); -+ if (result2 != 0) { -+ warning("nikita-2327", -+ "Cannot drop link on %lli %i. %s", -+ (unsigned long long)get_inode_oid(inode), -+ result2, possible_leak); -+ } -+ } else -+ INODE_INC_FIELD(dir, i_size); -+ return result; -+} -+ -+static reiser4_block_nr estimate_rename(struct inode *old_dir, /* directory where @old is located */ -+ struct dentry *old_name, /* old name */ -+ struct inode *new_dir, /* directory where @new is located */ -+ struct dentry *new_name /* new name */ ) -+{ -+ reiser4_block_nr res1, res2; -+ dir_plugin *p_parent_old, *p_parent_new; -+ file_plugin *p_child_old, *p_child_new; -+ -+ assert("vpf-311", old_dir != NULL); -+ assert("vpf-312", new_dir != NULL); -+ assert("vpf-313", old_name != NULL); -+ assert("vpf-314", new_name != NULL); -+ -+ p_parent_old = inode_dir_plugin(old_dir); -+ p_parent_new = inode_dir_plugin(new_dir); -+ p_child_old = inode_file_plugin(old_name->d_inode); -+ if (new_name->d_inode) -+ p_child_new = inode_file_plugin(new_name->d_inode); -+ else -+ p_child_new = NULL; -+ -+ /* find_entry - can insert one leaf. */ -+ res1 = res2 = 1; -+ -+ /* replace_name */ -+ { -+ /* reiser4_add_nlink(p_child_old) and reiser4_del_nlink(p_child_old) */ -+ res1 += 2 * p_child_old->estimate.update(old_name->d_inode); -+ /* update key */ -+ res1 += 1; -+ /* reiser4_del_nlink(p_child_new) */ -+ if (p_child_new) -+ res1 += p_child_new->estimate.update(new_name->d_inode); -+ } -+ -+ /* else add_name */ -+ { -+ /* reiser4_add_nlink(p_parent_new) and reiser4_del_nlink(p_parent_new) */ -+ res2 += -+ 2 * inode_file_plugin(new_dir)->estimate.update(new_dir); -+ /* reiser4_add_nlink(p_parent_old) */ -+ res2 += p_child_old->estimate.update(old_name->d_inode); -+ /* add_entry(p_parent_new) */ -+ res2 += p_parent_new->estimate.add_entry(new_dir); -+ /* reiser4_del_nlink(p_parent_old) */ -+ res2 += p_child_old->estimate.update(old_name->d_inode); -+ } -+ -+ res1 = res1 < res2 ? res2 : res1; -+ -+ /* reiser4_write_sd(p_parent_new) */ -+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir); -+ -+ /* reiser4_write_sd(p_child_new) */ -+ if (p_child_new) -+ res1 += p_child_new->estimate.update(new_name->d_inode); -+ -+ /* hashed_rem_entry(p_parent_old) */ -+ res1 += p_parent_old->estimate.rem_entry(old_dir); -+ -+ /* reiser4_del_nlink(p_child_old) */ -+ res1 += p_child_old->estimate.update(old_name->d_inode); -+ -+ /* replace_name */ -+ { -+ /* reiser4_add_nlink(p_parent_dir_new) */ -+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir); -+ /* update_key */ -+ res1 += 1; -+ /* reiser4_del_nlink(p_parent_new) */ -+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir); -+ /* reiser4_del_nlink(p_parent_old) */ -+ res1 += inode_file_plugin(old_dir)->estimate.update(old_dir); -+ } -+ -+ /* reiser4_write_sd(p_parent_old) */ -+ res1 += inode_file_plugin(old_dir)->estimate.update(old_dir); -+ -+ /* reiser4_write_sd(p_child_old) */ -+ res1 += p_child_old->estimate.update(old_name->d_inode); -+ -+ return res1; -+} -+ -+static int hashed_rename_estimate_and_grab(struct inode *old_dir, /* directory where @old is located */ -+ struct dentry *old_name, /* old name */ -+ struct inode *new_dir, /* directory where @new is located */ -+ struct dentry *new_name -+ /* new name */ ) -+{ -+ reiser4_block_nr reserve; -+ -+ reserve = estimate_rename(old_dir, old_name, new_dir, new_name); -+ -+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) -+ return RETERR(-ENOSPC); -+ -+ return 0; -+} -+ -+/* check whether @old_inode and @new_inode can be moved within file system -+ * tree. This singles out attempts to rename pseudo-files, for example. */ -+static int can_rename(struct inode *old_dir, struct inode *old_inode, -+ struct inode *new_dir, struct inode *new_inode) -+{ -+ file_plugin *fplug; -+ dir_plugin *dplug; -+ -+ assert("nikita-3370", old_inode != NULL); -+ -+ dplug = inode_dir_plugin(new_dir); -+ fplug = inode_file_plugin(old_inode); -+ -+ if (dplug == NULL) -+ return RETERR(-ENOTDIR); -+ else if (new_dir->i_op->create == NULL) -+ return RETERR(-EPERM); -+ else if (!fplug->can_add_link(old_inode)) -+ return RETERR(-EMLINK); -+ else if (new_inode != NULL) { -+ fplug = inode_file_plugin(new_inode); -+ if (fplug->can_rem_link != NULL && -+ !fplug->can_rem_link(new_inode)) -+ return RETERR(-EBUSY); -+ } -+ return 0; -+} -+ -+int reiser4_find_entry(struct inode *, struct dentry *, lock_handle *, -+ znode_lock_mode, reiser4_dir_entry_desc *); -+int reiser4_update_dir(struct inode *); -+ -+/* this is common implementation of vfs's rename method of struct -+ inode_operations -+ See comments in the body. -+ -+ It is arguable that this function can be made generic so, that it -+ will be applicable to any kind of directory plugin that deals with -+ directories composed out of directory entries. The only obstacle -+ here is that we don't have any data-type to represent directory -+ entry. This should be re-considered when more than one different -+ directory plugin will be implemented. -+*/ -+int reiser4_rename_common(struct inode *old_dir /* directory where @old -+ * is located */ , -+ struct dentry *old_name /* old name */ , -+ struct inode *new_dir /* directory where @new -+ * is located */ , -+ struct dentry *new_name /* new name */ ) -+{ -+ /* From `The Open Group Base Specifications Issue 6' -+ -+ If either the old or new argument names a symbolic link, rename() -+ shall operate on the symbolic link itself, and shall not resolve -+ the last component of the argument. If the old argument and the new -+ argument resolve to the same existing file, rename() shall return -+ successfully and perform no other action. -+ -+ [this is done by VFS: vfs_rename()] -+ -+ If the old argument points to the pathname of a file that is not a -+ directory, the new argument shall not point to the pathname of a -+ directory. -+ -+ [checked by VFS: vfs_rename->may_delete()] -+ -+ If the link named by the new argument exists, it shall -+ be removed and old renamed to new. In this case, a link named new -+ shall remain visible to other processes throughout the renaming -+ operation and refer either to the file referred to by new or old -+ before the operation began. -+ -+ [we should assure this] -+ -+ Write access permission is required for -+ both the directory containing old and the directory containing new. -+ -+ [checked by VFS: vfs_rename->may_delete(), may_create()] -+ -+ If the old argument points to the pathname of a directory, the new -+ argument shall not point to the pathname of a file that is not a -+ directory. -+ -+ [checked by VFS: vfs_rename->may_delete()] -+ -+ If the directory named by the new argument exists, it -+ shall be removed and old renamed to new. In this case, a link named -+ new shall exist throughout the renaming operation and shall refer -+ either to the directory referred to by new or old before the -+ operation began. -+ -+ [we should assure this] -+ -+ If new names an existing directory, it shall be -+ required to be an empty directory. -+ -+ [we should check this] -+ -+ If the old argument points to a pathname of a symbolic link, the -+ symbolic link shall be renamed. If the new argument points to a -+ pathname of a symbolic link, the symbolic link shall be removed. -+ -+ The new pathname shall not contain a path prefix that names -+ old. Write access permission is required for the directory -+ containing old and the directory containing new. If the old -+ argument points to the pathname of a directory, write access -+ permission may be required for the directory named by old, and, if -+ it exists, the directory named by new. -+ -+ [checked by VFS: vfs_rename(), vfs_rename_dir()] -+ -+ If the link named by the new argument exists and the file's link -+ count becomes 0 when it is removed and no process has the file -+ open, the space occupied by the file shall be freed and the file -+ shall no longer be accessible. If one or more processes have the -+ file open when the last link is removed, the link shall be removed -+ before rename() returns, but the removal of the file contents shall -+ be postponed until all references to the file are closed. -+ -+ [iput() handles this, but we can do this manually, a la -+ reiser4_unlink()] -+ -+ Upon successful completion, rename() shall mark for update the -+ st_ctime and st_mtime fields of the parent directory of each file. -+ -+ [N/A] -+ -+ */ -+ reiser4_context *ctx; -+ int result; -+ int is_dir; /* is @old_name directory */ -+ -+ struct inode *old_inode; -+ struct inode *new_inode; -+ coord_t *new_coord; -+ -+ struct reiser4_dentry_fsdata *new_fsdata; -+ dir_plugin *dplug; -+ file_plugin *fplug; -+ -+ reiser4_dir_entry_desc *old_entry, *new_entry, *dotdot_entry; -+ lock_handle *new_lh, *dotdot_lh; -+ struct dentry *dotdot_name; -+ struct reiser4_dentry_fsdata *dataonstack; -+ -+ ctx = reiser4_init_context(old_dir->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ old_entry = kzalloc(3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) + -+ sizeof(*dotdot_name) + sizeof(*dataonstack), -+ reiser4_ctx_gfp_mask_get()); -+ if (!old_entry) { -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return RETERR(-ENOMEM); -+ } -+ -+ new_entry = old_entry + 1; -+ dotdot_entry = old_entry + 2; -+ new_lh = (lock_handle *)(old_entry + 3); -+ dotdot_lh = new_lh + 1; -+ dotdot_name = (struct dentry *)(new_lh + 2); -+ dataonstack = (struct reiser4_dentry_fsdata *)(dotdot_name + 1); -+ -+ assert("nikita-2318", old_dir != NULL); -+ assert("nikita-2319", new_dir != NULL); -+ assert("nikita-2320", old_name != NULL); -+ assert("nikita-2321", new_name != NULL); -+ -+ old_inode = old_name->d_inode; -+ new_inode = new_name->d_inode; -+ -+ dplug = inode_dir_plugin(old_dir); -+ fplug = NULL; -+ -+ new_fsdata = reiser4_get_dentry_fsdata(new_name); -+ if (IS_ERR(new_fsdata)) { -+ kfree(old_entry); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return PTR_ERR(new_fsdata); -+ } -+ -+ new_coord = &new_fsdata->dec.entry_coord; -+ coord_clear_iplug(new_coord); -+ -+ is_dir = S_ISDIR(old_inode->i_mode); -+ -+ assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir); -+ -+ /* if target is existing directory and it's not empty---return error. -+ -+ This check is done specifically, because is_dir_empty() requires -+ tree traversal and have to be done before locks are taken. -+ */ -+ if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) { -+ kfree(old_entry); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return RETERR(-ENOTEMPTY); -+ } -+ -+ result = can_rename(old_dir, old_inode, new_dir, new_inode); -+ if (result != 0) { -+ kfree(old_entry); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ result = hashed_rename_estimate_and_grab(old_dir, old_name, -+ new_dir, new_name); -+ if (result != 0) { -+ kfree(old_entry); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ init_lh(new_lh); -+ -+ /* find entry for @new_name */ -+ result = reiser4_find_entry(new_dir, new_name, new_lh, ZNODE_WRITE_LOCK, -+ new_entry); -+ -+ if (IS_CBKERR(result)) { -+ done_lh(new_lh); -+ kfree(old_entry); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+ } -+ -+ reiser4_seal_done(&new_fsdata->dec.entry_seal); -+ -+ /* add or replace name for @old_inode as @new_name */ -+ if (new_inode != NULL) { -+ /* target (@new_name) exists. */ -+ /* Not clear what to do with objects that are -+ both directories and files at the same time. */ -+ if (result == CBK_COORD_FOUND) { -+ result = replace_name(old_inode, -+ new_dir, -+ new_inode, new_coord, new_lh); -+ if (result == 0) -+ fplug = inode_file_plugin(new_inode); -+ } else if (result == CBK_COORD_NOTFOUND) { -+ /* VFS told us that @new_name is bound to existing -+ inode, but we failed to find directory entry. */ -+ warning("nikita-2324", "Target not found"); -+ result = RETERR(-ENOENT); -+ } -+ } else { -+ /* target (@new_name) doesn't exists. */ -+ if (result == CBK_COORD_NOTFOUND) -+ result = add_name(old_inode, -+ new_dir, -+ new_name, new_coord, new_lh, is_dir); -+ else if (result == CBK_COORD_FOUND) { -+ /* VFS told us that @new_name is "negative" dentry, -+ but we found directory entry. */ -+ warning("nikita-2331", "Target found unexpectedly"); -+ result = RETERR(-EIO); -+ } -+ } -+ -+ assert("nikita-3462", ergo(result == 0, -+ old_inode->i_nlink >= 2 + !!is_dir)); -+ -+ /* We are done with all modifications to the @new_dir, release lock on -+ node. */ -+ done_lh(new_lh); -+ -+ if (fplug != NULL) { -+ /* detach @new_inode from name-space */ -+ result = fplug->detach(new_inode, new_dir); -+ if (result != 0) -+ warning("nikita-2330", "Cannot detach %lli: %i. %s", -+ (unsigned long long)get_inode_oid(new_inode), -+ result, possible_leak); -+ } -+ -+ if (new_inode != NULL) -+ reiser4_update_sd(new_inode); -+ -+ if (result == 0) { -+ old_entry->obj = old_inode; -+ -+ dplug->build_entry_key(old_dir, -+ &old_name->d_name, &old_entry->key); -+ -+ /* At this stage new name was introduced for -+ @old_inode. @old_inode, @new_dir, and @new_inode i_nlink -+ counters were updated. -+ -+ We want to remove @old_name now. If @old_inode wasn't -+ directory this is simple. -+ */ -+ result = dplug->rem_entry(old_dir, old_name, old_entry); -+ if (result != 0 && result != -ENOMEM) { -+ warning("nikita-2335", -+ "Cannot remove old name: %i", result); -+ } else { -+ result = reiser4_del_nlink(old_inode, old_dir, 0); -+ if (result != 0 && result != -ENOMEM) { -+ warning("nikita-2337", -+ "Cannot drop link on old: %i", result); -+ } -+ } -+ -+ if (result == 0 && is_dir) { -+ /* @old_inode is directory. We also have to update -+ dotdot entry. */ -+ coord_t *dotdot_coord; -+ -+ memset(dataonstack, 0, sizeof dataonstack); -+ memset(dotdot_entry, 0, sizeof dotdot_entry); -+ dotdot_entry->obj = old_dir; -+ memset(dotdot_name, 0, sizeof dotdot_name); -+ dotdot_name->d_name.name = ".."; -+ dotdot_name->d_name.len = 2; -+ /* -+ * allocate ->d_fsdata on the stack to avoid using -+ * reiser4_get_dentry_fsdata(). Locking is not needed, -+ * because dentry is private to the current thread. -+ */ -+ dotdot_name->d_fsdata = dataonstack; -+ init_lh(dotdot_lh); -+ -+ dotdot_coord = &dataonstack->dec.entry_coord; -+ coord_clear_iplug(dotdot_coord); -+ -+ result = reiser4_find_entry(old_inode, dotdot_name, -+ dotdot_lh, ZNODE_WRITE_LOCK, -+ dotdot_entry); -+ if (result == 0) { -+ /* replace_name() decreases i_nlink on -+ * @old_dir */ -+ result = replace_name(new_dir, -+ old_inode, -+ old_dir, -+ dotdot_coord, dotdot_lh); -+ } else -+ result = RETERR(-EIO); -+ done_lh(dotdot_lh); -+ } -+ } -+ reiser4_update_dir(new_dir); -+ reiser4_update_dir(old_dir); -+ reiser4_update_sd(old_inode); -+ if (result == 0) { -+ file_plugin *fplug; -+ -+ if (new_inode != NULL) { -+ /* add safe-link for target file (in case we removed -+ * last reference to the poor fellow */ -+ fplug = inode_file_plugin(new_inode); -+ if (new_inode->i_nlink == 0) -+ result = safe_link_add(new_inode, SAFE_UNLINK); -+ } -+ } -+ kfree(old_entry); -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+ -+#if 0 -+int reiser4_rename_common(struct inode *old_dir /* directory where @old -+ * is located */ , -+ struct dentry *old_name /* old name */ , -+ struct inode *new_dir /* directory where @new -+ * is located */ , -+ struct dentry *new_name /* new name */ ) -+{ -+ /* From `The Open Group Base Specifications Issue 6' -+ -+ If either the old or new argument names a symbolic link, rename() -+ shall operate on the symbolic link itself, and shall not resolve -+ the last component of the argument. If the old argument and the new -+ argument resolve to the same existing file, rename() shall return -+ successfully and perform no other action. -+ -+ [this is done by VFS: vfs_rename()] -+ -+ If the old argument points to the pathname of a file that is not a -+ directory, the new argument shall not point to the pathname of a -+ directory. -+ -+ [checked by VFS: vfs_rename->may_delete()] -+ -+ If the link named by the new argument exists, it shall -+ be removed and old renamed to new. In this case, a link named new -+ shall remain visible to other processes throughout the renaming -+ operation and refer either to the file referred to by new or old -+ before the operation began. -+ -+ [we should assure this] -+ -+ Write access permission is required for -+ both the directory containing old and the directory containing new. -+ -+ [checked by VFS: vfs_rename->may_delete(), may_create()] -+ -+ If the old argument points to the pathname of a directory, the new -+ argument shall not point to the pathname of a file that is not a -+ directory. -+ -+ [checked by VFS: vfs_rename->may_delete()] -+ -+ If the directory named by the new argument exists, it -+ shall be removed and old renamed to new. In this case, a link named -+ new shall exist throughout the renaming operation and shall refer -+ either to the directory referred to by new or old before the -+ operation began. -+ -+ [we should assure this] -+ -+ If new names an existing directory, it shall be -+ required to be an empty directory. -+ -+ [we should check this] -+ -+ If the old argument points to a pathname of a symbolic link, the -+ symbolic link shall be renamed. If the new argument points to a -+ pathname of a symbolic link, the symbolic link shall be removed. -+ -+ The new pathname shall not contain a path prefix that names -+ old. Write access permission is required for the directory -+ containing old and the directory containing new. If the old -+ argument points to the pathname of a directory, write access -+ permission may be required for the directory named by old, and, if -+ it exists, the directory named by new. -+ -+ [checked by VFS: vfs_rename(), vfs_rename_dir()] -+ -+ If the link named by the new argument exists and the file's link -+ count becomes 0 when it is removed and no process has the file -+ open, the space occupied by the file shall be freed and the file -+ shall no longer be accessible. If one or more processes have the -+ file open when the last link is removed, the link shall be removed -+ before rename() returns, but the removal of the file contents shall -+ be postponed until all references to the file are closed. -+ -+ [iput() handles this, but we can do this manually, a la -+ reiser4_unlink()] -+ -+ Upon successful completion, rename() shall mark for update the -+ st_ctime and st_mtime fields of the parent directory of each file. -+ -+ [N/A] -+ -+ */ -+ reiser4_context *ctx; -+ int result; -+ int is_dir; /* is @old_name directory */ -+ struct inode *old_inode; -+ struct inode *new_inode; -+ reiser4_dir_entry_desc old_entry; -+ reiser4_dir_entry_desc new_entry; -+ coord_t *new_coord; -+ struct reiser4_dentry_fsdata *new_fsdata; -+ lock_handle new_lh; -+ dir_plugin *dplug; -+ file_plugin *fplug; -+ -+ ctx = reiser4_init_context(old_dir->i_sb); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ assert("nikita-2318", old_dir != NULL); -+ assert("nikita-2319", new_dir != NULL); -+ assert("nikita-2320", old_name != NULL); -+ assert("nikita-2321", new_name != NULL); -+ -+ old_inode = old_name->d_inode; -+ new_inode = new_name->d_inode; -+ -+ dplug = inode_dir_plugin(old_dir); -+ fplug = NULL; -+ -+ new_fsdata = reiser4_get_dentry_fsdata(new_name); -+ if (IS_ERR(new_fsdata)) { -+ result = PTR_ERR(new_fsdata); -+ goto exit; -+ } -+ -+ new_coord = &new_fsdata->dec.entry_coord; -+ coord_clear_iplug(new_coord); -+ -+ is_dir = S_ISDIR(old_inode->i_mode); -+ -+ assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir); -+ -+ /* if target is existing directory and it's not empty---return error. -+ -+ This check is done specifically, because is_dir_empty() requires -+ tree traversal and have to be done before locks are taken. -+ */ -+ if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) -+ return RETERR(-ENOTEMPTY); -+ -+ result = can_rename(old_dir, old_inode, new_dir, new_inode); -+ if (result != 0) -+ goto exit; -+ -+ result = hashed_rename_estimate_and_grab(old_dir, old_name, -+ new_dir, new_name); -+ if (result != 0) -+ goto exit; -+ -+ init_lh(&new_lh); -+ -+ /* find entry for @new_name */ -+ result = reiser4_find_entry(new_dir, new_name, &new_lh, -+ ZNODE_WRITE_LOCK, &new_entry); -+ -+ if (IS_CBKERR(result)) { -+ done_lh(&new_lh); -+ goto exit; -+ } -+ -+ reiser4_seal_done(&new_fsdata->dec.entry_seal); -+ -+ /* add or replace name for @old_inode as @new_name */ -+ if (new_inode != NULL) { -+ /* target (@new_name) exists. */ -+ /* Not clear what to do with objects that are -+ both directories and files at the same time. */ -+ if (result == CBK_COORD_FOUND) { -+ result = replace_name(old_inode, -+ new_dir, -+ new_inode, new_coord, &new_lh); -+ if (result == 0) -+ fplug = inode_file_plugin(new_inode); -+ } else if (result == CBK_COORD_NOTFOUND) { -+ /* VFS told us that @new_name is bound to existing -+ inode, but we failed to find directory entry. */ -+ warning("nikita-2324", "Target not found"); -+ result = RETERR(-ENOENT); -+ } -+ } else { -+ /* target (@new_name) doesn't exists. */ -+ if (result == CBK_COORD_NOTFOUND) -+ result = add_name(old_inode, -+ new_dir, -+ new_name, new_coord, &new_lh, is_dir); -+ else if (result == CBK_COORD_FOUND) { -+ /* VFS told us that @new_name is "negative" dentry, -+ but we found directory entry. */ -+ warning("nikita-2331", "Target found unexpectedly"); -+ result = RETERR(-EIO); -+ } -+ } -+ -+ assert("nikita-3462", ergo(result == 0, -+ old_inode->i_nlink >= 2 + !!is_dir)); -+ -+ /* We are done with all modifications to the @new_dir, release lock on -+ node. */ -+ done_lh(&new_lh); -+ -+ if (fplug != NULL) { -+ /* detach @new_inode from name-space */ -+ result = fplug->detach(new_inode, new_dir); -+ if (result != 0) -+ warning("nikita-2330", "Cannot detach %lli: %i. %s", -+ (unsigned long long)get_inode_oid(new_inode), -+ result, possible_leak); -+ } -+ -+ if (new_inode != NULL) -+ reiser4_update_sd(new_inode); -+ -+ if (result == 0) { -+ memset(&old_entry, 0, sizeof old_entry); -+ old_entry.obj = old_inode; -+ -+ dplug->build_entry_key(old_dir, -+ &old_name->d_name, &old_entry.key); -+ -+ /* At this stage new name was introduced for -+ @old_inode. @old_inode, @new_dir, and @new_inode i_nlink -+ counters were updated. -+ -+ We want to remove @old_name now. If @old_inode wasn't -+ directory this is simple. -+ */ -+ result = dplug->rem_entry(old_dir, old_name, &old_entry); -+ /*result = rem_entry_hashed(old_dir, old_name, &old_entry); */ -+ if (result != 0 && result != -ENOMEM) { -+ warning("nikita-2335", -+ "Cannot remove old name: %i", result); -+ } else { -+ result = reiser4_del_nlink(old_inode, old_dir, 0); -+ if (result != 0 && result != -ENOMEM) { -+ warning("nikita-2337", -+ "Cannot drop link on old: %i", result); -+ } -+ } -+ -+ if (result == 0 && is_dir) { -+ /* @old_inode is directory. We also have to update -+ dotdot entry. */ -+ coord_t *dotdot_coord; -+ lock_handle dotdot_lh; -+ struct dentry dotdot_name; -+ reiser4_dir_entry_desc dotdot_entry; -+ struct reiser4_dentry_fsdata dataonstack; -+ struct reiser4_dentry_fsdata *fsdata; -+ -+ memset(&dataonstack, 0, sizeof dataonstack); -+ memset(&dotdot_entry, 0, sizeof dotdot_entry); -+ dotdot_entry.obj = old_dir; -+ memset(&dotdot_name, 0, sizeof dotdot_name); -+ dotdot_name.d_name.name = ".."; -+ dotdot_name.d_name.len = 2; -+ /* -+ * allocate ->d_fsdata on the stack to avoid using -+ * reiser4_get_dentry_fsdata(). Locking is not needed, -+ * because dentry is private to the current thread. -+ */ -+ dotdot_name.d_fsdata = &dataonstack; -+ init_lh(&dotdot_lh); -+ -+ fsdata = &dataonstack; -+ dotdot_coord = &fsdata->dec.entry_coord; -+ coord_clear_iplug(dotdot_coord); -+ -+ result = reiser4_find_entry(old_inode, -+ &dotdot_name, -+ &dotdot_lh, -+ ZNODE_WRITE_LOCK, -+ &dotdot_entry); -+ if (result == 0) { -+ /* replace_name() decreases i_nlink on -+ * @old_dir */ -+ result = replace_name(new_dir, -+ old_inode, -+ old_dir, -+ dotdot_coord, &dotdot_lh); -+ } else -+ result = RETERR(-EIO); -+ done_lh(&dotdot_lh); -+ } -+ } -+ reiser4_update_dir(new_dir); -+ reiser4_update_dir(old_dir); -+ reiser4_update_sd(old_inode); -+ if (result == 0) { -+ file_plugin *fplug; -+ -+ if (new_inode != NULL) { -+ /* add safe-link for target file (in case we removed -+ * last reference to the poor fellow */ -+ fplug = inode_file_plugin(new_inode); -+ if (new_inode->i_nlink == 0) -+ result = safe_link_add(new_inode, SAFE_UNLINK); -+ } -+ } -+ exit: -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+ return result; -+} -+#endif -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/acl.h linux-2.6.24/fs/reiser4/plugin/item/acl.h ---- linux-2.6.24.orig/fs/reiser4/plugin/item/acl.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/item/acl.h 2008-01-25 11:39:07.000224175 +0300 -@@ -0,0 +1,66 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Directory entry. */ -+ -+#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ ) -+#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ -+ -+#include "../../forward.h" -+#include "../../dformat.h" -+#include "../../kassign.h" -+#include "../../key.h" -+ -+#include -+#include /* for struct dentry */ -+ -+typedef struct directory_entry_format { -+ /* key of object stat-data. It's not necessary to store whole -+ key here, because it's always key of stat-data, so minor -+ packing locality and offset can be omitted here. But this -+ relies on particular key allocation scheme for stat-data, so, -+ for extensibility sake, whole key can be stored here. -+ -+ We store key as array of bytes, because we don't want 8-byte -+ alignment of dir entries. -+ */ -+ obj_key_id id; -+ /* file name. Null terminated string. */ -+ d8 name[0]; -+} directory_entry_format; -+ -+void print_de(const char *prefix, coord_t * coord); -+int extract_key_de(const coord_t * coord, reiser4_key * key); -+int update_key_de(const coord_t * coord, const reiser4_key * key, -+ lock_handle * lh); -+char *extract_name_de(const coord_t * coord, char *buf); -+unsigned extract_file_type_de(const coord_t * coord); -+int add_entry_de(struct inode *dir, coord_t * coord, -+ lock_handle * lh, const struct dentry *name, -+ reiser4_dir_entry_desc * entry); -+int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord, -+ lock_handle * lh, reiser4_dir_entry_desc * entry); -+int max_name_len_de(const struct inode *dir); -+ -+int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length); -+ -+char *extract_dent_name(const coord_t * coord, -+ directory_entry_format * dent, char *buf); -+ -+#if REISER4_LARGE_KEY -+#define DE_NAME_BUF_LEN (24) -+#else -+#define DE_NAME_BUF_LEN (16) -+#endif -+ -+/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/blackbox.c linux-2.6.24/fs/reiser4/plugin/item/blackbox.c ---- linux-2.6.24.orig/fs/reiser4/plugin/item/blackbox.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/item/blackbox.c 2008-01-25 11:39:07.004225206 +0300 -@@ -0,0 +1,142 @@ -+/* Copyright 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Black box item implementation */ -+ -+#include "../../forward.h" -+#include "../../debug.h" -+#include "../../dformat.h" -+#include "../../kassign.h" -+#include "../../coord.h" -+#include "../../tree.h" -+#include "../../lock.h" -+ -+#include "blackbox.h" -+#include "item.h" -+#include "../plugin.h" -+ -+int -+store_black_box(reiser4_tree * tree, -+ const reiser4_key * key, void *data, int length) -+{ -+ int result; -+ reiser4_item_data idata; -+ coord_t coord; -+ lock_handle lh; -+ -+ memset(&idata, 0, sizeof idata); -+ -+ idata.data = data; -+ idata.user = 0; -+ idata.length = length; -+ idata.iplug = item_plugin_by_id(BLACK_BOX_ID); -+ -+ init_lh(&lh); -+ result = insert_by_key(tree, key, -+ &idata, &coord, &lh, LEAF_LEVEL, CBK_UNIQUE); -+ -+ assert("nikita-3413", -+ ergo(result == 0, -+ WITH_COORD(&coord, -+ item_length_by_coord(&coord) == length))); -+ -+ done_lh(&lh); -+ return result; -+} -+ -+int -+load_black_box(reiser4_tree * tree, -+ reiser4_key * key, void *data, int length, int exact) -+{ -+ int result; -+ coord_t coord; -+ lock_handle lh; -+ -+ init_lh(&lh); -+ result = coord_by_key(tree, key, -+ &coord, &lh, ZNODE_READ_LOCK, -+ exact ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN, -+ LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL); -+ -+ if (result == 0) { -+ int ilen; -+ -+ result = zload(coord.node); -+ if (result == 0) { -+ ilen = item_length_by_coord(&coord); -+ if (ilen <= length) { -+ memcpy(data, item_body_by_coord(&coord), ilen); -+ unit_key_by_coord(&coord, key); -+ } else if (exact) { -+ /* -+ * item is larger than buffer provided by the -+ * user. Only issue a warning if @exact is -+ * set. If @exact is false, we are iterating -+ * over all safe-links and here we are reaching -+ * the end of the iteration. -+ */ -+ warning("nikita-3415", -+ "Wrong black box length: %i > %i", -+ ilen, length); -+ result = RETERR(-EIO); -+ } -+ zrelse(coord.node); -+ } -+ } -+ -+ done_lh(&lh); -+ return result; -+ -+} -+ -+int -+update_black_box(reiser4_tree * tree, -+ const reiser4_key * key, void *data, int length) -+{ -+ int result; -+ coord_t coord; -+ lock_handle lh; -+ -+ init_lh(&lh); -+ result = coord_by_key(tree, key, -+ &coord, &lh, ZNODE_READ_LOCK, -+ FIND_EXACT, -+ LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL); -+ if (result == 0) { -+ int ilen; -+ -+ result = zload(coord.node); -+ if (result == 0) { -+ ilen = item_length_by_coord(&coord); -+ if (length <= ilen) { -+ memcpy(item_body_by_coord(&coord), data, -+ length); -+ } else { -+ warning("nikita-3437", -+ "Wrong black box length: %i < %i", -+ ilen, length); -+ result = RETERR(-EIO); -+ } -+ zrelse(coord.node); -+ } -+ } -+ -+ done_lh(&lh); -+ return result; -+ -+} -+ -+int kill_black_box(reiser4_tree * tree, const reiser4_key * key) -+{ -+ return reiser4_cut_tree(tree, key, key, NULL, 1); -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/blackbox.h linux-2.6.24/fs/reiser4/plugin/item/blackbox.h ---- linux-2.6.24.orig/fs/reiser4/plugin/item/blackbox.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/item/blackbox.h 2008-01-25 11:39:07.004225206 +0300 -@@ -0,0 +1,33 @@ -+/* Copyright 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* "Black box" entry to fixed-width contain user supplied data */ -+ -+#if !defined( __FS_REISER4_BLACK_BOX_H__ ) -+#define __FS_REISER4_BLACK_BOX_H__ -+ -+#include "../../forward.h" -+#include "../../dformat.h" -+#include "../../kassign.h" -+#include "../../key.h" -+ -+extern int store_black_box(reiser4_tree * tree, -+ const reiser4_key * key, void *data, int length); -+extern int load_black_box(reiser4_tree * tree, -+ reiser4_key * key, void *data, int length, int exact); -+extern int kill_black_box(reiser4_tree * tree, const reiser4_key * key); -+extern int update_black_box(reiser4_tree * tree, -+ const reiser4_key * key, void *data, int length); -+ -+/* __FS_REISER4_BLACK_BOX_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/cde.c linux-2.6.24/fs/reiser4/plugin/item/cde.c ---- linux-2.6.24.orig/fs/reiser4/plugin/item/cde.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/item/cde.c 2008-01-25 11:39:07.004225206 +0300 -@@ -0,0 +1,1008 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Directory entry implementation */ -+ -+/* DESCRIPTION: -+ -+ This is "compound" directory item plugin implementation. This directory -+ item type is compound (as opposed to the "simple directory item" in -+ fs/reiser4/plugin/item/sde.[ch]), because it consists of several directory -+ entries. -+ -+ The reason behind this decision is disk space efficiency: all directory -+ entries inside the same directory have identical fragment in their -+ keys. This, of course, depends on key assignment policy. In our default key -+ assignment policy, all directory entries have the same locality which is -+ equal to the object id of their directory. -+ -+ Composing directory item out of several directory entries for the same -+ directory allows us to store said key fragment only once. That is, this is -+ some ad hoc form of key compression (stem compression) that is implemented -+ here, because general key compression is not supposed to be implemented in -+ v4.0. -+ -+ Another decision that was made regarding all directory item plugins, is -+ that they will store entry keys unaligned. This is for that sake of disk -+ space efficiency again. -+ -+ In should be noted, that storing keys unaligned increases CPU consumption, -+ at least on some architectures. -+ -+ Internal on-disk structure of the compound directory item is the following: -+ -+ HEADER cde_item_format. Here number of entries is stored. -+ ENTRY_HEADER_0 cde_unit_header. Here part of entry key and -+ ENTRY_HEADER_1 offset of entry body are stored. -+ ENTRY_HEADER_2 (basically two last parts of key) -+ ... -+ ENTRY_HEADER_N -+ ENTRY_BODY_0 directory_entry_format. Here part of stat data key and -+ ENTRY_BODY_1 NUL-terminated name are stored. -+ ENTRY_BODY_2 (part of statadta key in the -+ sence that since all SDs have -+ zero offset, this offset is not -+ stored on disk). -+ ... -+ ENTRY_BODY_N -+ -+ When it comes to the balancing, each directory entry in compound directory -+ item is unit, that is, something that can be cut from one item and pasted -+ into another item of the same type. Handling of unit cut and paste is major -+ reason for the complexity of code below. -+ -+*/ -+ -+#include "../../forward.h" -+#include "../../debug.h" -+#include "../../dformat.h" -+#include "../../kassign.h" -+#include "../../key.h" -+#include "../../coord.h" -+#include "sde.h" -+#include "cde.h" -+#include "item.h" -+#include "../node/node.h" -+#include "../plugin.h" -+#include "../../znode.h" -+#include "../../carry.h" -+#include "../../tree.h" -+#include "../../inode.h" -+ -+#include /* for struct inode */ -+#include /* for struct dentry */ -+#include -+ -+#if 0 -+#define CHECKME(coord) \ -+({ \ -+ const char *message; \ -+ coord_t dup; \ -+ \ -+ coord_dup_nocheck(&dup, (coord)); \ -+ dup.unit_pos = 0; \ -+ assert("nikita-2871", cde_check(&dup, &message) == 0); \ -+}) -+#else -+#define CHECKME(coord) noop -+#endif -+ -+/* return body of compound directory item at @coord */ -+static inline cde_item_format *formatted_at(const coord_t * coord) -+{ -+ assert("nikita-1282", coord != NULL); -+ return item_body_by_coord(coord); -+} -+ -+/* return entry header at @coord */ -+static inline cde_unit_header *header_at(const coord_t * -+ coord /* coord of item */ , -+ int idx /* index of unit */ ) -+{ -+ assert("nikita-1283", coord != NULL); -+ return &formatted_at(coord)->entry[idx]; -+} -+ -+/* return number of units in compound directory item at @coord */ -+static int units(const coord_t * coord /* coord of item */ ) -+{ -+ return le16_to_cpu(get_unaligned(&formatted_at(coord)->num_of_entries)); -+} -+ -+/* return offset of the body of @idx-th entry in @coord */ -+static unsigned int offset_of(const coord_t * coord /* coord of item */ , -+ int idx /* index of unit */ ) -+{ -+ if (idx < units(coord)) -+ return le16_to_cpu(get_unaligned(&header_at(coord, idx)->offset)); -+ else if (idx == units(coord)) -+ return item_length_by_coord(coord); -+ else -+ impossible("nikita-1308", "Wrong idx"); -+ return 0; -+} -+ -+/* set offset of the body of @idx-th entry in @coord */ -+static void set_offset(const coord_t * coord /* coord of item */ , -+ int idx /* index of unit */ , -+ unsigned int offset /* new offset */ ) -+{ -+ put_unaligned(cpu_to_le16((__u16) offset), &header_at(coord, idx)->offset); -+} -+ -+static void adj_offset(const coord_t * coord /* coord of item */ , -+ int idx /* index of unit */ , -+ int delta /* offset change */ ) -+{ -+ d16 *doffset; -+ __u16 offset; -+ -+ doffset = &header_at(coord, idx)->offset; -+ offset = le16_to_cpu(get_unaligned(doffset)); -+ offset += delta; -+ put_unaligned(cpu_to_le16((__u16) offset), doffset); -+} -+ -+/* return pointer to @offset-th byte from the beginning of @coord */ -+static char *address(const coord_t * coord /* coord of item */ , -+ int offset) -+{ -+ return ((char *)item_body_by_coord(coord)) + offset; -+} -+ -+/* return pointer to the body of @idx-th entry in @coord */ -+static directory_entry_format *entry_at(const coord_t * coord /* coord of -+ * item */ , -+ int idx /* index of unit */ ) -+{ -+ return (directory_entry_format *) address(coord, -+ (int)offset_of(coord, idx)); -+} -+ -+/* return number of unit referenced by @coord */ -+static int idx_of(const coord_t * coord /* coord of item */ ) -+{ -+ assert("nikita-1285", coord != NULL); -+ return coord->unit_pos; -+} -+ -+/* find position where entry with @entry_key would be inserted into @coord */ -+static int find(const coord_t * coord /* coord of item */ , -+ const reiser4_key * entry_key /* key to look for */ , -+ cmp_t * last /* result of last comparison */ ) -+{ -+ int entries; -+ -+ int left; -+ int right; -+ -+ cde_unit_header *header; -+ -+ assert("nikita-1295", coord != NULL); -+ assert("nikita-1296", entry_key != NULL); -+ assert("nikita-1297", last != NULL); -+ -+ entries = units(coord); -+ left = 0; -+ right = entries - 1; -+ while (right - left >= REISER4_SEQ_SEARCH_BREAK) { -+ int median; -+ -+ median = (left + right) >> 1; -+ -+ header = header_at(coord, median); -+ *last = de_id_key_cmp(&header->hash, entry_key); -+ switch (*last) { -+ case LESS_THAN: -+ left = median; -+ break; -+ case GREATER_THAN: -+ right = median; -+ break; -+ case EQUAL_TO:{ -+ do { -+ median--; -+ header--; -+ } while (median >= 0 && -+ de_id_key_cmp(&header->hash, -+ entry_key) == EQUAL_TO); -+ return median + 1; -+ } -+ } -+ } -+ header = header_at(coord, left); -+ for (; left < entries; ++left, ++header) { -+ prefetch(header + 1); -+ *last = de_id_key_cmp(&header->hash, entry_key); -+ if (*last != LESS_THAN) -+ break; -+ } -+ if (left < entries) -+ return left; -+ else -+ return RETERR(-ENOENT); -+ -+} -+ -+/* expand @coord as to accommodate for insertion of @no new entries starting -+ from @pos, with total bodies size @size. */ -+static int expand_item(const coord_t * coord /* coord of item */ , -+ int pos /* unit position */ , int no /* number of new -+ * units*/ , -+ int size /* total size of new units' data */ , -+ unsigned int data_size /* free space already reserved -+ * in the item for insertion */ ) -+{ -+ int entries; -+ cde_unit_header *header; -+ char *dent; -+ int i; -+ -+ assert("nikita-1310", coord != NULL); -+ assert("nikita-1311", pos >= 0); -+ assert("nikita-1312", no > 0); -+ assert("nikita-1313", data_size >= no * sizeof(directory_entry_format)); -+ assert("nikita-1343", -+ item_length_by_coord(coord) >= -+ (int)(size + data_size + no * sizeof *header)); -+ -+ entries = units(coord); -+ -+ if (pos == entries) -+ dent = address(coord, size); -+ else -+ dent = (char *)entry_at(coord, pos); -+ /* place where new header will be in */ -+ header = header_at(coord, pos); -+ /* free space for new entry headers */ -+ memmove(header + no, header, -+ (unsigned)(address(coord, size) - (char *)header)); -+ /* if adding to the end initialise first new header */ -+ if (pos == entries) { -+ set_offset(coord, pos, (unsigned)size); -+ } -+ -+ /* adjust entry pointer and size */ -+ dent = dent + no * sizeof *header; -+ size += no * sizeof *header; -+ /* free space for new entries */ -+ memmove(dent + data_size, dent, -+ (unsigned)(address(coord, size) - dent)); -+ -+ /* increase counter */ -+ entries += no; -+ put_unaligned(cpu_to_le16((__u16) entries), &formatted_at(coord)->num_of_entries); -+ -+ /* [ 0 ... pos ] entries were shifted by no * ( sizeof *header ) -+ bytes. */ -+ for (i = 0; i <= pos; ++i) -+ adj_offset(coord, i, no * sizeof *header); -+ /* [ pos + no ... +\infty ) entries were shifted by ( no * -+ sizeof *header + data_size ) bytes */ -+ for (i = pos + no; i < entries; ++i) -+ adj_offset(coord, i, no * sizeof *header + data_size); -+ return 0; -+} -+ -+/* insert new @entry into item */ -+static int expand(const coord_t * coord /* coord of item */ , -+ struct cde_entry * entry /* entry to insert */ , -+ int len /* length of @entry data */ , -+ int *pos /* position to insert */ , -+ reiser4_dir_entry_desc * dir_entry /* parameters for new -+ * entry */ ) -+{ -+ cmp_t cmp_res; -+ int datasize; -+ -+ *pos = find(coord, &dir_entry->key, &cmp_res); -+ if (*pos < 0) -+ *pos = units(coord); -+ -+ datasize = sizeof(directory_entry_format); -+ if (is_longname(entry->name->name, entry->name->len)) -+ datasize += entry->name->len + 1; -+ -+ expand_item(coord, *pos, 1, item_length_by_coord(coord) - len, -+ datasize); -+ return 0; -+} -+ -+/* paste body of @entry into item */ -+static int paste_entry(const coord_t * coord /* coord of item */ , -+ struct cde_entry * entry /* new entry */ , -+ int pos /* position to insert */ , -+ reiser4_dir_entry_desc * dir_entry /* parameters for -+ * new entry */ ) -+{ -+ cde_unit_header *header; -+ directory_entry_format *dent; -+ const char *name; -+ int len; -+ -+ header = header_at(coord, pos); -+ dent = entry_at(coord, pos); -+ -+ build_de_id_by_key(&dir_entry->key, &header->hash); -+ build_inode_key_id(entry->obj, &dent->id); -+ /* AUDIT unsafe strcpy() operation! It should be replaced with -+ much less CPU hungry -+ memcpy( ( char * ) dent -> name, entry -> name -> name , entry -> name -> len ); -+ -+ Also a more major thing is that there should be a way to figure out -+ amount of space in dent -> name and be able to check that we are -+ not going to overwrite more than we supposed to */ -+ name = entry->name->name; -+ len = entry->name->len; -+ if (is_longname(name, len)) { -+ strcpy((unsigned char *)dent->name, name); -+ put_unaligned(0, &dent->name[len]); -+ } -+ return 0; -+} -+ -+/* estimate how much space is necessary in item to insert/paste set of entries -+ described in @data. */ -+int estimate_cde(const coord_t * coord /* coord of item */ , -+ const reiser4_item_data * data /* parameters for new item */ ) -+{ -+ struct cde_entry_data *e; -+ int result; -+ int i; -+ -+ e = (struct cde_entry_data *) data->data; -+ -+ assert("nikita-1288", e != NULL); -+ assert("nikita-1289", e->num_of_entries >= 0); -+ -+ if (coord == NULL) -+ /* insert */ -+ result = sizeof(cde_item_format); -+ else -+ /* paste */ -+ result = 0; -+ -+ result += e->num_of_entries * -+ (sizeof(cde_unit_header) + sizeof(directory_entry_format)); -+ for (i = 0; i < e->num_of_entries; ++i) { -+ const char *name; -+ int len; -+ -+ name = e->entry[i].name->name; -+ len = e->entry[i].name->len; -+ assert("nikita-2054", strlen(name) == len); -+ if (is_longname(name, len)) -+ result += len + 1; -+ } -+ ((reiser4_item_data *) data)->length = result; -+ return result; -+} -+ -+/* ->nr_units() method for this item plugin. */ -+pos_in_node_t nr_units_cde(const coord_t * coord /* coord of item */ ) -+{ -+ return units(coord); -+} -+ -+/* ->unit_key() method for this item plugin. */ -+reiser4_key *unit_key_cde(const coord_t * coord /* coord of item */ , -+ reiser4_key * key /* resulting key */ ) -+{ -+ assert("nikita-1452", coord != NULL); -+ assert("nikita-1345", idx_of(coord) < units(coord)); -+ assert("nikita-1346", key != NULL); -+ -+ item_key_by_coord(coord, key); -+ extract_key_from_de_id(extract_dir_id_from_key(key), -+ &header_at(coord, idx_of(coord))->hash, key); -+ return key; -+} -+ -+/* mergeable_cde(): implementation of ->mergeable() item method. -+ -+ Two directory items are mergeable iff they are from the same -+ directory. That simple. -+ -+*/ -+int mergeable_cde(const coord_t * p1 /* coord of first item */ , -+ const coord_t * p2 /* coord of second item */ ) -+{ -+ reiser4_key k1; -+ reiser4_key k2; -+ -+ assert("nikita-1339", p1 != NULL); -+ assert("nikita-1340", p2 != NULL); -+ -+ return -+ (item_plugin_by_coord(p1) == item_plugin_by_coord(p2)) && -+ (extract_dir_id_from_key(item_key_by_coord(p1, &k1)) == -+ extract_dir_id_from_key(item_key_by_coord(p2, &k2))); -+ -+} -+ -+/* ->max_key_inside() method for this item plugin. */ -+reiser4_key *max_key_inside_cde(const coord_t * coord /* coord of item */ , -+ reiser4_key * result /* resulting key */ ) -+{ -+ assert("nikita-1342", coord != NULL); -+ -+ item_key_by_coord(coord, result); -+ set_key_ordering(result, get_key_ordering(reiser4_max_key())); -+ set_key_fulloid(result, get_key_fulloid(reiser4_max_key())); -+ set_key_offset(result, get_key_offset(reiser4_max_key())); -+ return result; -+} -+ -+/* @data contains data which are to be put into tree */ -+int can_contain_key_cde(const coord_t * coord /* coord of item */ , -+ const reiser4_key * key /* key to check */ , -+ const reiser4_item_data * data /* parameters of new -+ * item/unit being -+ * created */ ) -+{ -+ reiser4_key item_key; -+ -+ /* FIXME-VS: do not rely on anything but iplug field of @data. Only -+ data->iplug is initialized */ -+ assert("vs-457", data && data->iplug); -+/* assert( "vs-553", data -> user == 0 );*/ -+ item_key_by_coord(coord, &item_key); -+ -+ return (item_plugin_by_coord(coord) == data->iplug) && -+ (extract_dir_id_from_key(&item_key) == -+ extract_dir_id_from_key(key)); -+} -+ -+#if REISER4_DEBUG -+/* cde_check ->check() method for compressed directory items -+ -+ used for debugging, every item should have here the most complete -+ possible check of the consistency of the item that the inventor can -+ construct -+*/ -+int reiser4_check_cde(const coord_t * coord /* coord of item to check */, -+ const char **error /* where to store error message */) -+{ -+ int i; -+ int result; -+ char *item_start; -+ char *item_end; -+ reiser4_key key; -+ -+ coord_t c; -+ -+ assert("nikita-1357", coord != NULL); -+ assert("nikita-1358", error != NULL); -+ -+ if (!ergo(coord->item_pos != 0, -+ is_dot_key(item_key_by_coord(coord, &key)))) { -+ *error = "CDE doesn't start with dot"; -+ return -1; -+ } -+ item_start = item_body_by_coord(coord); -+ item_end = item_start + item_length_by_coord(coord); -+ -+ coord_dup(&c, coord); -+ result = 0; -+ for (i = 0; i < units(coord); ++i) { -+ directory_entry_format *entry; -+ -+ if ((char *)(header_at(coord, i) + 1) > -+ item_end - units(coord) * sizeof *entry) { -+ *error = "CDE header is out of bounds"; -+ result = -1; -+ break; -+ } -+ entry = entry_at(coord, i); -+ if ((char *)entry < item_start + sizeof(cde_item_format)) { -+ *error = "CDE header is too low"; -+ result = -1; -+ break; -+ } -+ if ((char *)(entry + 1) > item_end) { -+ *error = "CDE header is too high"; -+ result = -1; -+ break; -+ } -+ } -+ -+ return result; -+} -+#endif -+ -+/* ->init() method for this item plugin. */ -+int init_cde(coord_t * coord /* coord of item */ , -+ coord_t * from UNUSED_ARG, reiser4_item_data * data /* structure used for insertion */ -+ UNUSED_ARG) -+{ -+ put_unaligned(cpu_to_le16(0), &formatted_at(coord)->num_of_entries); -+ return 0; -+} -+ -+/* ->lookup() method for this item plugin. */ -+lookup_result lookup_cde(const reiser4_key * key /* key to search for */ , -+ lookup_bias bias /* search bias */ , -+ coord_t * coord /* coord of item to lookup in */ ) -+{ -+ cmp_t last_comp; -+ int pos; -+ -+ reiser4_key utmost_key; -+ -+ assert("nikita-1293", coord != NULL); -+ assert("nikita-1294", key != NULL); -+ -+ CHECKME(coord); -+ -+ if (keygt(item_key_by_coord(coord, &utmost_key), key)) { -+ coord->unit_pos = 0; -+ coord->between = BEFORE_UNIT; -+ return CBK_COORD_NOTFOUND; -+ } -+ pos = find(coord, key, &last_comp); -+ if (pos >= 0) { -+ coord->unit_pos = (int)pos; -+ switch (last_comp) { -+ case EQUAL_TO: -+ coord->between = AT_UNIT; -+ return CBK_COORD_FOUND; -+ case GREATER_THAN: -+ coord->between = BEFORE_UNIT; -+ return RETERR(-ENOENT); -+ case LESS_THAN: -+ default: -+ impossible("nikita-1298", "Broken find"); -+ return RETERR(-EIO); -+ } -+ } else { -+ coord->unit_pos = units(coord) - 1; -+ coord->between = AFTER_UNIT; -+ return (bias == -+ FIND_MAX_NOT_MORE_THAN) ? CBK_COORD_FOUND : -+ CBK_COORD_NOTFOUND; -+ } -+} -+ -+/* ->paste() method for this item plugin. */ -+int paste_cde(coord_t * coord /* coord of item */ , -+ reiser4_item_data * data /* parameters of new unit being -+ * inserted */ , -+ carry_plugin_info * info UNUSED_ARG /* todo carry queue */ ) -+{ -+ struct cde_entry_data *e; -+ int result; -+ int i; -+ -+ CHECKME(coord); -+ e = (struct cde_entry_data *) data->data; -+ -+ result = 0; -+ for (i = 0; i < e->num_of_entries; ++i) { -+ int pos; -+ int phantom_size; -+ -+ phantom_size = data->length; -+ if (units(coord) == 0) -+ phantom_size -= sizeof(cde_item_format); -+ -+ result = -+ expand(coord, e->entry + i, phantom_size, &pos, data->arg); -+ if (result != 0) -+ break; -+ result = paste_entry(coord, e->entry + i, pos, data->arg); -+ if (result != 0) -+ break; -+ } -+ CHECKME(coord); -+ return result; -+} -+ -+/* amount of space occupied by all entries starting from @idx both headers and -+ bodies. */ -+static unsigned int part_size(const coord_t * coord /* coord of item */ , -+ int idx /* index of unit */ ) -+{ -+ assert("nikita-1299", coord != NULL); -+ assert("nikita-1300", idx < (int)units(coord)); -+ -+ return sizeof(cde_item_format) + -+ (idx + 1) * sizeof(cde_unit_header) + offset_of(coord, -+ idx + 1) - -+ offset_of(coord, 0); -+} -+ -+/* how many but not more than @want units of @source can be merged with -+ item in @target node. If pend == append - we try to append last item -+ of @target by first units of @source. If pend == prepend - we try to -+ "prepend" first item in @target by last units of @source. @target -+ node has @free_space bytes of free space. Total size of those units -+ are returned via @size */ -+int can_shift_cde(unsigned free_space /* free space in item */ , -+ coord_t * coord /* coord of source item */ , -+ znode * target /* target node */ , -+ shift_direction pend /* shift direction */ , -+ unsigned *size /* resulting number of shifted bytes */ , -+ unsigned want /* maximal number of bytes to shift */ ) -+{ -+ int shift; -+ -+ CHECKME(coord); -+ if (want == 0) { -+ *size = 0; -+ return 0; -+ } -+ -+ /* pend == SHIFT_LEFT <==> shifting to the left */ -+ if (pend == SHIFT_LEFT) { -+ for (shift = min((int)want - 1, units(coord)); shift >= 0; -+ --shift) { -+ *size = part_size(coord, shift); -+ if (target != NULL) -+ *size -= sizeof(cde_item_format); -+ if (*size <= free_space) -+ break; -+ } -+ shift = shift + 1; -+ } else { -+ int total_size; -+ -+ assert("nikita-1301", pend == SHIFT_RIGHT); -+ -+ total_size = item_length_by_coord(coord); -+ for (shift = units(coord) - want - 1; shift < units(coord) - 1; -+ ++shift) { -+ *size = total_size - part_size(coord, shift); -+ if (target == NULL) -+ *size += sizeof(cde_item_format); -+ if (*size <= free_space) -+ break; -+ } -+ shift = units(coord) - shift - 1; -+ } -+ if (shift == 0) -+ *size = 0; -+ CHECKME(coord); -+ return shift; -+} -+ -+/* ->copy_units() method for this item plugin. */ -+void copy_units_cde(coord_t * target /* coord of target item */ , -+ coord_t * source /* coord of source item */ , -+ unsigned from /* starting unit */ , -+ unsigned count /* how many units to copy */ , -+ shift_direction where_is_free_space /* shift direction */ , -+ unsigned free_space /* free space in item */ ) -+{ -+ char *header_from; -+ char *header_to; -+ -+ char *entry_from; -+ char *entry_to; -+ -+ int pos_in_target; -+ int data_size; -+ int data_delta; -+ int i; -+ -+ assert("nikita-1303", target != NULL); -+ assert("nikita-1304", source != NULL); -+ assert("nikita-1305", (int)from < units(source)); -+ assert("nikita-1307", (int)(from + count) <= units(source)); -+ -+ if (where_is_free_space == SHIFT_LEFT) { -+ assert("nikita-1453", from == 0); -+ pos_in_target = units(target); -+ } else { -+ assert("nikita-1309", (int)(from + count) == units(source)); -+ pos_in_target = 0; -+ memmove(item_body_by_coord(target), -+ (char *)item_body_by_coord(target) + free_space, -+ item_length_by_coord(target) - free_space); -+ } -+ -+ CHECKME(target); -+ CHECKME(source); -+ -+ /* expand @target */ -+ data_size = -+ offset_of(source, (int)(from + count)) - offset_of(source, -+ (int)from); -+ -+ if (units(target) == 0) -+ free_space -= sizeof(cde_item_format); -+ -+ expand_item(target, pos_in_target, (int)count, -+ (int)(item_length_by_coord(target) - free_space), -+ (unsigned)data_size); -+ -+ /* copy first @count units of @source into @target */ -+ data_delta = -+ offset_of(target, pos_in_target) - offset_of(source, (int)from); -+ -+ /* copy entries */ -+ entry_from = (char *)entry_at(source, (int)from); -+ entry_to = (char *)entry_at(source, (int)(from + count)); -+ memmove(entry_at(target, pos_in_target), entry_from, -+ (unsigned)(entry_to - entry_from)); -+ -+ /* copy headers */ -+ header_from = (char *)header_at(source, (int)from); -+ header_to = (char *)header_at(source, (int)(from + count)); -+ memmove(header_at(target, pos_in_target), header_from, -+ (unsigned)(header_to - header_from)); -+ -+ /* update offsets */ -+ for (i = pos_in_target; i < (int)(pos_in_target + count); ++i) -+ adj_offset(target, i, data_delta); -+ CHECKME(target); -+ CHECKME(source); -+} -+ -+/* ->cut_units() method for this item plugin. */ -+int cut_units_cde(coord_t * coord /* coord of item */ , -+ pos_in_node_t from /* start unit pos */ , -+ pos_in_node_t to /* stop unit pos */ , -+ struct carry_cut_data *cdata UNUSED_ARG, -+ reiser4_key * smallest_removed, reiser4_key * new_first) -+{ -+ char *header_from; -+ char *header_to; -+ -+ char *entry_from; -+ char *entry_to; -+ -+ int size; -+ int entry_delta; -+ int header_delta; -+ int i; -+ -+ unsigned count; -+ -+ CHECKME(coord); -+ -+ count = to - from + 1; -+ -+ assert("nikita-1454", coord != NULL); -+ assert("nikita-1455", (int)(from + count) <= units(coord)); -+ -+ if (smallest_removed) -+ unit_key_by_coord(coord, smallest_removed); -+ -+ if (new_first) { -+ coord_t next; -+ -+ /* not everything is cut from item head */ -+ assert("vs-1527", from == 0); -+ assert("vs-1528", to < units(coord) - 1); -+ -+ coord_dup(&next, coord); -+ next.unit_pos++; -+ unit_key_by_coord(&next, new_first); -+ } -+ -+ size = item_length_by_coord(coord); -+ if (count == (unsigned)units(coord)) { -+ return size; -+ } -+ -+ header_from = (char *)header_at(coord, (int)from); -+ header_to = (char *)header_at(coord, (int)(from + count)); -+ -+ entry_from = (char *)entry_at(coord, (int)from); -+ entry_to = (char *)entry_at(coord, (int)(from + count)); -+ -+ /* move headers */ -+ memmove(header_from, header_to, -+ (unsigned)(address(coord, size) - header_to)); -+ -+ header_delta = header_to - header_from; -+ -+ entry_from -= header_delta; -+ entry_to -= header_delta; -+ size -= header_delta; -+ -+ /* copy entries */ -+ memmove(entry_from, entry_to, -+ (unsigned)(address(coord, size) - entry_to)); -+ -+ entry_delta = entry_to - entry_from; -+ size -= entry_delta; -+ -+ /* update offsets */ -+ -+ for (i = 0; i < (int)from; ++i) -+ adj_offset(coord, i, -header_delta); -+ -+ for (i = from; i < units(coord) - (int)count; ++i) -+ adj_offset(coord, i, -header_delta - entry_delta); -+ -+ put_unaligned(cpu_to_le16((__u16) units(coord) - count), -+ &formatted_at(coord)->num_of_entries); -+ -+ if (from == 0) { -+ /* entries from head was removed - move remaining to right */ -+ memmove((char *)item_body_by_coord(coord) + -+ header_delta + entry_delta, item_body_by_coord(coord), -+ (unsigned)size); -+ if (REISER4_DEBUG) -+ memset(item_body_by_coord(coord), 0, -+ (unsigned)header_delta + entry_delta); -+ } else { -+ /* freed space is already at the end of item */ -+ if (REISER4_DEBUG) -+ memset((char *)item_body_by_coord(coord) + size, 0, -+ (unsigned)header_delta + entry_delta); -+ } -+ -+ return header_delta + entry_delta; -+} -+ -+int kill_units_cde(coord_t * coord /* coord of item */ , -+ pos_in_node_t from /* start unit pos */ , -+ pos_in_node_t to /* stop unit pos */ , -+ struct carry_kill_data *kdata UNUSED_ARG, -+ reiser4_key * smallest_removed, reiser4_key * new_first) -+{ -+ return cut_units_cde(coord, from, to, NULL, smallest_removed, new_first); -+} -+ -+/* ->s.dir.extract_key() method for this item plugin. */ -+int extract_key_cde(const coord_t * coord /* coord of item */ , -+ reiser4_key * key /* resulting key */ ) -+{ -+ directory_entry_format *dent; -+ -+ assert("nikita-1155", coord != NULL); -+ assert("nikita-1156", key != NULL); -+ -+ dent = entry_at(coord, idx_of(coord)); -+ return extract_key_from_id(&dent->id, key); -+} -+ -+int -+update_key_cde(const coord_t * coord, const reiser4_key * key, -+ lock_handle * lh UNUSED_ARG) -+{ -+ directory_entry_format *dent; -+ obj_key_id obj_id; -+ int result; -+ -+ assert("nikita-2344", coord != NULL); -+ assert("nikita-2345", key != NULL); -+ -+ dent = entry_at(coord, idx_of(coord)); -+ result = build_obj_key_id(key, &obj_id); -+ if (result == 0) { -+ dent->id = obj_id; -+ znode_make_dirty(coord->node); -+ } -+ return 0; -+} -+ -+/* ->s.dir.extract_name() method for this item plugin. */ -+char *extract_name_cde(const coord_t * coord /* coord of item */ , char *buf) -+{ -+ directory_entry_format *dent; -+ -+ assert("nikita-1157", coord != NULL); -+ -+ dent = entry_at(coord, idx_of(coord)); -+ return extract_dent_name(coord, dent, buf); -+} -+ -+static int cde_bytes(int pasting, const reiser4_item_data * data) -+{ -+ int result; -+ -+ result = data->length; -+ if (!pasting) -+ result -= sizeof(cde_item_format); -+ return result; -+} -+ -+/* ->s.dir.add_entry() method for this item plugin */ -+int add_entry_cde(struct inode *dir /* directory object */ , -+ coord_t * coord /* coord of item */ , -+ lock_handle * lh /* lock handle for insertion */ , -+ const struct dentry *name /* name to insert */ , -+ reiser4_dir_entry_desc * dir_entry /* parameters of new -+ * directory entry */ ) -+{ -+ reiser4_item_data data; -+ struct cde_entry entry; -+ struct cde_entry_data edata; -+ int result; -+ -+ assert("nikita-1656", coord->node == lh->node); -+ assert("nikita-1657", znode_is_write_locked(coord->node)); -+ -+ edata.num_of_entries = 1; -+ edata.entry = &entry; -+ -+ entry.dir = dir; -+ entry.obj = dir_entry->obj; -+ entry.name = &name->d_name; -+ -+ data.data = (char *)&edata; -+ data.user = 0; /* &edata is not user space */ -+ data.iplug = item_plugin_by_id(COMPOUND_DIR_ID); -+ data.arg = dir_entry; -+ assert("nikita-1302", data.iplug != NULL); -+ -+ result = is_dot_key(&dir_entry->key); -+ data.length = estimate_cde(result ? coord : NULL, &data); -+ -+ /* NOTE-NIKITA quota plugin? */ -+ if (DQUOT_ALLOC_SPACE_NODIRTY(dir, cde_bytes(result, &data))) -+ return RETERR(-EDQUOT); -+ -+ if (result) -+ result = insert_by_coord(coord, &data, &dir_entry->key, lh, 0); -+ else -+ result = reiser4_resize_item(coord, &data, &dir_entry->key, -+ lh, 0); -+ return result; -+} -+ -+/* ->s.dir.rem_entry() */ -+int rem_entry_cde(struct inode *dir /* directory of item */ , -+ const struct qstr *name, coord_t * coord /* coord of item */ , -+ lock_handle * lh UNUSED_ARG /* lock handle for -+ * removal */ , -+ reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of -+ * directory entry -+ * being removed */ ) -+{ -+ coord_t shadow; -+ int result; -+ int length; -+ ON_DEBUG(char buf[DE_NAME_BUF_LEN]); -+ -+ assert("nikita-2870", strlen(name->name) == name->len); -+ assert("nikita-2869", -+ !strcmp(name->name, extract_name_cde(coord, buf))); -+ -+ length = sizeof(directory_entry_format) + sizeof(cde_unit_header); -+ if (is_longname(name->name, name->len)) -+ length += name->len + 1; -+ -+ if (inode_get_bytes(dir) < length) { -+ warning("nikita-2628", "Dir is broke: %llu: %llu", -+ (unsigned long long)get_inode_oid(dir), -+ inode_get_bytes(dir)); -+ -+ return RETERR(-EIO); -+ } -+ -+ /* cut_node() is supposed to take pointers to _different_ -+ coords, because it will modify them without respect to -+ possible aliasing. To work around this, create temporary copy -+ of @coord. -+ */ -+ coord_dup(&shadow, coord); -+ result = -+ kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0); -+ if (result == 0) { -+ /* NOTE-NIKITA quota plugin? */ -+ DQUOT_FREE_SPACE_NODIRTY(dir, length); -+ } -+ return result; -+} -+ -+/* ->s.dir.max_name_len() method for this item plugin */ -+int max_name_len_cde(const struct inode *dir /* directory */ ) -+{ -+ return -+ reiser4_tree_by_inode(dir)->nplug->max_item_size() - -+ sizeof(directory_entry_format) - sizeof(cde_item_format) - -+ sizeof(cde_unit_header) - 2; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/cde.h linux-2.6.24/fs/reiser4/plugin/item/cde.h ---- linux-2.6.24.orig/fs/reiser4/plugin/item/cde.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/item/cde.h 2008-01-25 11:39:07.004225206 +0300 -@@ -0,0 +1,87 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Compound directory item. See cde.c for description. */ -+ -+#if !defined( __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ ) -+#define __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ -+ -+#include "../../forward.h" -+#include "../../kassign.h" -+#include "../../dformat.h" -+ -+#include /* for struct inode */ -+#include /* for struct dentry, etc */ -+ -+typedef struct cde_unit_header { -+ de_id hash; -+ d16 offset; -+} cde_unit_header; -+ -+typedef struct cde_item_format { -+ d16 num_of_entries; -+ cde_unit_header entry[0]; -+} cde_item_format; -+ -+struct cde_entry { -+ const struct inode *dir; -+ const struct inode *obj; -+ const struct qstr *name; -+}; -+ -+struct cde_entry_data { -+ int num_of_entries; -+ struct cde_entry *entry; -+}; -+ -+/* plugin->item.b.* */ -+reiser4_key *max_key_inside_cde(const coord_t * coord, reiser4_key * result); -+int can_contain_key_cde(const coord_t * coord, const reiser4_key * key, -+ const reiser4_item_data *); -+int mergeable_cde(const coord_t * p1, const coord_t * p2); -+pos_in_node_t nr_units_cde(const coord_t * coord); -+reiser4_key *unit_key_cde(const coord_t * coord, reiser4_key * key); -+int estimate_cde(const coord_t * coord, const reiser4_item_data * data); -+void print_cde(const char *prefix, coord_t * coord); -+int init_cde(coord_t * coord, coord_t * from, reiser4_item_data * data); -+lookup_result lookup_cde(const reiser4_key * key, lookup_bias bias, -+ coord_t * coord); -+int paste_cde(coord_t * coord, reiser4_item_data * data, -+ carry_plugin_info * info UNUSED_ARG); -+int can_shift_cde(unsigned free_space, coord_t * coord, znode * target, -+ shift_direction pend, unsigned *size, unsigned want); -+void copy_units_cde(coord_t * target, coord_t * source, unsigned from, -+ unsigned count, shift_direction where_is_free_space, -+ unsigned free_space); -+int cut_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ struct carry_cut_data *, reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+int kill_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ struct carry_kill_data *, reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+void print_cde(const char *prefix, coord_t * coord); -+int reiser4_check_cde(const coord_t * coord, const char **error); -+ -+/* plugin->u.item.s.dir.* */ -+int extract_key_cde(const coord_t * coord, reiser4_key * key); -+int update_key_cde(const coord_t * coord, const reiser4_key * key, -+ lock_handle * lh); -+char *extract_name_cde(const coord_t * coord, char *buf); -+int add_entry_cde(struct inode *dir, coord_t * coord, -+ lock_handle * lh, const struct dentry *name, -+ reiser4_dir_entry_desc * entry); -+int rem_entry_cde(struct inode *dir, const struct qstr *name, coord_t * coord, -+ lock_handle * lh, reiser4_dir_entry_desc * entry); -+int max_name_len_cde(const struct inode *dir); -+ -+/* __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/ctail.c linux-2.6.24/fs/reiser4/plugin/item/ctail.c ---- linux-2.6.24.orig/fs/reiser4/plugin/item/ctail.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/item/ctail.c 2008-01-25 11:39:07.008226236 +0300 -@@ -0,0 +1,1613 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* ctails (aka "clustered tails") are items for cryptcompress objects */ -+ -+/* DESCRIPTION: -+ -+Each cryptcompress object is stored on disk as a set of clusters sliced -+into ctails. -+ -+Internal on-disk structure: -+ -+ HEADER (1) Here stored disk cluster shift -+ BODY -+*/ -+ -+#include "../../forward.h" -+#include "../../debug.h" -+#include "../../dformat.h" -+#include "../../kassign.h" -+#include "../../key.h" -+#include "../../coord.h" -+#include "item.h" -+#include "../node/node.h" -+#include "../plugin.h" -+#include "../object.h" -+#include "../../znode.h" -+#include "../../carry.h" -+#include "../../tree.h" -+#include "../../inode.h" -+#include "../../super.h" -+#include "../../context.h" -+#include "../../page_cache.h" -+#include "../cluster.h" -+#include "../../flush.h" -+#include "../../tree_walk.h" -+ -+#include -+#include -+#include -+ -+/* return body of ctail item at @coord */ -+static ctail_item_format *ctail_formatted_at(const coord_t * coord) -+{ -+ assert("edward-60", coord != NULL); -+ return item_body_by_coord(coord); -+} -+ -+static int cluster_shift_by_coord(const coord_t * coord) -+{ -+ return get_unaligned(&ctail_formatted_at(coord)->cluster_shift); -+} -+ -+static inline void dclust_set_extension_shift(hint_t * hint) -+{ -+ assert("edward-1270", -+ item_id_by_coord(&hint->ext_coord.coord) == CTAIL_ID); -+ hint->ext_coord.extension.ctail.shift = -+ cluster_shift_by_coord(&hint->ext_coord.coord); -+} -+ -+static loff_t off_by_coord(const coord_t * coord) -+{ -+ reiser4_key key; -+ return get_key_offset(item_key_by_coord(coord, &key)); -+} -+ -+int coord_is_unprepped_ctail(const coord_t * coord) -+{ -+ assert("edward-1233", coord != NULL); -+ assert("edward-1234", item_id_by_coord(coord) == CTAIL_ID); -+ assert("edward-1235", -+ ergo((int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT, -+ nr_units_ctail(coord) == (pos_in_node_t) UCTAIL_NR_UNITS)); -+ -+ return (int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT; -+} -+ -+static cloff_t clust_by_coord(const coord_t * coord, struct inode *inode) -+{ -+ int shift; -+ -+ if (inode != NULL) { -+ shift = inode_cluster_shift(inode); -+ assert("edward-1236", -+ ergo(!coord_is_unprepped_ctail(coord), -+ shift == cluster_shift_by_coord(coord))); -+ } else { -+ assert("edward-1237", !coord_is_unprepped_ctail(coord)); -+ shift = cluster_shift_by_coord(coord); -+ } -+ return off_by_coord(coord) >> shift; -+} -+ -+static int disk_cluster_size(const coord_t * coord) -+{ -+ assert("edward-1156", -+ item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID)); -+ /* calculation of disk cluster size -+ is meaninless if ctail is unprepped */ -+ assert("edward-1238", !coord_is_unprepped_ctail(coord)); -+ -+ return 1 << cluster_shift_by_coord(coord); -+} -+ -+/* true if the key is of first disk cluster item */ -+static int is_disk_cluster_key(const reiser4_key * key, const coord_t * coord) -+{ -+ assert("edward-1239", item_id_by_coord(coord) == CTAIL_ID); -+ -+ return coord_is_unprepped_ctail(coord) || -+ ((get_key_offset(key) & -+ ((loff_t) disk_cluster_size(coord) - 1)) == 0); -+} -+ -+static char *first_unit(coord_t * coord) -+{ -+ /* FIXME: warning: pointer of type `void *' used in arithmetic */ -+ return (char *)item_body_by_coord(coord) + sizeof(ctail_item_format); -+} -+ -+/* plugin->u.item.b.max_key_inside : -+ tail_max_key_inside */ -+ -+/* plugin->u.item.b.can_contain_key */ -+int -+can_contain_key_ctail(const coord_t * coord, const reiser4_key * key, -+ const reiser4_item_data * data) -+{ -+ reiser4_key item_key; -+ -+ if (item_plugin_by_coord(coord) != data->iplug) -+ return 0; -+ -+ item_key_by_coord(coord, &item_key); -+ if (get_key_locality(key) != get_key_locality(&item_key) || -+ get_key_objectid(key) != get_key_objectid(&item_key)) -+ return 0; -+ if (get_key_offset(&item_key) + nr_units_ctail(coord) != -+ get_key_offset(key)) -+ return 0; -+ if (is_disk_cluster_key(key, coord)) -+ return 0; -+ return 1; -+} -+ -+/* plugin->u.item.b.mergeable */ -+int mergeable_ctail(const coord_t * p1, const coord_t * p2) -+{ -+ reiser4_key key1, key2; -+ -+ assert("edward-62", item_id_by_coord(p1) == CTAIL_ID); -+ assert("edward-61", plugin_of_group(item_plugin_by_coord(p1), -+ UNIX_FILE_METADATA_ITEM_TYPE)); -+ -+ if (item_id_by_coord(p2) != CTAIL_ID) { -+ /* second item is of another type */ -+ return 0; -+ } -+ -+ item_key_by_coord(p1, &key1); -+ item_key_by_coord(p2, &key2); -+ if (get_key_locality(&key1) != get_key_locality(&key2) || -+ get_key_objectid(&key1) != get_key_objectid(&key2) || -+ get_key_type(&key1) != get_key_type(&key2)) { -+ /* items of different objects */ -+ return 0; -+ } -+ if (get_key_offset(&key1) + nr_units_ctail(p1) != get_key_offset(&key2)) -+ /* not adjacent items */ -+ return 0; -+ if (is_disk_cluster_key(&key2, p2)) -+ return 0; -+ return 1; -+} -+ -+/* plugin->u.item.b.nr_units */ -+pos_in_node_t nr_units_ctail(const coord_t * coord) -+{ -+ return (item_length_by_coord(coord) - -+ sizeof(ctail_formatted_at(coord)->cluster_shift)); -+} -+ -+/* plugin->u.item.b.estimate: -+ estimate how much space is needed to insert/paste @data->length bytes -+ into ctail at @coord */ -+int estimate_ctail(const coord_t * coord /* coord of item */ , -+ const reiser4_item_data * -+ data /* parameters for new item */ ) -+{ -+ if (coord == NULL) -+ /* insert */ -+ return (sizeof(ctail_item_format) + data->length); -+ else -+ /* paste */ -+ return data->length; -+} -+ -+/* ->init() method for this item plugin. */ -+int init_ctail(coord_t * to /* coord of item */ , -+ coord_t * from /* old_item */ , -+ reiser4_item_data * data /* structure used for insertion */ ) -+{ -+ int cluster_shift; /* cpu value to convert */ -+ -+ if (data) { -+ assert("edward-463", data->length > sizeof(ctail_item_format)); -+ cluster_shift = *((int *)(data->arg)); -+ data->length -= sizeof(ctail_item_format); -+ } else { -+ assert("edward-464", from != NULL); -+ assert("edward-855", ctail_ok(from)); -+ cluster_shift = (int)(cluster_shift_by_coord(from)); -+ } -+ put_unaligned((d8)cluster_shift, &ctail_formatted_at(to)->cluster_shift); -+ assert("edward-856", ctail_ok(to)); -+ return 0; -+} -+ -+/* plugin->u.item.b.lookup: -+ NULL: We are looking for item keys only */ -+ -+#if REISER4_DEBUG -+int ctail_ok(const coord_t * coord) -+{ -+ return coord_is_unprepped_ctail(coord) || -+ cluster_shift_ok(cluster_shift_by_coord(coord)); -+} -+ -+/* plugin->u.item.b.check */ -+int check_ctail(const coord_t * coord, const char **error) -+{ -+ if (!ctail_ok(coord)) { -+ if (error) -+ *error = "bad cluster shift in ctail"; -+ return 1; -+ } -+ return 0; -+} -+#endif -+ -+/* plugin->u.item.b.paste */ -+int -+paste_ctail(coord_t * coord, reiser4_item_data * data, -+ carry_plugin_info * info UNUSED_ARG) -+{ -+ unsigned old_nr_units; -+ -+ assert("edward-268", data->data != NULL); -+ /* copy only from kernel space */ -+ assert("edward-66", data->user == 0); -+ -+ old_nr_units = -+ item_length_by_coord(coord) - sizeof(ctail_item_format) - -+ data->length; -+ -+ /* ctail items never get pasted in the middle */ -+ -+ if (coord->unit_pos == 0 && coord->between == AT_UNIT) { -+ -+ /* paste at the beginning when create new item */ -+ assert("edward-450", -+ item_length_by_coord(coord) == -+ data->length + sizeof(ctail_item_format)); -+ assert("edward-451", old_nr_units == 0); -+ } else if (coord->unit_pos == old_nr_units - 1 -+ && coord->between == AFTER_UNIT) { -+ -+ /* paste at the end */ -+ coord->unit_pos++; -+ } else -+ impossible("edward-453", "bad paste position"); -+ -+ memcpy(first_unit(coord) + coord->unit_pos, data->data, data->length); -+ -+ assert("edward-857", ctail_ok(coord)); -+ -+ return 0; -+} -+ -+/* plugin->u.item.b.fast_paste */ -+ -+/* plugin->u.item.b.can_shift -+ number of units is returned via return value, number of bytes via @size. For -+ ctail items they coincide */ -+int -+can_shift_ctail(unsigned free_space, coord_t * source, -+ znode * target, shift_direction direction UNUSED_ARG, -+ unsigned *size /* number of bytes */ , unsigned want) -+{ -+ /* make sure that that we do not want to shift more than we have */ -+ assert("edward-68", want > 0 && want <= nr_units_ctail(source)); -+ -+ *size = min(want, free_space); -+ -+ if (!target) { -+ /* new item will be created */ -+ if (*size <= sizeof(ctail_item_format)) { -+ *size = 0; -+ return 0; -+ } -+ return *size - sizeof(ctail_item_format); -+ } -+ return *size; -+} -+ -+/* plugin->u.item.b.copy_units -+ cooperates with ->can_shift() */ -+void -+copy_units_ctail(coord_t * target, coord_t * source, -+ unsigned from, unsigned count /* units */ , -+ shift_direction where_is_free_space, -+ unsigned free_space /* bytes */ ) -+{ -+ /* make sure that item @target is expanded already */ -+ assert("edward-69", (unsigned)item_length_by_coord(target) >= count); -+ assert("edward-70", free_space == count || free_space == count + 1); -+ -+ assert("edward-858", ctail_ok(source)); -+ -+ if (where_is_free_space == SHIFT_LEFT) { -+ /* append item @target with @count first bytes of @source: -+ this restriction came from ordinary tails */ -+ assert("edward-71", from == 0); -+ assert("edward-860", ctail_ok(target)); -+ -+ memcpy(first_unit(target) + nr_units_ctail(target) - count, -+ first_unit(source), count); -+ } else { -+ /* target item is moved to right already */ -+ reiser4_key key; -+ -+ assert("edward-72", nr_units_ctail(source) == from + count); -+ -+ if (free_space == count) { -+ init_ctail(target, source, NULL); -+ } else { -+ /* new item has been created */ -+ assert("edward-862", ctail_ok(target)); -+ } -+ memcpy(first_unit(target), first_unit(source) + from, count); -+ -+ assert("edward-863", ctail_ok(target)); -+ -+ /* new units are inserted before first unit in an item, -+ therefore, we have to update item key */ -+ item_key_by_coord(source, &key); -+ set_key_offset(&key, get_key_offset(&key) + from); -+ -+ node_plugin_by_node(target->node)->update_item_key(target, &key, -+ NULL /*info */); -+ } -+} -+ -+/* plugin->u.item.b.create_hook */ -+int create_hook_ctail(const coord_t * coord, void *arg) -+{ -+ assert("edward-864", znode_is_loaded(coord->node)); -+ -+ znode_set_convertible(coord->node); -+ return 0; -+} -+ -+/* plugin->u.item.b.kill_hook */ -+int kill_hook_ctail(const coord_t * coord, pos_in_node_t from, -+ pos_in_node_t count, carry_kill_data * kdata) -+{ -+ struct inode *inode; -+ -+ assert("edward-1157", item_id_by_coord(coord) == CTAIL_ID); -+ assert("edward-291", znode_is_write_locked(coord->node)); -+ -+ inode = kdata->inode; -+ if (inode) { -+ reiser4_key key; -+ struct cryptcompress_info * info; -+ cloff_t index; -+ -+ item_key_by_coord(coord, &key); -+ info = cryptcompress_inode_data(inode); -+ index = off_to_clust(get_key_offset(&key), inode); -+ -+ if (from == 0) { -+ info->trunc_index = index; -+ if (is_disk_cluster_key(&key, coord)) { -+ /* -+ * first item of disk cluster is to be killed -+ */ -+ truncate_complete_page_cluster( -+ inode, index, kdata->params.truncate); -+ inode_sub_bytes(inode, -+ inode_cluster_size(inode)); -+ } -+ } -+ } -+ return 0; -+} -+ -+/* for shift_hook_ctail(), -+ return true if the first disk cluster item has dirty child -+*/ -+static int ctail_convertible(const coord_t * coord) -+{ -+ int result; -+ reiser4_key key; -+ jnode *child = NULL; -+ -+ assert("edward-477", coord != NULL); -+ assert("edward-478", item_id_by_coord(coord) == CTAIL_ID); -+ -+ if (coord_is_unprepped_ctail(coord)) -+ /* unprepped ctail should be converted */ -+ return 1; -+ -+ item_key_by_coord(coord, &key); -+ child = jlookup(current_tree, -+ get_key_objectid(&key), -+ off_to_pg(off_by_coord(coord))); -+ if (!child) -+ return 0; -+ result = JF_ISSET(child, JNODE_DIRTY); -+ jput(child); -+ return result; -+} -+ -+/* FIXME-EDWARD */ -+/* plugin->u.item.b.shift_hook */ -+int shift_hook_ctail(const coord_t * item /* coord of item */ , -+ unsigned from UNUSED_ARG /* start unit */ , -+ unsigned count UNUSED_ARG /* stop unit */ , -+ znode * old_node /* old parent */ ) -+{ -+ assert("edward-479", item != NULL); -+ assert("edward-480", item->node != old_node); -+ -+ if (!znode_convertible(old_node) || znode_convertible(item->node)) -+ return 0; -+ if (ctail_convertible(item)) -+ znode_set_convertible(item->node); -+ return 0; -+} -+ -+static int -+cut_or_kill_ctail_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ int cut, void *p, reiser4_key * smallest_removed, -+ reiser4_key * new_first) -+{ -+ pos_in_node_t count; /* number of units to cut */ -+ char *item; -+ -+ count = to - from + 1; -+ item = item_body_by_coord(coord); -+ -+ assert("edward-74", ergo(from != 0, to == coord_last_unit_pos(coord))); -+ -+ if (smallest_removed) { -+ /* store smallest key removed */ -+ item_key_by_coord(coord, smallest_removed); -+ set_key_offset(smallest_removed, -+ get_key_offset(smallest_removed) + from); -+ } -+ -+ if (new_first) { -+ assert("vs-1531", from == 0); -+ -+ item_key_by_coord(coord, new_first); -+ set_key_offset(new_first, -+ get_key_offset(new_first) + from + count); -+ } -+ -+ if (!cut) -+ kill_hook_ctail(coord, from, 0, (struct carry_kill_data *)p); -+ -+ if (from == 0) { -+ if (count != nr_units_ctail(coord)) { -+ /* part of item is removed, so move free space at the beginning -+ of the item and update item key */ -+ reiser4_key key; -+ memcpy(item + to + 1, item, sizeof(ctail_item_format)); -+ item_key_by_coord(coord, &key); -+ set_key_offset(&key, get_key_offset(&key) + count); -+ node_plugin_by_node(coord->node)->update_item_key(coord, -+ &key, -+ NULL); -+ } else { -+ /* cut_units should not be called to cut evrything */ -+ assert("vs-1532", ergo(cut, 0)); -+ /* whole item is cut, so more then amount of space occupied -+ by units got freed */ -+ count += sizeof(ctail_item_format); -+ } -+ if (REISER4_DEBUG) -+ memset(item, 0, count); -+ } else if (REISER4_DEBUG) -+ memset(item + sizeof(ctail_item_format) + from, 0, count); -+ return count; -+} -+ -+/* plugin->u.item.b.cut_units */ -+int -+cut_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to, -+ carry_cut_data * cdata, reiser4_key * smallest_removed, -+ reiser4_key * new_first) -+{ -+ return cut_or_kill_ctail_units(item, from, to, 1, NULL, -+ smallest_removed, new_first); -+} -+ -+/* plugin->u.item.b.kill_units */ -+int -+kill_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to, -+ struct carry_kill_data *kdata, reiser4_key * smallest_removed, -+ reiser4_key * new_first) -+{ -+ return cut_or_kill_ctail_units(item, from, to, 0, kdata, -+ smallest_removed, new_first); -+} -+ -+/* plugin->u.item.s.file.read */ -+int read_ctail(struct file *file UNUSED_ARG, flow_t * f, hint_t * hint) -+{ -+ uf_coord_t *uf_coord; -+ coord_t *coord; -+ -+ uf_coord = &hint->ext_coord; -+ coord = &uf_coord->coord; -+ assert("edward-127", f->user == 0); -+ assert("edward-129", coord && coord->node); -+ assert("edward-130", coord_is_existing_unit(coord)); -+ assert("edward-132", znode_is_loaded(coord->node)); -+ -+ /* start read only from the beginning of ctail */ -+ assert("edward-133", coord->unit_pos == 0); -+ /* read only whole ctails */ -+ assert("edward-135", nr_units_ctail(coord) <= f->length); -+ -+ assert("edward-136", reiser4_schedulable()); -+ assert("edward-886", ctail_ok(coord)); -+ -+ if (f->data) -+ memcpy(f->data, (char *)first_unit(coord), -+ (size_t) nr_units_ctail(coord)); -+ -+ dclust_set_extension_shift(hint); -+ mark_page_accessed(znode_page(coord->node)); -+ move_flow_forward(f, nr_units_ctail(coord)); -+ -+ return 0; -+} -+ -+/** -+ * Prepare transform stream with plain text for page -+ * @page taking into account synchronization issues. -+ */ -+static int ctail_read_disk_cluster(struct cluster_handle * clust, -+ struct inode * inode, struct page * page, -+ znode_lock_mode mode) -+{ -+ int result; -+ -+ assert("edward-1450", mode == ZNODE_READ_LOCK || ZNODE_WRITE_LOCK); -+ assert("edward-671", clust->hint != NULL); -+ assert("edward-140", clust->dstat == INVAL_DISK_CLUSTER); -+ assert("edward-672", cryptcompress_inode_ok(inode)); -+ assert("edward-1527", PageLocked(page)); -+ -+ unlock_page(page); -+ -+ /* set input stream */ -+ result = grab_tfm_stream(inode, &clust->tc, INPUT_STREAM); -+ if (result) { -+ lock_page(page); -+ return result; -+ } -+ result = find_disk_cluster(clust, inode, 1 /* read items */, mode); -+ lock_page(page); -+ if (result) -+ return result; -+ /* -+ * at this point we have locked position in the tree -+ */ -+ assert("edward-1528", znode_is_any_locked(clust->hint->lh.node)); -+ -+ if (page->mapping != inode->i_mapping) { -+ /* page was truncated */ -+ reiser4_unset_hint(clust->hint); -+ reset_cluster_params(clust); -+ return AOP_TRUNCATED_PAGE; -+ } -+ if (PageUptodate(page)) { -+ /* disk cluster can be obsolete, don't use it! */ -+ reiser4_unset_hint(clust->hint); -+ reset_cluster_params(clust); -+ return 0; -+ } -+ if (clust->dstat == FAKE_DISK_CLUSTER || -+ clust->dstat == UNPR_DISK_CLUSTER || -+ clust->dstat == TRNC_DISK_CLUSTER) { -+ /* -+ * this information about disk cluster will be valid -+ * as long as we keep the position in the tree locked -+ */ -+ tfm_cluster_set_uptodate(&clust->tc); -+ return 0; -+ } -+ /* now prepare output stream.. */ -+ result = grab_coa(&clust->tc, inode_compression_plugin(inode)); -+ if (result) -+ return result; -+ /* ..and fill this with plain text */ -+ result = reiser4_inflate_cluster(clust, inode); -+ if (result) -+ return result; -+ /* -+ * The stream is ready! It won't be obsolete as -+ * long as we keep last disk cluster item locked. -+ */ -+ tfm_cluster_set_uptodate(&clust->tc); -+ return 0; -+} -+ -+/* -+ * fill one page with plain text. -+ */ -+int do_readpage_ctail(struct inode * inode, struct cluster_handle * clust, -+ struct page *page, znode_lock_mode mode) -+{ -+ int ret; -+ unsigned cloff; -+ char *data; -+ size_t to_page; -+ struct tfm_cluster * tc = &clust->tc; -+ -+ assert("edward-212", PageLocked(page)); -+ -+ if (unlikely(page->mapping != inode->i_mapping)) -+ return AOP_TRUNCATED_PAGE; -+ if (PageUptodate(page)) -+ goto exit; -+ to_page = pbytes(page_index(page), inode); -+ if (to_page == 0) { -+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0); -+ SetPageUptodate(page); -+ goto exit; -+ } -+ if (!tfm_cluster_is_uptodate(&clust->tc)) { -+ clust->index = pg_to_clust(page->index, inode); -+ -+ /* this will unlock/lock the page */ -+ ret = ctail_read_disk_cluster(clust, inode, page, mode); -+ -+ assert("edward-212", PageLocked(page)); -+ if (ret) -+ return ret; -+ -+ /* refresh bytes */ -+ to_page = pbytes(page_index(page), inode); -+ if (to_page == 0) { -+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0); -+ SetPageUptodate(page); -+ goto exit; -+ } -+ } -+ if (PageUptodate(page)) -+ /* somebody else fill it already */ -+ goto exit; -+ -+ assert("edward-119", tfm_cluster_is_uptodate(tc)); -+ assert("edward-1529", znode_is_any_locked(clust->hint->lh.node)); -+ -+ switch (clust->dstat) { -+ case UNPR_DISK_CLUSTER: -+ BUG_ON(1); -+ case TRNC_DISK_CLUSTER: -+ /* -+ * Race with truncate! -+ * We resolve it in favour of the last one (the only way, -+ * as in this case plain text is unrecoverable) -+ */ -+ case FAKE_DISK_CLUSTER: -+ /* fill the page by zeroes */ -+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0); -+ SetPageUptodate(page); -+ break; -+ case PREP_DISK_CLUSTER: -+ /* fill page by transformed stream with plain text */ -+ assert("edward-1058", !PageUptodate(page)); -+ assert("edward-120", tc->len <= inode_cluster_size(inode)); -+ -+ /* page index in this logical cluster */ -+ cloff = pg_to_off_to_cloff(page->index, inode); -+ -+ data = kmap(page); -+ memcpy(data, tfm_stream_data(tc, OUTPUT_STREAM) + cloff, to_page); -+ memset(data + to_page, 0, (size_t) PAGE_CACHE_SIZE - to_page); -+ flush_dcache_page(page); -+ kunmap(page); -+ SetPageUptodate(page); -+ break; -+ default: -+ impossible("edward-1169", "bad disk cluster state"); -+ } -+ exit: -+ return 0; -+} -+ -+/* plugin->u.item.s.file.readpage */ -+int readpage_ctail(void *vp, struct page *page) -+{ -+ int result; -+ hint_t * hint; -+ struct cluster_handle * clust = vp; -+ -+ assert("edward-114", clust != NULL); -+ assert("edward-115", PageLocked(page)); -+ assert("edward-116", !PageUptodate(page)); -+ assert("edward-118", page->mapping && page->mapping->host); -+ assert("edward-867", !tfm_cluster_is_uptodate(&clust->tc)); -+ -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) { -+ unlock_page(page); -+ return RETERR(-ENOMEM); -+ } -+ clust->hint = hint; -+ result = load_file_hint(clust->file, hint); -+ if (result) { -+ kfree(hint); -+ unlock_page(page); -+ return result; -+ } -+ assert("vs-25", hint->ext_coord.lh == &hint->lh); -+ -+ result = do_readpage_ctail(page->mapping->host, clust, page, -+ ZNODE_READ_LOCK); -+ assert("edward-213", PageLocked(page)); -+ assert("edward-1163", ergo(!result, PageUptodate(page))); -+ -+ unlock_page(page); -+ done_lh(&hint->lh); -+ hint->ext_coord.valid = 0; -+ save_file_hint(clust->file, hint); -+ kfree(hint); -+ tfm_cluster_clr_uptodate(&clust->tc); -+ -+ return result; -+} -+ -+/* Helper function for ->readpages() */ -+static int ctail_read_page_cluster(struct cluster_handle * clust, -+ struct inode *inode) -+{ -+ int i; -+ int result; -+ assert("edward-779", clust != NULL); -+ assert("edward-1059", clust->win == NULL); -+ assert("edward-780", inode != NULL); -+ -+ result = prepare_page_cluster(inode, clust, READ_OP); -+ if (result) -+ return result; -+ -+ assert("edward-781", !tfm_cluster_is_uptodate(&clust->tc)); -+ -+ for (i = 0; i < clust->nr_pages; i++) { -+ struct page *page = clust->pages[i]; -+ lock_page(page); -+ result = do_readpage_ctail(inode, clust, page, ZNODE_READ_LOCK); -+ unlock_page(page); -+ if (result) -+ break; -+ } -+ tfm_cluster_clr_uptodate(&clust->tc); -+ put_page_cluster(clust, inode, READ_OP); -+ return result; -+} -+ -+/* filler for read_cache_pages() */ -+static int ctail_readpages_filler(void * data, struct page * page) -+{ -+ int ret = 0; -+ struct cluster_handle * clust = data; -+ struct inode * inode = clust->file->f_dentry->d_inode; -+ -+ assert("edward-1525", page->mapping == inode->i_mapping); -+ -+ if (PageUptodate(page)) { -+ unlock_page(page); -+ return 0; -+ } -+ if (pbytes(page_index(page), inode) == 0) { -+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0); -+ SetPageUptodate(page); -+ unlock_page(page); -+ return 0; -+ } -+ move_cluster_forward(clust, inode, page->index); -+ unlock_page(page); -+ /* -+ * read the whole page cluster -+ */ -+ ret = ctail_read_page_cluster(clust, inode); -+ -+ assert("edward-869", !tfm_cluster_is_uptodate(&clust->tc)); -+ return ret; -+} -+ -+/* -+ * We populate a bit more then upper readahead suggests: -+ * with each nominated page we read the whole page cluster -+ * this page belongs to. -+ */ -+int readpages_ctail(struct file *file, struct address_space *mapping, -+ struct list_head *pages) -+{ -+ int ret = 0; -+ hint_t *hint; -+ struct cluster_handle clust; -+ struct inode *inode = mapping->host; -+ -+ assert("edward-1521", inode == file->f_dentry->d_inode); -+ -+ cluster_init_read(&clust, NULL); -+ clust.file = file; -+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get()); -+ if (hint == NULL) { -+ warning("vs-28", "failed to allocate hint"); -+ ret = RETERR(-ENOMEM); -+ goto exit1; -+ } -+ clust.hint = hint; -+ ret = load_file_hint(clust.file, hint); -+ if (ret) { -+ warning("edward-1522", "failed to load hint"); -+ goto exit2; -+ } -+ assert("vs-26", hint->ext_coord.lh == &hint->lh); -+ ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode)); -+ if (ret) { -+ warning("edward-1523", "failed to alloc pgset"); -+ goto exit3; -+ } -+ ret = read_cache_pages(mapping, pages, ctail_readpages_filler, &clust); -+ -+ assert("edward-870", !tfm_cluster_is_uptodate(&clust.tc)); -+ exit3: -+ done_lh(&hint->lh); -+ save_file_hint(file, hint); -+ hint->ext_coord.valid = 0; -+ exit2: -+ kfree(hint); -+ exit1: -+ put_cluster_handle(&clust); -+ return ret; -+} -+ -+/* -+ plugin->u.item.s.file.append_key -+ key of the first item of the next disk cluster -+*/ -+reiser4_key *append_key_ctail(const coord_t * coord, reiser4_key * key) -+{ -+ assert("edward-1241", item_id_by_coord(coord) == CTAIL_ID); -+ assert("edward-1242", cluster_shift_ok(cluster_shift_by_coord(coord))); -+ -+ item_key_by_coord(coord, key); -+ set_key_offset(key, ((__u64) (clust_by_coord(coord, NULL)) + 1) -+ << cluster_shift_by_coord(coord)); -+ return key; -+} -+ -+static int insert_unprepped_ctail(struct cluster_handle * clust, -+ struct inode *inode) -+{ -+ int result; -+ char buf[UCTAIL_NR_UNITS]; -+ reiser4_item_data data; -+ reiser4_key key; -+ int shift = (int)UCTAIL_SHIFT; -+ -+ memset(buf, 0, (size_t) UCTAIL_NR_UNITS); -+ result = key_by_inode_cryptcompress(inode, -+ clust_to_off(clust->index, inode), -+ &key); -+ if (result) -+ return result; -+ data.user = 0; -+ data.iplug = item_plugin_by_id(CTAIL_ID); -+ data.arg = &shift; -+ data.length = sizeof(ctail_item_format) + (size_t) UCTAIL_NR_UNITS; -+ data.data = buf; -+ -+ result = insert_by_coord(&clust->hint->ext_coord.coord, -+ &data, &key, clust->hint->ext_coord.lh, 0); -+ return result; -+} -+ -+static int -+insert_cryptcompress_flow(coord_t * coord, lock_handle * lh, flow_t * f, -+ int cluster_shift) -+{ -+ int result; -+ carry_pool *pool; -+ carry_level *lowest_level; -+ reiser4_item_data *data; -+ carry_op *op; -+ -+ pool = -+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) + -+ sizeof(*data)); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ lowest_level = (carry_level *) (pool + 1); -+ init_carry_level(lowest_level, pool); -+ data = (reiser4_item_data *) (lowest_level + 3); -+ -+ assert("edward-466", coord->between == AFTER_ITEM -+ || coord->between == AFTER_UNIT || coord->between == BEFORE_ITEM -+ || coord->between == EMPTY_NODE -+ || coord->between == BEFORE_UNIT); -+ -+ if (coord->between == AFTER_UNIT) { -+ coord->unit_pos = 0; -+ coord->between = AFTER_ITEM; -+ } -+ op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node, -+ 0 /* operate directly on coord -> node */); -+ if (IS_ERR(op) || (op == NULL)) { -+ done_carry_pool(pool); -+ return RETERR(op ? PTR_ERR(op) : -EIO); -+ } -+ data->user = 0; -+ data->iplug = item_plugin_by_id(CTAIL_ID); -+ data->arg = &cluster_shift; -+ -+ data->length = 0; -+ data->data = NULL; -+ -+ op->u.insert_flow.flags = COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT; -+ op->u.insert_flow.insert_point = coord; -+ op->u.insert_flow.flow = f; -+ op->u.insert_flow.data = data; -+ op->u.insert_flow.new_nodes = 0; -+ -+ lowest_level->track_type = CARRY_TRACK_CHANGE; -+ lowest_level->tracked = lh; -+ -+ result = reiser4_carry(lowest_level, NULL); -+ done_carry_pool(pool); -+ -+ return result; -+} -+ -+/* Implementation of CRC_APPEND_ITEM mode of ctail conversion */ -+static int insert_cryptcompress_flow_in_place(coord_t * coord, -+ lock_handle * lh, flow_t * f, -+ int cluster_shift) -+{ -+ int ret; -+ coord_t pos; -+ lock_handle lock; -+ -+ assert("edward-484", -+ coord->between == AT_UNIT || coord->between == AFTER_ITEM); -+ assert("edward-485", item_id_by_coord(coord) == CTAIL_ID); -+ -+ coord_dup(&pos, coord); -+ pos.unit_pos = 0; -+ pos.between = AFTER_ITEM; -+ -+ init_lh(&lock); -+ copy_lh(&lock, lh); -+ -+ ret = insert_cryptcompress_flow(&pos, &lock, f, cluster_shift); -+ done_lh(&lock); -+ assert("edward-1347", znode_is_write_locked(lh->node)); -+ assert("edward-1228", !ret); -+ return ret; -+} -+ -+/* Implementation of CRC_OVERWRITE_ITEM mode of ctail conversion */ -+static int overwrite_ctail(coord_t * coord, flow_t * f) -+{ -+ unsigned count; -+ -+ assert("edward-269", f->user == 0); -+ assert("edward-270", f->data != NULL); -+ assert("edward-271", f->length > 0); -+ assert("edward-272", coord_is_existing_unit(coord)); -+ assert("edward-273", coord->unit_pos == 0); -+ assert("edward-274", znode_is_write_locked(coord->node)); -+ assert("edward-275", reiser4_schedulable()); -+ assert("edward-467", item_id_by_coord(coord) == CTAIL_ID); -+ assert("edward-1243", ctail_ok(coord)); -+ -+ count = nr_units_ctail(coord); -+ -+ if (count > f->length) -+ count = f->length; -+ memcpy(first_unit(coord), f->data, count); -+ move_flow_forward(f, count); -+ coord->unit_pos += count; -+ return 0; -+} -+ -+/* Implementation of CRC_CUT_ITEM mode of ctail conversion: -+ cut ctail (part or whole) starting from next unit position */ -+static int cut_ctail(coord_t * coord) -+{ -+ coord_t stop; -+ -+ assert("edward-435", coord->between == AT_UNIT && -+ coord->item_pos < coord_num_items(coord) && -+ coord->unit_pos <= coord_num_units(coord)); -+ -+ if (coord->unit_pos == coord_num_units(coord)) -+ /* nothing to cut */ -+ return 0; -+ coord_dup(&stop, coord); -+ stop.unit_pos = coord_last_unit_pos(coord); -+ -+ return cut_node_content(coord, &stop, NULL, NULL, NULL); -+} -+ -+int ctail_insert_unprepped_cluster(struct cluster_handle * clust, -+ struct inode * inode) -+{ -+ int result; -+ assert("edward-1244", inode != NULL); -+ assert("edward-1245", clust->hint != NULL); -+ assert("edward-1246", clust->dstat == FAKE_DISK_CLUSTER); -+ assert("edward-1247", clust->reserved == 1); -+ -+ result = get_disk_cluster_locked(clust, inode, ZNODE_WRITE_LOCK); -+ if (cbk_errored(result)) -+ return result; -+ assert("edward-1249", result == CBK_COORD_NOTFOUND); -+ assert("edward-1250", znode_is_write_locked(clust->hint->lh.node)); -+ -+ assert("edward-1295", -+ clust->hint->ext_coord.lh->node == -+ clust->hint->ext_coord.coord.node); -+ -+ coord_set_between_clusters(&clust->hint->ext_coord.coord); -+ -+ result = insert_unprepped_ctail(clust, inode); -+ all_grabbed2free(); -+ -+ assert("edward-1251", !result); -+ assert("edward-1252", cryptcompress_inode_ok(inode)); -+ assert("edward-1253", znode_is_write_locked(clust->hint->lh.node)); -+ assert("edward-1254", -+ reiser4_clustered_blocks(reiser4_get_current_sb())); -+ assert("edward-1255", -+ znode_convertible(clust->hint->ext_coord.coord.node)); -+ -+ return result; -+} -+ -+static int do_convert_ctail(flush_pos_t * pos, cryptcompress_write_mode_t mode) -+{ -+ int result = 0; -+ struct convert_item_info * info; -+ -+ assert("edward-468", pos != NULL); -+ assert("edward-469", pos->sq != NULL); -+ assert("edward-845", item_convert_data(pos) != NULL); -+ -+ info = item_convert_data(pos); -+ assert("edward-679", info->flow.data != NULL); -+ -+ switch (mode) { -+ case CRC_APPEND_ITEM: -+ assert("edward-1229", info->flow.length != 0); -+ assert("edward-1256", -+ cluster_shift_ok(cluster_shift_by_coord(&pos->coord))); -+ result = -+ insert_cryptcompress_flow_in_place(&pos->coord, -+ &pos->lock, -+ &info->flow, -+ info->cluster_shift); -+ break; -+ case CRC_OVERWRITE_ITEM: -+ assert("edward-1230", info->flow.length != 0); -+ overwrite_ctail(&pos->coord, &info->flow); -+ if (info->flow.length != 0) -+ break; -+ case CRC_CUT_ITEM: -+ assert("edward-1231", info->flow.length == 0); -+ result = cut_ctail(&pos->coord); -+ break; -+ default: -+ result = RETERR(-EIO); -+ impossible("edward-244", "bad convert mode"); -+ } -+ return result; -+} -+ -+/* plugin->u.item.f.scan */ -+int scan_ctail(flush_scan * scan) -+{ -+ int result = 0; -+ struct page *page; -+ struct inode *inode; -+ jnode *node = scan->node; -+ -+ assert("edward-227", scan->node != NULL); -+ assert("edward-228", jnode_is_cluster_page(scan->node)); -+ assert("edward-639", znode_is_write_locked(scan->parent_lock.node)); -+ -+ page = jnode_page(node); -+ inode = page->mapping->host; -+ -+ if (!reiser4_scanning_left(scan)) -+ return result; -+ if (!ZF_ISSET(scan->parent_lock.node, JNODE_DIRTY)) -+ znode_make_dirty(scan->parent_lock.node); -+ -+ if (!znode_convertible(scan->parent_lock.node)) { -+ if (JF_ISSET(scan->node, JNODE_DIRTY)) -+ znode_set_convertible(scan->parent_lock.node); -+ else { -+ warning("edward-681", -+ "cluster page is already processed"); -+ return -EAGAIN; -+ } -+ } -+ return result; -+} -+ -+/* If true, this function attaches children */ -+static int should_attach_convert_idata(flush_pos_t * pos) -+{ -+ int result; -+ assert("edward-431", pos != NULL); -+ assert("edward-432", pos->child == NULL); -+ assert("edward-619", znode_is_write_locked(pos->coord.node)); -+ assert("edward-470", -+ item_plugin_by_coord(&pos->coord) == -+ item_plugin_by_id(CTAIL_ID)); -+ -+ /* check for leftmost child */ -+ utmost_child_ctail(&pos->coord, LEFT_SIDE, &pos->child); -+ -+ if (!pos->child) -+ return 0; -+ spin_lock_jnode(pos->child); -+ result = (JF_ISSET(pos->child, JNODE_DIRTY) && -+ pos->child->atom == ZJNODE(pos->coord.node)->atom); -+ spin_unlock_jnode(pos->child); -+ if (!result && pos->child) { -+ /* existing child isn't to attach, clear up this one */ -+ jput(pos->child); -+ pos->child = NULL; -+ } -+ return result; -+} -+ -+/** -+ * Collect all needed information about the object here, -+ * as in-memory inode can be evicted from memory before -+ * disk update completion. -+ */ -+static int init_convert_data_ctail(struct convert_item_info * idata, -+ struct inode *inode) -+{ -+ assert("edward-813", idata != NULL); -+ assert("edward-814", inode != NULL); -+ -+ idata->cluster_shift = inode_cluster_shift(inode); -+ idata->d_cur = DC_FIRST_ITEM; -+ idata->d_next = DC_INVALID_STATE; -+ -+ return 0; -+} -+ -+static int alloc_item_convert_data(struct convert_info * sq) -+{ -+ assert("edward-816", sq != NULL); -+ assert("edward-817", sq->itm == NULL); -+ -+ sq->itm = kmalloc(sizeof(*sq->itm), reiser4_ctx_gfp_mask_get()); -+ if (sq->itm == NULL) -+ return RETERR(-ENOMEM); -+ return 0; -+} -+ -+static void free_item_convert_data(struct convert_info * sq) -+{ -+ assert("edward-818", sq != NULL); -+ assert("edward-819", sq->itm != NULL); -+ assert("edward-820", sq->iplug != NULL); -+ -+ kfree(sq->itm); -+ sq->itm = NULL; -+ return; -+} -+ -+static int alloc_convert_data(flush_pos_t * pos) -+{ -+ assert("edward-821", pos != NULL); -+ assert("edward-822", pos->sq == NULL); -+ -+ pos->sq = kmalloc(sizeof(*pos->sq), reiser4_ctx_gfp_mask_get()); -+ if (!pos->sq) -+ return RETERR(-ENOMEM); -+ memset(pos->sq, 0, sizeof(*pos->sq)); -+ cluster_init_write(&pos->sq->clust, NULL); -+ return 0; -+} -+ -+void free_convert_data(flush_pos_t * pos) -+{ -+ struct convert_info *sq; -+ -+ assert("edward-823", pos != NULL); -+ assert("edward-824", pos->sq != NULL); -+ -+ sq = pos->sq; -+ if (sq->itm) -+ free_item_convert_data(sq); -+ put_cluster_handle(&sq->clust); -+ kfree(pos->sq); -+ pos->sq = NULL; -+ return; -+} -+ -+static int init_item_convert_data(flush_pos_t * pos, struct inode *inode) -+{ -+ struct convert_info *sq; -+ -+ assert("edward-825", pos != NULL); -+ assert("edward-826", pos->sq != NULL); -+ assert("edward-827", item_convert_data(pos) != NULL); -+ assert("edward-828", inode != NULL); -+ -+ sq = pos->sq; -+ -+ memset(sq->itm, 0, sizeof(*sq->itm)); -+ -+ /* iplug->init_convert_data() */ -+ return init_convert_data_ctail(sq->itm, inode); -+} -+ -+/* create and attach disk cluster info used by 'convert' phase of the flush -+ squalloc() */ -+static int attach_convert_idata(flush_pos_t * pos, struct inode *inode) -+{ -+ int ret = 0; -+ struct convert_item_info *info; -+ struct cluster_handle *clust; -+ file_plugin *fplug = inode_file_plugin(inode); -+ compression_plugin *cplug = inode_compression_plugin(inode); -+ -+ assert("edward-248", pos != NULL); -+ assert("edward-249", pos->child != NULL); -+ assert("edward-251", inode != NULL); -+ assert("edward-682", cryptcompress_inode_ok(inode)); -+ assert("edward-252", -+ fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); -+ assert("edward-473", -+ item_plugin_by_coord(&pos->coord) == -+ item_plugin_by_id(CTAIL_ID)); -+ -+ if (!pos->sq) { -+ ret = alloc_convert_data(pos); -+ if (ret) -+ return ret; -+ } -+ clust = &pos->sq->clust; -+ ret = grab_coa(&clust->tc, cplug); -+ if (ret) -+ goto err; -+ ret = set_cluster_by_page(clust, -+ jnode_page(pos->child), -+ MAX_CLUSTER_NRPAGES); -+ if (ret) -+ goto err; -+ -+ assert("edward-829", pos->sq != NULL); -+ assert("edward-250", item_convert_data(pos) == NULL); -+ -+ pos->sq->iplug = item_plugin_by_id(CTAIL_ID); -+ -+ ret = alloc_item_convert_data(pos->sq); -+ if (ret) -+ goto err; -+ ret = init_item_convert_data(pos, inode); -+ if (ret) -+ goto err; -+ info = item_convert_data(pos); -+ -+ ret = checkout_logical_cluster(clust, pos->child, inode); -+ if (ret) -+ goto err; -+ -+ reiser4_deflate_cluster(clust, inode); -+ inc_item_convert_count(pos); -+ -+ /* prepare flow for insertion */ -+ fplug->flow_by_inode(inode, -+ (const char __user *)tfm_stream_data(&clust->tc, OUTPUT_STREAM), -+ 0 /* kernel space */ , -+ clust->tc.len, -+ clust_to_off(clust->index, inode), -+ WRITE_OP, &info->flow); -+ jput(pos->child); -+ return 0; -+ err: -+ jput(pos->child); -+ free_convert_data(pos); -+ return ret; -+} -+ -+/* clear up disk cluster info */ -+static void detach_convert_idata(struct convert_info * sq) -+{ -+ struct convert_item_info *info; -+ -+ assert("edward-253", sq != NULL); -+ assert("edward-840", sq->itm != NULL); -+ -+ info = sq->itm; -+ assert("edward-1212", info->flow.length == 0); -+ -+ free_item_convert_data(sq); -+ return; -+} -+ -+/* plugin->u.item.f.utmost_child */ -+ -+/* This function sets leftmost child for a first cluster item, -+ if the child exists, and NULL in other cases. -+ NOTE-EDWARD: Do not call this for RIGHT_SIDE */ -+ -+int utmost_child_ctail(const coord_t * coord, sideof side, jnode ** child) -+{ -+ reiser4_key key; -+ -+ item_key_by_coord(coord, &key); -+ -+ assert("edward-257", coord != NULL); -+ assert("edward-258", child != NULL); -+ assert("edward-259", side == LEFT_SIDE); -+ assert("edward-260", -+ item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID)); -+ -+ if (!is_disk_cluster_key(&key, coord)) -+ *child = NULL; -+ else -+ *child = jlookup(current_tree, -+ get_key_objectid(item_key_by_coord -+ (coord, &key)), -+ off_to_pg(get_key_offset(&key))); -+ return 0; -+} -+ -+/* Returns true if @p2 is the next item to @p1 -+ in the _same_ disk cluster. -+ Disk cluster is a set of items. If ->clustered() != NULL, -+ with each item the whole disk cluster should be read/modified -+*/ -+ -+/* Go rightward and check for next disk cluster item, set -+ * d_next to DC_CHAINED_ITEM, if the last one exists. -+ * If the current position is last item, go to right neighbor. -+ * Skip empty nodes. Note, that right neighbors may be not in -+ * the slum because of races. If so, make it dirty and -+ * convertible. -+ */ -+static int next_item_dc_stat(flush_pos_t * pos) -+{ -+ int ret = 0; -+ int stop = 0; -+ znode *cur; -+ coord_t coord; -+ lock_handle lh; -+ lock_handle right_lock; -+ -+ assert("edward-1232", !node_is_empty(pos->coord.node)); -+ assert("edward-1014", -+ pos->coord.item_pos < coord_num_items(&pos->coord)); -+ assert("edward-1015", chaining_data_present(pos)); -+ assert("edward-1017", -+ item_convert_data(pos)->d_next == DC_INVALID_STATE); -+ -+ item_convert_data(pos)->d_next = DC_AFTER_CLUSTER; -+ -+ if (item_convert_data(pos)->d_cur == DC_AFTER_CLUSTER) -+ return ret; -+ if (pos->coord.item_pos < coord_num_items(&pos->coord) - 1) -+ return ret; -+ -+ /* Check next slum item. -+ * Note, that it can not be killed by concurrent truncate, -+ * as the last one will want the lock held by us. -+ */ -+ init_lh(&right_lock); -+ cur = pos->coord.node; -+ -+ while (!stop) { -+ init_lh(&lh); -+ ret = reiser4_get_right_neighbor(&lh, -+ cur, -+ ZNODE_WRITE_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ if (ret) -+ break; -+ ret = zload(lh.node); -+ if (ret) { -+ done_lh(&lh); -+ break; -+ } -+ coord_init_before_first_item(&coord, lh.node); -+ -+ if (node_is_empty(lh.node)) { -+ znode_make_dirty(lh.node); -+ znode_set_convertible(lh.node); -+ stop = 0; -+ } else if (same_disk_cluster(&pos->coord, &coord)) { -+ -+ item_convert_data(pos)->d_next = DC_CHAINED_ITEM; -+ -+ if (!ZF_ISSET(lh.node, JNODE_DIRTY)) { -+ /* -+ warning("edward-1024", -+ "next slum item mergeable, " -+ "but znode %p isn't dirty\n", -+ lh.node); -+ */ -+ znode_make_dirty(lh.node); -+ } -+ if (!znode_convertible(lh.node)) { -+ /* -+ warning("edward-1272", -+ "next slum item mergeable, " -+ "but znode %p isn't convertible\n", -+ lh.node); -+ */ -+ znode_set_convertible(lh.node); -+ } -+ stop = 1; -+ } else -+ stop = 1; -+ zrelse(lh.node); -+ done_lh(&right_lock); -+ copy_lh(&right_lock, &lh); -+ done_lh(&lh); -+ cur = right_lock.node; -+ } -+ done_lh(&right_lock); -+ -+ if (ret == -E_NO_NEIGHBOR) -+ ret = 0; -+ return ret; -+} -+ -+static int -+assign_convert_mode(struct convert_item_info * idata, -+ cryptcompress_write_mode_t * mode) -+{ -+ int result = 0; -+ -+ assert("edward-1025", idata != NULL); -+ -+ if (idata->flow.length) { -+ /* append or overwrite */ -+ switch (idata->d_cur) { -+ case DC_FIRST_ITEM: -+ case DC_CHAINED_ITEM: -+ *mode = CRC_OVERWRITE_ITEM; -+ break; -+ case DC_AFTER_CLUSTER: -+ *mode = CRC_APPEND_ITEM; -+ break; -+ default: -+ impossible("edward-1018", "wrong current item state"); -+ } -+ } else { -+ /* cut or invalidate */ -+ switch (idata->d_cur) { -+ case DC_FIRST_ITEM: -+ case DC_CHAINED_ITEM: -+ *mode = CRC_CUT_ITEM; -+ break; -+ case DC_AFTER_CLUSTER: -+ result = 1; -+ break; -+ default: -+ impossible("edward-1019", "wrong current item state"); -+ } -+ } -+ return result; -+} -+ -+/* plugin->u.item.f.convert */ -+/* write ctail in guessed mode */ -+int convert_ctail(flush_pos_t * pos) -+{ -+ int result; -+ int nr_items; -+ cryptcompress_write_mode_t mode = CRC_OVERWRITE_ITEM; -+ -+ assert("edward-1020", pos != NULL); -+ assert("edward-1213", coord_num_items(&pos->coord) != 0); -+ assert("edward-1257", item_id_by_coord(&pos->coord) == CTAIL_ID); -+ assert("edward-1258", ctail_ok(&pos->coord)); -+ assert("edward-261", pos->coord.node != NULL); -+ -+ nr_items = coord_num_items(&pos->coord); -+ if (!chaining_data_present(pos)) { -+ if (should_attach_convert_idata(pos)) { -+ /* attach convert item info */ -+ struct inode *inode; -+ -+ assert("edward-264", pos->child != NULL); -+ assert("edward-265", jnode_page(pos->child) != NULL); -+ assert("edward-266", -+ jnode_page(pos->child)->mapping != NULL); -+ -+ inode = jnode_page(pos->child)->mapping->host; -+ -+ assert("edward-267", inode != NULL); -+ -+ /* attach item convert info by child and put the last one */ -+ result = attach_convert_idata(pos, inode); -+ pos->child = NULL; -+ if (result == -E_REPEAT) { -+ /* jnode became clean, or there is no dirty -+ pages (nothing to update in disk cluster) */ -+ warning("edward-1021", -+ "convert_ctail: nothing to attach"); -+ return 0; -+ } -+ if (result != 0) -+ return result; -+ } else -+ /* unconvertible */ -+ return 0; -+ } else { -+ /* use old convert info */ -+ -+ struct convert_item_info *idata; -+ -+ idata = item_convert_data(pos); -+ -+ result = assign_convert_mode(idata, &mode); -+ if (result) { -+ /* disk cluster is over, -+ nothing to update anymore */ -+ detach_convert_idata(pos->sq); -+ return 0; -+ } -+ } -+ -+ assert("edward-433", chaining_data_present(pos)); -+ assert("edward-1022", -+ pos->coord.item_pos < coord_num_items(&pos->coord)); -+ -+ /* check if next item is of current disk cluster */ -+ result = next_item_dc_stat(pos); -+ if (result) { -+ detach_convert_idata(pos->sq); -+ return result; -+ } -+ result = do_convert_ctail(pos, mode); -+ if (result) { -+ detach_convert_idata(pos->sq); -+ return result; -+ } -+ switch (mode) { -+ case CRC_CUT_ITEM: -+ assert("edward-1214", item_convert_data(pos)->flow.length == 0); -+ assert("edward-1215", -+ coord_num_items(&pos->coord) == nr_items || -+ coord_num_items(&pos->coord) == nr_items - 1); -+ if (item_convert_data(pos)->d_next == DC_CHAINED_ITEM) -+ break; -+ if (coord_num_items(&pos->coord) != nr_items) { -+ /* the item was killed, no more chained items */ -+ detach_convert_idata(pos->sq); -+ if (!node_is_empty(pos->coord.node)) -+ /* make sure the next item will be scanned */ -+ coord_init_before_item(&pos->coord); -+ break; -+ } -+ case CRC_APPEND_ITEM: -+ assert("edward-434", item_convert_data(pos)->flow.length == 0); -+ detach_convert_idata(pos->sq); -+ break; -+ case CRC_OVERWRITE_ITEM: -+ if (coord_is_unprepped_ctail(&pos->coord)) { -+ /* convert unpprepped ctail to prepped one */ -+ assert("edward-1259", -+ cluster_shift_ok(item_convert_data(pos)-> -+ cluster_shift)); -+ put_unaligned((d8)item_convert_data(pos)->cluster_shift, -+ &ctail_formatted_at(&pos->coord)-> -+ cluster_shift); -+ } -+ break; -+ } -+ return result; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/ctail.h linux-2.6.24/fs/reiser4/plugin/item/ctail.h ---- linux-2.6.24.orig/fs/reiser4/plugin/item/ctail.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/item/ctail.h 2008-01-25 11:39:07.008226236 +0300 -@@ -0,0 +1,102 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Ctail items are fragments (or bodies) of special tipe to provide -+ optimal storage of encrypted and(or) compressed files. */ -+ -+ -+#if !defined( __FS_REISER4_CTAIL_H__ ) -+#define __FS_REISER4_CTAIL_H__ -+ -+/* Disk format of ctail item */ -+typedef struct ctail_item_format { -+ /* packed shift; -+ if its value is different from UCTAIL_SHIFT (see below), then -+ size of disk cluster is calculated as (1 << cluster_shift) */ -+ d8 cluster_shift; -+ /* ctail body */ -+ d8 body[0]; -+} __attribute__ ((packed)) ctail_item_format; -+ -+/* "Unprepped" disk cluster is represented by a single ctail item -+ with the following "magic" attributes: */ -+/* "magic" cluster_shift */ -+#define UCTAIL_SHIFT 0xff -+/* How many units unprepped ctail item has */ -+#define UCTAIL_NR_UNITS 1 -+ -+/* The following is a set of various item states in a disk cluster. -+ Disk cluster is a set of items whose keys belong to the interval -+ [dc_key , dc_key + disk_cluster_size - 1] */ -+typedef enum { -+ DC_INVALID_STATE = 0, -+ DC_FIRST_ITEM = 1, -+ DC_CHAINED_ITEM = 2, -+ DC_AFTER_CLUSTER = 3 -+} dc_item_stat; -+ -+/* ctail-specific extension. -+ In particular this describes parameters of disk cluster an item belongs to */ -+struct ctail_coord_extension { -+ int shift; /* this contains cluster_shift extracted from -+ ctail_item_format (above), or UCTAIL_SHIFT -+ (the last one is the "magic" of unprepped disk clusters)*/ -+ int dsize; /* size of a prepped disk cluster */ -+ int ncount; /* count of nodes occupied by a disk cluster */ -+}; -+ -+struct cut_list; -+ -+/* plugin->item.b.* */ -+int can_contain_key_ctail(const coord_t *, const reiser4_key *, -+ const reiser4_item_data *); -+int mergeable_ctail(const coord_t * p1, const coord_t * p2); -+pos_in_node_t nr_units_ctail(const coord_t * coord); -+int estimate_ctail(const coord_t * coord, const reiser4_item_data * data); -+void print_ctail(const char *prefix, coord_t * coord); -+lookup_result lookup_ctail(const reiser4_key *, lookup_bias, coord_t *); -+ -+int paste_ctail(coord_t * coord, reiser4_item_data * data, -+ carry_plugin_info * info UNUSED_ARG); -+int init_ctail(coord_t *, coord_t *, reiser4_item_data *); -+int can_shift_ctail(unsigned free_space, coord_t * coord, -+ znode * target, shift_direction pend, unsigned *size, -+ unsigned want); -+void copy_units_ctail(coord_t * target, coord_t * source, unsigned from, -+ unsigned count, shift_direction where_is_free_space, -+ unsigned free_space); -+int cut_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ carry_cut_data *, reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+int kill_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ carry_kill_data *, reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+int ctail_ok(const coord_t * coord); -+int check_ctail(const coord_t * coord, const char **error); -+ -+/* plugin->u.item.s.* */ -+int read_ctail(struct file *, flow_t *, hint_t *); -+int readpage_ctail(void *, struct page *); -+int readpages_ctail(struct file *, struct address_space *, struct list_head *); -+reiser4_key *append_key_ctail(const coord_t *, reiser4_key *); -+int create_hook_ctail(const coord_t * coord, void *arg); -+int kill_hook_ctail(const coord_t *, pos_in_node_t, pos_in_node_t, -+ carry_kill_data *); -+int shift_hook_ctail(const coord_t *, unsigned, unsigned, znode *); -+ -+/* plugin->u.item.f */ -+int utmost_child_ctail(const coord_t *, sideof, jnode **); -+int scan_ctail(flush_scan *); -+int convert_ctail(flush_pos_t *); -+size_t inode_scaled_cluster_size(struct inode *); -+ -+#endif /* __FS_REISER4_CTAIL_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/extent.c linux-2.6.24/fs/reiser4/plugin/item/extent.c ---- linux-2.6.24.orig/fs/reiser4/plugin/item/extent.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/item/extent.c 2008-01-25 11:39:07.008226236 +0300 -@@ -0,0 +1,197 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "item.h" -+#include "../../key.h" -+#include "../../super.h" -+#include "../../carry.h" -+#include "../../inode.h" -+#include "../../page_cache.h" -+#include "../../flush.h" -+#include "../object.h" -+ -+/* prepare structure reiser4_item_data. It is used to put one extent unit into tree */ -+/* Audited by: green(2002.06.13) */ -+reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit, -+ int nr_extents) -+{ -+ data->data = ext_unit; -+ /* data->data is kernel space */ -+ data->user = 0; -+ data->length = sizeof(reiser4_extent) * nr_extents; -+ data->arg = NULL; -+ data->iplug = item_plugin_by_id(EXTENT_POINTER_ID); -+ return data; -+} -+ -+/* how many bytes are addressed by @nr first extents of the extent item */ -+reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr) -+{ -+ pos_in_node_t i; -+ reiser4_block_nr blocks; -+ reiser4_extent *ext; -+ -+ ext = item_body_by_coord(coord); -+ assert("vs-263", nr <= nr_units_extent(coord)); -+ -+ blocks = 0; -+ for (i = 0; i < nr; i++, ext++) { -+ blocks += extent_get_width(ext); -+ } -+ -+ return blocks * current_blocksize; -+} -+ -+extent_state state_of_extent(reiser4_extent * ext) -+{ -+ switch ((int)extent_get_start(ext)) { -+ case 0: -+ return HOLE_EXTENT; -+ case 1: -+ return UNALLOCATED_EXTENT; -+ default: -+ break; -+ } -+ return ALLOCATED_EXTENT; -+} -+ -+int extent_is_unallocated(const coord_t * item) -+{ -+ assert("jmacd-5133", item_is_extent(item)); -+ -+ return state_of_extent(extent_by_coord(item)) == UNALLOCATED_EXTENT; -+} -+ -+/* set extent's start and width */ -+void reiser4_set_extent(reiser4_extent * ext, reiser4_block_nr start, -+ reiser4_block_nr width) -+{ -+ extent_set_start(ext, start); -+ extent_set_width(ext, width); -+} -+ -+/** -+ * reiser4_replace_extent - replace extent and paste 1 or 2 after it -+ * @un_extent: coordinate of extent to be overwritten -+ * @lh: need better comment -+ * @key: need better comment -+ * @exts_to_add: data prepared for insertion into tree -+ * @replace: need better comment -+ * @flags: need better comment -+ * @return_insert_position: need better comment -+ * -+ * Overwrites one extent, pastes 1 or 2 more ones after overwritten one. If -+ * @return_inserted_position is 1 - @un_extent and @lh are returned set to -+ * first of newly inserted units, if it is 0 - @un_extent and @lh are returned -+ * set to extent which was overwritten. -+ */ -+int reiser4_replace_extent(struct replace_handle *h, -+ int return_inserted_position) -+{ -+ int result; -+ znode *orig_znode; -+ /*ON_DEBUG(reiser4_extent orig_ext);*/ /* this is for debugging */ -+ -+ assert("vs-990", coord_is_existing_unit(h->coord)); -+ assert("vs-1375", znode_is_write_locked(h->coord->node)); -+ assert("vs-1426", extent_get_width(&h->overwrite) != 0); -+ assert("vs-1427", extent_get_width(&h->new_extents[0]) != 0); -+ assert("vs-1427", ergo(h->nr_new_extents == 2, -+ extent_get_width(&h->new_extents[1]) != 0)); -+ -+ /* compose structure for paste */ -+ init_new_extent(&h->item, &h->new_extents[0], h->nr_new_extents); -+ -+ coord_dup(&h->coord_after, h->coord); -+ init_lh(&h->lh_after); -+ copy_lh(&h->lh_after, h->lh); -+ reiser4_tap_init(&h->watch, &h->coord_after, &h->lh_after, ZNODE_WRITE_LOCK); -+ reiser4_tap_monitor(&h->watch); -+ -+ ON_DEBUG(h->orig_ext = *extent_by_coord(h->coord)); -+ orig_znode = h->coord->node; -+ -+#if REISER4_DEBUG -+ /* make sure that key is set properly */ -+ unit_key_by_coord(h->coord, &h->tmp); -+ set_key_offset(&h->tmp, -+ get_key_offset(&h->tmp) + -+ extent_get_width(&h->overwrite) * current_blocksize); -+ assert("vs-1080", keyeq(&h->tmp, &h->paste_key)); -+#endif -+ -+ /* set insert point after unit to be replaced */ -+ h->coord->between = AFTER_UNIT; -+ -+ result = insert_into_item(h->coord, return_inserted_position ? h->lh : NULL, -+ &h->paste_key, &h->item, h->flags); -+ if (!result) { -+ /* now we have to replace the unit after which new units were -+ inserted. Its position is tracked by @watch */ -+ reiser4_extent *ext; -+ znode *node; -+ -+ node = h->coord_after.node; -+ if (node != orig_znode) { -+ coord_clear_iplug(&h->coord_after); -+ result = zload(node); -+ } -+ -+ if (likely(!result)) { -+ ext = extent_by_coord(&h->coord_after); -+ -+ assert("vs-987", znode_is_loaded(node)); -+ assert("vs-988", !memcmp(ext, &h->orig_ext, sizeof(*ext))); -+ -+ /* overwrite extent unit */ -+ memcpy(ext, &h->overwrite, sizeof(reiser4_extent)); -+ znode_make_dirty(node); -+ -+ if (node != orig_znode) -+ zrelse(node); -+ -+ if (return_inserted_position == 0) { -+ /* coord and lh are to be set to overwritten -+ extent */ -+ assert("vs-1662", -+ WITH_DATA(node, !memcmp(&h->overwrite, -+ extent_by_coord( -+ &h->coord_after), -+ sizeof(reiser4_extent)))); -+ -+ *h->coord = h->coord_after; -+ done_lh(h->lh); -+ copy_lh(h->lh, &h->lh_after); -+ } else { -+ /* h->coord and h->lh are to be set to first of -+ inserted units */ -+ assert("vs-1663", -+ WITH_DATA(h->coord->node, -+ !memcmp(&h->new_extents[0], -+ extent_by_coord(h->coord), -+ sizeof(reiser4_extent)))); -+ assert("vs-1664", h->lh->node == h->coord->node); -+ } -+ } -+ } -+ reiser4_tap_done(&h->watch); -+ -+ return result; -+} -+ -+lock_handle *znode_lh(znode *node) -+{ -+ assert("vs-1371", znode_is_write_locked(node)); -+ assert("vs-1372", znode_is_wlocked_once(node)); -+ return list_entry(node->lock.owners.next, lock_handle, owners_link); -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/extent_file_ops.c linux-2.6.24/fs/reiser4/plugin/item/extent_file_ops.c ---- linux-2.6.24.orig/fs/reiser4/plugin/item/extent_file_ops.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/item/extent_file_ops.c 2008-01-25 11:40:16.698169785 +0300 -@@ -0,0 +1,1450 @@ -+/* COPYRIGHT 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "item.h" -+#include "../../inode.h" -+#include "../../page_cache.h" -+#include "../object.h" -+ -+#include -+#include -+ -+static inline reiser4_extent *ext_by_offset(const znode *node, int offset) -+{ -+ reiser4_extent *ext; -+ -+ ext = (reiser4_extent *) (zdata(node) + offset); -+ return ext; -+} -+ -+/** -+ * check_uf_coord - verify coord extension -+ * @uf_coord: -+ * @key: -+ * -+ * Makes sure that all fields of @uf_coord are set properly. If @key is -+ * specified - check whether @uf_coord is set correspondingly. -+ */ -+static void check_uf_coord(const uf_coord_t *uf_coord, const reiser4_key *key) -+{ -+#if REISER4_DEBUG -+ const coord_t *coord; -+ const struct extent_coord_extension *ext_coord; -+ reiser4_extent *ext; -+ -+ coord = &uf_coord->coord; -+ ext_coord = &uf_coord->extension.extent; -+ ext = ext_by_offset(coord->node, uf_coord->extension.extent.ext_offset); -+ -+ assert("", -+ WITH_DATA(coord->node, -+ (uf_coord->valid == 1 && -+ coord_is_iplug_set(coord) && -+ item_is_extent(coord) && -+ ext_coord->nr_units == nr_units_extent(coord) && -+ ext == extent_by_coord(coord) && -+ ext_coord->width == extent_get_width(ext) && -+ coord->unit_pos < ext_coord->nr_units && -+ ext_coord->pos_in_unit < ext_coord->width && -+ memcmp(ext, &ext_coord->extent, -+ sizeof(reiser4_extent)) == 0))); -+ if (key) { -+ reiser4_key coord_key; -+ -+ unit_key_by_coord(&uf_coord->coord, &coord_key); -+ set_key_offset(&coord_key, -+ get_key_offset(&coord_key) + -+ (uf_coord->extension.extent. -+ pos_in_unit << PAGE_CACHE_SHIFT)); -+ assert("", keyeq(key, &coord_key)); -+ } -+#endif -+} -+ -+static inline reiser4_extent *ext_by_ext_coord(const uf_coord_t *uf_coord) -+{ -+ check_uf_coord(uf_coord, NULL); -+ -+ return ext_by_offset(uf_coord->coord.node, -+ uf_coord->extension.extent.ext_offset); -+} -+ -+#if REISER4_DEBUG -+ -+/** -+ * offset_is_in_unit -+ * -+ * -+ * -+ */ -+/* return 1 if offset @off is inside of extent unit pointed to by @coord. Set -+ pos_in_unit inside of unit correspondingly */ -+static int offset_is_in_unit(const coord_t *coord, loff_t off) -+{ -+ reiser4_key unit_key; -+ __u64 unit_off; -+ reiser4_extent *ext; -+ -+ ext = extent_by_coord(coord); -+ -+ unit_key_extent(coord, &unit_key); -+ unit_off = get_key_offset(&unit_key); -+ if (off < unit_off) -+ return 0; -+ if (off >= (unit_off + (current_blocksize * extent_get_width(ext)))) -+ return 0; -+ return 1; -+} -+ -+static int -+coord_matches_key_extent(const coord_t * coord, const reiser4_key * key) -+{ -+ reiser4_key item_key; -+ -+ assert("vs-771", coord_is_existing_unit(coord)); -+ assert("vs-1258", keylt(key, append_key_extent(coord, &item_key))); -+ assert("vs-1259", keyge(key, item_key_by_coord(coord, &item_key))); -+ -+ return offset_is_in_unit(coord, get_key_offset(key)); -+} -+ -+#endif -+ -+/** -+ * can_append - -+ * @key: -+ * @coord: -+ * -+ * Returns 1 if @key is equal to an append key of item @coord is set to -+ */ -+static int can_append(const reiser4_key *key, const coord_t *coord) -+{ -+ reiser4_key append_key; -+ -+ return keyeq(key, append_key_extent(coord, &append_key)); -+} -+ -+/** -+ * append_hole -+ * @coord: -+ * @lh: -+ * @key: -+ * -+ */ -+static int append_hole(coord_t *coord, lock_handle *lh, -+ const reiser4_key *key) -+{ -+ reiser4_key append_key; -+ reiser4_block_nr hole_width; -+ reiser4_extent *ext, new_ext; -+ reiser4_item_data idata; -+ -+ /* last item of file may have to be appended with hole */ -+ assert("vs-708", znode_get_level(coord->node) == TWIG_LEVEL); -+ assert("vs-714", item_id_by_coord(coord) == EXTENT_POINTER_ID); -+ -+ /* key of first byte which is not addressed by this extent */ -+ append_key_extent(coord, &append_key); -+ -+ assert("", keyle(&append_key, key)); -+ -+ /* -+ * extent item has to be appended with hole. Calculate length of that -+ * hole -+ */ -+ hole_width = ((get_key_offset(key) - get_key_offset(&append_key) + -+ current_blocksize - 1) >> current_blocksize_bits); -+ assert("vs-954", hole_width > 0); -+ -+ /* set coord after last unit */ -+ coord_init_after_item_end(coord); -+ -+ /* get last extent in the item */ -+ ext = extent_by_coord(coord); -+ if (state_of_extent(ext) == HOLE_EXTENT) { -+ /* -+ * last extent of a file is hole extent. Widen that extent by -+ * @hole_width blocks. Note that we do not worry about -+ * overflowing - extent width is 64 bits -+ */ -+ reiser4_set_extent(ext, HOLE_EXTENT_START, -+ extent_get_width(ext) + hole_width); -+ znode_make_dirty(coord->node); -+ return 0; -+ } -+ -+ /* append last item of the file with hole extent unit */ -+ assert("vs-713", (state_of_extent(ext) == ALLOCATED_EXTENT || -+ state_of_extent(ext) == UNALLOCATED_EXTENT)); -+ -+ reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width); -+ init_new_extent(&idata, &new_ext, 1); -+ return insert_into_item(coord, lh, &append_key, &idata, 0); -+} -+ -+/** -+ * check_jnodes -+ * @twig: longterm locked twig node -+ * @key: -+ * -+ */ -+static void check_jnodes(znode *twig, const reiser4_key *key, int count) -+{ -+#if REISER4_DEBUG -+ coord_t c; -+ reiser4_key node_key, jnode_key; -+ -+ jnode_key = *key; -+ -+ assert("", twig != NULL); -+ assert("", znode_get_level(twig) == TWIG_LEVEL); -+ assert("", znode_is_write_locked(twig)); -+ -+ zload(twig); -+ /* get the smallest key in twig node */ -+ coord_init_first_unit(&c, twig); -+ unit_key_by_coord(&c, &node_key); -+ assert("", keyle(&node_key, &jnode_key)); -+ -+ coord_init_last_unit(&c, twig); -+ unit_key_by_coord(&c, &node_key); -+ if (item_plugin_by_coord(&c)->s.file.append_key) -+ item_plugin_by_coord(&c)->s.file.append_key(&c, &node_key); -+ set_key_offset(&jnode_key, -+ get_key_offset(&jnode_key) + (loff_t)count * PAGE_CACHE_SIZE - 1); -+ assert("", keylt(&jnode_key, &node_key)); -+ zrelse(twig); -+#endif -+} -+ -+/** -+ * append_last_extent - append last file item -+ * @uf_coord: coord to start insertion from -+ * @jnodes: array of jnodes -+ * @count: number of jnodes in the array -+ * -+ * There is already at least one extent item of file @inode in the tree. Append -+ * the last of them with unallocated extent unit of width @count. Assign -+ * fake block numbers to jnodes corresponding to the inserted extent. -+ */ -+static int append_last_extent(uf_coord_t *uf_coord, const reiser4_key *key, -+ jnode **jnodes, int count) -+{ -+ int result; -+ reiser4_extent new_ext; -+ reiser4_item_data idata; -+ coord_t *coord; -+ struct extent_coord_extension *ext_coord; -+ reiser4_extent *ext; -+ reiser4_block_nr block; -+ jnode *node; -+ int i; -+ -+ coord = &uf_coord->coord; -+ ext_coord = &uf_coord->extension.extent; -+ ext = ext_by_ext_coord(uf_coord); -+ -+ /* check correctness of position in the item */ -+ assert("vs-228", coord->unit_pos == coord_last_unit_pos(coord)); -+ assert("vs-1311", coord->between == AFTER_UNIT); -+ assert("vs-1302", ext_coord->pos_in_unit == ext_coord->width - 1); -+ -+ if (!can_append(key, coord)) { -+ /* hole extent has to be inserted */ -+ result = append_hole(coord, uf_coord->lh, key); -+ uf_coord->valid = 0; -+ return result; -+ } -+ -+ if (count == 0) -+ return 0; -+ -+ assert("", get_key_offset(key) == (loff_t)index_jnode(jnodes[0]) * PAGE_CACHE_SIZE); -+ -+ result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host, -+ count); -+ BUG_ON(result != 0); -+ -+ switch (state_of_extent(ext)) { -+ case UNALLOCATED_EXTENT: -+ /* -+ * last extent unit of the file is unallocated one. Increase -+ * its width by @count -+ */ -+ reiser4_set_extent(ext, UNALLOCATED_EXTENT_START, -+ extent_get_width(ext) + count); -+ znode_make_dirty(coord->node); -+ -+ /* update coord extension */ -+ ext_coord->width += count; -+ ON_DEBUG(extent_set_width -+ (&uf_coord->extension.extent.extent, -+ ext_coord->width)); -+ break; -+ -+ case HOLE_EXTENT: -+ case ALLOCATED_EXTENT: -+ /* -+ * last extent unit of the file is either hole or allocated -+ * one. Append one unallocated extent of width @count -+ */ -+ reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count); -+ init_new_extent(&idata, &new_ext, 1); -+ result = insert_into_item(coord, uf_coord->lh, key, &idata, 0); -+ uf_coord->valid = 0; -+ if (result) -+ return result; -+ break; -+ -+ default: -+ return RETERR(-EIO); -+ } -+ -+ /* -+ * make sure that we hold long term locked twig node containing all -+ * jnodes we are about to capture -+ */ -+ check_jnodes(uf_coord->lh->node, key, count); -+ -+ /* -+ * assign fake block numbers to all jnodes. FIXME: make sure whether -+ * twig node containing inserted extent item is locked -+ */ -+ block = fake_blocknr_unformatted(count); -+ for (i = 0; i < count; i ++, block ++) { -+ node = jnodes[i]; -+ spin_lock_jnode(node); -+ JF_SET(node, JNODE_CREATED); -+ jnode_set_block(node, &block); -+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0); -+ BUG_ON(result != 0); -+ jnode_make_dirty_locked(node); -+ spin_unlock_jnode(node); -+ } -+ return count; -+} -+ -+/** -+ * insert_first_hole - inser hole extent into tree -+ * @coord: -+ * @lh: -+ * @key: -+ * -+ * -+ */ -+static int insert_first_hole(coord_t *coord, lock_handle *lh, -+ const reiser4_key *key) -+{ -+ reiser4_extent new_ext; -+ reiser4_item_data idata; -+ reiser4_key item_key; -+ reiser4_block_nr hole_width; -+ -+ /* @coord must be set for inserting of new item */ -+ assert("vs-711", coord_is_between_items(coord)); -+ -+ item_key = *key; -+ set_key_offset(&item_key, 0ull); -+ -+ hole_width = ((get_key_offset(key) + current_blocksize - 1) >> -+ current_blocksize_bits); -+ assert("vs-710", hole_width > 0); -+ -+ /* compose body of hole extent and insert item into tree */ -+ reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width); -+ init_new_extent(&idata, &new_ext, 1); -+ return insert_extent_by_coord(coord, &idata, &item_key, lh); -+} -+ -+ -+/** -+ * insert_first_extent - insert first file item -+ * @inode: inode of file -+ * @uf_coord: coord to start insertion from -+ * @jnodes: array of jnodes -+ * @count: number of jnodes in the array -+ * @inode: -+ * -+ * There are no items of file @inode in the tree yet. Insert unallocated extent -+ * of width @count into tree or hole extent if writing not to the -+ * beginning. Assign fake block numbers to jnodes corresponding to the inserted -+ * unallocated extent. Returns number of jnodes or error code. -+ */ -+static int insert_first_extent(uf_coord_t *uf_coord, const reiser4_key *key, -+ jnode **jnodes, int count, -+ struct inode *inode) -+{ -+ int result; -+ int i; -+ reiser4_extent new_ext; -+ reiser4_item_data idata; -+ reiser4_block_nr block; -+ struct unix_file_info *uf_info; -+ jnode *node; -+ -+ /* first extent insertion starts at leaf level */ -+ assert("vs-719", znode_get_level(uf_coord->coord.node) == LEAF_LEVEL); -+ assert("vs-711", coord_is_between_items(&uf_coord->coord)); -+ -+ if (get_key_offset(key) != 0) { -+ result = insert_first_hole(&uf_coord->coord, uf_coord->lh, key); -+ uf_coord->valid = 0; -+ uf_info = unix_file_inode_data(inode); -+ -+ /* -+ * first item insertion is only possible when writing to empty -+ * file or performing tail conversion -+ */ -+ assert("", (uf_info->container == UF_CONTAINER_EMPTY || -+ (reiser4_inode_get_flag(inode, -+ REISER4_PART_MIXED) && -+ reiser4_inode_get_flag(inode, -+ REISER4_PART_IN_CONV)))); -+ /* if file was empty - update its state */ -+ if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY) -+ uf_info->container = UF_CONTAINER_EXTENTS; -+ return result; -+ } -+ -+ if (count == 0) -+ return 0; -+ -+ result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host, count); -+ BUG_ON(result != 0); -+ -+ /* -+ * prepare for tree modification: compose body of item and item data -+ * structure needed for insertion -+ */ -+ reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count); -+ init_new_extent(&idata, &new_ext, 1); -+ -+ /* insert extent item into the tree */ -+ result = insert_extent_by_coord(&uf_coord->coord, &idata, key, -+ uf_coord->lh); -+ if (result) -+ return result; -+ -+ /* -+ * make sure that we hold long term locked twig node containing all -+ * jnodes we are about to capture -+ */ -+ check_jnodes(uf_coord->lh->node, key, count); -+ /* -+ * assign fake block numbers to all jnodes, capture and mark them dirty -+ */ -+ block = fake_blocknr_unformatted(count); -+ for (i = 0; i < count; i ++, block ++) { -+ node = jnodes[i]; -+ spin_lock_jnode(node); -+ JF_SET(node, JNODE_CREATED); -+ jnode_set_block(node, &block); -+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0); -+ BUG_ON(result != 0); -+ jnode_make_dirty_locked(node); -+ spin_unlock_jnode(node); -+ } -+ -+ /* -+ * invalidate coordinate, research must be performed to continue -+ * because write will continue on twig level -+ */ -+ uf_coord->valid = 0; -+ return count; -+} -+ -+/** -+ * plug_hole - replace hole extent with unallocated and holes -+ * @uf_coord: -+ * @key: -+ * @node: -+ * @h: structure containing coordinate, lock handle, key, etc -+ * -+ * Creates an unallocated extent of width 1 within a hole. In worst case two -+ * additional extents can be created. -+ */ -+static int plug_hole(uf_coord_t *uf_coord, const reiser4_key *key, int *how) -+{ -+ struct replace_handle rh; -+ reiser4_extent *ext; -+ reiser4_block_nr width, pos_in_unit; -+ coord_t *coord; -+ struct extent_coord_extension *ext_coord; -+ int return_inserted_position; -+ -+ check_uf_coord(uf_coord, key); -+ -+ rh.coord = coord_by_uf_coord(uf_coord); -+ rh.lh = uf_coord->lh; -+ rh.flags = 0; -+ -+ coord = coord_by_uf_coord(uf_coord); -+ ext_coord = ext_coord_by_uf_coord(uf_coord); -+ ext = ext_by_ext_coord(uf_coord); -+ -+ width = ext_coord->width; -+ pos_in_unit = ext_coord->pos_in_unit; -+ -+ *how = 0; -+ if (width == 1) { -+ reiser4_set_extent(ext, UNALLOCATED_EXTENT_START, 1); -+ znode_make_dirty(coord->node); -+ /* update uf_coord */ -+ ON_DEBUG(ext_coord->extent = *ext); -+ *how = 1; -+ return 0; -+ } else if (pos_in_unit == 0) { -+ /* we deal with first element of extent */ -+ if (coord->unit_pos) { -+ /* there is an extent to the left */ -+ if (state_of_extent(ext - 1) == UNALLOCATED_EXTENT) { -+ /* -+ * left neighboring unit is an unallocated -+ * extent. Increase its width and decrease -+ * width of hole -+ */ -+ extent_set_width(ext - 1, -+ extent_get_width(ext - 1) + 1); -+ extent_set_width(ext, width - 1); -+ znode_make_dirty(coord->node); -+ -+ /* update coord extension */ -+ coord->unit_pos--; -+ ext_coord->width = extent_get_width(ext - 1); -+ ext_coord->pos_in_unit = ext_coord->width - 1; -+ ext_coord->ext_offset -= sizeof(reiser4_extent); -+ ON_DEBUG(ext_coord->extent = -+ *extent_by_coord(coord)); -+ *how = 2; -+ return 0; -+ } -+ } -+ /* extent for replace */ -+ reiser4_set_extent(&rh.overwrite, UNALLOCATED_EXTENT_START, 1); -+ /* extent to be inserted */ -+ reiser4_set_extent(&rh.new_extents[0], HOLE_EXTENT_START, -+ width - 1); -+ rh.nr_new_extents = 1; -+ -+ /* have reiser4_replace_extent to return with @coord and -+ @uf_coord->lh set to unit which was replaced */ -+ return_inserted_position = 0; -+ *how = 3; -+ } else if (pos_in_unit == width - 1) { -+ /* we deal with last element of extent */ -+ if (coord->unit_pos < nr_units_extent(coord) - 1) { -+ /* there is an extent unit to the right */ -+ if (state_of_extent(ext + 1) == UNALLOCATED_EXTENT) { -+ /* -+ * right neighboring unit is an unallocated -+ * extent. Increase its width and decrease -+ * width of hole -+ */ -+ extent_set_width(ext + 1, -+ extent_get_width(ext + 1) + 1); -+ extent_set_width(ext, width - 1); -+ znode_make_dirty(coord->node); -+ -+ /* update coord extension */ -+ coord->unit_pos++; -+ ext_coord->width = extent_get_width(ext + 1); -+ ext_coord->pos_in_unit = 0; -+ ext_coord->ext_offset += sizeof(reiser4_extent); -+ ON_DEBUG(ext_coord->extent = -+ *extent_by_coord(coord)); -+ *how = 4; -+ return 0; -+ } -+ } -+ /* extent for replace */ -+ reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START, width - 1); -+ /* extent to be inserted */ -+ reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START, -+ 1); -+ rh.nr_new_extents = 1; -+ -+ /* have reiser4_replace_extent to return with @coord and -+ @uf_coord->lh set to unit which was inserted */ -+ return_inserted_position = 1; -+ *how = 5; -+ } else { -+ /* extent for replace */ -+ reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START, -+ pos_in_unit); -+ /* extents to be inserted */ -+ reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START, -+ 1); -+ reiser4_set_extent(&rh.new_extents[1], HOLE_EXTENT_START, -+ width - pos_in_unit - 1); -+ rh.nr_new_extents = 2; -+ -+ /* have reiser4_replace_extent to return with @coord and -+ @uf_coord->lh set to first of units which were inserted */ -+ return_inserted_position = 1; -+ *how = 6; -+ } -+ unit_key_by_coord(coord, &rh.paste_key); -+ set_key_offset(&rh.paste_key, get_key_offset(&rh.paste_key) + -+ extent_get_width(&rh.overwrite) * current_blocksize); -+ -+ uf_coord->valid = 0; -+ return reiser4_replace_extent(&rh, return_inserted_position); -+} -+ -+/** -+ * overwrite_one_block - -+ * @uf_coord: -+ * @key: -+ * @node: -+ * -+ * If @node corresponds to hole extent - create unallocated extent for it and -+ * assign fake block number. If @node corresponds to allocated extent - assign -+ * block number of jnode -+ */ -+static int overwrite_one_block(uf_coord_t *uf_coord, const reiser4_key *key, -+ jnode *node, int *hole_plugged) -+{ -+ int result; -+ struct extent_coord_extension *ext_coord; -+ reiser4_extent *ext; -+ reiser4_block_nr block; -+ int how; -+ -+ assert("vs-1312", uf_coord->coord.between == AT_UNIT); -+ -+ result = 0; -+ ext_coord = ext_coord_by_uf_coord(uf_coord); -+ ext = ext_by_ext_coord(uf_coord); -+ assert("", state_of_extent(ext) != UNALLOCATED_EXTENT); -+ -+ switch (state_of_extent(ext)) { -+ case ALLOCATED_EXTENT: -+ block = extent_get_start(ext) + ext_coord->pos_in_unit; -+ break; -+ -+ case HOLE_EXTENT: -+ result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(node)->host, 1); -+ BUG_ON(result != 0); -+ result = plug_hole(uf_coord, key, &how); -+ if (result) -+ return result; -+ block = fake_blocknr_unformatted(1); -+ if (hole_plugged) -+ *hole_plugged = 1; -+ JF_SET(node, JNODE_CREATED); -+ break; -+ -+ default: -+ return RETERR(-EIO); -+ } -+ -+ jnode_set_block(node, &block); -+ return 0; -+} -+ -+/** -+ * move_coord - move coordinate forward -+ * @uf_coord: -+ * -+ * Move coordinate one data block pointer forward. Return 1 if coord is set to -+ * the last one already or is invalid. -+ */ -+static int move_coord(uf_coord_t *uf_coord) -+{ -+ struct extent_coord_extension *ext_coord; -+ -+ if (uf_coord->valid == 0) -+ return 1; -+ ext_coord = &uf_coord->extension.extent; -+ ext_coord->pos_in_unit ++; -+ if (ext_coord->pos_in_unit < ext_coord->width) -+ /* coordinate moved within the unit */ -+ return 0; -+ -+ /* end of unit is reached. Try to move to next unit */ -+ ext_coord->pos_in_unit = 0; -+ uf_coord->coord.unit_pos ++; -+ if (uf_coord->coord.unit_pos < ext_coord->nr_units) { -+ /* coordinate moved to next unit */ -+ ext_coord->ext_offset += sizeof(reiser4_extent); -+ ext_coord->width = -+ extent_get_width(ext_by_offset -+ (uf_coord->coord.node, -+ ext_coord->ext_offset)); -+ ON_DEBUG(ext_coord->extent = -+ *ext_by_offset(uf_coord->coord.node, -+ ext_coord->ext_offset)); -+ return 0; -+ } -+ /* end of item is reached */ -+ uf_coord->valid = 0; -+ return 1; -+} -+ -+/** -+ * overwrite_extent - -+ * @inode: -+ * -+ * Returns number of handled jnodes. -+ */ -+static int overwrite_extent(uf_coord_t *uf_coord, const reiser4_key *key, -+ jnode **jnodes, int count, int *plugged_hole) -+{ -+ int result; -+ reiser4_key k; -+ int i; -+ jnode *node; -+ -+ k = *key; -+ for (i = 0; i < count; i ++) { -+ node = jnodes[i]; -+ if (*jnode_get_block(node) == 0) { -+ result = overwrite_one_block(uf_coord, &k, node, plugged_hole); -+ if (result) -+ return result; -+ } -+ /* -+ * make sure that we hold long term locked twig node containing -+ * all jnodes we are about to capture -+ */ -+ check_jnodes(uf_coord->lh->node, &k, 1); -+ /* -+ * assign fake block numbers to all jnodes, capture and mark -+ * them dirty -+ */ -+ spin_lock_jnode(node); -+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0); -+ BUG_ON(result != 0); -+ jnode_make_dirty_locked(node); -+ spin_unlock_jnode(node); -+ -+ if (uf_coord->valid == 0) -+ return i + 1; -+ -+ check_uf_coord(uf_coord, &k); -+ -+ if (move_coord(uf_coord)) { -+ /* -+ * failed to move to the next node pointer. Either end -+ * of file or end of twig node is reached. In the later -+ * case we might go to the right neighbor. -+ */ -+ uf_coord->valid = 0; -+ return i + 1; -+ } -+ set_key_offset(&k, get_key_offset(&k) + PAGE_CACHE_SIZE); -+ } -+ -+ return count; -+} -+ -+/** -+ * reiser4_update_extent -+ * @file: -+ * @jnodes: -+ * @count: -+ * @off: -+ * -+ */ -+int reiser4_update_extent(struct inode *inode, jnode *node, loff_t pos, -+ int *plugged_hole) -+{ -+ int result; -+ znode *loaded; -+ uf_coord_t uf_coord; -+ coord_t *coord; -+ lock_handle lh; -+ reiser4_key key; -+ -+ assert("", reiser4_lock_counters()->d_refs == 0); -+ -+ key_by_inode_and_offset_common(inode, pos, &key); -+ -+ init_uf_coord(&uf_coord, &lh); -+ coord = &uf_coord.coord; -+ result = find_file_item_nohint(coord, &lh, &key, -+ ZNODE_WRITE_LOCK, inode); -+ if (IS_CBKERR(result)) { -+ assert("", reiser4_lock_counters()->d_refs == 0); -+ return result; -+ } -+ -+ result = zload(coord->node); -+ BUG_ON(result != 0); -+ loaded = coord->node; -+ -+ if (coord->between == AFTER_UNIT) { -+ /* -+ * append existing extent item with unallocated extent of width -+ * nr_jnodes -+ */ -+ init_coord_extension_extent(&uf_coord, -+ get_key_offset(&key)); -+ result = append_last_extent(&uf_coord, &key, -+ &node, 1); -+ } else if (coord->between == AT_UNIT) { -+ /* -+ * overwrite -+ * not optimal yet. Will be optimized if new write will show -+ * performance win. -+ */ -+ init_coord_extension_extent(&uf_coord, -+ get_key_offset(&key)); -+ result = overwrite_extent(&uf_coord, &key, -+ &node, 1, plugged_hole); -+ } else { -+ /* -+ * there are no items of this file in the tree yet. Create -+ * first item of the file inserting one unallocated extent of -+ * width nr_jnodes -+ */ -+ result = insert_first_extent(&uf_coord, &key, &node, 1, inode); -+ } -+ assert("", result == 1 || result < 0); -+ zrelse(loaded); -+ done_lh(&lh); -+ assert("", reiser4_lock_counters()->d_refs == 0); -+ return (result == 1) ? 0 : result; -+} -+ -+/** -+ * update_extents -+ * @file: -+ * @jnodes: -+ * @count: -+ * @off: -+ * -+ */ -+static int update_extents(struct file *file, struct inode *inode, -+ jnode **jnodes, int count, loff_t pos) -+{ -+ struct hint hint; -+ reiser4_key key; -+ int result; -+ znode *loaded; -+ -+ result = load_file_hint(file, &hint); -+ BUG_ON(result != 0); -+ -+ if (count != 0) -+ /* -+ * count == 0 is special case: expanding truncate -+ */ -+ pos = (loff_t)index_jnode(jnodes[0]) << PAGE_CACHE_SHIFT; -+ key_by_inode_and_offset_common(inode, pos, &key); -+ -+ assert("", reiser4_lock_counters()->d_refs == 0); -+ -+ do { -+ result = find_file_item(&hint, &key, ZNODE_WRITE_LOCK, inode); -+ if (IS_CBKERR(result)) { -+ assert("", reiser4_lock_counters()->d_refs == 0); -+ return result; -+ } -+ -+ result = zload(hint.ext_coord.coord.node); -+ BUG_ON(result != 0); -+ loaded = hint.ext_coord.coord.node; -+ -+ if (hint.ext_coord.coord.between == AFTER_UNIT) { -+ /* -+ * append existing extent item with unallocated extent -+ * of width nr_jnodes -+ */ -+ if (hint.ext_coord.valid == 0) -+ /* NOTE: get statistics on this */ -+ init_coord_extension_extent(&hint.ext_coord, -+ get_key_offset(&key)); -+ result = append_last_extent(&hint.ext_coord, &key, -+ jnodes, count); -+ } else if (hint.ext_coord.coord.between == AT_UNIT) { -+ /* -+ * overwrite -+ * not optimal yet. Will be optimized if new write will -+ * show performance win. -+ */ -+ if (hint.ext_coord.valid == 0) -+ /* NOTE: get statistics on this */ -+ init_coord_extension_extent(&hint.ext_coord, -+ get_key_offset(&key)); -+ result = overwrite_extent(&hint.ext_coord, &key, -+ jnodes, count, NULL); -+ } else { -+ /* -+ * there are no items of this file in the tree -+ * yet. Create first item of the file inserting one -+ * unallocated extent of * width nr_jnodes -+ */ -+ result = insert_first_extent(&hint.ext_coord, &key, -+ jnodes, count, inode); -+ } -+ zrelse(loaded); -+ if (result < 0) { -+ done_lh(hint.ext_coord.lh); -+ break; -+ } -+ -+ jnodes += result; -+ count -= result; -+ set_key_offset(&key, get_key_offset(&key) + result * PAGE_CACHE_SIZE); -+ -+ /* seal and unlock znode */ -+ if (hint.ext_coord.valid) -+ reiser4_set_hint(&hint, &key, ZNODE_WRITE_LOCK); -+ else -+ reiser4_unset_hint(&hint); -+ -+ } while (count > 0); -+ -+ save_file_hint(file, &hint); -+ assert("", reiser4_lock_counters()->d_refs == 0); -+ return result; -+} -+ -+/** -+ * write_extent_reserve_space - reserve space for extent write operation -+ * @inode: -+ * -+ * Estimates and reserves space which may be required for writing -+ * WRITE_GRANULARITY pages of file. -+ */ -+static int write_extent_reserve_space(struct inode *inode) -+{ -+ __u64 count; -+ reiser4_tree *tree; -+ -+ /* -+ * to write WRITE_GRANULARITY pages to a file by extents we have to -+ * reserve disk space for: -+ -+ * 1. find_file_item may have to insert empty node to the tree (empty -+ * leaf node between two extent items). This requires 1 block and -+ * number of blocks which are necessary to perform insertion of an -+ * internal item into twig level. -+ -+ * 2. for each of written pages there might be needed 1 block and -+ * number of blocks which might be necessary to perform insertion of or -+ * paste to an extent item. -+ -+ * 3. stat data update -+ */ -+ tree = reiser4_tree_by_inode(inode); -+ count = estimate_one_insert_item(tree) + -+ WRITE_GRANULARITY * (1 + estimate_one_insert_into_item(tree)) + -+ estimate_one_insert_item(tree); -+ grab_space_enable(); -+ return reiser4_grab_space(count, 0 /* flags */); -+} -+ -+/* -+ * filemap_copy_from_user no longer exists in generic code, because it -+ * is deadlocky (copying from user while holding the page lock is bad). -+ * As a temporary fix for reiser4, just define it here. -+ */ -+static inline size_t -+filemap_copy_from_user(struct page *page, unsigned long offset, -+ const char __user *buf, unsigned bytes) -+{ -+ char *kaddr; -+ int left; -+ -+ kaddr = kmap_atomic(page, KM_USER0); -+ left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes); -+ kunmap_atomic(kaddr, KM_USER0); -+ -+ if (left != 0) { -+ /* Do it the slow way */ -+ kaddr = kmap(page); -+ left = __copy_from_user_nocache(kaddr + offset, buf, bytes); -+ kunmap(page); -+ } -+ return bytes - left; -+} -+ -+/** -+ * reiser4_write_extent - write method of extent item plugin -+ * @file: file to write to -+ * @buf: address of user-space buffer -+ * @count: number of bytes to write -+ * @pos: position in file to write to -+ * -+ */ -+ssize_t reiser4_write_extent(struct file *file, struct inode * inode, -+ const char __user *buf, size_t count, loff_t *pos) -+{ -+ int have_to_update_extent; -+ int nr_pages, nr_dirty; -+ struct page *page; -+ jnode *jnodes[WRITE_GRANULARITY + 1]; -+ unsigned long index; -+ unsigned long end; -+ int i; -+ int to_page, page_off; -+ size_t left, written; -+ int result = 0; -+ -+ if (write_extent_reserve_space(inode)) -+ return RETERR(-ENOSPC); -+ -+ if (count == 0) { -+ /* truncate case */ -+ update_extents(file, inode, jnodes, 0, *pos); -+ return 0; -+ } -+ -+ BUG_ON(get_current_context()->trans->atom != NULL); -+ -+ left = count; -+ index = *pos >> PAGE_CACHE_SHIFT; -+ /* calculate number of pages which are to be written */ -+ end = ((*pos + count - 1) >> PAGE_CACHE_SHIFT); -+ nr_pages = end - index + 1; -+ nr_dirty = 0; -+ assert("", nr_pages <= WRITE_GRANULARITY + 1); -+ -+ /* get pages and jnodes */ -+ for (i = 0; i < nr_pages; i ++) { -+ page = find_or_create_page(inode->i_mapping, index + i, -+ reiser4_ctx_gfp_mask_get()); -+ if (page == NULL) { -+ nr_pages = i; -+ result = RETERR(-ENOMEM); -+ goto out; -+ } -+ -+ jnodes[i] = jnode_of_page(page); -+ if (IS_ERR(jnodes[i])) { -+ unlock_page(page); -+ page_cache_release(page); -+ nr_pages = i; -+ result = RETERR(-ENOMEM); -+ goto out; -+ } -+ /* prevent jnode and page from disconnecting */ -+ JF_SET(jnodes[i], JNODE_WRITE_PREPARED); -+ unlock_page(page); -+ } -+ -+ BUG_ON(get_current_context()->trans->atom != NULL); -+ -+ have_to_update_extent = 0; -+ -+ page_off = (*pos & (PAGE_CACHE_SIZE - 1)); -+ for (i = 0; i < nr_pages; i ++) { -+ to_page = PAGE_CACHE_SIZE - page_off; -+ if (to_page > left) -+ to_page = left; -+ page = jnode_page(jnodes[i]); -+ if (page_offset(page) < inode->i_size && -+ !PageUptodate(page) && to_page != PAGE_CACHE_SIZE) { -+ /* -+ * the above is not optimal for partial write to last -+ * page of file when file size is not at boundary of -+ * page -+ */ -+ lock_page(page); -+ if (!PageUptodate(page)) { -+ result = readpage_unix_file(NULL, page); -+ BUG_ON(result != 0); -+ /* wait for read completion */ -+ lock_page(page); -+ BUG_ON(!PageUptodate(page)); -+ } else -+ result = 0; -+ unlock_page(page); -+ } -+ -+ BUG_ON(get_current_context()->trans->atom != NULL); -+ fault_in_pages_readable(buf, to_page); -+ BUG_ON(get_current_context()->trans->atom != NULL); -+ -+ lock_page(page); -+ if (!PageUptodate(page) && to_page != PAGE_CACHE_SIZE) -+ simple_prepare_write(file, page, page_off, -+ page_off + to_page); -+ -+ written = filemap_copy_from_user(page, page_off, buf, to_page); -+ if (unlikely(written != to_page)) { -+ unlock_page(page); -+ result = RETERR(-EFAULT); -+ break; -+ } -+ -+ flush_dcache_page(page); -+ reiser4_set_page_dirty_internal(page); -+ unlock_page(page); -+ nr_dirty++; -+ -+ mark_page_accessed(page); -+ SetPageUptodate(page); -+ -+ if (jnodes[i]->blocknr == 0) -+ have_to_update_extent ++; -+ -+ page_off = 0; -+ buf += to_page; -+ left -= to_page; -+ BUG_ON(get_current_context()->trans->atom != NULL); -+ } -+ -+ if (have_to_update_extent) { -+ update_extents(file, inode, jnodes, nr_dirty, *pos); -+ } else { -+ for (i = 0; i < nr_dirty; i ++) { -+ int ret; -+ spin_lock_jnode(jnodes[i]); -+ ret = reiser4_try_capture(jnodes[i], -+ ZNODE_WRITE_LOCK, 0); -+ BUG_ON(ret != 0); -+ jnode_make_dirty_locked(jnodes[i]); -+ spin_unlock_jnode(jnodes[i]); -+ } -+ } -+out: -+ for (i = 0; i < nr_pages; i ++) { -+ page_cache_release(jnode_page(jnodes[i])); -+ JF_CLR(jnodes[i], JNODE_WRITE_PREPARED); -+ jput(jnodes[i]); -+ } -+ -+ /* the only errors handled so far is ENOMEM and -+ EFAULT on copy_from_user */ -+ -+ return (count - left) ? (count - left) : result; -+} -+ -+int reiser4_do_readpage_extent(reiser4_extent * ext, reiser4_block_nr pos, -+ struct page *page) -+{ -+ jnode *j; -+ struct address_space *mapping; -+ unsigned long index; -+ oid_t oid; -+ reiser4_block_nr block; -+ -+ mapping = page->mapping; -+ oid = get_inode_oid(mapping->host); -+ index = page->index; -+ -+ switch (state_of_extent(ext)) { -+ case HOLE_EXTENT: -+ /* -+ * it is possible to have hole page with jnode, if page was -+ * eflushed previously. -+ */ -+ j = jfind(mapping, index); -+ if (j == NULL) { -+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0); -+ SetPageUptodate(page); -+ unlock_page(page); -+ return 0; -+ } -+ spin_lock_jnode(j); -+ if (!jnode_page(j)) { -+ jnode_attach_page(j, page); -+ } else { -+ BUG_ON(jnode_page(j) != page); -+ assert("vs-1504", jnode_page(j) == page); -+ } -+ block = *jnode_get_io_block(j); -+ spin_unlock_jnode(j); -+ if (block == 0) { -+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0); -+ SetPageUptodate(page); -+ unlock_page(page); -+ jput(j); -+ return 0; -+ } -+ break; -+ -+ case ALLOCATED_EXTENT: -+ j = jnode_of_page(page); -+ if (IS_ERR(j)) -+ return PTR_ERR(j); -+ if (*jnode_get_block(j) == 0) { -+ reiser4_block_nr blocknr; -+ -+ blocknr = extent_get_start(ext) + pos; -+ jnode_set_block(j, &blocknr); -+ } else -+ assert("vs-1403", -+ j->blocknr == extent_get_start(ext) + pos); -+ break; -+ -+ case UNALLOCATED_EXTENT: -+ j = jfind(mapping, index); -+ assert("nikita-2688", j); -+ assert("vs-1426", jnode_page(j) == NULL); -+ -+ spin_lock_jnode(j); -+ jnode_attach_page(j, page); -+ spin_unlock_jnode(j); -+ break; -+ -+ default: -+ warning("vs-957", "wrong extent\n"); -+ return RETERR(-EIO); -+ } -+ -+ BUG_ON(j == 0); -+ reiser4_page_io(page, j, READ, reiser4_ctx_gfp_mask_get()); -+ jput(j); -+ return 0; -+} -+ -+/* Implements plugin->u.item.s.file.read operation for extent items. */ -+int reiser4_read_extent(struct file *file, flow_t *flow, hint_t *hint) -+{ -+ int result; -+ struct page *page; -+ unsigned long cur_page, next_page; -+ unsigned long page_off, count; -+ struct address_space *mapping; -+ loff_t file_off; -+ uf_coord_t *uf_coord; -+ coord_t *coord; -+ struct extent_coord_extension *ext_coord; -+ unsigned long nr_pages; -+ char *kaddr; -+ -+ assert("vs-1353", current_blocksize == PAGE_CACHE_SIZE); -+ assert("vs-572", flow->user == 1); -+ assert("vs-1351", flow->length > 0); -+ -+ uf_coord = &hint->ext_coord; -+ -+ check_uf_coord(uf_coord, NULL); -+ assert("vs-33", uf_coord->lh == &hint->lh); -+ -+ coord = &uf_coord->coord; -+ assert("vs-1119", znode_is_rlocked(coord->node)); -+ assert("vs-1120", znode_is_loaded(coord->node)); -+ assert("vs-1256", coord_matches_key_extent(coord, &flow->key)); -+ -+ mapping = file->f_dentry->d_inode->i_mapping; -+ ext_coord = &uf_coord->extension.extent; -+ -+ /* offset in a file to start read from */ -+ file_off = get_key_offset(&flow->key); -+ /* offset within the page to start read from */ -+ page_off = (unsigned long)(file_off & (PAGE_CACHE_SIZE - 1)); -+ /* bytes which can be read from the page which contains file_off */ -+ count = PAGE_CACHE_SIZE - page_off; -+ -+ /* index of page containing offset read is to start from */ -+ cur_page = (unsigned long)(file_off >> PAGE_CACHE_SHIFT); -+ next_page = cur_page; -+ /* number of pages flow spans over */ -+ nr_pages = -+ ((file_off + flow->length + PAGE_CACHE_SIZE - -+ 1) >> PAGE_CACHE_SHIFT) - cur_page; -+ -+ /* we start having twig node read locked. However, we do not want to -+ keep that lock all the time readahead works. So, set a sel and -+ release twig node. */ -+ reiser4_set_hint(hint, &flow->key, ZNODE_READ_LOCK); -+ /* &hint->lh is done-ed */ -+ -+ do { -+ reiser4_txn_restart_current(); -+ page = read_mapping_page(mapping, cur_page, file); -+ if (IS_ERR(page)) -+ return PTR_ERR(page); -+ lock_page(page); -+ if (!PageUptodate(page)) { -+ unlock_page(page); -+ page_cache_release(page); -+ warning("jmacd-97178", "extent_read: page is not up to date"); -+ return RETERR(-EIO); -+ } -+ mark_page_accessed(page); -+ unlock_page(page); -+ -+ /* If users can be writing to this page using arbitrary virtual -+ addresses, take care about potential aliasing before reading -+ the page on the kernel side. -+ */ -+ if (mapping_writably_mapped(mapping)) -+ flush_dcache_page(page); -+ -+ assert("nikita-3034", reiser4_schedulable()); -+ -+ /* number of bytes which are to be read from the page */ -+ if (count > flow->length) -+ count = flow->length; -+ -+ result = fault_in_pages_writeable(flow->data, count); -+ if (result) { -+ page_cache_release(page); -+ return RETERR(-EFAULT); -+ } -+ -+ kaddr = kmap_atomic(page, KM_USER0); -+ result = __copy_to_user_inatomic(flow->data, -+ kaddr + page_off, count); -+ kunmap_atomic(kaddr, KM_USER0); -+ if (result != 0) { -+ kaddr = kmap(page); -+ result = __copy_to_user(flow->data, kaddr + page_off, count); -+ kunmap(page); -+ if (unlikely(result)) -+ return RETERR(-EFAULT); -+ } -+ -+ page_cache_release(page); -+ -+ /* increase key (flow->key), update user area pointer (flow->data) */ -+ move_flow_forward(flow, count); -+ -+ page_off = 0; -+ cur_page ++; -+ count = PAGE_CACHE_SIZE; -+ nr_pages--; -+ } while (flow->length); -+ -+ return 0; -+} -+ -+/* -+ plugin->s.file.readpage -+ reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->extent_readpage -+ or -+ filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_extent -+ -+ At the beginning: coord->node is read locked, zloaded, page is -+ locked, coord is set to existing unit inside of extent item (it is not necessary that coord matches to page->index) -+*/ -+int reiser4_readpage_extent(void *vp, struct page *page) -+{ -+ uf_coord_t *uf_coord = vp; -+ ON_DEBUG(coord_t * coord = &uf_coord->coord); -+ ON_DEBUG(reiser4_key key); -+ -+ assert("vs-1040", PageLocked(page)); -+ assert("vs-1050", !PageUptodate(page)); -+ assert("vs-1039", page->mapping && page->mapping->host); -+ -+ assert("vs-1044", znode_is_loaded(coord->node)); -+ assert("vs-758", item_is_extent(coord)); -+ assert("vs-1046", coord_is_existing_unit(coord)); -+ assert("vs-1045", znode_is_rlocked(coord->node)); -+ assert("vs-1047", -+ page->mapping->host->i_ino == -+ get_key_objectid(item_key_by_coord(coord, &key))); -+ check_uf_coord(uf_coord, NULL); -+ -+ return reiser4_do_readpage_extent( -+ ext_by_ext_coord(uf_coord), -+ uf_coord->extension.extent.pos_in_unit, page); -+} -+ -+/** -+ * get_block_address_extent -+ * @coord: -+ * @block: -+ * @result: -+ * -+ * -+ */ -+int get_block_address_extent(const coord_t *coord, sector_t block, -+ sector_t *result) -+{ -+ reiser4_extent *ext; -+ -+ if (!coord_is_existing_unit(coord)) -+ return RETERR(-EINVAL); -+ -+ ext = extent_by_coord(coord); -+ -+ if (state_of_extent(ext) != ALLOCATED_EXTENT) -+ /* FIXME: bad things may happen if it is unallocated extent */ -+ *result = 0; -+ else { -+ reiser4_key key; -+ -+ unit_key_by_coord(coord, &key); -+ assert("vs-1645", -+ block >= get_key_offset(&key) >> current_blocksize_bits); -+ assert("vs-1646", -+ block < -+ (get_key_offset(&key) >> current_blocksize_bits) + -+ extent_get_width(ext)); -+ *result = -+ extent_get_start(ext) + (block - -+ (get_key_offset(&key) >> -+ current_blocksize_bits)); -+ } -+ return 0; -+} -+ -+/* -+ plugin->u.item.s.file.append_key -+ key of first byte which is the next to last byte by addressed by this extent -+*/ -+reiser4_key *append_key_extent(const coord_t * coord, reiser4_key * key) -+{ -+ item_key_by_coord(coord, key); -+ set_key_offset(key, -+ get_key_offset(key) + reiser4_extent_size(coord, -+ nr_units_extent -+ (coord))); -+ -+ assert("vs-610", get_key_offset(key) -+ && (get_key_offset(key) & (current_blocksize - 1)) == 0); -+ return key; -+} -+ -+/* plugin->u.item.s.file.init_coord_extension */ -+void init_coord_extension_extent(uf_coord_t * uf_coord, loff_t lookuped) -+{ -+ coord_t *coord; -+ struct extent_coord_extension *ext_coord; -+ reiser4_key key; -+ loff_t offset; -+ -+ assert("vs-1295", uf_coord->valid == 0); -+ -+ coord = &uf_coord->coord; -+ assert("vs-1288", coord_is_iplug_set(coord)); -+ assert("vs-1327", znode_is_loaded(coord->node)); -+ -+ if (coord->between != AFTER_UNIT && coord->between != AT_UNIT) -+ return; -+ -+ ext_coord = &uf_coord->extension.extent; -+ ext_coord->nr_units = nr_units_extent(coord); -+ ext_coord->ext_offset = -+ (char *)extent_by_coord(coord) - zdata(coord->node); -+ ext_coord->width = extent_get_width(extent_by_coord(coord)); -+ ON_DEBUG(ext_coord->extent = *extent_by_coord(coord)); -+ uf_coord->valid = 1; -+ -+ /* pos_in_unit is the only uninitialized field in extended coord */ -+ if (coord->between == AFTER_UNIT) { -+ assert("vs-1330", -+ coord->unit_pos == nr_units_extent(coord) - 1); -+ -+ ext_coord->pos_in_unit = ext_coord->width - 1; -+ } else { -+ /* AT_UNIT */ -+ unit_key_by_coord(coord, &key); -+ offset = get_key_offset(&key); -+ -+ assert("vs-1328", offset <= lookuped); -+ assert("vs-1329", -+ lookuped < -+ offset + ext_coord->width * current_blocksize); -+ ext_coord->pos_in_unit = -+ ((lookuped - offset) >> current_blocksize_bits); -+ } -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/extent_flush_ops.c linux-2.6.24/fs/reiser4/plugin/item/extent_flush_ops.c ---- linux-2.6.24.orig/fs/reiser4/plugin/item/extent_flush_ops.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/item/extent_flush_ops.c 2008-01-25 11:39:07.016228297 +0300 -@@ -0,0 +1,1028 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "item.h" -+#include "../../tree.h" -+#include "../../jnode.h" -+#include "../../super.h" -+#include "../../flush.h" -+#include "../../carry.h" -+#include "../object.h" -+ -+#include -+ -+static reiser4_block_nr extent_unit_start(const coord_t * item); -+ -+/* Return either first or last extent (depending on @side) of the item -+ @coord is set to. Set @pos_in_unit either to first or to last block -+ of extent. */ -+static reiser4_extent *extent_utmost_ext(const coord_t * coord, sideof side, -+ reiser4_block_nr * pos_in_unit) -+{ -+ reiser4_extent *ext; -+ -+ if (side == LEFT_SIDE) { -+ /* get first extent of item */ -+ ext = extent_item(coord); -+ *pos_in_unit = 0; -+ } else { -+ /* get last extent of item and last position within it */ -+ assert("vs-363", side == RIGHT_SIDE); -+ ext = extent_item(coord) + coord_last_unit_pos(coord); -+ *pos_in_unit = extent_get_width(ext) - 1; -+ } -+ -+ return ext; -+} -+ -+/* item_plugin->f.utmost_child */ -+/* Return the child. Coord is set to extent item. Find jnode corresponding -+ either to first or to last unformatted node pointed by the item */ -+int utmost_child_extent(const coord_t * coord, sideof side, jnode ** childp) -+{ -+ reiser4_extent *ext; -+ reiser4_block_nr pos_in_unit; -+ -+ ext = extent_utmost_ext(coord, side, &pos_in_unit); -+ -+ switch (state_of_extent(ext)) { -+ case HOLE_EXTENT: -+ *childp = NULL; -+ return 0; -+ case ALLOCATED_EXTENT: -+ case UNALLOCATED_EXTENT: -+ break; -+ default: -+ /* this should never happen */ -+ assert("vs-1417", 0); -+ } -+ -+ { -+ reiser4_key key; -+ reiser4_tree *tree; -+ unsigned long index; -+ -+ if (side == LEFT_SIDE) { -+ /* get key of first byte addressed by the extent */ -+ item_key_by_coord(coord, &key); -+ } else { -+ /* get key of byte which next after last byte addressed by the extent */ -+ append_key_extent(coord, &key); -+ } -+ -+ assert("vs-544", -+ (get_key_offset(&key) >> PAGE_CACHE_SHIFT) < ~0ul); -+ /* index of first or last (depending on @side) page addressed -+ by the extent */ -+ index = -+ (unsigned long)(get_key_offset(&key) >> PAGE_CACHE_SHIFT); -+ if (side == RIGHT_SIDE) -+ index--; -+ -+ tree = coord->node->zjnode.tree; -+ *childp = jlookup(tree, get_key_objectid(&key), index); -+ } -+ -+ return 0; -+} -+ -+/* item_plugin->f.utmost_child_real_block */ -+/* Return the child's block, if allocated. */ -+int -+utmost_child_real_block_extent(const coord_t * coord, sideof side, -+ reiser4_block_nr * block) -+{ -+ reiser4_extent *ext; -+ -+ ext = extent_by_coord(coord); -+ -+ switch (state_of_extent(ext)) { -+ case ALLOCATED_EXTENT: -+ *block = extent_get_start(ext); -+ if (side == RIGHT_SIDE) -+ *block += extent_get_width(ext) - 1; -+ break; -+ case HOLE_EXTENT: -+ case UNALLOCATED_EXTENT: -+ *block = 0; -+ break; -+ default: -+ /* this should never happen */ -+ assert("vs-1418", 0); -+ } -+ -+ return 0; -+} -+ -+/* item_plugin->f.scan */ -+/* Performs leftward scanning starting from an unformatted node and its parent coordinate. -+ This scan continues, advancing the parent coordinate, until either it encounters a -+ formatted child or it finishes scanning this node. -+ -+ If unallocated, the entire extent must be dirty and in the same atom. (Actually, I'm -+ not sure this is last property (same atom) is enforced, but it should be the case since -+ one atom must write the parent and the others must read the parent, thus fusing?). In -+ any case, the code below asserts this case for unallocated extents. Unallocated -+ extents are thus optimized because we can skip to the endpoint when scanning. -+ -+ It returns control to reiser4_scan_extent, handles these terminating conditions, -+ e.g., by loading the next twig. -+*/ -+int reiser4_scan_extent(flush_scan * scan) -+{ -+ coord_t coord; -+ jnode *neighbor; -+ unsigned long scan_index, unit_index, unit_width, scan_max, scan_dist; -+ reiser4_block_nr unit_start; -+ __u64 oid; -+ reiser4_key key; -+ int ret = 0, allocated, incr; -+ reiser4_tree *tree; -+ -+ if (!JF_ISSET(scan->node, JNODE_DIRTY)) { -+ scan->stop = 1; -+ return 0; /* Race with truncate, this node is already -+ * truncated. */ -+ } -+ -+ coord_dup(&coord, &scan->parent_coord); -+ -+ assert("jmacd-1404", !reiser4_scan_finished(scan)); -+ assert("jmacd-1405", jnode_get_level(scan->node) == LEAF_LEVEL); -+ assert("jmacd-1406", jnode_is_unformatted(scan->node)); -+ -+ /* The scan_index variable corresponds to the current page index of the -+ unformatted block scan position. */ -+ scan_index = index_jnode(scan->node); -+ -+ assert("jmacd-7889", item_is_extent(&coord)); -+ -+ repeat: -+ /* objectid of file */ -+ oid = get_key_objectid(item_key_by_coord(&coord, &key)); -+ -+ allocated = !extent_is_unallocated(&coord); -+ /* Get the values of this extent unit: */ -+ unit_index = extent_unit_index(&coord); -+ unit_width = extent_unit_width(&coord); -+ unit_start = extent_unit_start(&coord); -+ -+ assert("jmacd-7187", unit_width > 0); -+ assert("jmacd-7188", scan_index >= unit_index); -+ assert("jmacd-7189", scan_index <= unit_index + unit_width - 1); -+ -+ /* Depending on the scan direction, we set different maximum values for scan_index -+ (scan_max) and the number of nodes that would be passed if the scan goes the -+ entire way (scan_dist). Incr is an integer reflecting the incremental -+ direction of scan_index. */ -+ if (reiser4_scanning_left(scan)) { -+ scan_max = unit_index; -+ scan_dist = scan_index - unit_index; -+ incr = -1; -+ } else { -+ scan_max = unit_index + unit_width - 1; -+ scan_dist = scan_max - unit_index; -+ incr = +1; -+ } -+ -+ tree = coord.node->zjnode.tree; -+ -+ /* If the extent is allocated we have to check each of its blocks. If the extent -+ is unallocated we can skip to the scan_max. */ -+ if (allocated) { -+ do { -+ neighbor = jlookup(tree, oid, scan_index); -+ if (neighbor == NULL) -+ goto stop_same_parent; -+ -+ if (scan->node != neighbor -+ && !reiser4_scan_goto(scan, neighbor)) { -+ /* @neighbor was jput() by reiser4_scan_goto */ -+ goto stop_same_parent; -+ } -+ -+ ret = scan_set_current(scan, neighbor, 1, &coord); -+ if (ret != 0) { -+ goto exit; -+ } -+ -+ /* reference to @neighbor is stored in @scan, no need -+ to jput(). */ -+ scan_index += incr; -+ -+ } while (incr + scan_max != scan_index); -+ -+ } else { -+ /* Optimized case for unallocated extents, skip to the end. */ -+ neighbor = jlookup(tree, oid, scan_max /*index */ ); -+ if (neighbor == NULL) { -+ /* Race with truncate */ -+ scan->stop = 1; -+ ret = 0; -+ goto exit; -+ } -+ -+ assert("zam-1043", -+ reiser4_blocknr_is_fake(jnode_get_block(neighbor))); -+ -+ ret = scan_set_current(scan, neighbor, scan_dist, &coord); -+ if (ret != 0) { -+ goto exit; -+ } -+ } -+ -+ if (coord_sideof_unit(&coord, scan->direction) == 0 -+ && item_is_extent(&coord)) { -+ /* Continue as long as there are more extent units. */ -+ -+ scan_index = -+ extent_unit_index(&coord) + -+ (reiser4_scanning_left(scan) ? -+ extent_unit_width(&coord) - 1 : 0); -+ goto repeat; -+ } -+ -+ if (0) { -+ stop_same_parent: -+ -+ /* If we are scanning left and we stop in the middle of an allocated -+ extent, we know the preceder immediately.. */ -+ /* middle of extent is (scan_index - unit_index) != 0. */ -+ if (reiser4_scanning_left(scan) && -+ (scan_index - unit_index) != 0) { -+ /* FIXME(B): Someone should step-through and verify that this preceder -+ calculation is indeed correct. */ -+ /* @unit_start is starting block (number) of extent -+ unit. Flush stopped at the @scan_index block from -+ the beginning of the file, which is (scan_index - -+ unit_index) block within extent. -+ */ -+ if (unit_start) { -+ /* skip preceder update when we are at hole */ -+ scan->preceder_blk = -+ unit_start + scan_index - unit_index; -+ check_preceder(scan->preceder_blk); -+ } -+ } -+ -+ /* In this case, we leave coord set to the parent of scan->node. */ -+ scan->stop = 1; -+ -+ } else { -+ /* In this case, we are still scanning, coord is set to the next item which is -+ either off-the-end of the node or not an extent. */ -+ assert("jmacd-8912", scan->stop == 0); -+ assert("jmacd-7812", -+ (coord_is_after_sideof_unit(&coord, scan->direction) -+ || !item_is_extent(&coord))); -+ } -+ -+ ret = 0; -+ exit: -+ return ret; -+} -+ -+/* ask block allocator for some blocks */ -+static void extent_allocate_blocks(reiser4_blocknr_hint *preceder, -+ reiser4_block_nr wanted_count, -+ reiser4_block_nr *first_allocated, -+ reiser4_block_nr *allocated, -+ block_stage_t block_stage) -+{ -+ *allocated = wanted_count; -+ preceder->max_dist = 0; /* scan whole disk, if needed */ -+ -+ /* that number of blocks (wanted_count) is either in UNALLOCATED or in GRABBED */ -+ preceder->block_stage = block_stage; -+ -+ /* FIXME: we do not handle errors here now */ -+ check_me("vs-420", -+ reiser4_alloc_blocks(preceder, first_allocated, allocated, -+ BA_PERMANENT) == 0); -+ /* update flush_pos's preceder to last allocated block number */ -+ preceder->blk = *first_allocated + *allocated - 1; -+} -+ -+/* when on flush time unallocated extent is to be replaced with allocated one it may happen that one unallocated extent -+ will have to be replaced with set of allocated extents. In this case insert_into_item will be called which may have -+ to add new nodes into tree. Space for that is taken from inviolable reserve (5%). */ -+static reiser4_block_nr reserve_replace(void) -+{ -+ reiser4_block_nr grabbed, needed; -+ -+ grabbed = get_current_context()->grabbed_blocks; -+ needed = estimate_one_insert_into_item(current_tree); -+ check_me("vpf-340", !reiser4_grab_space_force(needed, BA_RESERVED)); -+ return grabbed; -+} -+ -+static void free_replace_reserved(reiser4_block_nr grabbed) -+{ -+ reiser4_context *ctx; -+ -+ ctx = get_current_context(); -+ grabbed2free(ctx, get_super_private(ctx->super), -+ ctx->grabbed_blocks - grabbed); -+} -+ -+/* Block offset of first block addressed by unit */ -+__u64 extent_unit_index(const coord_t * item) -+{ -+ reiser4_key key; -+ -+ assert("vs-648", coord_is_existing_unit(item)); -+ unit_key_by_coord(item, &key); -+ return get_key_offset(&key) >> current_blocksize_bits; -+} -+ -+/* AUDIT shouldn't return value be of reiser4_block_nr type? -+ Josh's answer: who knows? Is a "number of blocks" the same type as "block offset"? */ -+__u64 extent_unit_width(const coord_t * item) -+{ -+ assert("vs-649", coord_is_existing_unit(item)); -+ return width_by_coord(item); -+} -+ -+/* Starting block location of this unit */ -+static reiser4_block_nr extent_unit_start(const coord_t * item) -+{ -+ return extent_get_start(extent_by_coord(item)); -+} -+ -+/** -+ * split_allocated_extent - -+ * @coord: -+ * @pos_in_unit: -+ * -+ * replace allocated extent with two allocated extents -+ */ -+static int split_allocated_extent(coord_t *coord, reiser4_block_nr pos_in_unit) -+{ -+ int result; -+ struct replace_handle *h; -+ reiser4_extent *ext; -+ reiser4_block_nr grabbed; -+ -+ ext = extent_by_coord(coord); -+ assert("vs-1410", state_of_extent(ext) == ALLOCATED_EXTENT); -+ assert("vs-1411", extent_get_width(ext) > pos_in_unit); -+ -+ h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get()); -+ if (h == NULL) -+ return RETERR(-ENOMEM); -+ h->coord = coord; -+ h->lh = znode_lh(coord->node); -+ h->pkey = &h->key; -+ unit_key_by_coord(coord, h->pkey); -+ set_key_offset(h->pkey, -+ (get_key_offset(h->pkey) + -+ pos_in_unit * current_blocksize)); -+ reiser4_set_extent(&h->overwrite, extent_get_start(ext), -+ pos_in_unit); -+ reiser4_set_extent(&h->new_extents[0], -+ extent_get_start(ext) + pos_in_unit, -+ extent_get_width(ext) - pos_in_unit); -+ h->nr_new_extents = 1; -+ h->flags = COPI_DONT_SHIFT_LEFT; -+ h->paste_key = h->key; -+ -+ /* reserve space for extent unit paste, @grabbed is reserved before */ -+ grabbed = reserve_replace(); -+ result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten -+ extent */); -+ /* restore reserved */ -+ free_replace_reserved(grabbed); -+ kfree(h); -+ return result; -+} -+ -+/* replace extent @ext by extent @replace. Try to merge @replace with previous extent of the item (if there is -+ one). Return 1 if it succeeded, 0 - otherwise */ -+static int try_to_merge_with_left(coord_t *coord, reiser4_extent *ext, -+ reiser4_extent *replace) -+{ -+ assert("vs-1415", extent_by_coord(coord) == ext); -+ -+ if (coord->unit_pos == 0 -+ || state_of_extent(ext - 1) != ALLOCATED_EXTENT) -+ /* @ext either does not exist or is not allocated extent */ -+ return 0; -+ if (extent_get_start(ext - 1) + extent_get_width(ext - 1) != -+ extent_get_start(replace)) -+ return 0; -+ -+ /* we can glue, widen previous unit */ -+ extent_set_width(ext - 1, -+ extent_get_width(ext - 1) + extent_get_width(replace)); -+ -+ if (extent_get_width(ext) != extent_get_width(replace)) { -+ /* make current extent narrower */ -+ if (state_of_extent(ext) == ALLOCATED_EXTENT) -+ extent_set_start(ext, -+ extent_get_start(ext) + -+ extent_get_width(replace)); -+ extent_set_width(ext, -+ extent_get_width(ext) - -+ extent_get_width(replace)); -+ } else { -+ /* current extent completely glued with its left neighbor, remove it */ -+ coord_t from, to; -+ -+ coord_dup(&from, coord); -+ from.unit_pos = nr_units_extent(coord) - 1; -+ coord_dup(&to, &from); -+ -+ /* currently cut from extent can cut either from the beginning or from the end. Move place which got -+ freed after unit removal to end of item */ -+ memmove(ext, ext + 1, -+ (from.unit_pos - -+ coord->unit_pos) * sizeof(reiser4_extent)); -+ /* wipe part of item which is going to be cut, so that node_check will not be confused */ -+ cut_node_content(&from, &to, NULL, NULL, NULL); -+ } -+ znode_make_dirty(coord->node); -+ /* move coord back */ -+ coord->unit_pos--; -+ return 1; -+} -+ -+/** -+ * conv_extent - replace extent with 2 ones -+ * @coord: coordinate of extent to be replaced -+ * @replace: extent to overwrite the one @coord is set to -+ * -+ * Overwrites extent @coord is set to and paste one extent unit after -+ * overwritten one if @replace is shorter than initial extent -+ */ -+static int conv_extent(coord_t *coord, reiser4_extent *replace) -+{ -+ int result; -+ struct replace_handle *h; -+ reiser4_extent *ext; -+ reiser4_block_nr start, width, new_width; -+ reiser4_block_nr grabbed; -+ extent_state state; -+ -+ ext = extent_by_coord(coord); -+ state = state_of_extent(ext); -+ start = extent_get_start(ext); -+ width = extent_get_width(ext); -+ new_width = extent_get_width(replace); -+ -+ assert("vs-1458", (state == UNALLOCATED_EXTENT || -+ state == ALLOCATED_EXTENT)); -+ assert("vs-1459", width >= new_width); -+ -+ if (try_to_merge_with_left(coord, ext, replace)) { -+ /* merged @replace with left neighbor. Current unit is either -+ removed or narrowed */ -+ return 0; -+ } -+ -+ if (width == new_width) { -+ /* replace current extent with @replace */ -+ *ext = *replace; -+ znode_make_dirty(coord->node); -+ return 0; -+ } -+ -+ h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get()); -+ if (h == NULL) -+ return RETERR(-ENOMEM); -+ h->coord = coord; -+ h->lh = znode_lh(coord->node); -+ h->pkey = &h->key; -+ unit_key_by_coord(coord, h->pkey); -+ set_key_offset(h->pkey, -+ (get_key_offset(h->pkey) + new_width * current_blocksize)); -+ h->overwrite = *replace; -+ -+ /* replace @ext with @replace and padding extent */ -+ reiser4_set_extent(&h->new_extents[0], -+ (state == ALLOCATED_EXTENT) ? -+ (start + new_width) : -+ UNALLOCATED_EXTENT_START, -+ width - new_width); -+ h->nr_new_extents = 1; -+ h->flags = COPI_DONT_SHIFT_LEFT; -+ h->paste_key = h->key; -+ -+ /* reserve space for extent unit paste, @grabbed is reserved before */ -+ grabbed = reserve_replace(); -+ result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten -+ extent */); -+ -+ /* restore reserved */ -+ free_replace_reserved(grabbed); -+ kfree(h); -+ return result; -+} -+ -+/** -+ * assign_real_blocknrs -+ * @flush_pos: -+ * @oid: objectid of file jnodes to assign block number to belongs to -+ * @index: first jnode on the range -+ * @count: number of jnodes to assign block numbers to -+ * @first: start of allocated block range -+ * -+ * Assigns block numbers to each of @count jnodes. Index of first jnode is -+ * @index. Jnodes get lookuped with jlookup. -+ */ -+static void assign_real_blocknrs(flush_pos_t *flush_pos, oid_t oid, -+ unsigned long index, reiser4_block_nr count, -+ reiser4_block_nr first) -+{ -+ unsigned long i; -+ reiser4_tree *tree; -+ txn_atom *atom; -+ int nr; -+ -+ atom = atom_locked_by_fq(flush_pos->fq); -+ assert("vs-1468", atom); -+ BUG_ON(atom == NULL); -+ -+ nr = 0; -+ tree = current_tree; -+ for (i = 0; i < count; ++i, ++index) { -+ jnode *node; -+ -+ node = jlookup(tree, oid, index); -+ assert("", node != NULL); -+ BUG_ON(node == NULL); -+ -+ spin_lock_jnode(node); -+ assert("", !jnode_is_flushprepped(node)); -+ assert("vs-1475", node->atom == atom); -+ assert("vs-1476", atomic_read(&node->x_count) > 0); -+ -+ JF_CLR(node, JNODE_FLUSH_RESERVED); -+ jnode_set_block(node, &first); -+ unformatted_make_reloc(node, flush_pos->fq); -+ ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node), -+ FQ_LIST, 0)); -+ spin_unlock_jnode(node); -+ first++; -+ -+ atomic_dec(&node->x_count); -+ nr ++; -+ } -+ -+ spin_unlock_atom(atom); -+ return; -+} -+ -+/** -+ * make_node_ovrwr - assign node to overwrite set -+ * @jnodes: overwrite set list head -+ * @node: jnode to belong to overwrite set -+ * -+ * Sets OVRWR jnode state bit and puts @node to the end of list head @jnodes -+ * which is an accumulator for nodes before they get to overwrite set list of -+ * atom. -+ */ -+static void make_node_ovrwr(struct list_head *jnodes, jnode *node) -+{ -+ spin_lock_jnode(node); -+ -+ assert("zam-917", !JF_ISSET(node, JNODE_RELOC)); -+ assert("zam-918", !JF_ISSET(node, JNODE_OVRWR)); -+ -+ JF_SET(node, JNODE_OVRWR); -+ list_move_tail(&node->capture_link, jnodes); -+ ON_DEBUG(count_jnode(node->atom, node, DIRTY_LIST, OVRWR_LIST, 0)); -+ -+ spin_unlock_jnode(node); -+} -+ -+/** -+ * mark_jnodes_overwrite - put bunch of jnodes to overwrite set -+ * @flush_pos: flush position -+ * @oid: objectid of file jnodes belong to -+ * @index: starting index -+ * @width: extent width -+ * -+ * Puts nodes of one extent (file objectid @oid, extent width @width) to atom's -+ * overwrite set. Starting from the one with index @index. If end of slum is -+ * detected (node is not found or flushprepped) - stop iterating and set flush -+ * position's state to POS_INVALID. -+ */ -+static void mark_jnodes_overwrite(flush_pos_t *flush_pos, oid_t oid, -+ unsigned long index, reiser4_block_nr width) -+{ -+ unsigned long i; -+ reiser4_tree *tree; -+ jnode *node; -+ txn_atom *atom; -+ LIST_HEAD(jnodes); -+ -+ tree = current_tree; -+ -+ atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos)); -+ assert("vs-1478", atom); -+ -+ for (i = flush_pos->pos_in_unit; i < width; i++, index++) { -+ node = jlookup(tree, oid, index); -+ if (!node) { -+ flush_pos->state = POS_INVALID; -+ break; -+ } -+ if (jnode_check_flushprepped(node)) { -+ flush_pos->state = POS_INVALID; -+ atomic_dec(&node->x_count); -+ break; -+ } -+ if (node->atom != atom) { -+ flush_pos->state = POS_INVALID; -+ atomic_dec(&node->x_count); -+ break; -+ } -+ make_node_ovrwr(&jnodes, node); -+ atomic_dec(&node->x_count); -+ } -+ -+ list_splice_init(&jnodes, ATOM_OVRWR_LIST(atom)->prev); -+ spin_unlock_atom(atom); -+} -+ -+/** -+ * allocated_extent_slum_size -+ * @flush_pos: -+ * @oid: -+ * @index: -+ * @count: -+ * -+ * -+ */ -+static int allocated_extent_slum_size(flush_pos_t *flush_pos, oid_t oid, -+ unsigned long index, unsigned long count) -+{ -+ unsigned long i; -+ reiser4_tree *tree; -+ txn_atom *atom; -+ int nr; -+ -+ atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos)); -+ assert("vs-1468", atom); -+ -+ nr = 0; -+ tree = current_tree; -+ for (i = 0; i < count; ++i, ++index) { -+ jnode *node; -+ -+ node = jlookup(tree, oid, index); -+ if (!node) -+ break; -+ -+ if (jnode_check_flushprepped(node)) { -+ atomic_dec(&node->x_count); -+ break; -+ } -+ -+ if (node->atom != atom) { -+ /* -+ * this is possible on overwrite: extent_write may -+ * capture several unformatted nodes without capturing -+ * any formatted nodes. -+ */ -+ atomic_dec(&node->x_count); -+ break; -+ } -+ -+ assert("vs-1476", atomic_read(&node->x_count) > 1); -+ atomic_dec(&node->x_count); -+ nr ++; -+ } -+ -+ spin_unlock_atom(atom); -+ return nr; -+} -+ -+/** -+ * alloc_extent -+ * @flush_pos: -+ * -+ * -+ * this is called by handle_pos_on_twig to proceed extent unit flush_pos->coord -+ * is set to. It is to prepare for flushing sequence of not flushprepped nodes -+ * (slum). It supposes that slum starts at flush_pos->pos_in_unit position -+ * within the extent. Slum gets to relocate set if flush_pos->leaf_relocate is -+ * set to 1 and to overwrite set otherwise -+ */ -+int reiser4_alloc_extent(flush_pos_t *flush_pos) -+{ -+ coord_t *coord; -+ reiser4_extent *ext; -+ reiser4_extent replace_ext; -+ oid_t oid; -+ reiser4_block_nr protected; -+ reiser4_block_nr start; -+ __u64 index; -+ __u64 width; -+ extent_state state; -+ int result; -+ reiser4_block_nr first_allocated; -+ __u64 allocated; -+ reiser4_key key; -+ block_stage_t block_stage; -+ -+ assert("vs-1468", flush_pos->state == POS_ON_EPOINT); -+ assert("vs-1469", coord_is_existing_unit(&flush_pos->coord) -+ && item_is_extent(&flush_pos->coord)); -+ -+ coord = &flush_pos->coord; -+ -+ ext = extent_by_coord(coord); -+ state = state_of_extent(ext); -+ if (state == HOLE_EXTENT) { -+ flush_pos->state = POS_INVALID; -+ return 0; -+ } -+ -+ item_key_by_coord(coord, &key); -+ oid = get_key_objectid(&key); -+ index = extent_unit_index(coord) + flush_pos->pos_in_unit; -+ start = extent_get_start(ext); -+ width = extent_get_width(ext); -+ -+ assert("vs-1457", width > flush_pos->pos_in_unit); -+ -+ if (flush_pos->leaf_relocate || state == UNALLOCATED_EXTENT) { -+ /* relocate */ -+ if (flush_pos->pos_in_unit) { -+ /* split extent unit into two */ -+ result = -+ split_allocated_extent(coord, -+ flush_pos->pos_in_unit); -+ flush_pos->pos_in_unit = 0; -+ return result; -+ } -+ -+ /* limit number of nodes to allocate */ -+ if (flush_pos->nr_to_write < width) -+ width = flush_pos->nr_to_write; -+ -+ if (state == ALLOCATED_EXTENT) { -+ /* -+ * all protected nodes are not flushprepped, therefore -+ * they are counted as flush_reserved -+ */ -+ block_stage = BLOCK_FLUSH_RESERVED; -+ protected = allocated_extent_slum_size(flush_pos, oid, -+ index, width); -+ if (protected == 0) { -+ flush_pos->state = POS_INVALID; -+ flush_pos->pos_in_unit = 0; -+ return 0; -+ } -+ } else { -+ block_stage = BLOCK_UNALLOCATED; -+ protected = width; -+ } -+ -+ /* -+ * look at previous unit if possible. If it is allocated, make -+ * preceder more precise -+ */ -+ if (coord->unit_pos && -+ (state_of_extent(ext - 1) == ALLOCATED_EXTENT)) -+ reiser4_pos_hint(flush_pos)->blk = -+ extent_get_start(ext - 1) + -+ extent_get_width(ext - 1); -+ -+ /* allocate new block numbers for protected nodes */ -+ extent_allocate_blocks(reiser4_pos_hint(flush_pos), -+ protected, -+ &first_allocated, &allocated, -+ block_stage); -+ -+ if (state == ALLOCATED_EXTENT) -+ /* -+ * on relocating - free nodes which are going to be -+ * relocated -+ */ -+ reiser4_dealloc_blocks(&start, &allocated, -+ BLOCK_ALLOCATED, BA_DEFER); -+ -+ /* assign new block numbers to protected nodes */ -+ assign_real_blocknrs(flush_pos, oid, index, allocated, first_allocated); -+ -+ /* prepare extent which will replace current one */ -+ reiser4_set_extent(&replace_ext, first_allocated, allocated); -+ -+ /* adjust extent item */ -+ result = conv_extent(coord, &replace_ext); -+ if (result != 0 && result != -ENOMEM) { -+ warning("vs-1461", -+ "Failed to allocate extent. Should not happen\n"); -+ return result; -+ } -+ -+ /* -+ * break flush: we prepared for flushing as many blocks as we -+ * were asked for -+ */ -+ if (flush_pos->nr_to_write == allocated) -+ flush_pos->state = POS_INVALID; -+ } else { -+ /* overwrite */ -+ mark_jnodes_overwrite(flush_pos, oid, index, width); -+ } -+ flush_pos->pos_in_unit = 0; -+ return 0; -+} -+ -+/* if @key is glueable to the item @coord is set to */ -+static int must_insert(const coord_t *coord, const reiser4_key *key) -+{ -+ reiser4_key last; -+ -+ if (item_id_by_coord(coord) == EXTENT_POINTER_ID -+ && keyeq(append_key_extent(coord, &last), key)) -+ return 0; -+ return 1; -+} -+ -+/* copy extent @copy to the end of @node. It may have to either insert new item after the last one, or append last item, -+ or modify last unit of last item to have greater width */ -+static int put_unit_to_end(znode *node, const reiser4_key *key, -+ reiser4_extent *copy_ext) -+{ -+ int result; -+ coord_t coord; -+ cop_insert_flag flags; -+ reiser4_extent *last_ext; -+ reiser4_item_data data; -+ -+ /* set coord after last unit in an item */ -+ coord_init_last_unit(&coord, node); -+ coord.between = AFTER_UNIT; -+ -+ flags = -+ COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT | COPI_DONT_ALLOCATE; -+ if (must_insert(&coord, key)) { -+ result = -+ insert_by_coord(&coord, init_new_extent(&data, copy_ext, 1), -+ key, NULL /*lh */ , flags); -+ -+ } else { -+ /* try to glue with last unit */ -+ last_ext = extent_by_coord(&coord); -+ if (state_of_extent(last_ext) && -+ extent_get_start(last_ext) + extent_get_width(last_ext) == -+ extent_get_start(copy_ext)) { -+ /* widen last unit of node */ -+ extent_set_width(last_ext, -+ extent_get_width(last_ext) + -+ extent_get_width(copy_ext)); -+ znode_make_dirty(node); -+ return 0; -+ } -+ -+ /* FIXME: put an assertion here that we can not merge last unit in @node and new unit */ -+ result = -+ insert_into_item(&coord, NULL /*lh */ , key, -+ init_new_extent(&data, copy_ext, 1), -+ flags); -+ } -+ -+ assert("vs-438", result == 0 || result == -E_NODE_FULL); -+ return result; -+} -+ -+/* @coord is set to extent unit */ -+squeeze_result squalloc_extent(znode *left, const coord_t *coord, -+ flush_pos_t *flush_pos, -+ reiser4_key *stop_key) -+{ -+ reiser4_extent *ext; -+ __u64 index; -+ __u64 width; -+ reiser4_block_nr start; -+ extent_state state; -+ oid_t oid; -+ reiser4_block_nr first_allocated; -+ __u64 allocated; -+ __u64 protected; -+ reiser4_extent copy_extent; -+ reiser4_key key; -+ int result; -+ block_stage_t block_stage; -+ -+ assert("vs-1457", flush_pos->pos_in_unit == 0); -+ assert("vs-1467", coord_is_leftmost_unit(coord)); -+ assert("vs-1467", item_is_extent(coord)); -+ -+ ext = extent_by_coord(coord); -+ index = extent_unit_index(coord); -+ start = extent_get_start(ext); -+ width = extent_get_width(ext); -+ state = state_of_extent(ext); -+ unit_key_by_coord(coord, &key); -+ oid = get_key_objectid(&key); -+ -+ if ((flush_pos->leaf_relocate && state == ALLOCATED_EXTENT) || -+ (state == UNALLOCATED_EXTENT)) { -+ /* relocate */ -+ if (state == ALLOCATED_EXTENT) { -+ /* all protected nodes are not flushprepped, therefore -+ * they are counted as flush_reserved */ -+ block_stage = BLOCK_FLUSH_RESERVED; -+ protected = allocated_extent_slum_size(flush_pos, oid, -+ index, width); -+ if (protected == 0) { -+ flush_pos->state = POS_INVALID; -+ flush_pos->pos_in_unit = 0; -+ return 0; -+ } -+ } else { -+ block_stage = BLOCK_UNALLOCATED; -+ protected = width; -+ } -+ -+ /* -+ * look at previous unit if possible. If it is allocated, make -+ * preceder more precise -+ */ -+ if (coord->unit_pos && -+ (state_of_extent(ext - 1) == ALLOCATED_EXTENT)) -+ reiser4_pos_hint(flush_pos)->blk = -+ extent_get_start(ext - 1) + -+ extent_get_width(ext - 1); -+ -+ /* allocate new block numbers for protected nodes */ -+ extent_allocate_blocks(reiser4_pos_hint(flush_pos), -+ protected, -+ &first_allocated, &allocated, -+ block_stage); -+ -+ /* prepare extent which will be copied to left */ -+ reiser4_set_extent(©_extent, first_allocated, allocated); -+ -+ result = put_unit_to_end(left, &key, ©_extent); -+ if (result == -E_NODE_FULL) { -+ int target_block_stage; -+ -+ /* free blocks which were just allocated */ -+ target_block_stage = -+ (state == -+ ALLOCATED_EXTENT) ? BLOCK_FLUSH_RESERVED : -+ BLOCK_UNALLOCATED; -+ reiser4_dealloc_blocks(&first_allocated, &allocated, -+ target_block_stage, -+ BA_PERMANENT); -+ -+ /* rewind the preceder. */ -+ flush_pos->preceder.blk = first_allocated; -+ check_preceder(flush_pos->preceder.blk); -+ -+ return SQUEEZE_TARGET_FULL; -+ } -+ -+ if (state == ALLOCATED_EXTENT) { -+ /* free nodes which were relocated */ -+ reiser4_dealloc_blocks(&start, &allocated, -+ BLOCK_ALLOCATED, BA_DEFER); -+ } -+ -+ /* assign new block numbers to protected nodes */ -+ assign_real_blocknrs(flush_pos, oid, index, allocated, -+ first_allocated); -+ -+ set_key_offset(&key, -+ get_key_offset(&key) + -+ (allocated << current_blocksize_bits)); -+ } else { -+ /* -+ * overwrite: try to copy unit as it is to left neighbor and -+ * make all first not flushprepped nodes overwrite nodes -+ */ -+ reiser4_set_extent(©_extent, start, width); -+ result = put_unit_to_end(left, &key, ©_extent); -+ if (result == -E_NODE_FULL) -+ return SQUEEZE_TARGET_FULL; -+ -+ if (state != HOLE_EXTENT) -+ mark_jnodes_overwrite(flush_pos, oid, index, width); -+ set_key_offset(&key, -+ get_key_offset(&key) + -+ (width << current_blocksize_bits)); -+ } -+ *stop_key = key; -+ return SQUEEZE_CONTINUE; -+} -+ -+int key_by_offset_extent(struct inode *inode, loff_t off, reiser4_key * key) -+{ -+ return key_by_inode_and_offset_common(inode, off, key); -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/extent.h linux-2.6.24/fs/reiser4/plugin/item/extent.h ---- linux-2.6.24.orig/fs/reiser4/plugin/item/extent.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/item/extent.h 2008-01-25 11:40:16.698169785 +0300 -@@ -0,0 +1,231 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#ifndef __REISER4_EXTENT_H__ -+#define __REISER4_EXTENT_H__ -+ -+/* on disk extent */ -+typedef struct { -+ reiser4_dblock_nr start; -+ reiser4_dblock_nr width; -+} reiser4_extent; -+ -+struct extent_stat { -+ int unallocated_units; -+ int unallocated_blocks; -+ int allocated_units; -+ int allocated_blocks; -+ int hole_units; -+ int hole_blocks; -+}; -+ -+/* extents in an extent item can be either holes, or unallocated or allocated -+ extents */ -+typedef enum { -+ HOLE_EXTENT, -+ UNALLOCATED_EXTENT, -+ ALLOCATED_EXTENT -+} extent_state; -+ -+#define HOLE_EXTENT_START 0 -+#define UNALLOCATED_EXTENT_START 1 -+#define UNALLOCATED_EXTENT_START2 2 -+ -+struct extent_coord_extension { -+ reiser4_block_nr pos_in_unit; -+ reiser4_block_nr width; /* width of current unit */ -+ pos_in_node_t nr_units; /* number of units */ -+ int ext_offset; /* offset from the beginning of zdata() */ -+ unsigned long expected_page; -+#if REISER4_DEBUG -+ reiser4_extent extent; -+#endif -+}; -+ -+/* macros to set/get fields of on-disk extent */ -+static inline reiser4_block_nr extent_get_start(const reiser4_extent * ext) -+{ -+ return le64_to_cpu(ext->start); -+} -+ -+static inline reiser4_block_nr extent_get_width(const reiser4_extent * ext) -+{ -+ return le64_to_cpu(ext->width); -+} -+ -+extern __u64 reiser4_current_block_count(void); -+ -+static inline void -+extent_set_start(reiser4_extent * ext, reiser4_block_nr start) -+{ -+ cassert(sizeof(ext->start) == 8); -+ assert("nikita-2510", -+ ergo(start > 1, start < reiser4_current_block_count())); -+ put_unaligned(cpu_to_le64(start), &ext->start); -+} -+ -+static inline void -+extent_set_width(reiser4_extent * ext, reiser4_block_nr width) -+{ -+ cassert(sizeof(ext->width) == 8); -+ assert("", width > 0); -+ put_unaligned(cpu_to_le64(width), &ext->width); -+ assert("nikita-2511", -+ ergo(extent_get_start(ext) > 1, -+ extent_get_start(ext) + width <= -+ reiser4_current_block_count())); -+} -+ -+#define extent_item(coord) \ -+({ \ -+ assert("nikita-3143", item_is_extent(coord)); \ -+ ((reiser4_extent *)item_body_by_coord (coord)); \ -+}) -+ -+#define extent_by_coord(coord) \ -+({ \ -+ assert("nikita-3144", item_is_extent(coord)); \ -+ (extent_item (coord) + (coord)->unit_pos); \ -+}) -+ -+#define width_by_coord(coord) \ -+({ \ -+ assert("nikita-3145", item_is_extent(coord)); \ -+ extent_get_width (extent_by_coord(coord)); \ -+}) -+ -+struct carry_cut_data; -+struct carry_kill_data; -+ -+/* plugin->u.item.b.* */ -+reiser4_key *max_key_inside_extent(const coord_t *, reiser4_key *); -+int can_contain_key_extent(const coord_t * coord, const reiser4_key * key, -+ const reiser4_item_data *); -+int mergeable_extent(const coord_t * p1, const coord_t * p2); -+pos_in_node_t nr_units_extent(const coord_t *); -+lookup_result lookup_extent(const reiser4_key *, lookup_bias, coord_t *); -+void init_coord_extent(coord_t *); -+int init_extent(coord_t *, reiser4_item_data *); -+int paste_extent(coord_t *, reiser4_item_data *, carry_plugin_info *); -+int can_shift_extent(unsigned free_space, -+ coord_t * source, znode * target, shift_direction, -+ unsigned *size, unsigned want); -+void copy_units_extent(coord_t * target, coord_t * source, unsigned from, -+ unsigned count, shift_direction where_is_free_space, -+ unsigned free_space); -+int kill_hook_extent(const coord_t *, pos_in_node_t from, pos_in_node_t count, -+ struct carry_kill_data *); -+int create_hook_extent(const coord_t * coord, void *arg); -+int cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ struct carry_cut_data *, reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+int kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ struct carry_kill_data *, reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+reiser4_key *unit_key_extent(const coord_t *, reiser4_key *); -+reiser4_key *max_unit_key_extent(const coord_t *, reiser4_key *); -+void print_extent(const char *, coord_t *); -+int utmost_child_extent(const coord_t * coord, sideof side, jnode ** child); -+int utmost_child_real_block_extent(const coord_t * coord, sideof side, -+ reiser4_block_nr * block); -+void item_stat_extent(const coord_t * coord, void *vp); -+int reiser4_check_extent(const coord_t * coord, const char **error); -+ -+/* plugin->u.item.s.file.* */ -+ssize_t reiser4_write_extent(struct file *, struct inode * inode, -+ const char __user *, size_t, loff_t *); -+int reiser4_read_extent(struct file *, flow_t *, hint_t *); -+int reiser4_readpage_extent(void *, struct page *); -+int reiser4_do_readpage_extent(reiser4_extent*, reiser4_block_nr, struct page*); -+reiser4_key *append_key_extent(const coord_t *, reiser4_key *); -+void init_coord_extension_extent(uf_coord_t *, loff_t offset); -+int get_block_address_extent(const coord_t *, sector_t block, -+ sector_t * result); -+ -+/* these are used in flush.c -+ FIXME-VS: should they be somewhere in item_plugin? */ -+int allocate_extent_item_in_place(coord_t *, lock_handle *, flush_pos_t * pos); -+int allocate_and_copy_extent(znode * left, coord_t * right, flush_pos_t * pos, -+ reiser4_key * stop_key); -+ -+int extent_is_unallocated(const coord_t * item); /* True if this extent is unallocated (i.e., not a hole, not allocated). */ -+__u64 extent_unit_index(const coord_t * item); /* Block offset of this unit. */ -+__u64 extent_unit_width(const coord_t * item); /* Number of blocks in this unit. */ -+ -+/* plugin->u.item.f. */ -+int reiser4_scan_extent(flush_scan * scan); -+extern int key_by_offset_extent(struct inode *, loff_t, reiser4_key *); -+ -+reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit, -+ int nr_extents); -+reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr); -+extent_state state_of_extent(reiser4_extent * ext); -+void reiser4_set_extent(reiser4_extent *, reiser4_block_nr start, -+ reiser4_block_nr width); -+int reiser4_update_extent(struct inode *, jnode *, loff_t pos, -+ int *plugged_hole); -+ -+#include "../../coord.h" -+#include "../../lock.h" -+#include "../../tap.h" -+ -+struct replace_handle { -+ /* these are to be set before calling reiser4_replace_extent */ -+ coord_t *coord; -+ lock_handle *lh; -+ reiser4_key key; -+ reiser4_key *pkey; -+ reiser4_extent overwrite; -+ reiser4_extent new_extents[2]; -+ int nr_new_extents; -+ unsigned flags; -+ -+ /* these are used by reiser4_replace_extent */ -+ reiser4_item_data item; -+ coord_t coord_after; -+ lock_handle lh_after; -+ tap_t watch; -+ reiser4_key paste_key; -+#if REISER4_DEBUG -+ reiser4_extent orig_ext; -+ reiser4_key tmp; -+#endif -+}; -+ -+/* this structure is kmalloced before calling make_extent to avoid excessive -+ stack consumption on plug_hole->reiser4_replace_extent */ -+struct make_extent_handle { -+ uf_coord_t *uf_coord; -+ reiser4_block_nr blocknr; -+ int created; -+ struct inode *inode; -+ union { -+ struct { -+ } append; -+ struct replace_handle replace; -+ } u; -+}; -+ -+int reiser4_replace_extent(struct replace_handle *, -+ int return_inserted_position); -+lock_handle *znode_lh(znode *); -+ -+/* the reiser4 repacker support */ -+struct repacker_cursor; -+extern int process_extent_backward_for_repacking(tap_t *, -+ struct repacker_cursor *); -+extern int mark_extent_for_repacking(tap_t *, int); -+ -+#define coord_by_uf_coord(uf_coord) (&((uf_coord)->coord)) -+#define ext_coord_by_uf_coord(uf_coord) (&((uf_coord)->extension.extent)) -+ -+/* __REISER4_EXTENT_H__ */ -+#endif -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/extent_item_ops.c linux-2.6.24/fs/reiser4/plugin/item/extent_item_ops.c ---- linux-2.6.24.orig/fs/reiser4/plugin/item/extent_item_ops.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/item/extent_item_ops.c 2008-01-25 11:39:07.016228297 +0300 -@@ -0,0 +1,889 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "item.h" -+#include "../../inode.h" -+#include "../../tree_walk.h" /* check_sibling_list() */ -+#include "../../page_cache.h" -+#include "../../carry.h" -+ -+#include -+ -+/* item_plugin->b.max_key_inside */ -+reiser4_key *max_key_inside_extent(const coord_t * coord, reiser4_key * key) -+{ -+ item_key_by_coord(coord, key); -+ set_key_offset(key, get_key_offset(reiser4_max_key())); -+ return key; -+} -+ -+/* item_plugin->b.can_contain_key -+ this checks whether @key of @data is matching to position set by @coord */ -+int -+can_contain_key_extent(const coord_t * coord, const reiser4_key * key, -+ const reiser4_item_data * data) -+{ -+ reiser4_key item_key; -+ -+ if (item_plugin_by_coord(coord) != data->iplug) -+ return 0; -+ -+ item_key_by_coord(coord, &item_key); -+ if (get_key_locality(key) != get_key_locality(&item_key) || -+ get_key_objectid(key) != get_key_objectid(&item_key) || -+ get_key_ordering(key) != get_key_ordering(&item_key)) -+ return 0; -+ -+ return 1; -+} -+ -+/* item_plugin->b.mergeable -+ first item is of extent type */ -+/* Audited by: green(2002.06.13) */ -+int mergeable_extent(const coord_t * p1, const coord_t * p2) -+{ -+ reiser4_key key1, key2; -+ -+ assert("vs-299", item_id_by_coord(p1) == EXTENT_POINTER_ID); -+ /* FIXME-VS: Which is it? Assert or return 0 */ -+ if (item_id_by_coord(p2) != EXTENT_POINTER_ID) { -+ return 0; -+ } -+ -+ item_key_by_coord(p1, &key1); -+ item_key_by_coord(p2, &key2); -+ if (get_key_locality(&key1) != get_key_locality(&key2) || -+ get_key_objectid(&key1) != get_key_objectid(&key2) || -+ get_key_ordering(&key1) != get_key_ordering(&key2) || -+ get_key_type(&key1) != get_key_type(&key2)) -+ return 0; -+ if (get_key_offset(&key1) + -+ reiser4_extent_size(p1, nr_units_extent(p1)) != -+ get_key_offset(&key2)) -+ return 0; -+ return 1; -+} -+ -+/* item_plugin->b.nr_units */ -+pos_in_node_t nr_units_extent(const coord_t * coord) -+{ -+ /* length of extent item has to be multiple of extent size */ -+ assert("vs-1424", -+ (item_length_by_coord(coord) % sizeof(reiser4_extent)) == 0); -+ return item_length_by_coord(coord) / sizeof(reiser4_extent); -+} -+ -+/* item_plugin->b.lookup */ -+lookup_result -+lookup_extent(const reiser4_key * key, lookup_bias bias UNUSED_ARG, -+ coord_t * coord) -+{ /* znode and item_pos are -+ set to an extent item to -+ look through */ -+ reiser4_key item_key; -+ reiser4_block_nr lookuped, offset; -+ unsigned i, nr_units; -+ reiser4_extent *ext; -+ unsigned blocksize; -+ unsigned char blocksize_bits; -+ -+ item_key_by_coord(coord, &item_key); -+ offset = get_key_offset(&item_key); -+ -+ /* key we are looking for must be greater than key of item @coord */ -+ assert("vs-414", keygt(key, &item_key)); -+ -+ assert("umka-99945", -+ !keygt(key, max_key_inside_extent(coord, &item_key))); -+ -+ ext = extent_item(coord); -+ assert("vs-1350", (char *)ext == (zdata(coord->node) + coord->offset)); -+ -+ blocksize = current_blocksize; -+ blocksize_bits = current_blocksize_bits; -+ -+ /* offset we are looking for */ -+ lookuped = get_key_offset(key); -+ -+ nr_units = nr_units_extent(coord); -+ /* go through all extents until the one which address given offset */ -+ for (i = 0; i < nr_units; i++, ext++) { -+ offset += (extent_get_width(ext) << blocksize_bits); -+ if (offset > lookuped) { -+ /* desired byte is somewhere in this extent */ -+ coord->unit_pos = i; -+ coord->between = AT_UNIT; -+ return CBK_COORD_FOUND; -+ } -+ } -+ -+ /* set coord after last unit */ -+ coord->unit_pos = nr_units - 1; -+ coord->between = AFTER_UNIT; -+ return CBK_COORD_FOUND; -+} -+ -+/* item_plugin->b.paste -+ item @coord is set to has been appended with @data->length of free -+ space. data->data contains data to be pasted into the item in position -+ @coord->in_item.unit_pos. It must fit into that free space. -+ @coord must be set between units. -+*/ -+int -+paste_extent(coord_t * coord, reiser4_item_data * data, -+ carry_plugin_info * info UNUSED_ARG) -+{ -+ unsigned old_nr_units; -+ reiser4_extent *ext; -+ int item_length; -+ -+ ext = extent_item(coord); -+ item_length = item_length_by_coord(coord); -+ old_nr_units = (item_length - data->length) / sizeof(reiser4_extent); -+ -+ /* this is also used to copy extent into newly created item, so -+ old_nr_units could be 0 */ -+ assert("vs-260", item_length >= data->length); -+ -+ /* make sure that coord is set properly */ -+ assert("vs-35", -+ ((!coord_is_existing_unit(coord)) -+ || (!old_nr_units && !coord->unit_pos))); -+ -+ /* first unit to be moved */ -+ switch (coord->between) { -+ case AFTER_UNIT: -+ coord->unit_pos++; -+ case BEFORE_UNIT: -+ coord->between = AT_UNIT; -+ break; -+ case AT_UNIT: -+ assert("vs-331", !old_nr_units && !coord->unit_pos); -+ break; -+ default: -+ impossible("vs-330", "coord is set improperly"); -+ } -+ -+ /* prepare space for new units */ -+ memmove(ext + coord->unit_pos + data->length / sizeof(reiser4_extent), -+ ext + coord->unit_pos, -+ (old_nr_units - coord->unit_pos) * sizeof(reiser4_extent)); -+ -+ /* copy new data from kernel space */ -+ assert("vs-556", data->user == 0); -+ memcpy(ext + coord->unit_pos, data->data, (unsigned)data->length); -+ -+ /* after paste @coord is set to first of pasted units */ -+ assert("vs-332", coord_is_existing_unit(coord)); -+ assert("vs-333", -+ !memcmp(data->data, extent_by_coord(coord), -+ (unsigned)data->length)); -+ return 0; -+} -+ -+/* item_plugin->b.can_shift */ -+int -+can_shift_extent(unsigned free_space, coord_t * source, -+ znode * target UNUSED_ARG, shift_direction pend UNUSED_ARG, -+ unsigned *size, unsigned want) -+{ -+ *size = item_length_by_coord(source); -+ if (*size > free_space) -+ /* never split a unit of extent item */ -+ *size = free_space - free_space % sizeof(reiser4_extent); -+ -+ /* we can shift *size bytes, calculate how many do we want to shift */ -+ if (*size > want * sizeof(reiser4_extent)) -+ *size = want * sizeof(reiser4_extent); -+ -+ if (*size % sizeof(reiser4_extent) != 0) -+ impossible("vs-119", "Wrong extent size: %i %zd", *size, -+ sizeof(reiser4_extent)); -+ return *size / sizeof(reiser4_extent); -+ -+} -+ -+/* item_plugin->b.copy_units */ -+void -+copy_units_extent(coord_t * target, coord_t * source, -+ unsigned from, unsigned count, -+ shift_direction where_is_free_space, unsigned free_space) -+{ -+ char *from_ext, *to_ext; -+ -+ assert("vs-217", free_space == count * sizeof(reiser4_extent)); -+ -+ from_ext = item_body_by_coord(source); -+ to_ext = item_body_by_coord(target); -+ -+ if (where_is_free_space == SHIFT_LEFT) { -+ assert("vs-215", from == 0); -+ -+ /* At this moment, item length was already updated in the item -+ header by shifting code, hence nr_units_extent() will -+ return "new" number of units---one we obtain after copying -+ units. -+ */ -+ to_ext += -+ (nr_units_extent(target) - count) * sizeof(reiser4_extent); -+ } else { -+ reiser4_key key; -+ coord_t coord; -+ -+ assert("vs-216", -+ from + count == coord_last_unit_pos(source) + 1); -+ -+ from_ext += item_length_by_coord(source) - free_space; -+ -+ /* new units are inserted before first unit in an item, -+ therefore, we have to update item key */ -+ coord = *source; -+ coord.unit_pos = from; -+ unit_key_extent(&coord, &key); -+ -+ node_plugin_by_node(target->node)->update_item_key(target, &key, -+ NULL /*info */); -+ } -+ -+ memcpy(to_ext, from_ext, free_space); -+} -+ -+/* item_plugin->b.create_hook -+ @arg is znode of leaf node for which we need to update right delimiting key */ -+int create_hook_extent(const coord_t * coord, void *arg) -+{ -+ coord_t *child_coord; -+ znode *node; -+ reiser4_key key; -+ reiser4_tree *tree; -+ -+ if (!arg) -+ return 0; -+ -+ child_coord = arg; -+ tree = znode_get_tree(coord->node); -+ -+ assert("nikita-3246", znode_get_level(child_coord->node) == LEAF_LEVEL); -+ -+ write_lock_tree(tree); -+ write_lock_dk(tree); -+ /* find a node on the left level for which right delimiting key has to -+ be updated */ -+ if (coord_wrt(child_coord) == COORD_ON_THE_LEFT) { -+ assert("vs-411", znode_is_left_connected(child_coord->node)); -+ node = child_coord->node->left; -+ } else { -+ assert("vs-412", coord_wrt(child_coord) == COORD_ON_THE_RIGHT); -+ node = child_coord->node; -+ assert("nikita-3314", node != NULL); -+ } -+ -+ if (node != NULL) { -+ znode_set_rd_key(node, item_key_by_coord(coord, &key)); -+ -+ assert("nikita-3282", check_sibling_list(node)); -+ /* break sibling links */ -+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && node->right) { -+ ON_DEBUG(node->right->left_version = -+ atomic_inc_return(&delim_key_version); -+ node->right_version = -+ atomic_inc_return(&delim_key_version);); -+ -+ node->right->left = NULL; -+ node->right = NULL; -+ } -+ } -+ write_unlock_dk(tree); -+ write_unlock_tree(tree); -+ return 0; -+} -+ -+#define ITEM_TAIL_KILLED 0 -+#define ITEM_HEAD_KILLED 1 -+#define ITEM_KILLED 2 -+ -+/* item_plugin->b.kill_hook -+ this is called when @count units starting from @from-th one are going to be removed -+ */ -+int -+kill_hook_extent(const coord_t * coord, pos_in_node_t from, pos_in_node_t count, -+ struct carry_kill_data *kdata) -+{ -+ reiser4_extent *ext; -+ reiser4_block_nr start, length; -+ const reiser4_key *pfrom_key, *pto_key; -+ struct inode *inode; -+ reiser4_tree *tree; -+ pgoff_t from_off, to_off, offset, skip; -+ int retval; -+ -+ /* these are located in memory kmalloc-ed by kill_node_content */ -+ reiser4_key *min_item_key, *max_item_key, *from_key, *to_key, *key; -+ coord_t *dup, *next; -+ -+ assert("zam-811", znode_is_write_locked(coord->node)); -+ assert("nikita-3315", kdata != NULL); -+ assert("vs-34", kdata->buf != NULL); -+ -+ /* map structures to kdata->buf */ -+ min_item_key = (reiser4_key *) (kdata->buf); -+ max_item_key = min_item_key + 1; -+ from_key = max_item_key + 1; -+ to_key = from_key + 1; -+ key = to_key + 1; -+ dup = (coord_t *) (key + 1); -+ next = dup + 1; -+ -+ item_key_by_coord(coord, min_item_key); -+ max_item_key_by_coord(coord, max_item_key); -+ -+ if (kdata->params.from_key) { -+ pfrom_key = kdata->params.from_key; -+ pto_key = kdata->params.to_key; -+ } else { -+ assert("vs-1549", from == coord->unit_pos); -+ unit_key_by_coord(coord, from_key); -+ pfrom_key = from_key; -+ -+ coord_dup(dup, coord); -+ dup->unit_pos = from + count - 1; -+ max_unit_key_by_coord(dup, to_key); -+ pto_key = to_key; -+ } -+ -+ if (!keylt(pto_key, max_item_key)) { -+ if (!keygt(pfrom_key, min_item_key)) { -+ znode *left, *right; -+ -+ /* item is to be removed completely */ -+ assert("nikita-3316", kdata->left != NULL -+ && kdata->right != NULL); -+ -+ left = kdata->left->node; -+ right = kdata->right->node; -+ -+ tree = current_tree; -+ /* we have to do two things: -+ * -+ * 1. link left and right formatted neighbors of -+ * extent being removed, and -+ * -+ * 2. update their delimiting keys. -+ * -+ * atomicity of these operations is protected by -+ * taking dk-lock and tree-lock. -+ */ -+ /* if neighbors of item being removed are znodes - -+ * link them */ -+ write_lock_tree(tree); -+ write_lock_dk(tree); -+ link_left_and_right(left, right); -+ if (left) { -+ /* update right delimiting key of left -+ * neighbor of extent item */ -+ /*coord_t next; -+ reiser4_key key; */ -+ -+ coord_dup(next, coord); -+ -+ if (coord_next_item(next)) -+ *key = *znode_get_rd_key(coord->node); -+ else -+ item_key_by_coord(next, key); -+ znode_set_rd_key(left, key); -+ } -+ write_unlock_dk(tree); -+ write_unlock_tree(tree); -+ -+ from_off = -+ get_key_offset(min_item_key) >> PAGE_CACHE_SHIFT; -+ to_off = -+ (get_key_offset(max_item_key) + -+ 1) >> PAGE_CACHE_SHIFT; -+ retval = ITEM_KILLED; -+ } else { -+ /* tail of item is to be removed */ -+ from_off = -+ (get_key_offset(pfrom_key) + PAGE_CACHE_SIZE - -+ 1) >> PAGE_CACHE_SHIFT; -+ to_off = -+ (get_key_offset(max_item_key) + -+ 1) >> PAGE_CACHE_SHIFT; -+ retval = ITEM_TAIL_KILLED; -+ } -+ } else { -+ /* head of item is to be removed */ -+ assert("vs-1571", keyeq(pfrom_key, min_item_key)); -+ assert("vs-1572", -+ (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == -+ 0); -+ assert("vs-1573", -+ ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE - -+ 1)) == 0); -+ -+ if (kdata->left->node) { -+ /* update right delimiting key of left neighbor of extent item */ -+ /*reiser4_key key; */ -+ -+ *key = *pto_key; -+ set_key_offset(key, get_key_offset(pto_key) + 1); -+ -+ write_lock_dk(current_tree); -+ znode_set_rd_key(kdata->left->node, key); -+ write_unlock_dk(current_tree); -+ } -+ -+ from_off = get_key_offset(pfrom_key) >> PAGE_CACHE_SHIFT; -+ to_off = (get_key_offset(pto_key) + 1) >> PAGE_CACHE_SHIFT; -+ retval = ITEM_HEAD_KILLED; -+ } -+ -+ inode = kdata->inode; -+ assert("vs-1545", inode != NULL); -+ if (inode != NULL) -+ /* take care of pages and jnodes corresponding to part of item being killed */ -+ reiser4_invalidate_pages(inode->i_mapping, from_off, -+ to_off - from_off, -+ kdata->params.truncate); -+ -+ ext = extent_item(coord) + from; -+ offset = -+ (get_key_offset(min_item_key) + -+ reiser4_extent_size(coord, from)) >> PAGE_CACHE_SHIFT; -+ -+ assert("vs-1551", from_off >= offset); -+ assert("vs-1552", from_off - offset <= extent_get_width(ext)); -+ skip = from_off - offset; -+ offset = from_off; -+ -+ while (offset < to_off) { -+ length = extent_get_width(ext) - skip; -+ if (state_of_extent(ext) == HOLE_EXTENT) { -+ skip = 0; -+ offset += length; -+ ext++; -+ continue; -+ } -+ -+ if (offset + length > to_off) { -+ length = to_off - offset; -+ } -+ -+ DQUOT_FREE_BLOCK_NODIRTY(inode, length); -+ -+ if (state_of_extent(ext) == UNALLOCATED_EXTENT) { -+ /* some jnodes corresponding to this unallocated extent */ -+ fake_allocated2free(length, 0 /* unformatted */ ); -+ -+ skip = 0; -+ offset += length; -+ ext++; -+ continue; -+ } -+ -+ assert("vs-1218", state_of_extent(ext) == ALLOCATED_EXTENT); -+ -+ if (length != 0) { -+ start = extent_get_start(ext) + skip; -+ -+ /* BA_DEFER bit parameter is turned on because blocks which get freed are not safe to be freed -+ immediately */ -+ reiser4_dealloc_blocks(&start, &length, -+ 0 /* not used */ , -+ BA_DEFER -+ /* unformatted with defer */ ); -+ } -+ skip = 0; -+ offset += length; -+ ext++; -+ } -+ return retval; -+} -+ -+/* item_plugin->b.kill_units */ -+int -+kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ struct carry_kill_data *kdata, reiser4_key * smallest_removed, -+ reiser4_key * new_first) -+{ -+ reiser4_extent *ext; -+ reiser4_key item_key; -+ pos_in_node_t count; -+ reiser4_key from_key, to_key; -+ const reiser4_key *pfrom_key, *pto_key; -+ loff_t off; -+ int result; -+ -+ assert("vs-1541", -+ ((kdata->params.from_key == NULL && kdata->params.to_key == NULL) -+ || (kdata->params.from_key != NULL -+ && kdata->params.to_key != NULL))); -+ -+ if (kdata->params.from_key) { -+ pfrom_key = kdata->params.from_key; -+ pto_key = kdata->params.to_key; -+ } else { -+ coord_t dup; -+ -+ /* calculate key range of kill */ -+ assert("vs-1549", from == coord->unit_pos); -+ unit_key_by_coord(coord, &from_key); -+ pfrom_key = &from_key; -+ -+ coord_dup(&dup, coord); -+ dup.unit_pos = to; -+ max_unit_key_by_coord(&dup, &to_key); -+ pto_key = &to_key; -+ } -+ -+ item_key_by_coord(coord, &item_key); -+ -+#if REISER4_DEBUG -+ { -+ reiser4_key max_item_key; -+ -+ max_item_key_by_coord(coord, &max_item_key); -+ -+ if (new_first) { -+ /* head of item is to be cut */ -+ assert("vs-1542", keyeq(pfrom_key, &item_key)); -+ assert("vs-1538", keylt(pto_key, &max_item_key)); -+ } else { -+ /* tail of item is to be cut */ -+ assert("vs-1540", keygt(pfrom_key, &item_key)); -+ assert("vs-1543", !keylt(pto_key, &max_item_key)); -+ } -+ } -+#endif -+ -+ if (smallest_removed) -+ *smallest_removed = *pfrom_key; -+ -+ if (new_first) { -+ /* item head is cut. Item key will change. This new key is calculated here */ -+ assert("vs-1556", -+ (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) == -+ (PAGE_CACHE_SIZE - 1)); -+ *new_first = *pto_key; -+ set_key_offset(new_first, get_key_offset(new_first) + 1); -+ } -+ -+ count = to - from + 1; -+ result = kill_hook_extent(coord, from, count, kdata); -+ if (result == ITEM_TAIL_KILLED) { -+ assert("vs-1553", -+ get_key_offset(pfrom_key) >= -+ get_key_offset(&item_key) + -+ reiser4_extent_size(coord, from)); -+ off = -+ get_key_offset(pfrom_key) - -+ (get_key_offset(&item_key) + -+ reiser4_extent_size(coord, from)); -+ if (off) { -+ /* unit @from is to be cut partially. Its width decreases */ -+ ext = extent_item(coord) + from; -+ extent_set_width(ext, -+ (off + PAGE_CACHE_SIZE - -+ 1) >> PAGE_CACHE_SHIFT); -+ count--; -+ } -+ } else { -+ __u64 max_to_offset; -+ __u64 rest; -+ -+ assert("vs-1575", result == ITEM_HEAD_KILLED); -+ assert("", from == 0); -+ assert("", -+ ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE - -+ 1)) == 0); -+ assert("", -+ get_key_offset(pto_key) + 1 > -+ get_key_offset(&item_key) + -+ reiser4_extent_size(coord, to)); -+ max_to_offset = -+ get_key_offset(&item_key) + -+ reiser4_extent_size(coord, to + 1) - 1; -+ assert("", get_key_offset(pto_key) <= max_to_offset); -+ -+ rest = -+ (max_to_offset - -+ get_key_offset(pto_key)) >> PAGE_CACHE_SHIFT; -+ if (rest) { -+ /* unit @to is to be cut partially */ -+ ext = extent_item(coord) + to; -+ -+ assert("", extent_get_width(ext) > rest); -+ -+ if (state_of_extent(ext) == ALLOCATED_EXTENT) -+ extent_set_start(ext, -+ extent_get_start(ext) + -+ (extent_get_width(ext) - -+ rest)); -+ -+ extent_set_width(ext, rest); -+ count--; -+ } -+ } -+ return count * sizeof(reiser4_extent); -+} -+ -+/* item_plugin->b.cut_units -+ this is too similar to kill_units_extent */ -+int -+cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ struct carry_cut_data *cdata, reiser4_key * smallest_removed, -+ reiser4_key * new_first) -+{ -+ reiser4_extent *ext; -+ reiser4_key item_key; -+ pos_in_node_t count; -+ reiser4_key from_key, to_key; -+ const reiser4_key *pfrom_key, *pto_key; -+ loff_t off; -+ -+ assert("vs-1541", -+ ((cdata->params.from_key == NULL && cdata->params.to_key == NULL) -+ || (cdata->params.from_key != NULL -+ && cdata->params.to_key != NULL))); -+ -+ if (cdata->params.from_key) { -+ pfrom_key = cdata->params.from_key; -+ pto_key = cdata->params.to_key; -+ } else { -+ coord_t dup; -+ -+ /* calculate key range of kill */ -+ coord_dup(&dup, coord); -+ dup.unit_pos = from; -+ unit_key_by_coord(&dup, &from_key); -+ -+ dup.unit_pos = to; -+ max_unit_key_by_coord(&dup, &to_key); -+ -+ pfrom_key = &from_key; -+ pto_key = &to_key; -+ } -+ -+ assert("vs-1555", -+ (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == 0); -+ assert("vs-1556", -+ (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) == -+ (PAGE_CACHE_SIZE - 1)); -+ -+ item_key_by_coord(coord, &item_key); -+ -+#if REISER4_DEBUG -+ { -+ reiser4_key max_item_key; -+ -+ assert("vs-1584", -+ get_key_locality(pfrom_key) == -+ get_key_locality(&item_key)); -+ assert("vs-1585", -+ get_key_type(pfrom_key) == get_key_type(&item_key)); -+ assert("vs-1586", -+ get_key_objectid(pfrom_key) == -+ get_key_objectid(&item_key)); -+ assert("vs-1587", -+ get_key_ordering(pfrom_key) == -+ get_key_ordering(&item_key)); -+ -+ max_item_key_by_coord(coord, &max_item_key); -+ -+ if (new_first != NULL) { -+ /* head of item is to be cut */ -+ assert("vs-1542", keyeq(pfrom_key, &item_key)); -+ assert("vs-1538", keylt(pto_key, &max_item_key)); -+ } else { -+ /* tail of item is to be cut */ -+ assert("vs-1540", keygt(pfrom_key, &item_key)); -+ assert("vs-1543", keyeq(pto_key, &max_item_key)); -+ } -+ } -+#endif -+ -+ if (smallest_removed) -+ *smallest_removed = *pfrom_key; -+ -+ if (new_first) { -+ /* item head is cut. Item key will change. This new key is calculated here */ -+ *new_first = *pto_key; -+ set_key_offset(new_first, get_key_offset(new_first) + 1); -+ } -+ -+ count = to - from + 1; -+ -+ assert("vs-1553", -+ get_key_offset(pfrom_key) >= -+ get_key_offset(&item_key) + reiser4_extent_size(coord, from)); -+ off = -+ get_key_offset(pfrom_key) - (get_key_offset(&item_key) + -+ reiser4_extent_size(coord, from)); -+ if (off) { -+ /* tail of unit @from is to be cut partially. Its width decreases */ -+ assert("vs-1582", new_first == NULL); -+ ext = extent_item(coord) + from; -+ extent_set_width(ext, off >> PAGE_CACHE_SHIFT); -+ count--; -+ } -+ -+ assert("vs-1554", -+ get_key_offset(pto_key) <= -+ get_key_offset(&item_key) + -+ reiser4_extent_size(coord, to + 1) - 1); -+ off = -+ (get_key_offset(&item_key) + -+ reiser4_extent_size(coord, to + 1) - 1) - -+ get_key_offset(pto_key); -+ if (off) { -+ /* @to_key is smaller than max key of unit @to. Unit @to will not be removed. It gets start increased -+ and width decreased. */ -+ assert("vs-1583", (off & (PAGE_CACHE_SIZE - 1)) == 0); -+ ext = extent_item(coord) + to; -+ if (state_of_extent(ext) == ALLOCATED_EXTENT) -+ extent_set_start(ext, -+ extent_get_start(ext) + -+ (extent_get_width(ext) - -+ (off >> PAGE_CACHE_SHIFT))); -+ -+ extent_set_width(ext, (off >> PAGE_CACHE_SHIFT)); -+ count--; -+ } -+ return count * sizeof(reiser4_extent); -+} -+ -+/* item_plugin->b.unit_key */ -+reiser4_key *unit_key_extent(const coord_t * coord, reiser4_key * key) -+{ -+ assert("vs-300", coord_is_existing_unit(coord)); -+ -+ item_key_by_coord(coord, key); -+ set_key_offset(key, -+ (get_key_offset(key) + -+ reiser4_extent_size(coord, coord->unit_pos))); -+ -+ return key; -+} -+ -+/* item_plugin->b.max_unit_key */ -+reiser4_key *max_unit_key_extent(const coord_t * coord, reiser4_key * key) -+{ -+ assert("vs-300", coord_is_existing_unit(coord)); -+ -+ item_key_by_coord(coord, key); -+ set_key_offset(key, -+ (get_key_offset(key) + -+ reiser4_extent_size(coord, coord->unit_pos + 1) - 1)); -+ return key; -+} -+ -+/* item_plugin->b.estimate -+ item_plugin->b.item_data_by_flow */ -+ -+#if REISER4_DEBUG -+ -+/* item_plugin->b.check -+ used for debugging, every item should have here the most complete -+ possible check of the consistency of the item that the inventor can -+ construct -+*/ -+int reiser4_check_extent(const coord_t * coord /* coord of item to check */, -+ const char **error /* where to store error message */) -+{ -+ reiser4_extent *ext, *first; -+ unsigned i, j; -+ reiser4_block_nr start, width, blk_cnt; -+ unsigned num_units; -+ reiser4_tree *tree; -+ oid_t oid; -+ reiser4_key key; -+ coord_t scan; -+ -+ assert("vs-933", REISER4_DEBUG); -+ -+ if (znode_get_level(coord->node) != TWIG_LEVEL) { -+ *error = "Extent on the wrong level"; -+ return -1; -+ } -+ if (item_length_by_coord(coord) % sizeof(reiser4_extent) != 0) { -+ *error = "Wrong item size"; -+ return -1; -+ } -+ ext = first = extent_item(coord); -+ blk_cnt = reiser4_block_count(reiser4_get_current_sb()); -+ num_units = coord_num_units(coord); -+ tree = znode_get_tree(coord->node); -+ item_key_by_coord(coord, &key); -+ oid = get_key_objectid(&key); -+ coord_dup(&scan, coord); -+ -+ for (i = 0; i < num_units; ++i, ++ext) { -+ __u64 index; -+ -+ scan.unit_pos = i; -+ index = extent_unit_index(&scan); -+ -+#if 0 -+ /* check that all jnodes are present for the unallocated -+ * extent */ -+ if (state_of_extent(ext) == UNALLOCATED_EXTENT) { -+ for (j = 0; j < extent_get_width(ext); j++) { -+ jnode *node; -+ -+ node = jlookup(tree, oid, index + j); -+ if (node == NULL) { -+ print_coord("scan", &scan, 0); -+ *error = "Jnode missing"; -+ return -1; -+ } -+ jput(node); -+ } -+ } -+#endif -+ -+ start = extent_get_start(ext); -+ if (start < 2) -+ continue; -+ /* extent is allocated one */ -+ width = extent_get_width(ext); -+ if (start >= blk_cnt) { -+ *error = "Start too large"; -+ return -1; -+ } -+ if (start + width > blk_cnt) { -+ *error = "End too large"; -+ return -1; -+ } -+ /* make sure that this extent does not overlap with other -+ allocated extents extents */ -+ for (j = 0; j < i; j++) { -+ if (state_of_extent(first + j) != ALLOCATED_EXTENT) -+ continue; -+ if (! -+ ((extent_get_start(ext) >= -+ extent_get_start(first + j) + -+ extent_get_width(first + j)) -+ || (extent_get_start(ext) + -+ extent_get_width(ext) <= -+ extent_get_start(first + j)))) { -+ *error = "Extent overlaps with others"; -+ return -1; -+ } -+ } -+ -+ } -+ -+ return 0; -+} -+ -+#endif /* REISER4_DEBUG */ -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/internal.c linux-2.6.24/fs/reiser4/plugin/item/internal.c ---- linux-2.6.24.orig/fs/reiser4/plugin/item/internal.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/item/internal.c 2008-01-25 11:39:07.020229327 +0300 -@@ -0,0 +1,396 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Implementation of internal-item plugin methods. */ -+ -+#include "../../forward.h" -+#include "../../debug.h" -+#include "../../dformat.h" -+#include "../../key.h" -+#include "../../coord.h" -+#include "internal.h" -+#include "item.h" -+#include "../node/node.h" -+#include "../plugin.h" -+#include "../../jnode.h" -+#include "../../znode.h" -+#include "../../tree_walk.h" -+#include "../../tree_mod.h" -+#include "../../tree.h" -+#include "../../super.h" -+#include "../../block_alloc.h" -+ -+/* see internal.h for explanation */ -+ -+/* plugin->u.item.b.mergeable */ -+int mergeable_internal(const coord_t * p1 UNUSED_ARG /* first item */ , -+ const coord_t * p2 UNUSED_ARG /* second item */ ) -+{ -+ /* internal items are not mergeable */ -+ return 0; -+} -+ -+/* ->lookup() method for internal items */ -+lookup_result lookup_internal(const reiser4_key * key /* key to look up */ , -+ lookup_bias bias UNUSED_ARG /* lookup bias */ , -+ coord_t * coord /* coord of item */ ) -+{ -+ reiser4_key ukey; -+ -+ switch (keycmp(unit_key_by_coord(coord, &ukey), key)) { -+ default: -+ impossible("", "keycmp()?!"); -+ case LESS_THAN: -+ /* FIXME-VS: AFTER_ITEM used to be here. But with new coord -+ item plugin can not be taken using coord set this way */ -+ assert("vs-681", coord->unit_pos == 0); -+ coord->between = AFTER_UNIT; -+ case EQUAL_TO: -+ return CBK_COORD_FOUND; -+ case GREATER_THAN: -+ return CBK_COORD_NOTFOUND; -+ } -+} -+ -+/* return body of internal item at @coord */ -+static internal_item_layout *internal_at(const coord_t * coord /* coord of -+ * item */ ) -+{ -+ assert("nikita-607", coord != NULL); -+ assert("nikita-1650", -+ item_plugin_by_coord(coord) == -+ item_plugin_by_id(NODE_POINTER_ID)); -+ return (internal_item_layout *) item_body_by_coord(coord); -+} -+ -+void reiser4_update_internal(const coord_t * coord, -+ const reiser4_block_nr * blocknr) -+{ -+ internal_item_layout *item = internal_at(coord); -+ assert("nikita-2959", reiser4_blocknr_is_sane(blocknr)); -+ -+ put_unaligned(cpu_to_le64(*blocknr), &item->pointer); -+} -+ -+/* return child block number stored in the internal item at @coord */ -+static reiser4_block_nr pointer_at(const coord_t * coord /* coord of item */ ) -+{ -+ assert("nikita-608", coord != NULL); -+ return le64_to_cpu(get_unaligned(&internal_at(coord)->pointer)); -+} -+ -+/* get znode pointed to by internal @item */ -+static znode *znode_at(const coord_t * item /* coord of item */ , -+ znode * parent /* parent node */ ) -+{ -+ return child_znode(item, parent, 1, 0); -+} -+ -+/* store pointer from internal item into "block". Implementation of -+ ->down_link() method */ -+void down_link_internal(const coord_t * coord /* coord of item */ , -+ const reiser4_key * key UNUSED_ARG /* key to get -+ * pointer for */ , -+ reiser4_block_nr * block /* resulting block number */ ) -+{ -+ ON_DEBUG(reiser4_key item_key); -+ -+ assert("nikita-609", coord != NULL); -+ assert("nikita-611", block != NULL); -+ assert("nikita-612", (key == NULL) || -+ /* twig horrors */ -+ (znode_get_level(coord->node) == TWIG_LEVEL) -+ || keyle(item_key_by_coord(coord, &item_key), key)); -+ -+ *block = pointer_at(coord); -+ assert("nikita-2960", reiser4_blocknr_is_sane(block)); -+} -+ -+/* Get the child's block number, or 0 if the block is unallocated. */ -+int -+utmost_child_real_block_internal(const coord_t * coord, sideof side UNUSED_ARG, -+ reiser4_block_nr * block) -+{ -+ assert("jmacd-2059", coord != NULL); -+ -+ *block = pointer_at(coord); -+ assert("nikita-2961", reiser4_blocknr_is_sane(block)); -+ -+ if (reiser4_blocknr_is_fake(block)) { -+ *block = 0; -+ } -+ -+ return 0; -+} -+ -+/* Return the child. */ -+int -+utmost_child_internal(const coord_t * coord, sideof side UNUSED_ARG, -+ jnode ** childp) -+{ -+ reiser4_block_nr block = pointer_at(coord); -+ znode *child; -+ -+ assert("jmacd-2059", childp != NULL); -+ assert("nikita-2962", reiser4_blocknr_is_sane(&block)); -+ -+ child = zlook(znode_get_tree(coord->node), &block); -+ -+ if (IS_ERR(child)) { -+ return PTR_ERR(child); -+ } -+ -+ *childp = ZJNODE(child); -+ -+ return 0; -+} -+ -+#if REISER4_DEBUG -+ -+static void check_link(znode * left, znode * right) -+{ -+ znode *scan; -+ -+ for (scan = left; scan != right; scan = scan->right) { -+ if (ZF_ISSET(scan, JNODE_RIP)) -+ break; -+ if (znode_is_right_connected(scan) && scan->right != NULL) { -+ if (ZF_ISSET(scan->right, JNODE_RIP)) -+ break; -+ assert("nikita-3285", -+ znode_is_left_connected(scan->right)); -+ assert("nikita-3265", -+ ergo(scan != left, -+ ZF_ISSET(scan, JNODE_HEARD_BANSHEE))); -+ assert("nikita-3284", scan->right->left == scan); -+ } else -+ break; -+ } -+} -+ -+int check__internal(const coord_t * coord, const char **error) -+{ -+ reiser4_block_nr blk; -+ znode *child; -+ coord_t cpy; -+ -+ blk = pointer_at(coord); -+ if (!reiser4_blocknr_is_sane(&blk)) { -+ *error = "Invalid pointer"; -+ return -1; -+ } -+ coord_dup(&cpy, coord); -+ child = znode_at(&cpy, cpy.node); -+ if (child != NULL) { -+ znode *left_child; -+ znode *right_child; -+ -+ left_child = right_child = NULL; -+ -+ assert("nikita-3256", znode_invariant(child)); -+ if (coord_prev_item(&cpy) == 0 && item_is_internal(&cpy)) { -+ left_child = znode_at(&cpy, cpy.node); -+ if (left_child != NULL) { -+ read_lock_tree(znode_get_tree(child)); -+ check_link(left_child, child); -+ read_unlock_tree(znode_get_tree(child)); -+ zput(left_child); -+ } -+ } -+ coord_dup(&cpy, coord); -+ if (coord_next_item(&cpy) == 0 && item_is_internal(&cpy)) { -+ right_child = znode_at(&cpy, cpy.node); -+ if (right_child != NULL) { -+ read_lock_tree(znode_get_tree(child)); -+ check_link(child, right_child); -+ read_unlock_tree(znode_get_tree(child)); -+ zput(right_child); -+ } -+ } -+ zput(child); -+ } -+ return 0; -+} -+ -+#endif /* REISER4_DEBUG */ -+ -+/* return true only if this item really points to "block" */ -+/* Audited by: green(2002.06.14) */ -+int has_pointer_to_internal(const coord_t * coord /* coord of item */ , -+ const reiser4_block_nr * block /* block number to -+ * check */ ) -+{ -+ assert("nikita-613", coord != NULL); -+ assert("nikita-614", block != NULL); -+ -+ return pointer_at(coord) == *block; -+} -+ -+/* hook called by ->create_item() method of node plugin after new internal -+ item was just created. -+ -+ This is point where pointer to new node is inserted into tree. Initialize -+ parent pointer in child znode, insert child into sibling list and slum. -+ -+*/ -+int create_hook_internal(const coord_t * item /* coord of item */ , -+ void *arg /* child's left neighbor, if any */ ) -+{ -+ znode *child; -+ __u64 child_ptr; -+ -+ assert("nikita-1252", item != NULL); -+ assert("nikita-1253", item->node != NULL); -+ assert("nikita-1181", znode_get_level(item->node) > LEAF_LEVEL); -+ assert("nikita-1450", item->unit_pos == 0); -+ -+ /* -+ * preparing to item insertion build_child_ptr_data sets pointer to -+ * data to be inserted to jnode's blocknr which is in cpu byte -+ * order. Node's create_item simply copied those data. As result we -+ * have child pointer in cpu's byte order. Convert content of internal -+ * item to little endian byte order. -+ */ -+ child_ptr = get_unaligned((__u64 *)item_body_by_coord(item)); -+ reiser4_update_internal(item, &child_ptr); -+ -+ child = znode_at(item, item->node); -+ if (child != NULL && !IS_ERR(child)) { -+ znode *left; -+ int result = 0; -+ reiser4_tree *tree; -+ -+ left = arg; -+ tree = znode_get_tree(item->node); -+ write_lock_tree(tree); -+ write_lock_dk(tree); -+ assert("nikita-1400", (child->in_parent.node == NULL) -+ || (znode_above_root(child->in_parent.node))); -+ ++item->node->c_count; -+ coord_to_parent_coord(item, &child->in_parent); -+ sibling_list_insert_nolock(child, left); -+ -+ assert("nikita-3297", ZF_ISSET(child, JNODE_ORPHAN)); -+ ZF_CLR(child, JNODE_ORPHAN); -+ -+ if ((left != NULL) && !keyeq(znode_get_rd_key(left), -+ znode_get_rd_key(child))) { -+ znode_set_rd_key(child, znode_get_rd_key(left)); -+ } -+ write_unlock_dk(tree); -+ write_unlock_tree(tree); -+ zput(child); -+ return result; -+ } else { -+ if (child == NULL) -+ child = ERR_PTR(-EIO); -+ return PTR_ERR(child); -+ } -+} -+ -+/* hook called by ->cut_and_kill() method of node plugin just before internal -+ item is removed. -+ -+ This is point where empty node is removed from the tree. Clear parent -+ pointer in child, and mark node for pending deletion. -+ -+ Node will be actually deleted later and in several installations: -+ -+ . when last lock on this node will be released, node will be removed from -+ the sibling list and its lock will be invalidated -+ -+ . when last reference to this node will be dropped, bitmap will be updated -+ and node will be actually removed from the memory. -+ -+*/ -+int kill_hook_internal(const coord_t * item /* coord of item */ , -+ pos_in_node_t from UNUSED_ARG /* start unit */ , -+ pos_in_node_t count UNUSED_ARG /* stop unit */ , -+ struct carry_kill_data *p UNUSED_ARG) -+{ -+ znode *child; -+ -+ assert("nikita-1222", item != NULL); -+ assert("nikita-1224", from == 0); -+ assert("nikita-1225", count == 1); -+ -+ child = znode_at(item, item->node); -+ if (IS_ERR(child)) -+ return PTR_ERR(child); -+ else if (node_is_empty(child)) { -+ reiser4_tree *tree; -+ -+ assert("nikita-1397", znode_is_write_locked(child)); -+ assert("nikita-1398", child->c_count == 0); -+ assert("nikita-2546", ZF_ISSET(child, JNODE_HEARD_BANSHEE)); -+ -+ tree = znode_get_tree(item->node); -+ write_lock_tree(tree); -+ init_parent_coord(&child->in_parent, NULL); -+ --item->node->c_count; -+ write_unlock_tree(tree); -+ zput(child); -+ return 0; -+ } else { -+ warning("nikita-1223", -+ "Cowardly refuse to remove link to non-empty node"); -+ zput(child); -+ return RETERR(-EIO); -+ } -+} -+ -+/* hook called by ->shift() node plugin method when iternal item was just -+ moved from one node to another. -+ -+ Update parent pointer in child and c_counts in old and new parent -+ -+*/ -+int shift_hook_internal(const coord_t * item /* coord of item */ , -+ unsigned from UNUSED_ARG /* start unit */ , -+ unsigned count UNUSED_ARG /* stop unit */ , -+ znode * old_node /* old parent */ ) -+{ -+ znode *child; -+ znode *new_node; -+ reiser4_tree *tree; -+ -+ assert("nikita-1276", item != NULL); -+ assert("nikita-1277", from == 0); -+ assert("nikita-1278", count == 1); -+ assert("nikita-1451", item->unit_pos == 0); -+ -+ new_node = item->node; -+ assert("nikita-2132", new_node != old_node); -+ tree = znode_get_tree(item->node); -+ child = child_znode(item, old_node, 1, 0); -+ if (child == NULL) -+ return 0; -+ if (!IS_ERR(child)) { -+ write_lock_tree(tree); -+ ++new_node->c_count; -+ assert("nikita-1395", znode_parent(child) == old_node); -+ assert("nikita-1396", old_node->c_count > 0); -+ coord_to_parent_coord(item, &child->in_parent); -+ assert("nikita-1781", znode_parent(child) == new_node); -+ assert("nikita-1782", -+ check_tree_pointer(item, child) == NS_FOUND); -+ --old_node->c_count; -+ write_unlock_tree(tree); -+ zput(child); -+ return 0; -+ } else -+ return PTR_ERR(child); -+} -+ -+/* plugin->u.item.b.max_key_inside - not defined */ -+ -+/* plugin->u.item.b.nr_units - item.c:single_unit */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/internal.h linux-2.6.24/fs/reiser4/plugin/item/internal.h ---- linux-2.6.24.orig/fs/reiser4/plugin/item/internal.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/item/internal.h 2008-01-25 11:39:07.020229327 +0300 -@@ -0,0 +1,57 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+/* Internal item contains down-link to the child of the internal/twig -+ node in a tree. It is internal items that are actually used during -+ tree traversal. */ -+ -+#if !defined( __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ ) -+#define __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ -+ -+#include "../../forward.h" -+#include "../../dformat.h" -+ -+/* on-disk layout of internal item */ -+typedef struct internal_item_layout { -+ /* 0 */ reiser4_dblock_nr pointer; -+ /* 4 */ -+} internal_item_layout; -+ -+struct cut_list; -+ -+int mergeable_internal(const coord_t * p1, const coord_t * p2); -+lookup_result lookup_internal(const reiser4_key * key, lookup_bias bias, -+ coord_t * coord); -+/* store pointer from internal item into "block". Implementation of -+ ->down_link() method */ -+extern void down_link_internal(const coord_t * coord, const reiser4_key * key, -+ reiser4_block_nr * block); -+extern int has_pointer_to_internal(const coord_t * coord, -+ const reiser4_block_nr * block); -+extern int create_hook_internal(const coord_t * item, void *arg); -+extern int kill_hook_internal(const coord_t * item, pos_in_node_t from, -+ pos_in_node_t count, struct carry_kill_data *); -+extern int shift_hook_internal(const coord_t * item, unsigned from, -+ unsigned count, znode * old_node); -+extern void reiser4_print_internal(const char *prefix, coord_t * coord); -+ -+extern int utmost_child_internal(const coord_t * coord, sideof side, -+ jnode ** child); -+int utmost_child_real_block_internal(const coord_t * coord, sideof side, -+ reiser4_block_nr * block); -+ -+extern void reiser4_update_internal(const coord_t * coord, -+ const reiser4_block_nr * blocknr); -+/* FIXME: reiserfs has check_internal */ -+extern int check__internal(const coord_t * coord, const char **error); -+ -+/* __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/item.c linux-2.6.24/fs/reiser4/plugin/item/item.c ---- linux-2.6.24.orig/fs/reiser4/plugin/item/item.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/item/item.c 2008-01-25 11:39:07.020229327 +0300 -@@ -0,0 +1,719 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* definition of item plugins. */ -+ -+#include "../../forward.h" -+#include "../../debug.h" -+#include "../../key.h" -+#include "../../coord.h" -+#include "../plugin_header.h" -+#include "sde.h" -+#include "internal.h" -+#include "item.h" -+#include "static_stat.h" -+#include "../plugin.h" -+#include "../../znode.h" -+#include "../../tree.h" -+#include "../../context.h" -+#include "ctail.h" -+ -+/* return pointer to item body */ -+void item_body_by_coord_hard(coord_t * coord /* coord to query */ ) -+{ -+ assert("nikita-324", coord != NULL); -+ assert("nikita-325", coord->node != NULL); -+ assert("nikita-326", znode_is_loaded(coord->node)); -+ assert("nikita-3200", coord->offset == INVALID_OFFSET); -+ -+ coord->offset = -+ node_plugin_by_node(coord->node)->item_by_coord(coord) - -+ zdata(coord->node); -+ ON_DEBUG(coord->body_v = coord->node->times_locked); -+} -+ -+void *item_body_by_coord_easy(const coord_t * coord /* coord to query */ ) -+{ -+ return zdata(coord->node) + coord->offset; -+} -+ -+#if REISER4_DEBUG -+ -+int item_body_is_valid(const coord_t * coord) -+{ -+ return -+ coord->offset == -+ node_plugin_by_node(coord->node)->item_by_coord(coord) - -+ zdata(coord->node); -+} -+ -+#endif -+ -+/* return length of item at @coord */ -+pos_in_node_t item_length_by_coord(const coord_t * coord /* coord to query */ ) -+{ -+ int len; -+ -+ assert("nikita-327", coord != NULL); -+ assert("nikita-328", coord->node != NULL); -+ assert("nikita-329", znode_is_loaded(coord->node)); -+ -+ len = node_plugin_by_node(coord->node)->length_by_coord(coord); -+ return len; -+} -+ -+void obtain_item_plugin(const coord_t * coord) -+{ -+ assert("nikita-330", coord != NULL); -+ assert("nikita-331", coord->node != NULL); -+ assert("nikita-332", znode_is_loaded(coord->node)); -+ -+ coord_set_iplug((coord_t *) coord, -+ node_plugin_by_node(coord->node)-> -+ plugin_by_coord(coord)); -+ assert("nikita-2479", -+ coord_iplug(coord) == -+ node_plugin_by_node(coord->node)->plugin_by_coord(coord)); -+} -+ -+/* return id of item */ -+/* Audited by: green(2002.06.15) */ -+item_id item_id_by_coord(const coord_t * coord /* coord to query */ ) -+{ -+ assert("vs-539", coord != NULL); -+ assert("vs-538", coord->node != NULL); -+ assert("vs-537", znode_is_loaded(coord->node)); -+ assert("vs-536", item_plugin_by_coord(coord) != NULL); -+ assert("vs-540", -+ item_id_by_plugin(item_plugin_by_coord(coord)) < LAST_ITEM_ID); -+ -+ return item_id_by_plugin(item_plugin_by_coord(coord)); -+} -+ -+/* return key of item at @coord */ -+/* Audited by: green(2002.06.15) */ -+reiser4_key *item_key_by_coord(const coord_t * coord /* coord to query */ , -+ reiser4_key * key /* result */ ) -+{ -+ assert("nikita-338", coord != NULL); -+ assert("nikita-339", coord->node != NULL); -+ assert("nikita-340", znode_is_loaded(coord->node)); -+ -+ return node_plugin_by_node(coord->node)->key_at(coord, key); -+} -+ -+/* this returns max key in the item */ -+reiser4_key *max_item_key_by_coord(const coord_t * coord /* coord to query */ , -+ reiser4_key * key /* result */ ) -+{ -+ coord_t last; -+ -+ assert("nikita-338", coord != NULL); -+ assert("nikita-339", coord->node != NULL); -+ assert("nikita-340", znode_is_loaded(coord->node)); -+ -+ /* make coord pointing to last item's unit */ -+ coord_dup(&last, coord); -+ last.unit_pos = coord_num_units(&last) - 1; -+ assert("vs-1560", coord_is_existing_unit(&last)); -+ -+ max_unit_key_by_coord(&last, key); -+ return key; -+} -+ -+/* return key of unit at @coord */ -+reiser4_key *unit_key_by_coord(const coord_t * coord /* coord to query */ , -+ reiser4_key * key /* result */ ) -+{ -+ assert("nikita-772", coord != NULL); -+ assert("nikita-774", coord->node != NULL); -+ assert("nikita-775", znode_is_loaded(coord->node)); -+ -+ if (item_plugin_by_coord(coord)->b.unit_key != NULL) -+ return item_plugin_by_coord(coord)->b.unit_key(coord, key); -+ else -+ return item_key_by_coord(coord, key); -+} -+ -+/* return the biggest key contained the unit @coord */ -+reiser4_key *max_unit_key_by_coord(const coord_t * coord /* coord to query */ , -+ reiser4_key * key /* result */ ) -+{ -+ assert("nikita-772", coord != NULL); -+ assert("nikita-774", coord->node != NULL); -+ assert("nikita-775", znode_is_loaded(coord->node)); -+ -+ if (item_plugin_by_coord(coord)->b.max_unit_key != NULL) -+ return item_plugin_by_coord(coord)->b.max_unit_key(coord, key); -+ else -+ return unit_key_by_coord(coord, key); -+} -+ -+/* ->max_key_inside() method for items consisting of exactly one key (like -+ stat-data) */ -+static reiser4_key *max_key_inside_single_key(const coord_t * -+ coord /* coord of item */ , -+ reiser4_key * -+ result /* resulting key */ ) -+{ -+ assert("nikita-604", coord != NULL); -+ -+ /* coord -> key is starting key of this item and it has to be already -+ filled in */ -+ return unit_key_by_coord(coord, result); -+} -+ -+/* ->nr_units() method for items consisting of exactly one unit always */ -+pos_in_node_t -+nr_units_single_unit(const coord_t * coord UNUSED_ARG /* coord of item */ ) -+{ -+ return 1; -+} -+ -+static int -+paste_no_paste(coord_t * coord UNUSED_ARG, -+ reiser4_item_data * data UNUSED_ARG, -+ carry_plugin_info * info UNUSED_ARG) -+{ -+ return 0; -+} -+ -+/* default ->fast_paste() method */ -+static int -+agree_to_fast_op(const coord_t * coord UNUSED_ARG /* coord of item */ ) -+{ -+ return 1; -+} -+ -+int item_can_contain_key(const coord_t * item /* coord of item */ , -+ const reiser4_key * key /* key to check */ , -+ const reiser4_item_data * data /* parameters of item -+ * being created */ ) -+{ -+ item_plugin *iplug; -+ reiser4_key min_key_in_item; -+ reiser4_key max_key_in_item; -+ -+ assert("nikita-1658", item != NULL); -+ assert("nikita-1659", key != NULL); -+ -+ iplug = item_plugin_by_coord(item); -+ if (iplug->b.can_contain_key != NULL) -+ return iplug->b.can_contain_key(item, key, data); -+ else { -+ assert("nikita-1681", iplug->b.max_key_inside != NULL); -+ item_key_by_coord(item, &min_key_in_item); -+ iplug->b.max_key_inside(item, &max_key_in_item); -+ -+ /* can contain key if -+ min_key_in_item <= key && -+ key <= max_key_in_item -+ */ -+ return keyle(&min_key_in_item, key) -+ && keyle(key, &max_key_in_item); -+ } -+} -+ -+/* mergeable method for non mergeable items */ -+static int -+not_mergeable(const coord_t * i1 UNUSED_ARG, const coord_t * i2 UNUSED_ARG) -+{ -+ return 0; -+} -+ -+/* return 0 if @item1 and @item2 are not mergeable, !0 - otherwise */ -+int are_items_mergeable(const coord_t * i1 /* coord of first item */ , -+ const coord_t * i2 /* coord of second item */ ) -+{ -+ item_plugin *iplug; -+ reiser4_key k1; -+ reiser4_key k2; -+ -+ assert("nikita-1336", i1 != NULL); -+ assert("nikita-1337", i2 != NULL); -+ -+ iplug = item_plugin_by_coord(i1); -+ assert("nikita-1338", iplug != NULL); -+ -+ /* NOTE-NIKITA are_items_mergeable() is also called by assertions in -+ shifting code when nodes are in "suspended" state. */ -+ assert("nikita-1663", -+ keyle(item_key_by_coord(i1, &k1), item_key_by_coord(i2, &k2))); -+ -+ if (iplug->b.mergeable != NULL) { -+ return iplug->b.mergeable(i1, i2); -+ } else if (iplug->b.max_key_inside != NULL) { -+ iplug->b.max_key_inside(i1, &k1); -+ item_key_by_coord(i2, &k2); -+ -+ /* mergeable if ->max_key_inside() >= key of i2; */ -+ return keyge(iplug->b.max_key_inside(i1, &k1), -+ item_key_by_coord(i2, &k2)); -+ } else { -+ item_key_by_coord(i1, &k1); -+ item_key_by_coord(i2, &k2); -+ -+ return -+ (get_key_locality(&k1) == get_key_locality(&k2)) && -+ (get_key_objectid(&k1) == get_key_objectid(&k2)) -+ && (iplug == item_plugin_by_coord(i2)); -+ } -+} -+ -+int item_is_extent(const coord_t * item) -+{ -+ assert("vs-482", coord_is_existing_item(item)); -+ return item_id_by_coord(item) == EXTENT_POINTER_ID; -+} -+ -+int item_is_tail(const coord_t * item) -+{ -+ assert("vs-482", coord_is_existing_item(item)); -+ return item_id_by_coord(item) == FORMATTING_ID; -+} -+ -+#if REISER4_DEBUG -+ -+int item_is_statdata(const coord_t * item) -+{ -+ assert("vs-516", coord_is_existing_item(item)); -+ return plugin_of_group(item_plugin_by_coord(item), STAT_DATA_ITEM_TYPE); -+} -+ -+int item_is_ctail(const coord_t * item) -+{ -+ assert("edward-xx", coord_is_existing_item(item)); -+ return item_id_by_coord(item) == CTAIL_ID; -+} -+ -+#endif /* REISER4_DEBUG */ -+ -+static int change_item(struct inode *inode, -+ reiser4_plugin * plugin, -+ pset_member memb) -+{ -+ /* cannot change constituent item (sd, or dir_item) */ -+ return RETERR(-EINVAL); -+} -+ -+static reiser4_plugin_ops item_plugin_ops = { -+ .init = NULL, -+ .load = NULL, -+ .save_len = NULL, -+ .save = NULL, -+ .change = change_item -+}; -+ -+item_plugin item_plugins[LAST_ITEM_ID] = { -+ [STATIC_STAT_DATA_ID] = { -+ .h = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .id = STATIC_STAT_DATA_ID, -+ .groups = (1 << STAT_DATA_ITEM_TYPE), -+ .pops = &item_plugin_ops, -+ .label = "sd", -+ .desc = "stat-data", -+ .linkage = {NULL, NULL} -+ }, -+ .b = { -+ .max_key_inside = max_key_inside_single_key, -+ .can_contain_key = NULL, -+ .mergeable = not_mergeable, -+ .nr_units = nr_units_single_unit, -+ .lookup = NULL, -+ .init = NULL, -+ .paste = paste_no_paste, -+ .fast_paste = NULL, -+ .can_shift = NULL, -+ .copy_units = NULL, -+ .create_hook = NULL, -+ .kill_hook = NULL, -+ .shift_hook = NULL, -+ .cut_units = NULL, -+ .kill_units = NULL, -+ .unit_key = NULL, -+ .max_unit_key = NULL, -+ .estimate = NULL, -+ .item_data_by_flow = NULL, -+#if REISER4_DEBUG -+ .check = NULL -+#endif -+ }, -+ .f = { -+ .utmost_child = NULL, -+ .utmost_child_real_block = NULL, -+ .update = NULL, -+ .scan = NULL, -+ .convert = NULL -+ }, -+ .s = { -+ .sd = { -+ .init_inode = init_inode_static_sd, -+ .save_len = save_len_static_sd, -+ .save = save_static_sd -+ } -+ } -+ }, -+ [SIMPLE_DIR_ENTRY_ID] = { -+ .h = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .id = SIMPLE_DIR_ENTRY_ID, -+ .groups = (1 << DIR_ENTRY_ITEM_TYPE), -+ .pops = &item_plugin_ops, -+ .label = "de", -+ .desc = "directory entry", -+ .linkage = {NULL, NULL} -+ }, -+ .b = { -+ .max_key_inside = max_key_inside_single_key, -+ .can_contain_key = NULL, -+ .mergeable = NULL, -+ .nr_units = nr_units_single_unit, -+ .lookup = NULL, -+ .init = NULL, -+ .paste = NULL, -+ .fast_paste = NULL, -+ .can_shift = NULL, -+ .copy_units = NULL, -+ .create_hook = NULL, -+ .kill_hook = NULL, -+ .shift_hook = NULL, -+ .cut_units = NULL, -+ .kill_units = NULL, -+ .unit_key = NULL, -+ .max_unit_key = NULL, -+ .estimate = NULL, -+ .item_data_by_flow = NULL, -+#if REISER4_DEBUG -+ .check = NULL -+#endif -+ }, -+ .f = { -+ .utmost_child = NULL, -+ .utmost_child_real_block = NULL, -+ .update = NULL, -+ .scan = NULL, -+ .convert = NULL -+ }, -+ .s = { -+ .dir = { -+ .extract_key = extract_key_de, -+ .update_key = update_key_de, -+ .extract_name = extract_name_de, -+ .extract_file_type = extract_file_type_de, -+ .add_entry = add_entry_de, -+ .rem_entry = rem_entry_de, -+ .max_name_len = max_name_len_de -+ } -+ } -+ }, -+ [COMPOUND_DIR_ID] = { -+ .h = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .id = COMPOUND_DIR_ID, -+ .groups = (1 << DIR_ENTRY_ITEM_TYPE), -+ .pops = &item_plugin_ops, -+ .label = "cde", -+ .desc = "compressed directory entry", -+ .linkage = {NULL, NULL} -+ }, -+ .b = { -+ .max_key_inside = max_key_inside_cde, -+ .can_contain_key = can_contain_key_cde, -+ .mergeable = mergeable_cde, -+ .nr_units = nr_units_cde, -+ .lookup = lookup_cde, -+ .init = init_cde, -+ .paste = paste_cde, -+ .fast_paste = agree_to_fast_op, -+ .can_shift = can_shift_cde, -+ .copy_units = copy_units_cde, -+ .create_hook = NULL, -+ .kill_hook = NULL, -+ .shift_hook = NULL, -+ .cut_units = cut_units_cde, -+ .kill_units = kill_units_cde, -+ .unit_key = unit_key_cde, -+ .max_unit_key = unit_key_cde, -+ .estimate = estimate_cde, -+ .item_data_by_flow = NULL, -+#if REISER4_DEBUG -+ .check = reiser4_check_cde -+#endif -+ }, -+ .f = { -+ .utmost_child = NULL, -+ .utmost_child_real_block = NULL, -+ .update = NULL, -+ .scan = NULL, -+ .convert = NULL -+ }, -+ .s = { -+ .dir = { -+ .extract_key = extract_key_cde, -+ .update_key = update_key_cde, -+ .extract_name = extract_name_cde, -+ .extract_file_type = extract_file_type_de, -+ .add_entry = add_entry_cde, -+ .rem_entry = rem_entry_cde, -+ .max_name_len = max_name_len_cde -+ } -+ } -+ }, -+ [NODE_POINTER_ID] = { -+ .h = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .id = NODE_POINTER_ID, -+ .groups = (1 << INTERNAL_ITEM_TYPE), -+ .pops = NULL, -+ .label = "internal", -+ .desc = "internal item", -+ .linkage = {NULL, NULL} -+ }, -+ .b = { -+ .max_key_inside = NULL, -+ .can_contain_key = NULL, -+ .mergeable = mergeable_internal, -+ .nr_units = nr_units_single_unit, -+ .lookup = lookup_internal, -+ .init = NULL, -+ .paste = NULL, -+ .fast_paste = NULL, -+ .can_shift = NULL, -+ .copy_units = NULL, -+ .create_hook = create_hook_internal, -+ .kill_hook = kill_hook_internal, -+ .shift_hook = shift_hook_internal, -+ .cut_units = NULL, -+ .kill_units = NULL, -+ .unit_key = NULL, -+ .max_unit_key = NULL, -+ .estimate = NULL, -+ .item_data_by_flow = NULL, -+#if REISER4_DEBUG -+ .check = check__internal -+#endif -+ }, -+ .f = { -+ .utmost_child = utmost_child_internal, -+ .utmost_child_real_block = -+ utmost_child_real_block_internal, -+ .update = reiser4_update_internal, -+ .scan = NULL, -+ .convert = NULL -+ }, -+ .s = { -+ .internal = { -+ .down_link = down_link_internal, -+ .has_pointer_to = has_pointer_to_internal -+ } -+ } -+ }, -+ [EXTENT_POINTER_ID] = { -+ .h = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .id = EXTENT_POINTER_ID, -+ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE), -+ .pops = NULL, -+ .label = "extent", -+ .desc = "extent item", -+ .linkage = {NULL, NULL} -+ }, -+ .b = { -+ .max_key_inside = max_key_inside_extent, -+ .can_contain_key = can_contain_key_extent, -+ .mergeable = mergeable_extent, -+ .nr_units = nr_units_extent, -+ .lookup = lookup_extent, -+ .init = NULL, -+ .paste = paste_extent, -+ .fast_paste = agree_to_fast_op, -+ .can_shift = can_shift_extent, -+ .create_hook = create_hook_extent, -+ .copy_units = copy_units_extent, -+ .kill_hook = kill_hook_extent, -+ .shift_hook = NULL, -+ .cut_units = cut_units_extent, -+ .kill_units = kill_units_extent, -+ .unit_key = unit_key_extent, -+ .max_unit_key = max_unit_key_extent, -+ .estimate = NULL, -+ .item_data_by_flow = NULL, -+#if REISER4_DEBUG -+ .check = reiser4_check_extent -+#endif -+ }, -+ .f = { -+ .utmost_child = utmost_child_extent, -+ .utmost_child_real_block = -+ utmost_child_real_block_extent, -+ .update = NULL, -+ .scan = reiser4_scan_extent, -+ .convert = NULL, -+ .key_by_offset = key_by_offset_extent -+ }, -+ .s = { -+ .file = { -+ .write = reiser4_write_extent, -+ .read = reiser4_read_extent, -+ .readpage = reiser4_readpage_extent, -+ .get_block = get_block_address_extent, -+ .append_key = append_key_extent, -+ .init_coord_extension = -+ init_coord_extension_extent -+ } -+ } -+ }, -+ [FORMATTING_ID] = { -+ .h = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .id = FORMATTING_ID, -+ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE), -+ .pops = NULL, -+ .label = "body", -+ .desc = "body (or tail?) item", -+ .linkage = {NULL, NULL} -+ }, -+ .b = { -+ .max_key_inside = max_key_inside_tail, -+ .can_contain_key = can_contain_key_tail, -+ .mergeable = mergeable_tail, -+ .nr_units = nr_units_tail, -+ .lookup = lookup_tail, -+ .init = NULL, -+ .paste = paste_tail, -+ .fast_paste = agree_to_fast_op, -+ .can_shift = can_shift_tail, -+ .create_hook = NULL, -+ .copy_units = copy_units_tail, -+ .kill_hook = kill_hook_tail, -+ .shift_hook = NULL, -+ .cut_units = cut_units_tail, -+ .kill_units = kill_units_tail, -+ .unit_key = unit_key_tail, -+ .max_unit_key = unit_key_tail, -+ .estimate = NULL, -+ .item_data_by_flow = NULL, -+#if REISER4_DEBUG -+ .check = NULL -+#endif -+ }, -+ .f = { -+ .utmost_child = NULL, -+ .utmost_child_real_block = NULL, -+ .update = NULL, -+ .scan = NULL, -+ .convert = NULL -+ }, -+ .s = { -+ .file = { -+ .write = reiser4_write_tail, -+ .read = reiser4_read_tail, -+ .readpage = readpage_tail, -+ .get_block = get_block_address_tail, -+ .append_key = append_key_tail, -+ .init_coord_extension = -+ init_coord_extension_tail -+ } -+ } -+ }, -+ [CTAIL_ID] = { -+ .h = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .id = CTAIL_ID, -+ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE), -+ .pops = NULL, -+ .label = "ctail", -+ .desc = "cryptcompress tail item", -+ .linkage = {NULL, NULL} -+ }, -+ .b = { -+ .max_key_inside = max_key_inside_tail, -+ .can_contain_key = can_contain_key_ctail, -+ .mergeable = mergeable_ctail, -+ .nr_units = nr_units_ctail, -+ .lookup = NULL, -+ .init = init_ctail, -+ .paste = paste_ctail, -+ .fast_paste = agree_to_fast_op, -+ .can_shift = can_shift_ctail, -+ .create_hook = create_hook_ctail, -+ .copy_units = copy_units_ctail, -+ .kill_hook = kill_hook_ctail, -+ .shift_hook = shift_hook_ctail, -+ .cut_units = cut_units_ctail, -+ .kill_units = kill_units_ctail, -+ .unit_key = unit_key_tail, -+ .max_unit_key = unit_key_tail, -+ .estimate = estimate_ctail, -+ .item_data_by_flow = NULL, -+#if REISER4_DEBUG -+ .check = check_ctail -+#endif -+ }, -+ .f = { -+ .utmost_child = utmost_child_ctail, -+ /* FIXME-EDWARD: write this */ -+ .utmost_child_real_block = NULL, -+ .update = NULL, -+ .scan = scan_ctail, -+ .convert = convert_ctail -+ }, -+ .s = { -+ .file = { -+ .write = NULL, -+ .read = read_ctail, -+ .readpage = readpage_ctail, -+ .get_block = get_block_address_tail, -+ .append_key = append_key_ctail, -+ .init_coord_extension = -+ init_coord_extension_tail -+ } -+ } -+ }, -+ [BLACK_BOX_ID] = { -+ .h = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .id = BLACK_BOX_ID, -+ .groups = (1 << OTHER_ITEM_TYPE), -+ .pops = NULL, -+ .label = "blackbox", -+ .desc = "black box item", -+ .linkage = {NULL, NULL} -+ }, -+ .b = { -+ .max_key_inside = NULL, -+ .can_contain_key = NULL, -+ .mergeable = not_mergeable, -+ .nr_units = nr_units_single_unit, -+ /* to need for ->lookup method */ -+ .lookup = NULL, -+ .init = NULL, -+ .paste = NULL, -+ .fast_paste = NULL, -+ .can_shift = NULL, -+ .copy_units = NULL, -+ .create_hook = NULL, -+ .kill_hook = NULL, -+ .shift_hook = NULL, -+ .cut_units = NULL, -+ .kill_units = NULL, -+ .unit_key = NULL, -+ .max_unit_key = NULL, -+ .estimate = NULL, -+ .item_data_by_flow = NULL, -+#if REISER4_DEBUG -+ .check = NULL -+#endif -+ } -+ } -+}; -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/item.h linux-2.6.24/fs/reiser4/plugin/item/item.h ---- linux-2.6.24.orig/fs/reiser4/plugin/item/item.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/item/item.h 2008-01-25 11:40:16.698169785 +0300 -@@ -0,0 +1,398 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* first read balance.c comments before reading this */ -+ -+/* An item_plugin implements all of the operations required for -+ balancing that are item specific. */ -+ -+/* an item plugin also implements other operations that are specific to that -+ item. These go into the item specific operations portion of the item -+ handler, and all of the item specific portions of the item handler are put -+ into a union. */ -+ -+#if !defined( __REISER4_ITEM_H__ ) -+#define __REISER4_ITEM_H__ -+ -+#include "../../forward.h" -+#include "../plugin_header.h" -+#include "../../dformat.h" -+#include "../../seal.h" -+#include "../../plugin/file/file.h" -+ -+#include /* for struct file, struct inode */ -+#include /* for struct page */ -+#include /* for struct dentry */ -+ -+typedef enum { -+ STAT_DATA_ITEM_TYPE, -+ DIR_ENTRY_ITEM_TYPE, -+ INTERNAL_ITEM_TYPE, -+ UNIX_FILE_METADATA_ITEM_TYPE, -+ OTHER_ITEM_TYPE -+} item_type_id; -+ -+/* this is the part of each item plugin that all items are expected to -+ support or at least explicitly fail to support by setting the -+ pointer to null. */ -+struct balance_ops { -+ /* operations called by balancing -+ -+ It is interesting to consider that some of these item -+ operations could be given sources or targets that are not -+ really items in nodes. This could be ok/useful. -+ -+ */ -+ /* maximal key that can _possibly_ be occupied by this item -+ -+ When inserting, and node ->lookup() method (called by -+ coord_by_key()) reaches an item after binary search, -+ the ->max_key_inside() item plugin method is used to determine -+ whether new item should pasted into existing item -+ (new_key<=max_key_inside()) or new item has to be created -+ (new_key>max_key_inside()). -+ -+ For items that occupy exactly one key (like stat-data) -+ this method should return this key. For items that can -+ grow indefinitely (extent, directory item) this should -+ return reiser4_max_key(). -+ -+ For example extent with the key -+ -+ (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks, -+ -+ ->max_key_inside is (LOCALITY,4,OBJID,0xffffffffffffffff), and -+ */ -+ reiser4_key *(*max_key_inside) (const coord_t *, reiser4_key *); -+ -+ /* true if item @coord can merge data at @key. */ -+ int (*can_contain_key) (const coord_t *, const reiser4_key *, -+ const reiser4_item_data *); -+ /* mergeable() - check items for mergeability -+ -+ Optional method. Returns true if two items can be merged. -+ -+ */ -+ int (*mergeable) (const coord_t *, const coord_t *); -+ -+ /* number of atomic things in an item. -+ NOTE FOR CONTRIBUTORS: use a generic method -+ nr_units_single_unit() for solid (atomic) items, as -+ tree operations use it as a criterion of solidness -+ (see is_solid_item macro) */ -+ pos_in_node_t(*nr_units) (const coord_t *); -+ -+ /* search within item for a unit within the item, and return a -+ pointer to it. This can be used to calculate how many -+ bytes to shrink an item if you use pointer arithmetic and -+ compare to the start of the item body if the item's data -+ are continuous in the node, if the item's data are not -+ continuous in the node, all sorts of other things are maybe -+ going to break as well. */ -+ lookup_result(*lookup) (const reiser4_key *, lookup_bias, coord_t *); -+ /* method called by ode_plugin->create_item() to initialise new -+ item */ -+ int (*init) (coord_t * target, coord_t * from, -+ reiser4_item_data * data); -+ /* method called (e.g., by reiser4_resize_item()) to place new data -+ into item when it grows */ -+ int (*paste) (coord_t *, reiser4_item_data *, carry_plugin_info *); -+ /* return true if paste into @coord is allowed to skip -+ carry. That is, if such paste would require any changes -+ at the parent level -+ */ -+ int (*fast_paste) (const coord_t *); -+ /* how many but not more than @want units of @source can be -+ shifted into @target node. If pend == append - we try to -+ append last item of @target by first units of @source. If -+ pend == prepend - we try to "prepend" first item in @target -+ by last units of @source. @target node has @free_space -+ bytes of free space. Total size of those units are returned -+ via @size. -+ -+ @target is not NULL if shifting to the mergeable item and -+ NULL is new item will be created during shifting. -+ */ -+ int (*can_shift) (unsigned free_space, coord_t *, -+ znode *, shift_direction, unsigned *size, -+ unsigned want); -+ -+ /* starting off @from-th unit of item @source append or -+ prepend @count units to @target. @target has been already -+ expanded by @free_space bytes. That must be exactly what is -+ needed for those items in @target. If @where_is_free_space -+ == SHIFT_LEFT - free space is at the end of @target item, -+ othersize - it is in the beginning of it. */ -+ void (*copy_units) (coord_t *, coord_t *, -+ unsigned from, unsigned count, -+ shift_direction where_is_free_space, -+ unsigned free_space); -+ -+ int (*create_hook) (const coord_t *, void *); -+ /* do whatever is necessary to do when @count units starting -+ from @from-th one are removed from the tree */ -+ /* FIXME-VS: this is used to be here for, in particular, -+ extents and items of internal type to free blocks they point -+ to at the same time with removing items from a -+ tree. Problems start, however, when dealloc_block fails due -+ to some reason. Item gets removed, but blocks it pointed to -+ are not freed. It is not clear how to fix this for items of -+ internal type because a need to remove internal item may -+ appear in the middle of balancing, and there is no way to -+ undo changes made. OTOH, if space allocator involves -+ balancing to perform dealloc_block - this will probably -+ break balancing due to deadlock issues -+ */ -+ int (*kill_hook) (const coord_t *, pos_in_node_t from, -+ pos_in_node_t count, struct carry_kill_data *); -+ int (*shift_hook) (const coord_t *, unsigned from, unsigned count, -+ znode * _node); -+ -+ /* unit @*from contains @from_key. unit @*to contains @to_key. Cut all keys between @from_key and @to_key -+ including boundaries. When units are cut from item beginning - move space which gets freed to head of -+ item. When units are cut from item end - move freed space to item end. When units are cut from the middle of -+ item - move freed space to item head. Return amount of space which got freed. Save smallest removed key in -+ @smallest_removed if it is not 0. Save new first item key in @new_first_key if it is not 0 -+ */ -+ int (*cut_units) (coord_t *, pos_in_node_t from, pos_in_node_t to, -+ struct carry_cut_data *, -+ reiser4_key * smallest_removed, -+ reiser4_key * new_first_key); -+ -+ /* like cut_units, except that these units are removed from the -+ tree, not only from a node */ -+ int (*kill_units) (coord_t *, pos_in_node_t from, pos_in_node_t to, -+ struct carry_kill_data *, -+ reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+ -+ /* if @key_of_coord == 1 - returned key of coord, otherwise - -+ key of unit is returned. If @coord is not set to certain -+ unit - ERR_PTR(-ENOENT) is returned */ -+ reiser4_key *(*unit_key) (const coord_t *, reiser4_key *); -+ reiser4_key *(*max_unit_key) (const coord_t *, reiser4_key *); -+ /* estimate how much space is needed for paste @data into item at -+ @coord. if @coord==0 - estimate insertion, otherwise - estimate -+ pasting -+ */ -+ int (*estimate) (const coord_t *, const reiser4_item_data *); -+ -+ /* converts flow @f to item data. @coord == 0 on insert */ -+ int (*item_data_by_flow) (const coord_t *, const flow_t *, -+ reiser4_item_data *); -+ -+ /*void (*show) (struct seq_file *, coord_t *); */ -+ -+#if REISER4_DEBUG -+ /* used for debugging, every item should have here the most -+ complete possible check of the consistency of the item that -+ the inventor can construct */ -+ int (*check) (const coord_t *, const char **error); -+#endif -+ -+}; -+ -+struct flush_ops { -+ /* return the right or left child of @coord, only if it is in memory */ -+ int (*utmost_child) (const coord_t *, sideof side, jnode ** child); -+ -+ /* return whether the right or left child of @coord has a non-fake -+ block number. */ -+ int (*utmost_child_real_block) (const coord_t *, sideof side, -+ reiser4_block_nr *); -+ /* relocate child at @coord to the @block */ -+ void (*update) (const coord_t *, const reiser4_block_nr *); -+ /* count unformatted nodes per item for leave relocation policy, etc.. */ -+ int (*scan) (flush_scan * scan); -+ /* convert item by flush */ -+ int (*convert) (flush_pos_t * pos); -+ /* backward mapping from jnode offset to a key. */ -+ int (*key_by_offset) (struct inode *, loff_t, reiser4_key *); -+}; -+ -+/* operations specific to the directory item */ -+struct dir_entry_iops { -+ /* extract stat-data key from directory entry at @coord and place it -+ into @key. */ -+ int (*extract_key) (const coord_t *, reiser4_key * key); -+ /* update object key in item. */ -+ int (*update_key) (const coord_t *, const reiser4_key *, lock_handle *); -+ /* extract name from directory entry at @coord and return it */ -+ char *(*extract_name) (const coord_t *, char *buf); -+ /* extract file type (DT_* stuff) from directory entry at @coord and -+ return it */ -+ unsigned (*extract_file_type) (const coord_t *); -+ int (*add_entry) (struct inode * dir, -+ coord_t *, lock_handle *, -+ const struct dentry * name, -+ reiser4_dir_entry_desc * entry); -+ int (*rem_entry) (struct inode * dir, const struct qstr * name, -+ coord_t *, lock_handle *, -+ reiser4_dir_entry_desc * entry); -+ int (*max_name_len) (const struct inode * dir); -+}; -+ -+/* operations specific to items regular (unix) file metadata are built of */ -+struct file_iops{ -+ int (*write) (struct file *, struct inode *, -+ const char __user *, size_t, loff_t *pos); -+ int (*read) (struct file *, flow_t *, hint_t *); -+ int (*readpage) (void *, struct page *); -+ int (*get_block) (const coord_t *, sector_t, sector_t *); -+ /* -+ * key of first byte which is not addressed by the item @coord is set -+ * to. -+ * For example, for extent item with the key -+ * -+ * (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks, -+ * -+ * ->append_key is -+ * -+ * (LOCALITY,4,OBJID,STARTING-OFFSET + BLK * block_size) -+ */ -+ reiser4_key *(*append_key) (const coord_t *, reiser4_key *); -+ -+ void (*init_coord_extension) (uf_coord_t *, loff_t); -+}; -+ -+/* operations specific to items of stat data type */ -+struct sd_iops { -+ int (*init_inode) (struct inode * inode, char *sd, int len); -+ int (*save_len) (struct inode * inode); -+ int (*save) (struct inode * inode, char **area); -+}; -+ -+/* operations specific to internal item */ -+struct internal_iops{ -+ /* all tree traversal want to know from internal item is where -+ to go next. */ -+ void (*down_link) (const coord_t * coord, -+ const reiser4_key * key, reiser4_block_nr * block); -+ /* check that given internal item contains given pointer. */ -+ int (*has_pointer_to) (const coord_t * coord, -+ const reiser4_block_nr * block); -+}; -+ -+struct item_plugin { -+ /* generic fields */ -+ plugin_header h; -+ /* methods common for all item types */ -+ struct balance_ops b; /* balance operations */ -+ struct flush_ops f; /* flush operates with items via this methods */ -+ -+ /* methods specific to particular type of item */ -+ union { -+ struct dir_entry_iops dir; -+ struct file_iops file; -+ struct sd_iops sd; -+ struct internal_iops internal; -+ } s; -+}; -+ -+#define is_solid_item(iplug) ((iplug)->b.nr_units == nr_units_single_unit) -+ -+static inline item_id item_id_by_plugin(item_plugin * plugin) -+{ -+ return plugin->h.id; -+} -+ -+static inline char get_iplugid(item_plugin * iplug) -+{ -+ assert("nikita-2838", iplug != NULL); -+ assert("nikita-2839", iplug->h.id < 0xff); -+ return (char)item_id_by_plugin(iplug); -+} -+ -+extern unsigned long znode_times_locked(const znode * z); -+ -+static inline void coord_set_iplug(coord_t * coord, item_plugin * iplug) -+{ -+ assert("nikita-2837", coord != NULL); -+ assert("nikita-2838", iplug != NULL); -+ coord->iplugid = get_iplugid(iplug); -+ ON_DEBUG(coord->plug_v = znode_times_locked(coord->node)); -+} -+ -+static inline item_plugin *coord_iplug(const coord_t * coord) -+{ -+ assert("nikita-2833", coord != NULL); -+ assert("nikita-2834", coord->iplugid != INVALID_PLUGID); -+ assert("nikita-3549", coord->plug_v == znode_times_locked(coord->node)); -+ return (item_plugin *) plugin_by_id(REISER4_ITEM_PLUGIN_TYPE, -+ coord->iplugid); -+} -+ -+extern int item_can_contain_key(const coord_t * item, const reiser4_key * key, -+ const reiser4_item_data *); -+extern int are_items_mergeable(const coord_t * i1, const coord_t * i2); -+extern int item_is_extent(const coord_t *); -+extern int item_is_tail(const coord_t *); -+extern int item_is_statdata(const coord_t * item); -+extern int item_is_ctail(const coord_t *); -+ -+extern pos_in_node_t item_length_by_coord(const coord_t * coord); -+extern pos_in_node_t nr_units_single_unit(const coord_t * coord); -+extern item_id item_id_by_coord(const coord_t * coord /* coord to query */ ); -+extern reiser4_key *item_key_by_coord(const coord_t * coord, reiser4_key * key); -+extern reiser4_key *max_item_key_by_coord(const coord_t *, reiser4_key *); -+extern reiser4_key *unit_key_by_coord(const coord_t * coord, reiser4_key * key); -+extern reiser4_key *max_unit_key_by_coord(const coord_t * coord, -+ reiser4_key * key); -+extern void obtain_item_plugin(const coord_t * coord); -+ -+#if defined(REISER4_DEBUG) -+extern int znode_is_loaded(const znode * node); -+#endif -+ -+/* return plugin of item at @coord */ -+static inline item_plugin *item_plugin_by_coord(const coord_t * -+ coord /* coord to query */ ) -+{ -+ assert("nikita-330", coord != NULL); -+ assert("nikita-331", coord->node != NULL); -+ assert("nikita-332", znode_is_loaded(coord->node)); -+ -+ if (unlikely(!coord_is_iplug_set(coord))) -+ obtain_item_plugin(coord); -+ return coord_iplug(coord); -+} -+ -+/* this returns true if item is of internal type */ -+static inline int item_is_internal(const coord_t * item) -+{ -+ assert("vs-483", coord_is_existing_item(item)); -+ return plugin_of_group(item_plugin_by_coord(item), INTERNAL_ITEM_TYPE); -+} -+ -+extern void item_body_by_coord_hard(coord_t * coord); -+extern void *item_body_by_coord_easy(const coord_t * coord); -+#if REISER4_DEBUG -+extern int item_body_is_valid(const coord_t * coord); -+#endif -+ -+/* return pointer to item body */ -+static inline void *item_body_by_coord(const coord_t * -+ coord /* coord to query */ ) -+{ -+ assert("nikita-324", coord != NULL); -+ assert("nikita-325", coord->node != NULL); -+ assert("nikita-326", znode_is_loaded(coord->node)); -+ -+ if (coord->offset == INVALID_OFFSET) -+ item_body_by_coord_hard((coord_t *) coord); -+ assert("nikita-3201", item_body_is_valid(coord)); -+ assert("nikita-3550", coord->body_v == znode_times_locked(coord->node)); -+ return item_body_by_coord_easy(coord); -+} -+ -+/* __REISER4_ITEM_H__ */ -+#endif -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/Makefile linux-2.6.24/fs/reiser4/plugin/item/Makefile ---- linux-2.6.24.orig/fs/reiser4/plugin/item/Makefile 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/item/Makefile 2008-01-25 11:39:07.024230357 +0300 -@@ -0,0 +1,18 @@ -+obj-$(CONFIG_REISER4_FS) += item_plugins.o -+ -+item_plugins-objs := \ -+ item.o \ -+ static_stat.o \ -+ sde.o \ -+ cde.o \ -+ blackbox.o \ -+ internal.o \ -+ tail.o \ -+ ctail.o \ -+ extent.o \ -+ extent_item_ops.o \ -+ extent_file_ops.o \ -+ extent_flush_ops.o -+ -+ -+ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/sde.c linux-2.6.24/fs/reiser4/plugin/item/sde.c ---- linux-2.6.24.orig/fs/reiser4/plugin/item/sde.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/item/sde.c 2008-01-25 11:39:07.024230357 +0300 -@@ -0,0 +1,190 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Directory entry implementation */ -+#include "../../forward.h" -+#include "../../debug.h" -+#include "../../dformat.h" -+#include "../../kassign.h" -+#include "../../coord.h" -+#include "sde.h" -+#include "item.h" -+#include "../plugin.h" -+#include "../../znode.h" -+#include "../../carry.h" -+#include "../../tree.h" -+#include "../../inode.h" -+ -+#include /* for struct inode */ -+#include /* for struct dentry */ -+#include -+ -+/* ->extract_key() method of simple directory item plugin. */ -+int extract_key_de(const coord_t * coord /* coord of item */ , -+ reiser4_key * key /* resulting key */ ) -+{ -+ directory_entry_format *dent; -+ -+ assert("nikita-1458", coord != NULL); -+ assert("nikita-1459", key != NULL); -+ -+ dent = (directory_entry_format *) item_body_by_coord(coord); -+ assert("nikita-1158", item_length_by_coord(coord) >= (int)sizeof *dent); -+ return extract_key_from_id(&dent->id, key); -+} -+ -+int -+update_key_de(const coord_t * coord, const reiser4_key * key, -+ lock_handle * lh UNUSED_ARG) -+{ -+ directory_entry_format *dent; -+ obj_key_id obj_id; -+ int result; -+ -+ assert("nikita-2342", coord != NULL); -+ assert("nikita-2343", key != NULL); -+ -+ dent = (directory_entry_format *) item_body_by_coord(coord); -+ result = build_obj_key_id(key, &obj_id); -+ if (result == 0) { -+ dent->id = obj_id; -+ znode_make_dirty(coord->node); -+ } -+ return 0; -+} -+ -+char *extract_dent_name(const coord_t * coord, directory_entry_format * dent, -+ char *buf) -+{ -+ reiser4_key key; -+ -+ unit_key_by_coord(coord, &key); -+ if (get_key_type(&key) != KEY_FILE_NAME_MINOR) -+ reiser4_print_address("oops", znode_get_block(coord->node)); -+ if (!is_longname_key(&key)) { -+ if (is_dot_key(&key)) -+ return (char *)"."; -+ else -+ return extract_name_from_key(&key, buf); -+ } else -+ return (char *)dent->name; -+} -+ -+/* ->extract_name() method of simple directory item plugin. */ -+char *extract_name_de(const coord_t * coord /* coord of item */ , char *buf) -+{ -+ directory_entry_format *dent; -+ -+ assert("nikita-1460", coord != NULL); -+ -+ dent = (directory_entry_format *) item_body_by_coord(coord); -+ return extract_dent_name(coord, dent, buf); -+} -+ -+/* ->extract_file_type() method of simple directory item plugin. */ -+unsigned extract_file_type_de(const coord_t * coord UNUSED_ARG /* coord of -+ * item */ ) -+{ -+ assert("nikita-1764", coord != NULL); -+ /* we don't store file type in the directory entry yet. -+ -+ But see comments at kassign.h:obj_key_id -+ */ -+ return DT_UNKNOWN; -+} -+ -+int add_entry_de(struct inode *dir /* directory of item */ , -+ coord_t * coord /* coord of item */ , -+ lock_handle * lh /* insertion lock handle */ , -+ const struct dentry *de /* name to add */ , -+ reiser4_dir_entry_desc * entry /* parameters of new directory -+ * entry */ ) -+{ -+ reiser4_item_data data; -+ directory_entry_format *dent; -+ int result; -+ const char *name; -+ int len; -+ int longname; -+ -+ name = de->d_name.name; -+ len = de->d_name.len; -+ assert("nikita-1163", strlen(name) == len); -+ -+ longname = is_longname(name, len); -+ -+ data.length = sizeof *dent; -+ if (longname) -+ data.length += len + 1; -+ data.data = NULL; -+ data.user = 0; -+ data.iplug = item_plugin_by_id(SIMPLE_DIR_ENTRY_ID); -+ -+ /* NOTE-NIKITA quota plugin */ -+ if (DQUOT_ALLOC_SPACE_NODIRTY(dir, data.length)) -+ return -EDQUOT; -+ -+ result = insert_by_coord(coord, &data, &entry->key, lh, 0 /*flags */ ); -+ if (result != 0) -+ return result; -+ -+ dent = (directory_entry_format *) item_body_by_coord(coord); -+ build_inode_key_id(entry->obj, &dent->id); -+ if (longname) { -+ memcpy(dent->name, name, len); -+ put_unaligned(0, &dent->name[len]); -+ } -+ return 0; -+} -+ -+int rem_entry_de(struct inode *dir /* directory of item */ , -+ const struct qstr *name UNUSED_ARG, -+ coord_t * coord /* coord of item */ , -+ lock_handle * lh UNUSED_ARG /* lock handle for -+ * removal */ , -+ reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of -+ * directory entry -+ * being removed */ ) -+{ -+ coord_t shadow; -+ int result; -+ int length; -+ -+ length = item_length_by_coord(coord); -+ if (inode_get_bytes(dir) < length) { -+ warning("nikita-2627", "Dir is broke: %llu: %llu", -+ (unsigned long long)get_inode_oid(dir), -+ inode_get_bytes(dir)); -+ -+ return RETERR(-EIO); -+ } -+ -+ /* cut_node() is supposed to take pointers to _different_ -+ coords, because it will modify them without respect to -+ possible aliasing. To work around this, create temporary copy -+ of @coord. -+ */ -+ coord_dup(&shadow, coord); -+ result = -+ kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0); -+ if (result == 0) { -+ /* NOTE-NIKITA quota plugin */ -+ DQUOT_FREE_SPACE_NODIRTY(dir, length); -+ } -+ return result; -+} -+ -+int max_name_len_de(const struct inode *dir) -+{ -+ return reiser4_tree_by_inode(dir)->nplug->max_item_size() - -+ sizeof(directory_entry_format) - 2; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/sde.h linux-2.6.24/fs/reiser4/plugin/item/sde.h ---- linux-2.6.24.orig/fs/reiser4/plugin/item/sde.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/item/sde.h 2008-01-25 11:39:07.024230357 +0300 -@@ -0,0 +1,66 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Directory entry. */ -+ -+#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ ) -+#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ -+ -+#include "../../forward.h" -+#include "../../dformat.h" -+#include "../../kassign.h" -+#include "../../key.h" -+ -+#include -+#include /* for struct dentry */ -+ -+typedef struct directory_entry_format { -+ /* key of object stat-data. It's not necessary to store whole -+ key here, because it's always key of stat-data, so minor -+ packing locality and offset can be omitted here. But this -+ relies on particular key allocation scheme for stat-data, so, -+ for extensibility sake, whole key can be stored here. -+ -+ We store key as array of bytes, because we don't want 8-byte -+ alignment of dir entries. -+ */ -+ obj_key_id id; -+ /* file name. Null terminated string. */ -+ d8 name[0]; -+} directory_entry_format; -+ -+void print_de(const char *prefix, coord_t * coord); -+int extract_key_de(const coord_t * coord, reiser4_key * key); -+int update_key_de(const coord_t * coord, const reiser4_key * key, -+ lock_handle * lh); -+char *extract_name_de(const coord_t * coord, char *buf); -+unsigned extract_file_type_de(const coord_t * coord); -+int add_entry_de(struct inode *dir, coord_t * coord, -+ lock_handle * lh, const struct dentry *name, -+ reiser4_dir_entry_desc * entry); -+int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord, -+ lock_handle * lh, reiser4_dir_entry_desc * entry); -+int max_name_len_de(const struct inode *dir); -+ -+int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length); -+ -+char *extract_dent_name(const coord_t * coord, -+ directory_entry_format * dent, char *buf); -+ -+#if REISER4_LARGE_KEY -+#define DE_NAME_BUF_LEN (24) -+#else -+#define DE_NAME_BUF_LEN (16) -+#endif -+ -+/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/static_stat.c linux-2.6.24/fs/reiser4/plugin/item/static_stat.c ---- linux-2.6.24.orig/fs/reiser4/plugin/item/static_stat.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/item/static_stat.c 2008-01-25 11:39:07.024230357 +0300 -@@ -0,0 +1,1107 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* stat data manipulation. */ -+ -+#include "../../forward.h" -+#include "../../super.h" -+#include "../../vfs_ops.h" -+#include "../../inode.h" -+#include "../../debug.h" -+#include "../../dformat.h" -+#include "../object.h" -+#include "../plugin.h" -+#include "../plugin_header.h" -+#include "static_stat.h" -+#include "item.h" -+ -+#include -+#include -+ -+/* see static_stat.h for explanation */ -+ -+/* helper function used while we are dumping/loading inode/plugin state -+ to/from the stat-data. */ -+ -+static void move_on(int *length /* space remaining in stat-data */ , -+ char **area /* current coord in stat data */ , -+ int size_of /* how many bytes to move forward */ ) -+{ -+ assert("nikita-615", length != NULL); -+ assert("nikita-616", area != NULL); -+ -+ *length -= size_of; -+ *area += size_of; -+ -+ assert("nikita-617", *length >= 0); -+} -+ -+/* helper function used while loading inode/plugin state from stat-data. -+ Complain if there is less space in stat-data than was expected. -+ Can only happen on disk corruption. */ -+static int not_enough_space(struct inode *inode /* object being processed */ , -+ const char *where /* error message */ ) -+{ -+ assert("nikita-618", inode != NULL); -+ -+ warning("nikita-619", "Not enough space in %llu while loading %s", -+ (unsigned long long)get_inode_oid(inode), where); -+ -+ return RETERR(-EINVAL); -+} -+ -+/* helper function used while loading inode/plugin state from -+ stat-data. Call it if invalid plugin id was found. */ -+static int unknown_plugin(reiser4_plugin_id id /* invalid id */ , -+ struct inode *inode /* object being processed */ ) -+{ -+ warning("nikita-620", "Unknown plugin %i in %llu", -+ id, (unsigned long long)get_inode_oid(inode)); -+ -+ return RETERR(-EINVAL); -+} -+ -+/* this is installed as ->init_inode() method of -+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). -+ Copies data from on-disk stat-data format into inode. -+ Handles stat-data extensions. */ -+/* was sd_load */ -+int init_inode_static_sd(struct inode *inode /* object being processed */ , -+ char *sd /* stat-data body */ , -+ int len /* length of stat-data */ ) -+{ -+ int result; -+ int bit; -+ int chunk; -+ __u16 mask; -+ __u64 bigmask; -+ reiser4_stat_data_base *sd_base; -+ reiser4_inode *state; -+ -+ assert("nikita-625", inode != NULL); -+ assert("nikita-626", sd != NULL); -+ -+ result = 0; -+ sd_base = (reiser4_stat_data_base *) sd; -+ state = reiser4_inode_data(inode); -+ mask = le16_to_cpu(get_unaligned(&sd_base->extmask)); -+ bigmask = mask; -+ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN); -+ -+ move_on(&len, &sd, sizeof *sd_base); -+ for (bit = 0, chunk = 0; -+ mask != 0 || bit <= LAST_IMPORTANT_SD_EXTENSION; -+ ++bit, mask >>= 1) { -+ if (((bit + 1) % 16) != 0) { -+ /* handle extension */ -+ sd_ext_plugin *sdplug; -+ -+ if (bit >= LAST_SD_EXTENSION) { -+ warning("vpf-1904", -+ "No such extension %i in inode %llu", -+ bit, -+ (unsigned long long) -+ get_inode_oid(inode)); -+ -+ result = RETERR(-EINVAL); -+ break; -+ } -+ -+ sdplug = sd_ext_plugin_by_id(bit); -+ if (sdplug == NULL) { -+ warning("nikita-627", -+ "No such extension %i in inode %llu", -+ bit, -+ (unsigned long long) -+ get_inode_oid(inode)); -+ -+ result = RETERR(-EINVAL); -+ break; -+ } -+ if (mask & 1) { -+ assert("nikita-628", sdplug->present); -+ /* alignment is not supported in node layout -+ plugin yet. -+ result = align( inode, &len, &sd, -+ sdplug -> alignment ); -+ if( result != 0 ) -+ return result; */ -+ result = sdplug->present(inode, &sd, &len); -+ } else if (sdplug->absent != NULL) -+ result = sdplug->absent(inode); -+ if (result) -+ break; -+ /* else, we are looking at the last bit in 16-bit -+ portion of bitmask */ -+ } else if (mask & 1) { -+ /* next portion of bitmask */ -+ if (len < (int)sizeof(d16)) { -+ warning("nikita-629", -+ "No space for bitmap in inode %llu", -+ (unsigned long long) -+ get_inode_oid(inode)); -+ -+ result = RETERR(-EINVAL); -+ break; -+ } -+ mask = le16_to_cpu(get_unaligned((d16 *)sd)); -+ bigmask <<= 16; -+ bigmask |= mask; -+ move_on(&len, &sd, sizeof(d16)); -+ ++chunk; -+ if (chunk == 3) { -+ if (!(mask & 0x8000)) { -+ /* clear last bit */ -+ mask &= ~0x8000; -+ continue; -+ } -+ /* too much */ -+ warning("nikita-630", -+ "Too many extensions in %llu", -+ (unsigned long long) -+ get_inode_oid(inode)); -+ -+ result = RETERR(-EINVAL); -+ break; -+ } -+ } else -+ /* bitmask exhausted */ -+ break; -+ } -+ state->extmask = bigmask; -+ /* common initialisations */ -+ if (len - (bit / 16 * sizeof(d16)) > 0) { -+ /* alignment in save_len_static_sd() is taken into account -+ -edward */ -+ warning("nikita-631", "unused space in inode %llu", -+ (unsigned long long)get_inode_oid(inode)); -+ } -+ -+ return result; -+} -+ -+/* estimates size of stat-data required to store inode. -+ Installed as ->save_len() method of -+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */ -+/* was sd_len */ -+int save_len_static_sd(struct inode *inode /* object being processed */ ) -+{ -+ unsigned int result; -+ __u64 mask; -+ int bit; -+ -+ assert("nikita-632", inode != NULL); -+ -+ result = sizeof(reiser4_stat_data_base); -+ mask = reiser4_inode_data(inode)->extmask; -+ for (bit = 0; mask != 0; ++bit, mask >>= 1) { -+ if (mask & 1) { -+ sd_ext_plugin *sdplug; -+ -+ sdplug = sd_ext_plugin_by_id(bit); -+ assert("nikita-633", sdplug != NULL); -+ /* no aligment support -+ result += -+ round_up( result, sdplug -> alignment ) - result; */ -+ result += sdplug->save_len(inode); -+ } -+ } -+ result += bit / 16 * sizeof(d16); -+ return result; -+} -+ -+/* saves inode into stat-data. -+ Installed as ->save() method of -+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */ -+/* was sd_save */ -+int save_static_sd(struct inode *inode /* object being processed */ , -+ char **area /* where to save stat-data */ ) -+{ -+ int result; -+ __u64 emask; -+ int bit; -+ unsigned int len; -+ reiser4_stat_data_base *sd_base; -+ -+ assert("nikita-634", inode != NULL); -+ assert("nikita-635", area != NULL); -+ -+ result = 0; -+ emask = reiser4_inode_data(inode)->extmask; -+ sd_base = (reiser4_stat_data_base *) * area; -+ put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)), &sd_base->extmask); -+ /*cputod16((unsigned)(emask & 0xffff), &sd_base->extmask);*/ -+ -+ *area += sizeof *sd_base; -+ len = 0xffffffffu; -+ for (bit = 0; emask != 0; ++bit, emask >>= 1) { -+ if (emask & 1) { -+ if ((bit + 1) % 16 != 0) { -+ sd_ext_plugin *sdplug; -+ sdplug = sd_ext_plugin_by_id(bit); -+ assert("nikita-636", sdplug != NULL); -+ /* no alignment support yet -+ align( inode, &len, area, -+ sdplug -> alignment ); */ -+ result = sdplug->save(inode, area); -+ if (result) -+ break; -+ } else { -+ put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)), -+ (d16 *)(*area)); -+ /*cputod16((unsigned)(emask & 0xffff), -+ (d16 *) * area);*/ -+ *area += sizeof(d16); -+ } -+ } -+ } -+ return result; -+} -+ -+/* stat-data extension handling functions. */ -+ -+static int present_lw_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ , -+ int *len /* remaining length */ ) -+{ -+ if (*len >= (int)sizeof(reiser4_light_weight_stat)) { -+ reiser4_light_weight_stat *sd_lw; -+ -+ sd_lw = (reiser4_light_weight_stat *) * area; -+ -+ inode->i_mode = le16_to_cpu(get_unaligned(&sd_lw->mode)); -+ inode->i_nlink = le32_to_cpu(get_unaligned(&sd_lw->nlink)); -+ inode->i_size = le64_to_cpu(get_unaligned(&sd_lw->size)); -+ if ((inode->i_mode & S_IFMT) == (S_IFREG | S_IFIFO)) { -+ inode->i_mode &= ~S_IFIFO; -+ warning("", "partially converted file is encountered"); -+ reiser4_inode_set_flag(inode, REISER4_PART_MIXED); -+ } -+ move_on(len, area, sizeof *sd_lw); -+ return 0; -+ } else -+ return not_enough_space(inode, "lw sd"); -+} -+ -+static int save_len_lw_sd(struct inode *inode UNUSED_ARG /* object being -+ * processed */ ) -+{ -+ return sizeof(reiser4_light_weight_stat); -+} -+ -+static int save_lw_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ ) -+{ -+ reiser4_light_weight_stat *sd; -+ mode_t delta; -+ -+ assert("nikita-2705", inode != NULL); -+ assert("nikita-2706", area != NULL); -+ assert("nikita-2707", *area != NULL); -+ -+ sd = (reiser4_light_weight_stat *) * area; -+ -+ delta = (reiser4_inode_get_flag(inode, -+ REISER4_PART_MIXED) ? S_IFIFO : 0); -+ put_unaligned(cpu_to_le16(inode->i_mode | delta), &sd->mode); -+ put_unaligned(cpu_to_le32(inode->i_nlink), &sd->nlink); -+ put_unaligned(cpu_to_le64((__u64) inode->i_size), &sd->size); -+ *area += sizeof *sd; -+ return 0; -+} -+ -+static int present_unix_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ , -+ int *len /* remaining length */ ) -+{ -+ assert("nikita-637", inode != NULL); -+ assert("nikita-638", area != NULL); -+ assert("nikita-639", *area != NULL); -+ assert("nikita-640", len != NULL); -+ assert("nikita-641", *len > 0); -+ -+ if (*len >= (int)sizeof(reiser4_unix_stat)) { -+ reiser4_unix_stat *sd; -+ -+ sd = (reiser4_unix_stat *) * area; -+ -+ inode->i_uid = le32_to_cpu(get_unaligned(&sd->uid)); -+ inode->i_gid = le32_to_cpu(get_unaligned(&sd->gid)); -+ inode->i_atime.tv_sec = le32_to_cpu(get_unaligned(&sd->atime)); -+ inode->i_mtime.tv_sec = le32_to_cpu(get_unaligned(&sd->mtime)); -+ inode->i_ctime.tv_sec = le32_to_cpu(get_unaligned(&sd->ctime)); -+ if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) -+ inode->i_rdev = le64_to_cpu(get_unaligned(&sd->u.rdev)); -+ else -+ inode_set_bytes(inode, (loff_t) le64_to_cpu(get_unaligned(&sd->u.bytes))); -+ move_on(len, area, sizeof *sd); -+ return 0; -+ } else -+ return not_enough_space(inode, "unix sd"); -+} -+ -+static int absent_unix_sd(struct inode *inode /* object being processed */ ) -+{ -+ inode->i_uid = get_super_private(inode->i_sb)->default_uid; -+ inode->i_gid = get_super_private(inode->i_sb)->default_gid; -+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; -+ inode_set_bytes(inode, inode->i_size); -+ /* mark inode as lightweight, so that caller (lookup_common) will -+ complete initialisation by copying [ug]id from a parent. */ -+ reiser4_inode_set_flag(inode, REISER4_LIGHT_WEIGHT); -+ return 0; -+} -+ -+/* Audited by: green(2002.06.14) */ -+static int save_len_unix_sd(struct inode *inode UNUSED_ARG /* object being -+ * processed */ ) -+{ -+ return sizeof(reiser4_unix_stat); -+} -+ -+static int save_unix_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ ) -+{ -+ reiser4_unix_stat *sd; -+ -+ assert("nikita-642", inode != NULL); -+ assert("nikita-643", area != NULL); -+ assert("nikita-644", *area != NULL); -+ -+ sd = (reiser4_unix_stat *) * area; -+ put_unaligned(cpu_to_le32(inode->i_uid), &sd->uid); -+ put_unaligned(cpu_to_le32(inode->i_gid), &sd->gid); -+ put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_sec), &sd->atime); -+ put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_sec), &sd->ctime); -+ put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_sec), &sd->mtime); -+ if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) -+ put_unaligned(cpu_to_le64(inode->i_rdev), &sd->u.rdev); -+ else -+ put_unaligned(cpu_to_le64((__u64) inode_get_bytes(inode)), &sd->u.bytes); -+ *area += sizeof *sd; -+ return 0; -+} -+ -+static int -+present_large_times_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ , -+ int *len /* remaining length */ ) -+{ -+ if (*len >= (int)sizeof(reiser4_large_times_stat)) { -+ reiser4_large_times_stat *sd_lt; -+ -+ sd_lt = (reiser4_large_times_stat *) * area; -+ -+ inode->i_atime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->atime)); -+ inode->i_mtime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->mtime)); -+ inode->i_ctime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->ctime)); -+ -+ move_on(len, area, sizeof *sd_lt); -+ return 0; -+ } else -+ return not_enough_space(inode, "large times sd"); -+} -+ -+static int -+save_len_large_times_sd(struct inode *inode UNUSED_ARG -+ /* object being processed */ ) -+{ -+ return sizeof(reiser4_large_times_stat); -+} -+ -+static int -+save_large_times_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ ) -+{ -+ reiser4_large_times_stat *sd; -+ -+ assert("nikita-2817", inode != NULL); -+ assert("nikita-2818", area != NULL); -+ assert("nikita-2819", *area != NULL); -+ -+ sd = (reiser4_large_times_stat *) * area; -+ -+ put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_nsec), &sd->atime); -+ put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_nsec), &sd->ctime); -+ put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_nsec), &sd->mtime); -+ -+ *area += sizeof *sd; -+ return 0; -+} -+ -+/* symlink stat data extension */ -+ -+/* allocate memory for symlink target and attach it to inode->i_private */ -+static int -+symlink_target_to_inode(struct inode *inode, const char *target, int len) -+{ -+ assert("vs-845", inode->i_private == NULL); -+ assert("vs-846", !reiser4_inode_get_flag(inode, -+ REISER4_GENERIC_PTR_USED)); -+ /* FIXME-VS: this is prone to deadlock. Not more than other similar -+ places, though */ -+ inode->i_private = kmalloc((size_t) len + 1, -+ reiser4_ctx_gfp_mask_get()); -+ if (!inode->i_private) -+ return RETERR(-ENOMEM); -+ -+ memcpy((char *)(inode->i_private), target, (size_t) len); -+ ((char *)(inode->i_private))[len] = 0; -+ reiser4_inode_set_flag(inode, REISER4_GENERIC_PTR_USED); -+ return 0; -+} -+ -+/* this is called on read_inode. There is nothing to do actually, but some -+ sanity checks */ -+static int present_symlink_sd(struct inode *inode, char **area, int *len) -+{ -+ int result; -+ int length; -+ reiser4_symlink_stat *sd; -+ -+ length = (int)inode->i_size; -+ /* -+ * *len is number of bytes in stat data item from *area to the end of -+ * item. It must be not less than size of symlink + 1 for ending 0 -+ */ -+ if (length > *len) -+ return not_enough_space(inode, "symlink"); -+ -+ if (*(*area + length) != 0) { -+ warning("vs-840", "Symlink is not zero terminated"); -+ return RETERR(-EIO); -+ } -+ -+ sd = (reiser4_symlink_stat *) * area; -+ result = symlink_target_to_inode(inode, sd->body, length); -+ -+ move_on(len, area, length + 1); -+ return result; -+} -+ -+static int save_len_symlink_sd(struct inode *inode) -+{ -+ return inode->i_size + 1; -+} -+ -+/* this is called on create and update stat data. Do nothing on update but -+ update @area */ -+static int save_symlink_sd(struct inode *inode, char **area) -+{ -+ int result; -+ int length; -+ reiser4_symlink_stat *sd; -+ -+ length = (int)inode->i_size; -+ /* inode->i_size must be set already */ -+ assert("vs-841", length); -+ -+ result = 0; -+ sd = (reiser4_symlink_stat *) * area; -+ if (!reiser4_inode_get_flag(inode, REISER4_GENERIC_PTR_USED)) { -+ const char *target; -+ -+ target = (const char *)(inode->i_private); -+ inode->i_private = NULL; -+ -+ result = symlink_target_to_inode(inode, target, length); -+ -+ /* copy symlink to stat data */ -+ memcpy(sd->body, target, (size_t) length); -+ (*area)[length] = 0; -+ } else { -+ /* there is nothing to do in update but move area */ -+ assert("vs-844", -+ !memcmp(inode->i_private, sd->body, -+ (size_t) length + 1)); -+ } -+ -+ *area += (length + 1); -+ return result; -+} -+ -+static int present_flags_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ , -+ int *len /* remaining length */ ) -+{ -+ assert("nikita-645", inode != NULL); -+ assert("nikita-646", area != NULL); -+ assert("nikita-647", *area != NULL); -+ assert("nikita-648", len != NULL); -+ assert("nikita-649", *len > 0); -+ -+ if (*len >= (int)sizeof(reiser4_flags_stat)) { -+ reiser4_flags_stat *sd; -+ -+ sd = (reiser4_flags_stat *) * area; -+ inode->i_flags = le32_to_cpu(get_unaligned(&sd->flags)); -+ move_on(len, area, sizeof *sd); -+ return 0; -+ } else -+ return not_enough_space(inode, "generation and attrs"); -+} -+ -+/* Audited by: green(2002.06.14) */ -+static int save_len_flags_sd(struct inode *inode UNUSED_ARG /* object being -+ * processed */ ) -+{ -+ return sizeof(reiser4_flags_stat); -+} -+ -+static int save_flags_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ ) -+{ -+ reiser4_flags_stat *sd; -+ -+ assert("nikita-650", inode != NULL); -+ assert("nikita-651", area != NULL); -+ assert("nikita-652", *area != NULL); -+ -+ sd = (reiser4_flags_stat *) * area; -+ put_unaligned(cpu_to_le32(inode->i_flags), &sd->flags); -+ *area += sizeof *sd; -+ return 0; -+} -+ -+static int absent_plugin_sd(struct inode *inode); -+static int present_plugin_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */ , -+ int *len /* remaining length */, -+ int is_pset /* 1 if plugin set, 0 if heir set. */) -+{ -+ reiser4_plugin_stat *sd; -+ reiser4_plugin *plugin; -+ reiser4_inode *info; -+ int i; -+ __u16 mask; -+ int result; -+ int num_of_plugins; -+ -+ assert("nikita-653", inode != NULL); -+ assert("nikita-654", area != NULL); -+ assert("nikita-655", *area != NULL); -+ assert("nikita-656", len != NULL); -+ assert("nikita-657", *len > 0); -+ -+ if (*len < (int)sizeof(reiser4_plugin_stat)) -+ return not_enough_space(inode, "plugin"); -+ -+ sd = (reiser4_plugin_stat *) * area; -+ info = reiser4_inode_data(inode); -+ -+ mask = 0; -+ num_of_plugins = le16_to_cpu(get_unaligned(&sd->plugins_no)); -+ move_on(len, area, sizeof *sd); -+ result = 0; -+ for (i = 0; i < num_of_plugins; ++i) { -+ reiser4_plugin_slot *slot; -+ reiser4_plugin_type type; -+ pset_member memb; -+ -+ slot = (reiser4_plugin_slot *) * area; -+ if (*len < (int)sizeof *slot) -+ return not_enough_space(inode, "additional plugin"); -+ -+ memb = le16_to_cpu(get_unaligned(&slot->pset_memb)); -+ type = aset_member_to_type_unsafe(memb); -+ -+ if (type == REISER4_PLUGIN_TYPES) { -+ warning("nikita-3502", -+ "wrong %s member (%i) for %llu", is_pset ? -+ "pset" : "hset", memb, -+ (unsigned long long)get_inode_oid(inode)); -+ return RETERR(-EINVAL); -+ } -+ plugin = plugin_by_disk_id(reiser4_tree_by_inode(inode), -+ type, &slot->id); -+ if (plugin == NULL) -+ return unknown_plugin(le16_to_cpu(get_unaligned(&slot->id)), inode); -+ -+ /* plugin is loaded into inode, mark this into inode's -+ bitmask of loaded non-standard plugins */ -+ if (!(mask & (1 << memb))) { -+ mask |= (1 << memb); -+ } else { -+ warning("nikita-658", "duplicate plugin for %llu", -+ (unsigned long long)get_inode_oid(inode)); -+ return RETERR(-EINVAL); -+ } -+ move_on(len, area, sizeof *slot); -+ /* load plugin data, if any */ -+ if (plugin->h.pops != NULL && plugin->h.pops->load) -+ result = plugin->h.pops->load(inode, plugin, area, len); -+ else -+ result = aset_set_unsafe(is_pset ? &info->pset : -+ &info->hset, memb, plugin); -+ if (result) -+ return result; -+ } -+ if (is_pset) { -+ /* if object plugin wasn't loaded from stat-data, guess it by -+ mode bits */ -+ plugin = file_plugin_to_plugin(inode_file_plugin(inode)); -+ if (plugin == NULL) -+ result = absent_plugin_sd(inode); -+ info->plugin_mask = mask; -+ } else -+ info->heir_mask = mask; -+ -+ return result; -+} -+ -+static int present_pset_sd(struct inode *inode, char **area, int *len) { -+ return present_plugin_sd(inode, area, len, 1 /* pset */); -+} -+ -+/* Determine object plugin for @inode based on i_mode. -+ -+ Many objects in reiser4 file system are controlled by standard object -+ plugins that emulate traditional unix objects: unix file, directory, symlink, fifo, and so on. -+ -+ For such files we don't explicitly store plugin id in object stat -+ data. Rather required plugin is guessed from mode bits, where file "type" -+ is encoded (see stat(2)). -+*/ -+static int -+guess_plugin_by_mode(struct inode *inode /* object to guess plugins for */ ) -+{ -+ int fplug_id; -+ int dplug_id; -+ reiser4_inode *info; -+ -+ assert("nikita-736", inode != NULL); -+ -+ dplug_id = fplug_id = -1; -+ -+ switch (inode->i_mode & S_IFMT) { -+ case S_IFSOCK: -+ case S_IFBLK: -+ case S_IFCHR: -+ case S_IFIFO: -+ fplug_id = SPECIAL_FILE_PLUGIN_ID; -+ break; -+ case S_IFLNK: -+ fplug_id = SYMLINK_FILE_PLUGIN_ID; -+ break; -+ case S_IFDIR: -+ fplug_id = DIRECTORY_FILE_PLUGIN_ID; -+ dplug_id = HASHED_DIR_PLUGIN_ID; -+ break; -+ default: -+ warning("nikita-737", "wrong file mode: %o", inode->i_mode); -+ return RETERR(-EIO); -+ case S_IFREG: -+ fplug_id = UNIX_FILE_PLUGIN_ID; -+ break; -+ } -+ info = reiser4_inode_data(inode); -+ set_plugin(&info->pset, PSET_FILE, (fplug_id >= 0) ? -+ plugin_by_id(REISER4_FILE_PLUGIN_TYPE, fplug_id) : NULL); -+ set_plugin(&info->pset, PSET_DIR, (dplug_id >= 0) ? -+ plugin_by_id(REISER4_DIR_PLUGIN_TYPE, dplug_id) : NULL); -+ return 0; -+} -+ -+/* Audited by: green(2002.06.14) */ -+static int absent_plugin_sd(struct inode *inode /* object being processed */ ) -+{ -+ int result; -+ -+ assert("nikita-659", inode != NULL); -+ -+ result = guess_plugin_by_mode(inode); -+ /* if mode was wrong, guess_plugin_by_mode() returns "regular file", -+ but setup_inode_ops() will call make_bad_inode(). -+ Another, more logical but bit more complex solution is to add -+ "bad-file plugin". */ -+ /* FIXME-VS: activate was called here */ -+ return result; -+} -+ -+/* helper function for plugin_sd_save_len(): calculate how much space -+ required to save state of given plugin */ -+/* Audited by: green(2002.06.14) */ -+static int len_for(reiser4_plugin * plugin /* plugin to save */ , -+ struct inode *inode /* object being processed */ , -+ pset_member memb, -+ int len, int is_pset) -+{ -+ reiser4_inode *info; -+ assert("nikita-661", inode != NULL); -+ -+ if (plugin == NULL) -+ return len; -+ -+ info = reiser4_inode_data(inode); -+ if (is_pset ? -+ info->plugin_mask & (1 << memb) : -+ info->heir_mask & (1 << memb)) { -+ len += sizeof(reiser4_plugin_slot); -+ if (plugin->h.pops && plugin->h.pops->save_len != NULL) { -+ /* non-standard plugin, call method */ -+ /* commented as it is incompatible with alignment -+ * policy in save_plug() -edward */ -+ /* len = round_up(len, plugin->h.pops->alignment); */ -+ len += plugin->h.pops->save_len(inode, plugin); -+ } -+ } -+ return len; -+} -+ -+/* calculate how much space is required to save state of all plugins, -+ associated with inode */ -+static int save_len_plugin_sd(struct inode *inode /* object being processed */, -+ int is_pset) -+{ -+ int len; -+ int last; -+ reiser4_inode *state; -+ pset_member memb; -+ -+ assert("nikita-663", inode != NULL); -+ -+ state = reiser4_inode_data(inode); -+ -+ /* common case: no non-standard plugins */ -+ if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0) -+ return 0; -+ len = sizeof(reiser4_plugin_stat); -+ last = PSET_LAST; -+ -+ for (memb = 0; memb < last; ++memb) { -+ len = len_for(aset_get(is_pset ? state->pset : state->hset, memb), -+ inode, memb, len, is_pset); -+ } -+ assert("nikita-664", len > (int)sizeof(reiser4_plugin_stat)); -+ return len; -+} -+ -+static int save_len_pset_sd(struct inode *inode) { -+ return save_len_plugin_sd(inode, 1 /* pset */); -+} -+ -+/* helper function for plugin_sd_save(): save plugin, associated with -+ inode. */ -+static int save_plug(reiser4_plugin * plugin /* plugin to save */ , -+ struct inode *inode /* object being processed */ , -+ int memb /* what element of pset is saved */ , -+ char **area /* position in stat-data */ , -+ int *count /* incremented if plugin were actually saved. */, -+ int is_pset /* 1 for plugin set, 0 for heir set */) -+{ -+ reiser4_plugin_slot *slot; -+ int fake_len; -+ int result; -+ -+ assert("nikita-665", inode != NULL); -+ assert("nikita-666", area != NULL); -+ assert("nikita-667", *area != NULL); -+ -+ if (plugin == NULL) -+ return 0; -+ -+ if (is_pset ? -+ !(reiser4_inode_data(inode)->plugin_mask & (1 << memb)) : -+ !(reiser4_inode_data(inode)->heir_mask & (1 << memb))) -+ return 0; -+ slot = (reiser4_plugin_slot *) * area; -+ put_unaligned(cpu_to_le16(memb), &slot->pset_memb); -+ put_unaligned(cpu_to_le16(plugin->h.id), &slot->id); -+ fake_len = (int)0xffff; -+ move_on(&fake_len, area, sizeof *slot); -+ ++*count; -+ result = 0; -+ if (plugin->h.pops != NULL) { -+ if (plugin->h.pops->save != NULL) -+ result = plugin->h.pops->save(inode, plugin, area); -+ } -+ return result; -+} -+ -+/* save state of all non-standard plugins associated with inode */ -+static int save_plugin_sd(struct inode *inode /* object being processed */ , -+ char **area /* position in stat-data */, -+ int is_pset /* 1 for pset, 0 for hset */) -+{ -+ int fake_len; -+ int result = 0; -+ int num_of_plugins; -+ reiser4_plugin_stat *sd; -+ reiser4_inode *state; -+ pset_member memb; -+ -+ assert("nikita-669", inode != NULL); -+ assert("nikita-670", area != NULL); -+ assert("nikita-671", *area != NULL); -+ -+ state = reiser4_inode_data(inode); -+ if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0) -+ return 0; -+ sd = (reiser4_plugin_stat *) * area; -+ fake_len = (int)0xffff; -+ move_on(&fake_len, area, sizeof *sd); -+ -+ num_of_plugins = 0; -+ for (memb = 0; memb < PSET_LAST; ++memb) { -+ result = save_plug(aset_get(is_pset ? state->pset : state->hset, -+ memb), -+ inode, memb, area, &num_of_plugins, is_pset); -+ if (result != 0) -+ break; -+ } -+ -+ put_unaligned(cpu_to_le16((__u16)num_of_plugins), &sd->plugins_no); -+ return result; -+} -+ -+static int save_pset_sd(struct inode *inode, char **area) { -+ return save_plugin_sd(inode, area, 1 /* pset */); -+} -+ -+static int present_hset_sd(struct inode *inode, char **area, int *len) { -+ return present_plugin_sd(inode, area, len, 0 /* hset */); -+} -+ -+static int save_len_hset_sd(struct inode *inode) { -+ return save_len_plugin_sd(inode, 0 /* pset */); -+} -+ -+static int save_hset_sd(struct inode *inode, char **area) { -+ return save_plugin_sd(inode, area, 0 /* hset */); -+} -+ -+/* helper function for crypto_sd_present(), crypto_sd_save. -+ Extract crypto info from stat-data and attach it to inode */ -+static int extract_crypto_info (struct inode * inode, -+ reiser4_crypto_stat * sd) -+{ -+ struct reiser4_crypto_info * info; -+ assert("edward-11", !inode_crypto_info(inode)); -+ assert("edward-1413", -+ !reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)); -+ /* create and attach a crypto-stat without secret key loaded */ -+ info = reiser4_alloc_crypto_info(inode); -+ if (IS_ERR(info)) -+ return PTR_ERR(info); -+ info->keysize = le16_to_cpu(get_unaligned(&sd->keysize)); -+ memcpy(info->keyid, sd->keyid, inode_digest_plugin(inode)->fipsize); -+ reiser4_attach_crypto_info(inode, info); -+ reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED); -+ return 0; -+} -+ -+/* crypto stat-data extension */ -+ -+static int present_crypto_sd(struct inode *inode, char **area, int *len) -+{ -+ int result; -+ reiser4_crypto_stat *sd; -+ digest_plugin *dplug = inode_digest_plugin(inode); -+ -+ assert("edward-06", dplug != NULL); -+ assert("edward-684", dplug->fipsize); -+ assert("edward-07", area != NULL); -+ assert("edward-08", *area != NULL); -+ assert("edward-09", len != NULL); -+ assert("edward-10", *len > 0); -+ -+ if (*len < (int)sizeof(reiser4_crypto_stat)) { -+ return not_enough_space(inode, "crypto-sd"); -+ } -+ /* *len is number of bytes in stat data item from *area to the end of -+ item. It must be not less than size of this extension */ -+ assert("edward-75", sizeof(*sd) + dplug->fipsize <= *len); -+ -+ sd = (reiser4_crypto_stat *) * area; -+ result = extract_crypto_info(inode, sd); -+ move_on(len, area, sizeof(*sd) + dplug->fipsize); -+ -+ return result; -+} -+ -+static int save_len_crypto_sd(struct inode *inode) -+{ -+ return sizeof(reiser4_crypto_stat) + -+ inode_digest_plugin(inode)->fipsize; -+} -+ -+static int save_crypto_sd(struct inode *inode, char **area) -+{ -+ int result = 0; -+ reiser4_crypto_stat *sd; -+ struct reiser4_crypto_info * info = inode_crypto_info(inode); -+ digest_plugin *dplug = inode_digest_plugin(inode); -+ -+ assert("edward-12", dplug != NULL); -+ assert("edward-13", area != NULL); -+ assert("edward-14", *area != NULL); -+ assert("edward-15", info != NULL); -+ assert("edward-1414", info->keyid != NULL); -+ assert("edward-1415", info->keysize != 0); -+ assert("edward-76", reiser4_inode_data(inode) != NULL); -+ -+ if (!reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)) { -+ /* file is just created */ -+ sd = (reiser4_crypto_stat *) *area; -+ /* copy everything but private key to the disk stat-data */ -+ put_unaligned(cpu_to_le16(info->keysize), &sd->keysize); -+ memcpy(sd->keyid, info->keyid, (size_t) dplug->fipsize); -+ reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED); -+ } -+ *area += (sizeof(*sd) + dplug->fipsize); -+ return result; -+} -+ -+static int eio(struct inode *inode, char **area, int *len) -+{ -+ return RETERR(-EIO); -+} -+ -+sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION] = { -+ [LIGHT_WEIGHT_STAT] = { -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = LIGHT_WEIGHT_STAT, -+ .pops = NULL, -+ .label = "light-weight sd", -+ .desc = "sd for light-weight files", -+ .linkage = {NULL,NULL} -+ }, -+ .present = present_lw_sd, -+ .absent = NULL, -+ .save_len = save_len_lw_sd, -+ .save = save_lw_sd, -+ .alignment = 8 -+ }, -+ [UNIX_STAT] = { -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = UNIX_STAT, -+ .pops = NULL, -+ .label = "unix-sd", -+ .desc = "unix stat-data fields", -+ .linkage = {NULL,NULL} -+ }, -+ .present = present_unix_sd, -+ .absent = absent_unix_sd, -+ .save_len = save_len_unix_sd, -+ .save = save_unix_sd, -+ .alignment = 8 -+ }, -+ [LARGE_TIMES_STAT] = { -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = LARGE_TIMES_STAT, -+ .pops = NULL, -+ .label = "64time-sd", -+ .desc = "nanosecond resolution for times", -+ .linkage = {NULL,NULL} -+ }, -+ .present = present_large_times_sd, -+ .absent = NULL, -+ .save_len = save_len_large_times_sd, -+ .save = save_large_times_sd, -+ .alignment = 8 -+ }, -+ [SYMLINK_STAT] = { -+ /* stat data of symlink has this extension */ -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = SYMLINK_STAT, -+ .pops = NULL, -+ .label = "symlink-sd", -+ .desc = -+ "stat data is appended with symlink name", -+ .linkage = {NULL,NULL} -+ }, -+ .present = present_symlink_sd, -+ .absent = NULL, -+ .save_len = save_len_symlink_sd, -+ .save = save_symlink_sd, -+ .alignment = 8 -+ }, -+ [PLUGIN_STAT] = { -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = PLUGIN_STAT, -+ .pops = NULL, -+ .label = "plugin-sd", -+ .desc = "plugin stat-data fields", -+ .linkage = {NULL,NULL} -+ }, -+ .present = present_pset_sd, -+ .absent = absent_plugin_sd, -+ .save_len = save_len_pset_sd, -+ .save = save_pset_sd, -+ .alignment = 8 -+ }, -+ [HEIR_STAT] = { -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = HEIR_STAT, -+ .pops = NULL, -+ .label = "heir-plugin-sd", -+ .desc = "heir plugin stat-data fields", -+ .linkage = {NULL,NULL} -+ }, -+ .present = present_hset_sd, -+ .absent = NULL, -+ .save_len = save_len_hset_sd, -+ .save = save_hset_sd, -+ .alignment = 8 -+ }, -+ [FLAGS_STAT] = { -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = FLAGS_STAT, -+ .pops = NULL, -+ .label = "flags-sd", -+ .desc = "inode bit flags", -+ .linkage = {NULL, NULL} -+ }, -+ .present = present_flags_sd, -+ .absent = NULL, -+ .save_len = save_len_flags_sd, -+ .save = save_flags_sd, -+ .alignment = 8 -+ }, -+ [CAPABILITIES_STAT] = { -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = CAPABILITIES_STAT, -+ .pops = NULL, -+ .label = "capabilities-sd", -+ .desc = "capabilities", -+ .linkage = {NULL, NULL} -+ }, -+ .present = eio, -+ .absent = NULL, -+ .save_len = save_len_flags_sd, -+ .save = save_flags_sd, -+ .alignment = 8 -+ }, -+ [CRYPTO_STAT] = { -+ .h = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .id = CRYPTO_STAT, -+ .pops = NULL, -+ .label = "crypto-sd", -+ .desc = "secret key size and id", -+ .linkage = {NULL, NULL} -+ }, -+ .present = present_crypto_sd, -+ .absent = NULL, -+ .save_len = save_len_crypto_sd, -+ .save = save_crypto_sd, -+ .alignment = 8 -+ } -+}; -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/static_stat.h linux-2.6.24/fs/reiser4/plugin/item/static_stat.h ---- linux-2.6.24.orig/fs/reiser4/plugin/item/static_stat.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/item/static_stat.h 2008-01-25 11:39:07.028231388 +0300 -@@ -0,0 +1,224 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* This describes the static_stat item, used to hold all information needed by the stat() syscall. -+ -+In the case where each file has not less than the fields needed by the -+stat() syscall, it is more compact to store those fields in this -+struct. -+ -+If this item does not exist, then all stats are dynamically resolved. -+At the moment, we either resolve all stats dynamically or all of them -+statically. If you think this is not fully optimal, and the rest of -+reiser4 is working, then fix it...:-) -+ -+*/ -+ -+#if !defined( __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ ) -+#define __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ -+ -+#include "../../forward.h" -+#include "../../dformat.h" -+ -+#include /* for struct inode */ -+ -+/* Stat data layout: goals and implementation. -+ -+ We want to be able to have lightweight files which have complete flexibility in what semantic metadata is attached to -+ them, including not having semantic metadata attached to them. -+ -+ There is one problem with doing that, which is that if in fact you have exactly the same metadata for most files you -+ want to store, then it takes more space to store that metadata in a dynamically sized structure than in a statically -+ sized structure because the statically sized structure knows without recording it what the names and lengths of the -+ attributes are. -+ -+ This leads to a natural compromise, which is to special case those files which have simply the standard unix file -+ attributes, and only employ the full dynamic stat data mechanism for those files that differ from the standard unix -+ file in their use of file attributes. -+ -+ Yet this compromise deserves to be compromised a little. -+ -+ We accommodate the case where you have no more than the standard unix file attributes by using an "extension -+ bitmask": each bit in it indicates presence or absence of or particular stat data extension (see sd_ext_bits enum). -+ -+ If the first bit of the extension bitmask bit is 0, we have light-weight file whose attributes are either inherited -+ from parent directory (as uid, gid) or initialised to some sane values. -+ -+ To capitalize on existing code infrastructure, extensions are -+ implemented as plugins of type REISER4_SD_EXT_PLUGIN_TYPE. -+ Each stat-data extension plugin implements four methods: -+ -+ ->present() called by sd_load() when this extension is found in stat-data -+ ->absent() called by sd_load() when this extension is not found in stat-data -+ ->save_len() called by sd_len() to calculate total length of stat-data -+ ->save() called by sd_save() to store extension data into stat-data -+ -+ Implementation is in fs/reiser4/plugin/item/static_stat.c -+*/ -+ -+/* stat-data extension. Please order this by presumed frequency of use */ -+typedef enum { -+ /* support for light-weight files */ -+ LIGHT_WEIGHT_STAT, -+ /* data required to implement unix stat(2) call. Layout is in -+ reiser4_unix_stat. If this is not present, file is light-weight */ -+ UNIX_STAT, -+ /* this contains additional set of 32bit [anc]time fields to implement -+ nanosecond resolution. Layout is in reiser4_large_times_stat. Usage -+ if this extension is governed by 32bittimes mount option. */ -+ LARGE_TIMES_STAT, -+ /* stat data has link name included */ -+ SYMLINK_STAT, -+ /* on-disk slots of non-standard plugins for main plugin table -+ (@reiser4_inode->pset), that is, plugins that cannot be deduced -+ from file mode bits), for example, aggregation, interpolation etc. */ -+ PLUGIN_STAT, -+ /* this extension contains persistent inode flags. These flags are -+ single bits: immutable, append, only, etc. Layout is in -+ reiser4_flags_stat. */ -+ FLAGS_STAT, -+ /* this extension contains capabilities sets, associated with this -+ file. Layout is in reiser4_capabilities_stat */ -+ CAPABILITIES_STAT, -+ /* this extension contains size and public id of the secret key. -+ Layout is in reiser4_crypto_stat */ -+ CRYPTO_STAT, -+ /* on-disk slots of non-default plugins for inheritance, which -+ are extracted to special plugin table (@reiser4_inode->hset). -+ By default, children of the object will inherit plugins from -+ its main plugin table (pset). */ -+ HEIR_STAT, -+ LAST_SD_EXTENSION, -+ /* -+ * init_inode_static_sd() iterates over extension mask until all -+ * non-zero bits are processed. This means, that neither ->present(), -+ * nor ->absent() methods will be called for stat-data extensions that -+ * go after last present extension. But some basic extensions, we want -+ * either ->absent() or ->present() method to be called, because these -+ * extensions set up something in inode even when they are not -+ * present. This is what LAST_IMPORTANT_SD_EXTENSION is for: for all -+ * extensions before and including LAST_IMPORTANT_SD_EXTENSION either -+ * ->present(), or ->absent() method will be called, independently of -+ * what other extensions are present. -+ */ -+ LAST_IMPORTANT_SD_EXTENSION = PLUGIN_STAT -+} sd_ext_bits; -+ -+/* minimal stat-data. This allows to support light-weight files. */ -+typedef struct reiser4_stat_data_base { -+ /* 0 */ __le16 extmask; -+ /* 2 */ -+} PACKED reiser4_stat_data_base; -+ -+typedef struct reiser4_light_weight_stat { -+ /* 0 */ __le16 mode; -+ /* 2 */ __le32 nlink; -+ /* 6 */ __le64 size; -+ /* size in bytes */ -+ /* 14 */ -+} PACKED reiser4_light_weight_stat; -+ -+typedef struct reiser4_unix_stat { -+ /* owner id */ -+ /* 0 */ __le32 uid; -+ /* group id */ -+ /* 4 */ __le32 gid; -+ /* access time */ -+ /* 8 */ __le32 atime; -+ /* modification time */ -+ /* 12 */ __le32 mtime; -+ /* change time */ -+ /* 16 */ __le32 ctime; -+ union { -+ /* minor:major for device files */ -+ /* 20 */ __le64 rdev; -+ /* bytes used by file */ -+ /* 20 */ __le64 bytes; -+ } u; -+ /* 28 */ -+} PACKED reiser4_unix_stat; -+ -+/* symlink stored as part of inode */ -+typedef struct reiser4_symlink_stat { -+ char body[0]; -+} PACKED reiser4_symlink_stat; -+ -+typedef struct reiser4_plugin_slot { -+ /* 0 */ __le16 pset_memb; -+ /* 2 */ __le16 id; -+ /* 4 *//* here plugin stores its persistent state */ -+} PACKED reiser4_plugin_slot; -+ -+/* stat-data extension for files with non-standard plugin. */ -+typedef struct reiser4_plugin_stat { -+ /* number of additional plugins, associated with this object */ -+ /* 0 */ __le16 plugins_no; -+ /* 2 */ reiser4_plugin_slot slot[0]; -+ /* 2 */ -+} PACKED reiser4_plugin_stat; -+ -+/* stat-data extension for inode flags. Currently it is just fixed-width 32 -+ * bit mask. If need arise, this can be replaced with variable width -+ * bitmask. */ -+typedef struct reiser4_flags_stat { -+ /* 0 */ __le32 flags; -+ /* 4 */ -+} PACKED reiser4_flags_stat; -+ -+typedef struct reiser4_capabilities_stat { -+ /* 0 */ __le32 effective; -+ /* 8 */ __le32 permitted; -+ /* 16 */ -+} PACKED reiser4_capabilities_stat; -+ -+typedef struct reiser4_cluster_stat { -+/* this defines cluster size (an attribute of cryptcompress objects) as PAGE_SIZE << cluster shift */ -+ /* 0 */ d8 cluster_shift; -+ /* 1 */ -+} PACKED reiser4_cluster_stat; -+ -+typedef struct reiser4_crypto_stat { -+ /* secret key size, bits */ -+ /* 0 */ d16 keysize; -+ /* secret key id */ -+ /* 2 */ d8 keyid[0]; -+ /* 2 */ -+} PACKED reiser4_crypto_stat; -+ -+typedef struct reiser4_large_times_stat { -+ /* access time */ -+ /* 0 */ d32 atime; -+ /* modification time */ -+ /* 4 */ d32 mtime; -+ /* change time */ -+ /* 8 */ d32 ctime; -+ /* 12 */ -+} PACKED reiser4_large_times_stat; -+ -+/* this structure is filled by sd_item_stat */ -+typedef struct sd_stat { -+ int dirs; -+ int files; -+ int others; -+} sd_stat; -+ -+/* plugin->item.common.* */ -+extern void print_sd(const char *prefix, coord_t * coord); -+extern void item_stat_static_sd(const coord_t * coord, void *vp); -+ -+/* plugin->item.s.sd.* */ -+extern int init_inode_static_sd(struct inode *inode, char *sd, int len); -+extern int save_len_static_sd(struct inode *inode); -+extern int save_static_sd(struct inode *inode, char **area); -+ -+/* __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/tail.c linux-2.6.24/fs/reiser4/plugin/item/tail.c ---- linux-2.6.24.orig/fs/reiser4/plugin/item/tail.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/item/tail.c 2008-01-25 11:40:16.698169785 +0300 -@@ -0,0 +1,808 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "item.h" -+#include "../../inode.h" -+#include "../../page_cache.h" -+#include "../../carry.h" -+#include "../../vfs_ops.h" -+ -+#include -+#include -+#include -+#include -+ -+/* plugin->u.item.b.max_key_inside */ -+reiser4_key *max_key_inside_tail(const coord_t *coord, reiser4_key *key) -+{ -+ item_key_by_coord(coord, key); -+ set_key_offset(key, get_key_offset(reiser4_max_key())); -+ return key; -+} -+ -+/* plugin->u.item.b.can_contain_key */ -+int can_contain_key_tail(const coord_t *coord, const reiser4_key *key, -+ const reiser4_item_data *data) -+{ -+ reiser4_key item_key; -+ -+ if (item_plugin_by_coord(coord) != data->iplug) -+ return 0; -+ -+ item_key_by_coord(coord, &item_key); -+ if (get_key_locality(key) != get_key_locality(&item_key) || -+ get_key_objectid(key) != get_key_objectid(&item_key)) -+ return 0; -+ -+ return 1; -+} -+ -+/* plugin->u.item.b.mergeable -+ first item is of tail type */ -+/* Audited by: green(2002.06.14) */ -+int mergeable_tail(const coord_t *p1, const coord_t *p2) -+{ -+ reiser4_key key1, key2; -+ -+ assert("vs-535", plugin_of_group(item_plugin_by_coord(p1), -+ UNIX_FILE_METADATA_ITEM_TYPE)); -+ assert("vs-365", item_id_by_coord(p1) == FORMATTING_ID); -+ -+ if (item_id_by_coord(p2) != FORMATTING_ID) { -+ /* second item is of another type */ -+ return 0; -+ } -+ -+ item_key_by_coord(p1, &key1); -+ item_key_by_coord(p2, &key2); -+ if (get_key_locality(&key1) != get_key_locality(&key2) || -+ get_key_objectid(&key1) != get_key_objectid(&key2) -+ || get_key_type(&key1) != get_key_type(&key2)) { -+ /* items of different objects */ -+ return 0; -+ } -+ if (get_key_offset(&key1) + nr_units_tail(p1) != get_key_offset(&key2)) { -+ /* not adjacent items */ -+ return 0; -+ } -+ return 1; -+} -+ -+/* plugin->u.item.b.print -+ plugin->u.item.b.check */ -+ -+/* plugin->u.item.b.nr_units */ -+pos_in_node_t nr_units_tail(const coord_t * coord) -+{ -+ return item_length_by_coord(coord); -+} -+ -+/* plugin->u.item.b.lookup */ -+lookup_result -+lookup_tail(const reiser4_key * key, lookup_bias bias, coord_t * coord) -+{ -+ reiser4_key item_key; -+ __u64 lookuped, offset; -+ unsigned nr_units; -+ -+ item_key_by_coord(coord, &item_key); -+ offset = get_key_offset(item_key_by_coord(coord, &item_key)); -+ nr_units = nr_units_tail(coord); -+ -+ /* key we are looking for must be greater than key of item @coord */ -+ assert("vs-416", keygt(key, &item_key)); -+ -+ /* offset we are looking for */ -+ lookuped = get_key_offset(key); -+ -+ if (lookuped >= offset && lookuped < offset + nr_units) { -+ /* byte we are looking for is in this item */ -+ coord->unit_pos = lookuped - offset; -+ coord->between = AT_UNIT; -+ return CBK_COORD_FOUND; -+ } -+ -+ /* set coord after last unit */ -+ coord->unit_pos = nr_units - 1; -+ coord->between = AFTER_UNIT; -+ return bias == -+ FIND_MAX_NOT_MORE_THAN ? CBK_COORD_FOUND : CBK_COORD_NOTFOUND; -+} -+ -+/* plugin->u.item.b.paste */ -+int -+paste_tail(coord_t *coord, reiser4_item_data *data, -+ carry_plugin_info *info UNUSED_ARG) -+{ -+ unsigned old_item_length; -+ char *item; -+ -+ /* length the item had before resizing has been performed */ -+ old_item_length = item_length_by_coord(coord) - data->length; -+ -+ /* tail items never get pasted in the middle */ -+ assert("vs-363", -+ (coord->unit_pos == 0 && coord->between == BEFORE_UNIT) || -+ (coord->unit_pos == old_item_length - 1 && -+ coord->between == AFTER_UNIT) || -+ (coord->unit_pos == 0 && old_item_length == 0 -+ && coord->between == AT_UNIT)); -+ -+ item = item_body_by_coord(coord); -+ if (coord->unit_pos == 0) -+ /* make space for pasted data when pasting at the beginning of -+ the item */ -+ memmove(item + data->length, item, old_item_length); -+ -+ if (coord->between == AFTER_UNIT) -+ coord->unit_pos++; -+ -+ if (data->data) { -+ assert("vs-554", data->user == 0 || data->user == 1); -+ if (data->user) { -+ assert("nikita-3035", reiser4_schedulable()); -+ /* copy from user space */ -+ if (__copy_from_user(item + coord->unit_pos, -+ (const char __user *)data->data, -+ (unsigned)data->length)) -+ return RETERR(-EFAULT); -+ } else -+ /* copy from kernel space */ -+ memcpy(item + coord->unit_pos, data->data, -+ (unsigned)data->length); -+ } else { -+ memset(item + coord->unit_pos, 0, (unsigned)data->length); -+ } -+ return 0; -+} -+ -+/* plugin->u.item.b.fast_paste */ -+ -+/* plugin->u.item.b.can_shift -+ number of units is returned via return value, number of bytes via @size. For -+ tail items they coincide */ -+int -+can_shift_tail(unsigned free_space, coord_t * source UNUSED_ARG, -+ znode * target UNUSED_ARG, shift_direction direction UNUSED_ARG, -+ unsigned *size, unsigned want) -+{ -+ /* make sure that that we do not want to shift more than we have */ -+ assert("vs-364", want > 0 -+ && want <= (unsigned)item_length_by_coord(source)); -+ -+ *size = min(want, free_space); -+ return *size; -+} -+ -+/* plugin->u.item.b.copy_units */ -+void -+copy_units_tail(coord_t * target, coord_t * source, -+ unsigned from, unsigned count, -+ shift_direction where_is_free_space, -+ unsigned free_space UNUSED_ARG) -+{ -+ /* make sure that item @target is expanded already */ -+ assert("vs-366", (unsigned)item_length_by_coord(target) >= count); -+ assert("vs-370", free_space >= count); -+ -+ if (where_is_free_space == SHIFT_LEFT) { -+ /* append item @target with @count first bytes of @source */ -+ assert("vs-365", from == 0); -+ -+ memcpy((char *)item_body_by_coord(target) + -+ item_length_by_coord(target) - count, -+ (char *)item_body_by_coord(source), count); -+ } else { -+ /* target item is moved to right already */ -+ reiser4_key key; -+ -+ assert("vs-367", -+ (unsigned)item_length_by_coord(source) == from + count); -+ -+ memcpy((char *)item_body_by_coord(target), -+ (char *)item_body_by_coord(source) + from, count); -+ -+ /* new units are inserted before first unit in an item, -+ therefore, we have to update item key */ -+ item_key_by_coord(source, &key); -+ set_key_offset(&key, get_key_offset(&key) + from); -+ -+ node_plugin_by_node(target->node)->update_item_key(target, &key, -+ NULL /*info */); -+ } -+} -+ -+/* plugin->u.item.b.create_hook */ -+ -+/* item_plugin->b.kill_hook -+ this is called when @count units starting from @from-th one are going to be removed -+ */ -+int -+kill_hook_tail(const coord_t * coord, pos_in_node_t from, -+ pos_in_node_t count, struct carry_kill_data *kdata) -+{ -+ reiser4_key key; -+ loff_t start, end; -+ -+ assert("vs-1577", kdata); -+ assert("vs-1579", kdata->inode); -+ -+ item_key_by_coord(coord, &key); -+ start = get_key_offset(&key) + from; -+ end = start + count; -+ fake_kill_hook_tail(kdata->inode, start, end, kdata->params.truncate); -+ return 0; -+} -+ -+/* plugin->u.item.b.shift_hook */ -+ -+/* helper for kill_units_tail and cut_units_tail */ -+static int -+do_cut_or_kill(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ reiser4_key * smallest_removed, reiser4_key * new_first) -+{ -+ pos_in_node_t count; -+ -+ /* this method is only called to remove part of item */ -+ assert("vs-374", (to - from + 1) < item_length_by_coord(coord)); -+ /* tails items are never cut from the middle of an item */ -+ assert("vs-396", ergo(from != 0, to == coord_last_unit_pos(coord))); -+ assert("vs-1558", ergo(from == 0, to < coord_last_unit_pos(coord))); -+ -+ count = to - from + 1; -+ -+ if (smallest_removed) { -+ /* store smallest key removed */ -+ item_key_by_coord(coord, smallest_removed); -+ set_key_offset(smallest_removed, -+ get_key_offset(smallest_removed) + from); -+ } -+ if (new_first) { -+ /* head of item is cut */ -+ assert("vs-1529", from == 0); -+ -+ item_key_by_coord(coord, new_first); -+ set_key_offset(new_first, -+ get_key_offset(new_first) + from + count); -+ } -+ -+ if (REISER4_DEBUG) -+ memset((char *)item_body_by_coord(coord) + from, 0, count); -+ return count; -+} -+ -+/* plugin->u.item.b.cut_units */ -+int -+cut_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ struct carry_cut_data *cdata UNUSED_ARG, -+ reiser4_key * smallest_removed, reiser4_key * new_first) -+{ -+ return do_cut_or_kill(coord, from, to, smallest_removed, new_first); -+} -+ -+/* plugin->u.item.b.kill_units */ -+int -+kill_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to, -+ struct carry_kill_data *kdata, reiser4_key * smallest_removed, -+ reiser4_key * new_first) -+{ -+ kill_hook_tail(coord, from, to - from + 1, kdata); -+ return do_cut_or_kill(coord, from, to, smallest_removed, new_first); -+} -+ -+/* plugin->u.item.b.unit_key */ -+reiser4_key *unit_key_tail(const coord_t * coord, reiser4_key * key) -+{ -+ assert("vs-375", coord_is_existing_unit(coord)); -+ -+ item_key_by_coord(coord, key); -+ set_key_offset(key, (get_key_offset(key) + coord->unit_pos)); -+ -+ return key; -+} -+ -+/* plugin->u.item.b.estimate -+ plugin->u.item.b.item_data_by_flow */ -+ -+/* tail redpage function. It is called from readpage_tail(). */ -+static int do_readpage_tail(uf_coord_t *uf_coord, struct page *page) -+{ -+ tap_t tap; -+ int result; -+ coord_t coord; -+ lock_handle lh; -+ int count, mapped; -+ struct inode *inode; -+ char *pagedata; -+ -+ /* saving passed coord in order to do not move it by tap. */ -+ init_lh(&lh); -+ copy_lh(&lh, uf_coord->lh); -+ inode = page->mapping->host; -+ coord_dup(&coord, &uf_coord->coord); -+ -+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK); -+ -+ if ((result = reiser4_tap_load(&tap))) -+ goto out_tap_done; -+ -+ /* lookup until page is filled up. */ -+ for (mapped = 0; mapped < PAGE_CACHE_SIZE; ) { -+ /* number of bytes to be copied to page */ -+ count = item_length_by_coord(&coord) - coord.unit_pos; -+ if (count > PAGE_CACHE_SIZE - mapped) -+ count = PAGE_CACHE_SIZE - mapped; -+ -+ /* attach @page to address space and get data address */ -+ pagedata = kmap_atomic(page, KM_USER0); -+ -+ /* copy tail item to page */ -+ memcpy(pagedata + mapped, -+ ((char *)item_body_by_coord(&coord) + coord.unit_pos), -+ count); -+ mapped += count; -+ -+ flush_dcache_page(page); -+ -+ /* dettach page from address space */ -+ kunmap_atomic(pagedata, KM_USER0); -+ -+ /* Getting next tail item. */ -+ if (mapped < PAGE_CACHE_SIZE) { -+ /* -+ * unlock page in order to avoid keep it locked -+ * during tree lookup, which takes long term locks -+ */ -+ unlock_page(page); -+ -+ /* getting right neighbour. */ -+ result = go_dir_el(&tap, RIGHT_SIDE, 0); -+ -+ /* lock page back */ -+ lock_page(page); -+ if (PageUptodate(page)) { -+ /* -+ * another thread read the page, we have -+ * nothing to do -+ */ -+ result = 0; -+ goto out_unlock_page; -+ } -+ -+ if (result) { -+ if (result == -E_NO_NEIGHBOR) { -+ /* -+ * rigth neighbor is not a formatted -+ * node -+ */ -+ result = 0; -+ goto done; -+ } else { -+ goto out_tap_relse; -+ } -+ } else { -+ if (!inode_file_plugin(inode)-> -+ owns_item(inode, &coord)) { -+ /* item of another file is found */ -+ result = 0; -+ goto done; -+ } -+ } -+ } -+ } -+ -+ done: -+ if (mapped != PAGE_CACHE_SIZE) -+ zero_user_page(page, mapped, PAGE_CACHE_SIZE - mapped, -+ KM_USER0); -+ SetPageUptodate(page); -+ out_unlock_page: -+ unlock_page(page); -+ out_tap_relse: -+ reiser4_tap_relse(&tap); -+ out_tap_done: -+ reiser4_tap_done(&tap); -+ return result; -+} -+ -+/* -+ plugin->s.file.readpage -+ reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->readpage_tail -+ or -+ filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_tail -+ -+ At the beginning: coord->node is read locked, zloaded, page is locked, coord is set to existing unit inside of tail -+ item. */ -+int readpage_tail(void *vp, struct page *page) -+{ -+ uf_coord_t *uf_coord = vp; -+ ON_DEBUG(coord_t * coord = &uf_coord->coord); -+ ON_DEBUG(reiser4_key key); -+ -+ assert("umka-2515", PageLocked(page)); -+ assert("umka-2516", !PageUptodate(page)); -+ assert("umka-2517", !jprivate(page) && !PagePrivate(page)); -+ assert("umka-2518", page->mapping && page->mapping->host); -+ -+ assert("umka-2519", znode_is_loaded(coord->node)); -+ assert("umka-2520", item_is_tail(coord)); -+ assert("umka-2521", coord_is_existing_unit(coord)); -+ assert("umka-2522", znode_is_rlocked(coord->node)); -+ assert("umka-2523", -+ page->mapping->host->i_ino == -+ get_key_objectid(item_key_by_coord(coord, &key))); -+ -+ return do_readpage_tail(uf_coord, page); -+} -+ -+/** -+ * overwrite_tail -+ * @flow: -+ * @coord: -+ * -+ * Overwrites tail item or its part by user data. Returns number of bytes -+ * written or error code. -+ */ -+static int overwrite_tail(flow_t *flow, coord_t *coord) -+{ -+ unsigned count; -+ -+ assert("vs-570", flow->user == 1); -+ assert("vs-946", flow->data); -+ assert("vs-947", coord_is_existing_unit(coord)); -+ assert("vs-948", znode_is_write_locked(coord->node)); -+ assert("nikita-3036", reiser4_schedulable()); -+ -+ count = item_length_by_coord(coord) - coord->unit_pos; -+ if (count > flow->length) -+ count = flow->length; -+ -+ if (__copy_from_user((char *)item_body_by_coord(coord) + coord->unit_pos, -+ (const char __user *)flow->data, count)) -+ return RETERR(-EFAULT); -+ -+ znode_make_dirty(coord->node); -+ return count; -+} -+ -+/** -+ * insert_first_tail -+ * @inode: -+ * @flow: -+ * @coord: -+ * @lh: -+ * -+ * Returns number of bytes written or error code. -+ */ -+static ssize_t insert_first_tail(struct inode *inode, flow_t *flow, -+ coord_t *coord, lock_handle *lh) -+{ -+ int result; -+ loff_t to_write; -+ struct unix_file_info *uf_info; -+ -+ if (get_key_offset(&flow->key) != 0) { -+ /* -+ * file is empty and we have to write not to the beginning of -+ * file. Create a hole at the beginning of file. On success -+ * insert_flow returns 0 as number of written bytes which is -+ * what we have to return on padding a file with holes -+ */ -+ flow->data = NULL; -+ flow->length = get_key_offset(&flow->key); -+ set_key_offset(&flow->key, 0); -+ /* -+ * holes in files built of tails are stored just like if there -+ * were real data which are all zeros. Therefore we have to -+ * allocate quota here as well -+ */ -+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length)) -+ return RETERR(-EDQUOT); -+ result = reiser4_insert_flow(coord, lh, flow); -+ if (flow->length) -+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length); -+ -+ uf_info = unix_file_inode_data(inode); -+ -+ /* -+ * first item insertion is only possible when writing to empty -+ * file or performing tail conversion -+ */ -+ assert("", (uf_info->container == UF_CONTAINER_EMPTY || -+ (reiser4_inode_get_flag(inode, -+ REISER4_PART_MIXED) && -+ reiser4_inode_get_flag(inode, -+ REISER4_PART_IN_CONV)))); -+ /* if file was empty - update its state */ -+ if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY) -+ uf_info->container = UF_CONTAINER_TAILS; -+ return result; -+ } -+ -+ /* check quota before appending data */ -+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length)) -+ return RETERR(-EDQUOT); -+ -+ to_write = flow->length; -+ result = reiser4_insert_flow(coord, lh, flow); -+ if (flow->length) -+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length); -+ return (to_write - flow->length) ? (to_write - flow->length) : result; -+} -+ -+/** -+ * append_tail -+ * @inode: -+ * @flow: -+ * @coord: -+ * @lh: -+ * -+ * Returns number of bytes written or error code. -+ */ -+static ssize_t append_tail(struct inode *inode, -+ flow_t *flow, coord_t *coord, lock_handle *lh) -+{ -+ int result; -+ reiser4_key append_key; -+ loff_t to_write; -+ -+ if (!keyeq(&flow->key, append_key_tail(coord, &append_key))) { -+ flow->data = NULL; -+ flow->length = get_key_offset(&flow->key) - get_key_offset(&append_key); -+ set_key_offset(&flow->key, get_key_offset(&append_key)); -+ /* -+ * holes in files built of tails are stored just like if there -+ * were real data which are all zeros. Therefore we have to -+ * allocate quota here as well -+ */ -+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length)) -+ return RETERR(-EDQUOT); -+ result = reiser4_insert_flow(coord, lh, flow); -+ if (flow->length) -+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length); -+ return result; -+ } -+ -+ /* check quota before appending data */ -+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length)) -+ return RETERR(-EDQUOT); -+ -+ to_write = flow->length; -+ result = reiser4_insert_flow(coord, lh, flow); -+ if (flow->length) -+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length); -+ return (to_write - flow->length) ? (to_write - flow->length) : result; -+} -+ -+/** -+ * write_tail_reserve_space - reserve space for tail write operation -+ * @inode: -+ * -+ * Estimates and reserves space which may be required for writing one flow to a -+ * file -+ */ -+static int write_extent_reserve_space(struct inode *inode) -+{ -+ __u64 count; -+ reiser4_tree *tree; -+ -+ /* -+ * to write one flow to a file by tails we have to reserve disk space for: -+ -+ * 1. find_file_item may have to insert empty node to the tree (empty -+ * leaf node between two extent items). This requires 1 block and -+ * number of blocks which are necessary to perform insertion of an -+ * internal item into twig level. -+ * -+ * 2. flow insertion -+ * -+ * 3. stat data update -+ */ -+ tree = reiser4_tree_by_inode(inode); -+ count = estimate_one_insert_item(tree) + -+ estimate_insert_flow(tree->height) + -+ estimate_one_insert_item(tree); -+ grab_space_enable(); -+ return reiser4_grab_space(count, 0 /* flags */); -+} -+ -+#define PAGE_PER_FLOW 4 -+ -+static loff_t faultin_user_pages(const char __user *buf, size_t count) -+{ -+ loff_t faulted; -+ int to_fault; -+ -+ if (count > PAGE_PER_FLOW * PAGE_CACHE_SIZE) -+ count = PAGE_PER_FLOW * PAGE_CACHE_SIZE; -+ faulted = 0; -+ while (count > 0) { -+ to_fault = PAGE_CACHE_SIZE; -+ if (count < to_fault) -+ to_fault = count; -+ fault_in_pages_readable(buf + faulted, to_fault); -+ count -= to_fault; -+ faulted += to_fault; -+ } -+ return faulted; -+} -+ -+/** -+ * reiser4_write_tail - write method of tail item plugin -+ * @file: file to write to -+ * @buf: address of user-space buffer -+ * @count: number of bytes to write -+ * @pos: position in file to write to -+ * -+ * Returns number of written bytes or error code. -+ */ -+ssize_t reiser4_write_tail(struct file *file, struct inode * inode, -+ const char __user *buf, size_t count, loff_t *pos) -+{ -+ struct hint hint; -+ int result; -+ flow_t flow; -+ coord_t *coord; -+ lock_handle *lh; -+ znode *loaded; -+ -+ assert("edward-1548", inode != NULL); -+ -+ if (write_extent_reserve_space(inode)) -+ return RETERR(-ENOSPC); -+ -+ result = load_file_hint(file, &hint); -+ BUG_ON(result != 0); -+ -+ flow.length = faultin_user_pages(buf, count); -+ flow.user = 1; -+ memcpy(&flow.data, &buf, sizeof(buf)); -+ flow.op = WRITE_OP; -+ key_by_inode_and_offset_common(inode, *pos, &flow.key); -+ -+ result = find_file_item(&hint, &flow.key, ZNODE_WRITE_LOCK, inode); -+ if (IS_CBKERR(result)) -+ return result; -+ -+ coord = &hint.ext_coord.coord; -+ lh = hint.ext_coord.lh; -+ -+ result = zload(coord->node); -+ BUG_ON(result != 0); -+ loaded = coord->node; -+ -+ if (coord->between == AFTER_UNIT) { -+ /* append with data or hole */ -+ result = append_tail(inode, &flow, coord, lh); -+ } else if (coord->between == AT_UNIT) { -+ /* overwrite */ -+ result = overwrite_tail(&flow, coord); -+ } else { -+ /* no items of this file yet. insert data or hole */ -+ result = insert_first_tail(inode, &flow, coord, lh); -+ } -+ zrelse(loaded); -+ if (result < 0) { -+ done_lh(lh); -+ return result; -+ } -+ -+ /* seal and unlock znode */ -+ hint.ext_coord.valid = 0; -+ if (hint.ext_coord.valid) -+ reiser4_set_hint(&hint, &flow.key, ZNODE_WRITE_LOCK); -+ else -+ reiser4_unset_hint(&hint); -+ -+ save_file_hint(file, &hint); -+ return result; -+} -+ -+#if REISER4_DEBUG -+ -+static int -+coord_matches_key_tail(const coord_t * coord, const reiser4_key * key) -+{ -+ reiser4_key item_key; -+ -+ assert("vs-1356", coord_is_existing_unit(coord)); -+ assert("vs-1354", keylt(key, append_key_tail(coord, &item_key))); -+ assert("vs-1355", keyge(key, item_key_by_coord(coord, &item_key))); -+ return get_key_offset(key) == -+ get_key_offset(&item_key) + coord->unit_pos; -+ -+} -+ -+#endif -+ -+/* plugin->u.item.s.file.read */ -+int reiser4_read_tail(struct file *file UNUSED_ARG, flow_t *f, hint_t *hint) -+{ -+ unsigned count; -+ int item_length; -+ coord_t *coord; -+ uf_coord_t *uf_coord; -+ -+ uf_coord = &hint->ext_coord; -+ coord = &uf_coord->coord; -+ -+ assert("vs-571", f->user == 1); -+ assert("vs-571", f->data); -+ assert("vs-967", coord && coord->node); -+ assert("vs-1117", znode_is_rlocked(coord->node)); -+ assert("vs-1118", znode_is_loaded(coord->node)); -+ -+ assert("nikita-3037", reiser4_schedulable()); -+ assert("vs-1357", coord_matches_key_tail(coord, &f->key)); -+ -+ /* calculate number of bytes to read off the item */ -+ item_length = item_length_by_coord(coord); -+ count = item_length_by_coord(coord) - coord->unit_pos; -+ if (count > f->length) -+ count = f->length; -+ -+ /* user page has to be brought in so that major page fault does not -+ * occur here when longtem lock is held */ -+ if (__copy_to_user((char __user *)f->data, -+ ((char *)item_body_by_coord(coord) + coord->unit_pos), -+ count)) -+ return RETERR(-EFAULT); -+ -+ /* probably mark_page_accessed() should only be called if -+ * coord->unit_pos is zero. */ -+ mark_page_accessed(znode_page(coord->node)); -+ move_flow_forward(f, count); -+ -+ coord->unit_pos += count; -+ if (item_length == coord->unit_pos) { -+ coord->unit_pos--; -+ coord->between = AFTER_UNIT; -+ } -+ reiser4_set_hint(hint, &f->key, ZNODE_READ_LOCK); -+ return 0; -+} -+ -+/* -+ plugin->u.item.s.file.append_key -+ key of first byte which is the next to last byte by addressed by this item -+*/ -+reiser4_key *append_key_tail(const coord_t * coord, reiser4_key * key) -+{ -+ item_key_by_coord(coord, key); -+ set_key_offset(key, get_key_offset(key) + item_length_by_coord(coord)); -+ return key; -+} -+ -+/* plugin->u.item.s.file.init_coord_extension */ -+void init_coord_extension_tail(uf_coord_t * uf_coord, loff_t lookuped) -+{ -+ uf_coord->valid = 1; -+} -+ -+/* -+ plugin->u.item.s.file.get_block -+*/ -+int -+get_block_address_tail(const coord_t * coord, sector_t lblock, sector_t * block) -+{ -+ assert("nikita-3252", znode_get_level(coord->node) == LEAF_LEVEL); -+ -+ if (reiser4_blocknr_is_fake(znode_get_block(coord->node))) -+ /* if node has'nt obtainet its block number yet, return 0. -+ * Lets avoid upsetting users with some cosmic numbers beyond -+ * the device capacity.*/ -+ *block = 0; -+ else -+ *block = *znode_get_block(coord->node); -+ return 0; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/item/tail.h linux-2.6.24/fs/reiser4/plugin/item/tail.h ---- linux-2.6.24.orig/fs/reiser4/plugin/item/tail.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/item/tail.h 2008-01-25 11:40:16.702170815 +0300 -@@ -0,0 +1,58 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#if !defined( __REISER4_TAIL_H__ ) -+#define __REISER4_TAIL_H__ -+ -+struct tail_coord_extension { -+ int not_used; -+}; -+ -+struct cut_list; -+ -+/* plugin->u.item.b.* */ -+reiser4_key *max_key_inside_tail(const coord_t *, reiser4_key *); -+int can_contain_key_tail(const coord_t * coord, const reiser4_key * key, -+ const reiser4_item_data *); -+int mergeable_tail(const coord_t * p1, const coord_t * p2); -+pos_in_node_t nr_units_tail(const coord_t *); -+lookup_result lookup_tail(const reiser4_key *, lookup_bias, coord_t *); -+int paste_tail(coord_t *, reiser4_item_data *, carry_plugin_info *); -+int can_shift_tail(unsigned free_space, coord_t * source, -+ znode * target, shift_direction, unsigned *size, -+ unsigned want); -+void copy_units_tail(coord_t * target, coord_t * source, unsigned from, -+ unsigned count, shift_direction, unsigned free_space); -+int kill_hook_tail(const coord_t *, pos_in_node_t from, pos_in_node_t count, -+ struct carry_kill_data *); -+int cut_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to, -+ struct carry_cut_data *, reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+int kill_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to, -+ struct carry_kill_data *, reiser4_key * smallest_removed, -+ reiser4_key * new_first); -+reiser4_key *unit_key_tail(const coord_t *, reiser4_key *); -+ -+/* plugin->u.item.s.* */ -+ssize_t reiser4_write_tail(struct file *file, struct inode * inode, -+ const char __user *buf, size_t count, loff_t *pos); -+int reiser4_read_tail(struct file *, flow_t *, hint_t *); -+int readpage_tail(void *vp, struct page *page); -+reiser4_key *append_key_tail(const coord_t *, reiser4_key *); -+void init_coord_extension_tail(uf_coord_t *, loff_t offset); -+int get_block_address_tail(const coord_t *, sector_t, sector_t *); -+int item_balance_dirty_pages(struct address_space *, const flow_t *, -+ hint_t *, int back_to_dirty, int set_hint); -+ -+/* __REISER4_TAIL_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/Makefile linux-2.6.24/fs/reiser4/plugin/Makefile ---- linux-2.6.24.orig/fs/reiser4/plugin/Makefile 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/Makefile 2008-01-25 11:39:07.032232418 +0300 -@@ -0,0 +1,26 @@ -+obj-$(CONFIG_REISER4_FS) += plugins.o -+ -+plugins-objs := \ -+ plugin.o \ -+ plugin_set.o \ -+ object.o \ -+ inode_ops.o \ -+ inode_ops_rename.o \ -+ file_ops.o \ -+ file_ops_readdir.o \ -+ file_plugin_common.o \ -+ dir_plugin_common.o \ -+ digest.o \ -+ hash.o \ -+ fibration.o \ -+ tail_policy.o \ -+ regular.o -+ -+obj-$(CONFIG_REISER4_FS) += item/ -+obj-$(CONFIG_REISER4_FS) += file/ -+obj-$(CONFIG_REISER4_FS) += dir/ -+obj-$(CONFIG_REISER4_FS) += node/ -+obj-$(CONFIG_REISER4_FS) += compress/ -+obj-$(CONFIG_REISER4_FS) += space/ -+obj-$(CONFIG_REISER4_FS) += disk_format/ -+obj-$(CONFIG_REISER4_FS) += security/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/node/Makefile linux-2.6.24/fs/reiser4/plugin/node/Makefile ---- linux-2.6.24.orig/fs/reiser4/plugin/node/Makefile 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/node/Makefile 2008-01-25 11:39:07.032232418 +0300 -@@ -0,0 +1,5 @@ -+obj-$(CONFIG_REISER4_FS) += node_plugins.o -+ -+node_plugins-objs := \ -+ node.o \ -+ node40.o -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/node/node40.c linux-2.6.24/fs/reiser4/plugin/node/node40.c ---- linux-2.6.24.orig/fs/reiser4/plugin/node/node40.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/node/node40.c 2008-01-25 11:39:07.036233449 +0300 -@@ -0,0 +1,2924 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "../../debug.h" -+#include "../../key.h" -+#include "../../coord.h" -+#include "../plugin_header.h" -+#include "../item/item.h" -+#include "node.h" -+#include "node40.h" -+#include "../plugin.h" -+#include "../../jnode.h" -+#include "../../znode.h" -+#include "../../pool.h" -+#include "../../carry.h" -+#include "../../tap.h" -+#include "../../tree.h" -+#include "../../super.h" -+#include "../../reiser4.h" -+ -+#include -+#include -+#include -+ -+/* leaf 40 format: -+ -+ [node header | item 0, item 1, .., item N-1 | free space | item_head N-1, .. item_head 1, item head 0 ] -+ plugin_id (16) key -+ free_space (16) pluginid (16) -+ free_space_start (16) offset (16) -+ level (8) -+ num_items (16) -+ magic (32) -+ flush_time (32) -+*/ -+/* NIKITA-FIXME-HANS: I told you guys not less than 10 times to not call it r4fs. Change to "ReIs". */ -+/* magic number that is stored in ->magic field of node header */ -+static const __u32 REISER4_NODE_MAGIC = 0x52344653; /* (*(__u32 *)"R4FS"); */ -+ -+static int prepare_for_update(znode * left, znode * right, -+ carry_plugin_info * info); -+ -+/* header of node of reiser40 format is at the beginning of node */ -+static inline node40_header *node40_node_header(const znode * node /* node to -+ * query */ ) -+{ -+ assert("nikita-567", node != NULL); -+ assert("nikita-568", znode_page(node) != NULL); -+ assert("nikita-569", zdata(node) != NULL); -+ return (node40_header *) zdata(node); -+} -+ -+/* functions to get/set fields of node40_header */ -+#define nh40_get_magic(nh) le32_to_cpu(get_unaligned(&(nh)->magic)) -+#define nh40_get_free_space(nh) le16_to_cpu(get_unaligned(&(nh)->free_space)) -+#define nh40_get_free_space_start(nh) le16_to_cpu(get_unaligned(&(nh)->free_space_start)) -+#define nh40_get_level(nh) get_unaligned(&(nh)->level) -+#define nh40_get_num_items(nh) le16_to_cpu(get_unaligned(&(nh)->nr_items)) -+#define nh40_get_flush_id(nh) le64_to_cpu(get_unaligned(&(nh)->flush_id)) -+ -+#define nh40_set_magic(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->magic) -+#define nh40_set_free_space(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space) -+#define nh40_set_free_space_start(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space_start) -+#define nh40_set_level(nh, value) put_unaligned(value, &(nh)->level) -+#define nh40_set_num_items(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->nr_items) -+#define nh40_set_mkfs_id(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->mkfs_id) -+ -+/* plugin field of node header should be read/set by -+ plugin_by_disk_id/save_disk_plugin */ -+ -+/* array of item headers is at the end of node */ -+static inline item_header40 *node40_ih_at(const znode * node, unsigned pos) -+{ -+ return (item_header40 *) (zdata(node) + znode_size(node)) - pos - 1; -+} -+ -+/* ( page_address( node -> pg ) + PAGE_CACHE_SIZE ) - pos - 1 -+ */ -+static inline item_header40 *node40_ih_at_coord(const coord_t * coord) -+{ -+ return (item_header40 *) (zdata(coord->node) + -+ znode_size(coord->node)) - (coord->item_pos) - -+ 1; -+} -+ -+/* functions to get/set fields of item_header40 */ -+#define ih40_get_offset(ih) le16_to_cpu(get_unaligned(&(ih)->offset)) -+ -+#define ih40_set_offset(ih, value) put_unaligned(cpu_to_le16(value), &(ih)->offset) -+ -+/* plugin field of item header should be read/set by -+ plugin_by_disk_id/save_disk_plugin */ -+ -+/* plugin methods */ -+ -+/* plugin->u.node.item_overhead -+ look for description of this method in plugin/node/node.h */ -+size_t -+item_overhead_node40(const znode * node UNUSED_ARG, flow_t * f UNUSED_ARG) -+{ -+ return sizeof(item_header40); -+} -+ -+/* plugin->u.node.free_space -+ look for description of this method in plugin/node/node.h */ -+size_t free_space_node40(znode * node) -+{ -+ assert("nikita-577", node != NULL); -+ assert("nikita-578", znode_is_loaded(node)); -+ assert("nikita-579", zdata(node) != NULL); -+ -+ return nh40_get_free_space(node40_node_header(node)); -+} -+ -+/* private inline version of node40_num_of_items() for use in this file. This -+ is necessary, because address of node40_num_of_items() is taken and it is -+ never inlined as a result. */ -+static inline short node40_num_of_items_internal(const znode * node) -+{ -+ return nh40_get_num_items(node40_node_header(node)); -+} -+ -+#if REISER4_DEBUG -+static inline void check_num_items(const znode * node) -+{ -+ assert("nikita-2749", -+ node40_num_of_items_internal(node) == node->nr_items); -+ assert("nikita-2746", znode_is_write_locked(node)); -+} -+#else -+#define check_num_items(node) noop -+#endif -+ -+/* plugin->u.node.num_of_items -+ look for description of this method in plugin/node/node.h */ -+int num_of_items_node40(const znode * node) -+{ -+ return node40_num_of_items_internal(node); -+} -+ -+static void -+node40_set_num_items(znode * node, node40_header * nh, unsigned value) -+{ -+ assert("nikita-2751", node != NULL); -+ assert("nikita-2750", nh == node40_node_header(node)); -+ -+ check_num_items(node); -+ nh40_set_num_items(nh, value); -+ node->nr_items = value; -+ check_num_items(node); -+} -+ -+/* plugin->u.node.item_by_coord -+ look for description of this method in plugin/node/node.h */ -+char *item_by_coord_node40(const coord_t * coord) -+{ -+ item_header40 *ih; -+ char *p; -+ -+ /* @coord is set to existing item */ -+ assert("nikita-596", coord != NULL); -+ assert("vs-255", coord_is_existing_item(coord)); -+ -+ ih = node40_ih_at_coord(coord); -+ p = zdata(coord->node) + ih40_get_offset(ih); -+ return p; -+} -+ -+/* plugin->u.node.length_by_coord -+ look for description of this method in plugin/node/node.h */ -+int length_by_coord_node40(const coord_t * coord) -+{ -+ item_header40 *ih; -+ int result; -+ -+ /* @coord is set to existing item */ -+ assert("vs-256", coord != NULL); -+ assert("vs-257", coord_is_existing_item(coord)); -+ -+ ih = node40_ih_at_coord(coord); -+ if ((int)coord->item_pos == -+ node40_num_of_items_internal(coord->node) - 1) -+ result = -+ nh40_get_free_space_start(node40_node_header(coord->node)) - -+ ih40_get_offset(ih); -+ else -+ result = ih40_get_offset(ih - 1) - ih40_get_offset(ih); -+ -+ return result; -+} -+ -+static pos_in_node_t -+node40_item_length(const znode * node, pos_in_node_t item_pos) -+{ -+ item_header40 *ih; -+ pos_in_node_t result; -+ -+ /* @coord is set to existing item */ -+ assert("vs-256", node != NULL); -+ assert("vs-257", node40_num_of_items_internal(node) > item_pos); -+ -+ ih = node40_ih_at(node, item_pos); -+ if (item_pos == node40_num_of_items_internal(node) - 1) -+ result = -+ nh40_get_free_space_start(node40_node_header(node)) - -+ ih40_get_offset(ih); -+ else -+ result = ih40_get_offset(ih - 1) - ih40_get_offset(ih); -+ -+ return result; -+} -+ -+/* plugin->u.node.plugin_by_coord -+ look for description of this method in plugin/node/node.h */ -+item_plugin *plugin_by_coord_node40(const coord_t * coord) -+{ -+ item_header40 *ih; -+ item_plugin *result; -+ -+ /* @coord is set to existing item */ -+ assert("vs-258", coord != NULL); -+ assert("vs-259", coord_is_existing_item(coord)); -+ -+ ih = node40_ih_at_coord(coord); -+ /* pass NULL in stead of current tree. This is time critical call. */ -+ result = item_plugin_by_disk_id(NULL, &ih->plugin_id); -+ return result; -+} -+ -+/* plugin->u.node.key_at -+ look for description of this method in plugin/node/node.h */ -+reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key) -+{ -+ item_header40 *ih; -+ -+ assert("nikita-1765", coord_is_existing_item(coord)); -+ -+ /* @coord is set to existing item */ -+ ih = node40_ih_at_coord(coord); -+ memcpy(key, &ih->key, sizeof(reiser4_key)); -+ return key; -+} -+ -+/* VS-FIXME-HANS: please review whether the below are properly disabled when debugging is disabled */ -+ -+#define NODE_INCSTAT(n, counter) \ -+ reiser4_stat_inc_at_level(znode_get_level(n), node.lookup.counter) -+ -+#define NODE_ADDSTAT(n, counter, val) \ -+ reiser4_stat_add_at_level(znode_get_level(n), node.lookup.counter, val) -+ -+/* plugin->u.node.lookup -+ look for description of this method in plugin/node/node.h */ -+node_search_result lookup_node40(znode * node /* node to query */ , -+ const reiser4_key * key /* key to look for */ , -+ lookup_bias bias /* search bias */ , -+ coord_t * coord /* resulting coord */ ) -+{ -+ int left; -+ int right; -+ int found; -+ int items; -+ -+ item_header40 *lefth; -+ item_header40 *righth; -+ -+ item_plugin *iplug; -+ item_header40 *bstop; -+ item_header40 *ih; -+ cmp_t order; -+ -+ assert("nikita-583", node != NULL); -+ assert("nikita-584", key != NULL); -+ assert("nikita-585", coord != NULL); -+ assert("nikita-2693", znode_is_any_locked(node)); -+ cassert(REISER4_SEQ_SEARCH_BREAK > 2); -+ -+ items = node_num_items(node); -+ -+ if (unlikely(items == 0)) { -+ coord_init_first_unit(coord, node); -+ return NS_NOT_FOUND; -+ } -+ -+ /* binary search for item that can contain given key */ -+ left = 0; -+ right = items - 1; -+ coord->node = node; -+ coord_clear_iplug(coord); -+ found = 0; -+ -+ lefth = node40_ih_at(node, left); -+ righth = node40_ih_at(node, right); -+ -+ /* It is known that for small arrays sequential search is on average -+ more efficient than binary. This is because sequential search is -+ coded as tight loop that can be better optimized by compilers and -+ for small array size gain from this optimization makes sequential -+ search the winner. Another, maybe more important, reason for this, -+ is that sequential array is more CPU cache friendly, whereas binary -+ search effectively destroys CPU caching. -+ -+ Critical here is the notion of "smallness". Reasonable value of -+ REISER4_SEQ_SEARCH_BREAK can be found by playing with code in -+ fs/reiser4/ulevel/ulevel.c:test_search(). -+ -+ Don't try to further optimize sequential search by scanning from -+ right to left in attempt to use more efficient loop termination -+ condition (comparison with 0). This doesn't work. -+ -+ */ -+ -+ while (right - left >= REISER4_SEQ_SEARCH_BREAK) { -+ int median; -+ item_header40 *medianh; -+ -+ median = (left + right) / 2; -+ medianh = node40_ih_at(node, median); -+ -+ assert("nikita-1084", median >= 0); -+ assert("nikita-1085", median < items); -+ switch (keycmp(key, &medianh->key)) { -+ case LESS_THAN: -+ right = median; -+ righth = medianh; -+ break; -+ default: -+ wrong_return_value("nikita-586", "keycmp"); -+ case GREATER_THAN: -+ left = median; -+ lefth = medianh; -+ break; -+ case EQUAL_TO: -+ do { -+ --median; -+ /* headers are ordered from right to left */ -+ ++medianh; -+ } while (median >= 0 && keyeq(key, &medianh->key)); -+ right = left = median + 1; -+ ih = lefth = righth = medianh - 1; -+ found = 1; -+ break; -+ } -+ } -+ /* sequential scan. Item headers, and, therefore, keys are stored at -+ the rightmost part of a node from right to left. We are trying to -+ access memory from left to right, and hence, scan in _descending_ -+ order of item numbers. -+ */ -+ if (!found) { -+ for (left = right, ih = righth; left >= 0; ++ih, --left) { -+ cmp_t comparison; -+ -+ prefetchkey(&(ih + 1)->key); -+ comparison = keycmp(&ih->key, key); -+ if (comparison == GREATER_THAN) -+ continue; -+ if (comparison == EQUAL_TO) { -+ found = 1; -+ do { -+ --left; -+ ++ih; -+ } while (left >= 0 && keyeq(&ih->key, key)); -+ ++left; -+ --ih; -+ } else { -+ assert("nikita-1256", comparison == LESS_THAN); -+ } -+ break; -+ } -+ if (unlikely(left < 0)) -+ left = 0; -+ } -+ -+ assert("nikita-3212", right >= left); -+ assert("nikita-3214", -+ equi(found, keyeq(&node40_ih_at(node, left)->key, key))); -+ -+ coord_set_item_pos(coord, left); -+ coord->unit_pos = 0; -+ coord->between = AT_UNIT; -+ -+ /* key < leftmost key in a mode or node is corrupted and keys -+ are not sorted */ -+ bstop = node40_ih_at(node, (unsigned)left); -+ order = keycmp(&bstop->key, key); -+ if (unlikely(order == GREATER_THAN)) { -+ if (unlikely(left != 0)) { -+ /* screw up */ -+ warning("nikita-587", "Key less than %i key in a node", -+ left); -+ reiser4_print_key("key", key); -+ reiser4_print_key("min", &bstop->key); -+ print_coord_content("coord", coord); -+ return RETERR(-EIO); -+ } else { -+ coord->between = BEFORE_UNIT; -+ return NS_NOT_FOUND; -+ } -+ } -+ /* left <= key, ok */ -+ iplug = item_plugin_by_disk_id(znode_get_tree(node), &bstop->plugin_id); -+ -+ if (unlikely(iplug == NULL)) { -+ warning("nikita-588", "Unknown plugin %i", -+ le16_to_cpu(get_unaligned(&bstop->plugin_id))); -+ reiser4_print_key("key", key); -+ print_coord_content("coord", coord); -+ return RETERR(-EIO); -+ } -+ -+ coord_set_iplug(coord, iplug); -+ -+ /* if exact key from item header was found by binary search, no -+ further checks are necessary. */ -+ if (found) { -+ assert("nikita-1259", order == EQUAL_TO); -+ return NS_FOUND; -+ } -+ if (iplug->b.max_key_inside != NULL) { -+ reiser4_key max_item_key; -+ -+ /* key > max_item_key --- outside of an item */ -+ if (keygt(key, iplug->b.max_key_inside(coord, &max_item_key))) { -+ coord->unit_pos = 0; -+ coord->between = AFTER_ITEM; -+ /* FIXME-VS: key we are looking for does not fit into -+ found item. Return NS_NOT_FOUND then. Without that -+ the following case does not work: there is extent of -+ file 10000, 10001. File 10000, 10002 has been just -+ created. When writing to position 0 in that file - -+ traverse_tree will stop here on twig level. When we -+ want it to go down to leaf level -+ */ -+ return NS_NOT_FOUND; -+ } -+ } -+ -+ if (iplug->b.lookup != NULL) { -+ return iplug->b.lookup(key, bias, coord); -+ } else { -+ assert("nikita-1260", order == LESS_THAN); -+ coord->between = AFTER_UNIT; -+ return (bias == FIND_EXACT) ? NS_NOT_FOUND : NS_FOUND; -+ } -+} -+ -+#undef NODE_ADDSTAT -+#undef NODE_INCSTAT -+ -+/* plugin->u.node.estimate -+ look for description of this method in plugin/node/node.h */ -+size_t estimate_node40(znode * node) -+{ -+ size_t result; -+ -+ assert("nikita-597", node != NULL); -+ -+ result = free_space_node40(node) - sizeof(item_header40); -+ -+ return (result > 0) ? result : 0; -+} -+ -+/* plugin->u.node.check -+ look for description of this method in plugin/node/node.h */ -+int check_node40(const znode * node /* node to check */ , -+ __u32 flags /* check flags */ , -+ const char **error /* where to store error message */ ) -+{ -+ int nr_items; -+ int i; -+ reiser4_key prev; -+ unsigned old_offset; -+ tree_level level; -+ coord_t coord; -+ int result; -+ -+ assert("nikita-580", node != NULL); -+ assert("nikita-581", error != NULL); -+ assert("nikita-2948", znode_is_loaded(node)); -+ -+ if (ZF_ISSET(node, JNODE_HEARD_BANSHEE)) -+ return 0; -+ -+ assert("nikita-582", zdata(node) != NULL); -+ -+ nr_items = node40_num_of_items_internal(node); -+ if (nr_items < 0) { -+ *error = "Negative number of items"; -+ return -1; -+ } -+ -+ if (flags & REISER4_NODE_DKEYS) -+ prev = *znode_get_ld_key((znode *) node); -+ else -+ prev = *reiser4_min_key(); -+ -+ old_offset = 0; -+ coord_init_zero(&coord); -+ coord.node = (znode *) node; -+ coord.unit_pos = 0; -+ coord.between = AT_UNIT; -+ level = znode_get_level(node); -+ for (i = 0; i < nr_items; i++) { -+ item_header40 *ih; -+ reiser4_key unit_key; -+ unsigned j; -+ -+ ih = node40_ih_at(node, (unsigned)i); -+ coord_set_item_pos(&coord, i); -+ if ((ih40_get_offset(ih) >= -+ znode_size(node) - nr_items * sizeof(item_header40)) || -+ (ih40_get_offset(ih) < sizeof(node40_header))) { -+ *error = "Offset is out of bounds"; -+ return -1; -+ } -+ if (ih40_get_offset(ih) <= old_offset) { -+ *error = "Offsets are in wrong order"; -+ return -1; -+ } -+ if ((i == 0) && (ih40_get_offset(ih) != sizeof(node40_header))) { -+ *error = "Wrong offset of first item"; -+ return -1; -+ } -+ old_offset = ih40_get_offset(ih); -+ -+ if (keygt(&prev, &ih->key)) { -+ *error = "Keys are in wrong order"; -+ return -1; -+ } -+ if (!keyeq(&ih->key, unit_key_by_coord(&coord, &unit_key))) { -+ *error = "Wrong key of first unit"; -+ return -1; -+ } -+ prev = ih->key; -+ for (j = 0; j < coord_num_units(&coord); ++j) { -+ coord.unit_pos = j; -+ unit_key_by_coord(&coord, &unit_key); -+ if (keygt(&prev, &unit_key)) { -+ *error = "Unit keys are in wrong order"; -+ return -1; -+ } -+ prev = unit_key; -+ } -+ coord.unit_pos = 0; -+ if (level != TWIG_LEVEL && item_is_extent(&coord)) { -+ *error = "extent on the wrong level"; -+ return -1; -+ } -+ if (level == LEAF_LEVEL && item_is_internal(&coord)) { -+ *error = "internal item on the wrong level"; -+ return -1; -+ } -+ if (level != LEAF_LEVEL && -+ !item_is_internal(&coord) && !item_is_extent(&coord)) { -+ *error = "wrong item on the internal level"; -+ return -1; -+ } -+ if (level > TWIG_LEVEL && !item_is_internal(&coord)) { -+ *error = "non-internal item on the internal level"; -+ return -1; -+ } -+#if REISER4_DEBUG -+ if (item_plugin_by_coord(&coord)->b.check -+ && item_plugin_by_coord(&coord)->b.check(&coord, error)) -+ return -1; -+#endif -+ if (i) { -+ coord_t prev_coord; -+ /* two neighboring items can not be mergeable */ -+ coord_dup(&prev_coord, &coord); -+ coord_prev_item(&prev_coord); -+ if (are_items_mergeable(&prev_coord, &coord)) { -+ *error = "mergeable items in one node"; -+ return -1; -+ } -+ -+ } -+ } -+ -+ if ((flags & REISER4_NODE_DKEYS) && !node_is_empty(node)) { -+ coord_t coord; -+ item_plugin *iplug; -+ -+ coord_init_last_unit(&coord, node); -+ iplug = item_plugin_by_coord(&coord); -+ if ((item_is_extent(&coord) || item_is_tail(&coord)) && -+ iplug->s.file.append_key != NULL) { -+ reiser4_key mkey; -+ -+ iplug->s.file.append_key(&coord, &mkey); -+ set_key_offset(&mkey, get_key_offset(&mkey) - 1); -+ read_lock_dk(current_tree); -+ result = keygt(&mkey, znode_get_rd_key((znode *) node)); -+ read_unlock_dk(current_tree); -+ if (result) { -+ *error = "key of rightmost item is too large"; -+ return -1; -+ } -+ } -+ } -+ if (flags & REISER4_NODE_DKEYS) { -+ read_lock_tree(current_tree); -+ read_lock_dk(current_tree); -+ -+ flags |= REISER4_NODE_TREE_STABLE; -+ -+ if (keygt(&prev, znode_get_rd_key((znode *) node))) { -+ if (flags & REISER4_NODE_TREE_STABLE) { -+ *error = "Last key is greater than rdkey"; -+ read_unlock_dk(current_tree); -+ read_unlock_tree(current_tree); -+ return -1; -+ } -+ } -+ if (keygt -+ (znode_get_ld_key((znode *) node), -+ znode_get_rd_key((znode *) node))) { -+ *error = "ldkey is greater than rdkey"; -+ read_unlock_dk(current_tree); -+ read_unlock_tree(current_tree); -+ return -1; -+ } -+ if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && -+ (node->left != NULL) && -+ !ZF_ISSET(node->left, JNODE_HEARD_BANSHEE) && -+ ergo(flags & REISER4_NODE_TREE_STABLE, -+ !keyeq(znode_get_rd_key(node->left), -+ znode_get_ld_key((znode *) node))) -+ && ergo(!(flags & REISER4_NODE_TREE_STABLE), -+ keygt(znode_get_rd_key(node->left), -+ znode_get_ld_key((znode *) node)))) { -+ *error = "left rdkey or ldkey is wrong"; -+ read_unlock_dk(current_tree); -+ read_unlock_tree(current_tree); -+ return -1; -+ } -+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && -+ (node->right != NULL) && -+ !ZF_ISSET(node->right, JNODE_HEARD_BANSHEE) && -+ ergo(flags & REISER4_NODE_TREE_STABLE, -+ !keyeq(znode_get_rd_key((znode *) node), -+ znode_get_ld_key(node->right))) -+ && ergo(!(flags & REISER4_NODE_TREE_STABLE), -+ keygt(znode_get_rd_key((znode *) node), -+ znode_get_ld_key(node->right)))) { -+ *error = "rdkey or right ldkey is wrong"; -+ read_unlock_dk(current_tree); -+ read_unlock_tree(current_tree); -+ return -1; -+ } -+ -+ read_unlock_dk(current_tree); -+ read_unlock_tree(current_tree); -+ } -+ -+ return 0; -+} -+ -+/* plugin->u.node.parse -+ look for description of this method in plugin/node/node.h */ -+int parse_node40(znode * node /* node to parse */ ) -+{ -+ node40_header *header; -+ int result; -+ d8 level; -+ -+ header = node40_node_header((znode *) node); -+ result = -EIO; -+ level = nh40_get_level(header); -+ if (unlikely(((__u8) znode_get_level(node)) != level)) -+ warning("nikita-494", "Wrong level found in node: %i != %i", -+ znode_get_level(node), level); -+ else if (unlikely(nh40_get_magic(header) != REISER4_NODE_MAGIC)) -+ warning("nikita-495", -+ "Wrong magic in tree node: want %x, got %x", -+ REISER4_NODE_MAGIC, nh40_get_magic(header)); -+ else { -+ node->nr_items = node40_num_of_items_internal(node); -+ result = 0; -+ } -+ return RETERR(result); -+} -+ -+/* plugin->u.node.init -+ look for description of this method in plugin/node/node.h */ -+int init_node40(znode * node /* node to initialise */ ) -+{ -+ node40_header *header; -+ -+ assert("nikita-570", node != NULL); -+ assert("nikita-572", zdata(node) != NULL); -+ -+ header = node40_node_header(node); -+ memset(header, 0, sizeof(node40_header)); -+ nh40_set_free_space(header, znode_size(node) - sizeof(node40_header)); -+ nh40_set_free_space_start(header, sizeof(node40_header)); -+ /* sane hypothesis: 0 in CPU format is 0 in disk format */ -+ /* items: 0 */ -+ save_plugin_id(node_plugin_to_plugin(node->nplug), -+ &header->common_header.plugin_id); -+ nh40_set_level(header, znode_get_level(node)); -+ nh40_set_magic(header, REISER4_NODE_MAGIC); -+ node->nr_items = 0; -+ nh40_set_mkfs_id(header, reiser4_mkfs_id(reiser4_get_current_sb())); -+ -+ /* flags: 0 */ -+ return 0; -+} -+ -+#ifdef GUESS_EXISTS -+int guess_node40(const znode * node /* node to guess plugin of */ ) -+{ -+ node40_header *nethack; -+ -+ assert("nikita-1058", node != NULL); -+ nethack = node40_node_header(node); -+ return -+ (nh40_get_magic(nethack) == REISER4_NODE_MAGIC) && -+ (plugin_by_disk_id(znode_get_tree(node), -+ REISER4_NODE_PLUGIN_TYPE, -+ &nethack->common_header.plugin_id)->h.id == -+ NODE40_ID); -+} -+#endif -+ -+/* plugin->u.node.chage_item_size -+ look for description of this method in plugin/node/node.h */ -+void change_item_size_node40(coord_t * coord, int by) -+{ -+ node40_header *nh; -+ item_header40 *ih; -+ char *item_data; -+ int item_length; -+ unsigned i; -+ -+ /* make sure that @item is coord of existing item */ -+ assert("vs-210", coord_is_existing_item(coord)); -+ -+ nh = node40_node_header(coord->node); -+ -+ item_data = item_by_coord_node40(coord); -+ item_length = length_by_coord_node40(coord); -+ -+ /* move item bodies */ -+ ih = node40_ih_at_coord(coord); -+ memmove(item_data + item_length + by, item_data + item_length, -+ nh40_get_free_space_start(node40_node_header(coord->node)) - -+ (ih40_get_offset(ih) + item_length)); -+ -+ /* update offsets of moved items */ -+ for (i = coord->item_pos + 1; i < nh40_get_num_items(nh); i++) { -+ ih = node40_ih_at(coord->node, i); -+ ih40_set_offset(ih, ih40_get_offset(ih) + by); -+ } -+ -+ /* update node header */ -+ nh40_set_free_space(nh, nh40_get_free_space(nh) - by); -+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) + by); -+} -+ -+static int should_notify_parent(const znode * node) -+{ -+ /* FIXME_JMACD This looks equivalent to znode_is_root(), right? -josh */ -+ return !disk_addr_eq(znode_get_block(node), -+ &znode_get_tree(node)->root_block); -+} -+ -+/* plugin->u.node.create_item -+ look for description of this method in plugin/node/node.h */ -+int -+create_item_node40(coord_t *target, const reiser4_key *key, -+ reiser4_item_data *data, carry_plugin_info *info) -+{ -+ node40_header *nh; -+ item_header40 *ih; -+ unsigned offset; -+ unsigned i; -+ -+ nh = node40_node_header(target->node); -+ -+ assert("vs-212", coord_is_between_items(target)); -+ /* node must have enough free space */ -+ assert("vs-254", -+ free_space_node40(target->node) >= -+ data->length + sizeof(item_header40)); -+ assert("vs-1410", data->length >= 0); -+ -+ if (coord_set_to_right(target)) -+ /* there are not items to the right of @target, so, new item -+ will be inserted after last one */ -+ coord_set_item_pos(target, nh40_get_num_items(nh)); -+ -+ if (target->item_pos < nh40_get_num_items(nh)) { -+ /* there are items to be moved to prepare space for new -+ item */ -+ ih = node40_ih_at_coord(target); -+ /* new item will start at this offset */ -+ offset = ih40_get_offset(ih); -+ -+ memmove(zdata(target->node) + offset + data->length, -+ zdata(target->node) + offset, -+ nh40_get_free_space_start(nh) - offset); -+ /* update headers of moved items */ -+ for (i = target->item_pos; i < nh40_get_num_items(nh); i++) { -+ ih = node40_ih_at(target->node, i); -+ ih40_set_offset(ih, ih40_get_offset(ih) + data->length); -+ } -+ -+ /* @ih is set to item header of the last item, move item headers */ -+ memmove(ih - 1, ih, -+ sizeof(item_header40) * (nh40_get_num_items(nh) - -+ target->item_pos)); -+ } else { -+ /* new item will start at this offset */ -+ offset = nh40_get_free_space_start(nh); -+ } -+ -+ /* make item header for the new item */ -+ ih = node40_ih_at_coord(target); -+ memcpy(&ih->key, key, sizeof(reiser4_key)); -+ ih40_set_offset(ih, offset); -+ save_plugin_id(item_plugin_to_plugin(data->iplug), &ih->plugin_id); -+ -+ /* update node header */ -+ nh40_set_free_space(nh, -+ nh40_get_free_space(nh) - data->length - -+ sizeof(item_header40)); -+ nh40_set_free_space_start(nh, -+ nh40_get_free_space_start(nh) + data->length); -+ node40_set_num_items(target->node, nh, nh40_get_num_items(nh) + 1); -+ -+ /* FIXME: check how does create_item work when between is set to BEFORE_UNIT */ -+ target->unit_pos = 0; -+ target->between = AT_UNIT; -+ coord_clear_iplug(target); -+ -+ /* initialize item */ -+ if (data->iplug->b.init != NULL) { -+ data->iplug->b.init(target, NULL, data); -+ } -+ /* copy item body */ -+ if (data->iplug->b.paste != NULL) { -+ data->iplug->b.paste(target, data, info); -+ } else if (data->data != NULL) { -+ if (data->user) { -+ /* AUDIT: Are we really should not check that pointer -+ from userspace was valid and data bytes were -+ available? How will we return -EFAULT of some kind -+ without this check? */ -+ assert("nikita-3038", reiser4_schedulable()); -+ /* copy data from user space */ -+ __copy_from_user(zdata(target->node) + offset, -+ (const char __user *)data->data, -+ (unsigned)data->length); -+ } else -+ /* copy from kernel space */ -+ memcpy(zdata(target->node) + offset, data->data, -+ (unsigned)data->length); -+ } -+ -+ if (target->item_pos == 0) { -+ /* left delimiting key has to be updated */ -+ prepare_for_update(NULL, target->node, info); -+ } -+ -+ if (item_plugin_by_coord(target)->b.create_hook != NULL) { -+ item_plugin_by_coord(target)->b.create_hook(target, data->arg); -+ } -+ -+ return 0; -+} -+ -+/* plugin->u.node.update_item_key -+ look for description of this method in plugin/node/node.h */ -+void -+update_item_key_node40(coord_t * target, const reiser4_key * key, -+ carry_plugin_info * info) -+{ -+ item_header40 *ih; -+ -+ ih = node40_ih_at_coord(target); -+ memcpy(&ih->key, key, sizeof(reiser4_key)); -+ -+ if (target->item_pos == 0) { -+ prepare_for_update(NULL, target->node, info); -+ } -+} -+ -+/* this bits encode cut mode */ -+#define CMODE_TAIL 1 -+#define CMODE_WHOLE 2 -+#define CMODE_HEAD 4 -+ -+struct cut40_info { -+ int mode; -+ pos_in_node_t tail_removed; /* position of item which gets tail removed */ -+ pos_in_node_t first_removed; /* position of first the leftmost item among items removed completely */ -+ pos_in_node_t removed_count; /* number of items removed completely */ -+ pos_in_node_t head_removed; /* position of item which gets head removed */ -+ -+ pos_in_node_t freed_space_start; -+ pos_in_node_t freed_space_end; -+ pos_in_node_t first_moved; -+ pos_in_node_t head_removed_location; -+}; -+ -+static void init_cinfo(struct cut40_info *cinfo) -+{ -+ cinfo->mode = 0; -+ cinfo->tail_removed = MAX_POS_IN_NODE; -+ cinfo->first_removed = MAX_POS_IN_NODE; -+ cinfo->removed_count = MAX_POS_IN_NODE; -+ cinfo->head_removed = MAX_POS_IN_NODE; -+ cinfo->freed_space_start = MAX_POS_IN_NODE; -+ cinfo->freed_space_end = MAX_POS_IN_NODE; -+ cinfo->first_moved = MAX_POS_IN_NODE; -+ cinfo->head_removed_location = MAX_POS_IN_NODE; -+} -+ -+/* complete cut_node40/kill_node40 content by removing the gap created by */ -+static void compact(znode * node, struct cut40_info *cinfo) -+{ -+ node40_header *nh; -+ item_header40 *ih; -+ pos_in_node_t freed; -+ pos_in_node_t pos, nr_items; -+ -+ assert("vs-1526", (cinfo->freed_space_start != MAX_POS_IN_NODE && -+ cinfo->freed_space_end != MAX_POS_IN_NODE && -+ cinfo->first_moved != MAX_POS_IN_NODE)); -+ assert("vs-1523", cinfo->freed_space_end >= cinfo->freed_space_start); -+ -+ nh = node40_node_header(node); -+ nr_items = nh40_get_num_items(nh); -+ -+ /* remove gap made up by removal */ -+ memmove(zdata(node) + cinfo->freed_space_start, -+ zdata(node) + cinfo->freed_space_end, -+ nh40_get_free_space_start(nh) - cinfo->freed_space_end); -+ -+ /* update item headers of moved items - change their locations */ -+ pos = cinfo->first_moved; -+ ih = node40_ih_at(node, pos); -+ if (cinfo->head_removed_location != MAX_POS_IN_NODE) { -+ assert("vs-1580", pos == cinfo->head_removed); -+ ih40_set_offset(ih, cinfo->head_removed_location); -+ pos++; -+ ih--; -+ } -+ -+ freed = cinfo->freed_space_end - cinfo->freed_space_start; -+ for (; pos < nr_items; pos++, ih--) { -+ assert("vs-1581", ih == node40_ih_at(node, pos)); -+ ih40_set_offset(ih, ih40_get_offset(ih) - freed); -+ } -+ -+ /* free space start moved to right */ -+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - freed); -+ -+ if (cinfo->removed_count != MAX_POS_IN_NODE) { -+ /* number of items changed. Remove item headers of those items */ -+ ih = node40_ih_at(node, nr_items - 1); -+ memmove(ih + cinfo->removed_count, ih, -+ sizeof(item_header40) * (nr_items - -+ cinfo->removed_count - -+ cinfo->first_removed)); -+ freed += sizeof(item_header40) * cinfo->removed_count; -+ node40_set_num_items(node, nh, nr_items - cinfo->removed_count); -+ } -+ -+ /* total amount of free space increased */ -+ nh40_set_free_space(nh, nh40_get_free_space(nh) + freed); -+} -+ -+int shrink_item_node40(coord_t * coord, int delta) -+{ -+ node40_header *nh; -+ item_header40 *ih; -+ pos_in_node_t pos; -+ pos_in_node_t nr_items; -+ char *end; -+ znode *node; -+ int off; -+ -+ assert("nikita-3487", coord != NULL); -+ assert("nikita-3488", delta >= 0); -+ -+ node = coord->node; -+ nh = node40_node_header(node); -+ nr_items = nh40_get_num_items(nh); -+ -+ ih = node40_ih_at_coord(coord); -+ assert("nikita-3489", delta <= length_by_coord_node40(coord)); -+ off = ih40_get_offset(ih) + length_by_coord_node40(coord); -+ end = zdata(node) + off; -+ -+ /* remove gap made up by removal */ -+ memmove(end - delta, end, nh40_get_free_space_start(nh) - off); -+ -+ /* update item headers of moved items - change their locations */ -+ pos = coord->item_pos + 1; -+ ih = node40_ih_at(node, pos); -+ for (; pos < nr_items; pos++, ih--) { -+ assert("nikita-3490", ih == node40_ih_at(node, pos)); -+ ih40_set_offset(ih, ih40_get_offset(ih) - delta); -+ } -+ -+ /* free space start moved to left */ -+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - delta); -+ /* total amount of free space increased */ -+ nh40_set_free_space(nh, nh40_get_free_space(nh) + delta); -+ /* -+ * This method does _not_ changes number of items. Hence, it cannot -+ * make node empty. Also it doesn't remove items at all, which means -+ * that no keys have to be updated either. -+ */ -+ return 0; -+} -+ -+/* this is used by cut_node40 and kill_node40. It analyses input parameters and calculates cut mode. There are 2 types -+ of cut. First is when a unit is removed from the middle of an item. In this case this function returns 1. All the -+ rest fits into second case: 0 or 1 of items getting tail cut, 0 or more items removed completely and 0 or 1 item -+ getting head cut. Function returns 0 in this case */ -+static int -+parse_cut(struct cut40_info *cinfo, const struct cut_kill_params *params) -+{ -+ reiser4_key left_key, right_key; -+ reiser4_key min_from_key, max_to_key; -+ const reiser4_key *from_key, *to_key; -+ -+ init_cinfo(cinfo); -+ -+ /* calculate minimal key stored in first item of items to be cut (params->from) */ -+ item_key_by_coord(params->from, &min_from_key); -+ /* and max key stored in last item of items to be cut (params->to) */ -+ max_item_key_by_coord(params->to, &max_to_key); -+ -+ /* if cut key range is not defined in input parameters - define it using cut coord range */ -+ if (params->from_key == NULL) { -+ assert("vs-1513", params->to_key == NULL); -+ unit_key_by_coord(params->from, &left_key); -+ from_key = &left_key; -+ max_unit_key_by_coord(params->to, &right_key); -+ to_key = &right_key; -+ } else { -+ from_key = params->from_key; -+ to_key = params->to_key; -+ } -+ -+ if (params->from->item_pos == params->to->item_pos) { -+ if (keylt(&min_from_key, from_key) -+ && keylt(to_key, &max_to_key)) -+ return 1; -+ -+ if (keygt(from_key, &min_from_key)) { -+ /* tail of item is to be cut cut */ -+ cinfo->tail_removed = params->from->item_pos; -+ cinfo->mode |= CMODE_TAIL; -+ } else if (keylt(to_key, &max_to_key)) { -+ /* head of item is to be cut */ -+ cinfo->head_removed = params->from->item_pos; -+ cinfo->mode |= CMODE_HEAD; -+ } else { -+ /* item is removed completely */ -+ cinfo->first_removed = params->from->item_pos; -+ cinfo->removed_count = 1; -+ cinfo->mode |= CMODE_WHOLE; -+ } -+ } else { -+ cinfo->first_removed = params->from->item_pos + 1; -+ cinfo->removed_count = -+ params->to->item_pos - params->from->item_pos - 1; -+ -+ if (keygt(from_key, &min_from_key)) { -+ /* first item is not cut completely */ -+ cinfo->tail_removed = params->from->item_pos; -+ cinfo->mode |= CMODE_TAIL; -+ } else { -+ cinfo->first_removed--; -+ cinfo->removed_count++; -+ } -+ if (keylt(to_key, &max_to_key)) { -+ /* last item is not cut completely */ -+ cinfo->head_removed = params->to->item_pos; -+ cinfo->mode |= CMODE_HEAD; -+ } else { -+ cinfo->removed_count++; -+ } -+ if (cinfo->removed_count) -+ cinfo->mode |= CMODE_WHOLE; -+ } -+ -+ return 0; -+} -+ -+static void -+call_kill_hooks(znode * node, pos_in_node_t from, pos_in_node_t count, -+ carry_kill_data * kdata) -+{ -+ coord_t coord; -+ item_plugin *iplug; -+ pos_in_node_t pos; -+ -+ coord.node = node; -+ coord.unit_pos = 0; -+ coord.between = AT_UNIT; -+ for (pos = 0; pos < count; pos++) { -+ coord_set_item_pos(&coord, from + pos); -+ coord.unit_pos = 0; -+ coord.between = AT_UNIT; -+ iplug = item_plugin_by_coord(&coord); -+ if (iplug->b.kill_hook) { -+ iplug->b.kill_hook(&coord, 0, coord_num_units(&coord), -+ kdata); -+ } -+ } -+} -+ -+/* this is used to kill item partially */ -+static pos_in_node_t -+kill_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data, -+ reiser4_key * smallest_removed, reiser4_key * new_first_key) -+{ -+ struct carry_kill_data *kdata; -+ item_plugin *iplug; -+ -+ kdata = data; -+ iplug = item_plugin_by_coord(coord); -+ -+ assert("vs-1524", iplug->b.kill_units); -+ return iplug->b.kill_units(coord, from, to, kdata, smallest_removed, -+ new_first_key); -+} -+ -+/* call item plugin to cut tail of file */ -+static pos_in_node_t -+kill_tail(coord_t * coord, void *data, reiser4_key * smallest_removed) -+{ -+ struct carry_kill_data *kdata; -+ pos_in_node_t to; -+ -+ kdata = data; -+ to = coord_last_unit_pos(coord); -+ return kill_units(coord, coord->unit_pos, to, kdata, smallest_removed, -+ NULL); -+} -+ -+/* call item plugin to cut head of item */ -+static pos_in_node_t -+kill_head(coord_t * coord, void *data, reiser4_key * smallest_removed, -+ reiser4_key * new_first_key) -+{ -+ return kill_units(coord, 0, coord->unit_pos, data, smallest_removed, -+ new_first_key); -+} -+ -+/* this is used to cut item partially */ -+static pos_in_node_t -+cut_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data, -+ reiser4_key * smallest_removed, reiser4_key * new_first_key) -+{ -+ carry_cut_data *cdata; -+ item_plugin *iplug; -+ -+ cdata = data; -+ iplug = item_plugin_by_coord(coord); -+ assert("vs-302", iplug->b.cut_units); -+ return iplug->b.cut_units(coord, from, to, cdata, smallest_removed, -+ new_first_key); -+} -+ -+/* call item plugin to cut tail of file */ -+static pos_in_node_t -+cut_tail(coord_t * coord, void *data, reiser4_key * smallest_removed) -+{ -+ carry_cut_data *cdata; -+ pos_in_node_t to; -+ -+ cdata = data; -+ to = coord_last_unit_pos(cdata->params.from); -+ return cut_units(coord, coord->unit_pos, to, data, smallest_removed, NULL); -+} -+ -+/* call item plugin to cut head of item */ -+static pos_in_node_t -+cut_head(coord_t * coord, void *data, reiser4_key * smallest_removed, -+ reiser4_key * new_first_key) -+{ -+ return cut_units(coord, 0, coord->unit_pos, data, smallest_removed, -+ new_first_key); -+} -+ -+/* this returns 1 of key of first item changed, 0 - if it did not */ -+static int -+prepare_for_compact(struct cut40_info *cinfo, -+ const struct cut_kill_params *params, int is_cut, -+ void *data, carry_plugin_info * info) -+{ -+ znode *node; -+ item_header40 *ih; -+ pos_in_node_t freed; -+ pos_in_node_t item_pos; -+ coord_t coord; -+ reiser4_key new_first_key; -+ pos_in_node_t(*kill_units_f) (coord_t *, pos_in_node_t, pos_in_node_t, -+ void *, reiser4_key *, reiser4_key *); -+ pos_in_node_t(*kill_tail_f) (coord_t *, void *, reiser4_key *); -+ pos_in_node_t(*kill_head_f) (coord_t *, void *, reiser4_key *, -+ reiser4_key *); -+ int retval; -+ -+ retval = 0; -+ -+ node = params->from->node; -+ -+ assert("vs-184", node == params->to->node); -+ assert("vs-312", !node_is_empty(node)); -+ assert("vs-297", -+ coord_compare(params->from, params->to) != COORD_CMP_ON_RIGHT); -+ -+ if (is_cut) { -+ kill_units_f = cut_units; -+ kill_tail_f = cut_tail; -+ kill_head_f = cut_head; -+ } else { -+ kill_units_f = kill_units; -+ kill_tail_f = kill_tail; -+ kill_head_f = kill_head; -+ } -+ -+ if (parse_cut(cinfo, params) == 1) { -+ /* cut from the middle of item */ -+ freed = -+ kill_units_f(params->from, params->from->unit_pos, -+ params->to->unit_pos, data, -+ params->smallest_removed, NULL); -+ -+ item_pos = params->from->item_pos; -+ ih = node40_ih_at(node, item_pos); -+ cinfo->freed_space_start = -+ ih40_get_offset(ih) + node40_item_length(node, -+ item_pos) - freed; -+ cinfo->freed_space_end = cinfo->freed_space_start + freed; -+ cinfo->first_moved = item_pos + 1; -+ } else { -+ assert("vs-1521", (cinfo->tail_removed != MAX_POS_IN_NODE || -+ cinfo->first_removed != MAX_POS_IN_NODE || -+ cinfo->head_removed != MAX_POS_IN_NODE)); -+ -+ switch (cinfo->mode) { -+ case CMODE_TAIL: -+ /* one item gets cut partially from its end */ -+ assert("vs-1562", -+ cinfo->tail_removed == params->from->item_pos); -+ -+ freed = -+ kill_tail_f(params->from, data, -+ params->smallest_removed); -+ -+ item_pos = cinfo->tail_removed; -+ ih = node40_ih_at(node, item_pos); -+ cinfo->freed_space_start = -+ ih40_get_offset(ih) + node40_item_length(node, -+ item_pos) - -+ freed; -+ cinfo->freed_space_end = -+ cinfo->freed_space_start + freed; -+ cinfo->first_moved = cinfo->tail_removed + 1; -+ break; -+ -+ case CMODE_WHOLE: -+ /* one or more items get removed completely */ -+ assert("vs-1563", -+ cinfo->first_removed == params->from->item_pos); -+ assert("vs-1564", cinfo->removed_count > 0 -+ && cinfo->removed_count != MAX_POS_IN_NODE); -+ -+ /* call kill hook for all items removed completely */ -+ if (is_cut == 0) -+ call_kill_hooks(node, cinfo->first_removed, -+ cinfo->removed_count, data); -+ -+ item_pos = cinfo->first_removed; -+ ih = node40_ih_at(node, item_pos); -+ -+ if (params->smallest_removed) -+ memcpy(params->smallest_removed, &ih->key, -+ sizeof(reiser4_key)); -+ -+ cinfo->freed_space_start = ih40_get_offset(ih); -+ -+ item_pos += (cinfo->removed_count - 1); -+ ih -= (cinfo->removed_count - 1); -+ cinfo->freed_space_end = -+ ih40_get_offset(ih) + node40_item_length(node, -+ item_pos); -+ cinfo->first_moved = item_pos + 1; -+ if (cinfo->first_removed == 0) -+ /* key of first item of the node changes */ -+ retval = 1; -+ break; -+ -+ case CMODE_HEAD: -+ /* one item gets cut partially from its head */ -+ assert("vs-1565", -+ cinfo->head_removed == params->from->item_pos); -+ -+ freed = -+ kill_head_f(params->to, data, -+ params->smallest_removed, -+ &new_first_key); -+ -+ item_pos = cinfo->head_removed; -+ ih = node40_ih_at(node, item_pos); -+ cinfo->freed_space_start = ih40_get_offset(ih); -+ cinfo->freed_space_end = ih40_get_offset(ih) + freed; -+ cinfo->first_moved = cinfo->head_removed + 1; -+ -+ /* item head is removed, therefore, item key changed */ -+ coord.node = node; -+ coord_set_item_pos(&coord, item_pos); -+ coord.unit_pos = 0; -+ coord.between = AT_UNIT; -+ update_item_key_node40(&coord, &new_first_key, NULL); -+ if (item_pos == 0) -+ /* key of first item of the node changes */ -+ retval = 1; -+ break; -+ -+ case CMODE_TAIL | CMODE_WHOLE: -+ /* one item gets cut from its end and one or more items get removed completely */ -+ assert("vs-1566", -+ cinfo->tail_removed == params->from->item_pos); -+ assert("vs-1567", -+ cinfo->first_removed == cinfo->tail_removed + 1); -+ assert("vs-1564", cinfo->removed_count > 0 -+ && cinfo->removed_count != MAX_POS_IN_NODE); -+ -+ freed = -+ kill_tail_f(params->from, data, -+ params->smallest_removed); -+ -+ item_pos = cinfo->tail_removed; -+ ih = node40_ih_at(node, item_pos); -+ cinfo->freed_space_start = -+ ih40_get_offset(ih) + node40_item_length(node, -+ item_pos) - -+ freed; -+ -+ /* call kill hook for all items removed completely */ -+ if (is_cut == 0) -+ call_kill_hooks(node, cinfo->first_removed, -+ cinfo->removed_count, data); -+ -+ item_pos += cinfo->removed_count; -+ ih -= cinfo->removed_count; -+ cinfo->freed_space_end = -+ ih40_get_offset(ih) + node40_item_length(node, -+ item_pos); -+ cinfo->first_moved = item_pos + 1; -+ break; -+ -+ case CMODE_WHOLE | CMODE_HEAD: -+ /* one or more items get removed completely and one item gets cut partially from its head */ -+ assert("vs-1568", -+ cinfo->first_removed == params->from->item_pos); -+ assert("vs-1564", cinfo->removed_count > 0 -+ && cinfo->removed_count != MAX_POS_IN_NODE); -+ assert("vs-1569", -+ cinfo->head_removed == -+ cinfo->first_removed + cinfo->removed_count); -+ -+ /* call kill hook for all items removed completely */ -+ if (is_cut == 0) -+ call_kill_hooks(node, cinfo->first_removed, -+ cinfo->removed_count, data); -+ -+ item_pos = cinfo->first_removed; -+ ih = node40_ih_at(node, item_pos); -+ -+ if (params->smallest_removed) -+ memcpy(params->smallest_removed, &ih->key, -+ sizeof(reiser4_key)); -+ -+ freed = -+ kill_head_f(params->to, data, NULL, &new_first_key); -+ -+ cinfo->freed_space_start = ih40_get_offset(ih); -+ -+ ih = node40_ih_at(node, cinfo->head_removed); -+ /* this is the most complex case. Item which got head removed and items which are to be moved -+ intact change their location differently. */ -+ cinfo->freed_space_end = ih40_get_offset(ih) + freed; -+ cinfo->first_moved = cinfo->head_removed; -+ cinfo->head_removed_location = cinfo->freed_space_start; -+ -+ /* item head is removed, therefore, item key changed */ -+ coord.node = node; -+ coord_set_item_pos(&coord, cinfo->head_removed); -+ coord.unit_pos = 0; -+ coord.between = AT_UNIT; -+ update_item_key_node40(&coord, &new_first_key, NULL); -+ -+ assert("vs-1579", cinfo->first_removed == 0); -+ /* key of first item of the node changes */ -+ retval = 1; -+ break; -+ -+ case CMODE_TAIL | CMODE_HEAD: -+ /* one item get cut from its end and its neighbor gets cut from its tail */ -+ impossible("vs-1576", "this can not happen currently"); -+ break; -+ -+ case CMODE_TAIL | CMODE_WHOLE | CMODE_HEAD: -+ impossible("vs-1577", "this can not happen currently"); -+ break; -+ default: -+ impossible("vs-1578", "unexpected cut mode"); -+ break; -+ } -+ } -+ return retval; -+} -+ -+/* plugin->u.node.kill -+ return value is number of items removed completely */ -+int kill_node40(struct carry_kill_data *kdata, carry_plugin_info * info) -+{ -+ znode *node; -+ struct cut40_info cinfo; -+ int first_key_changed; -+ -+ node = kdata->params.from->node; -+ -+ first_key_changed = -+ prepare_for_compact(&cinfo, &kdata->params, 0 /* not cut */ , kdata, -+ info); -+ compact(node, &cinfo); -+ -+ if (info) { -+ /* it is not called by node40_shift, so we have to take care -+ of changes on upper levels */ -+ if (node_is_empty(node) -+ && !(kdata->flags & DELETE_RETAIN_EMPTY)) -+ /* all contents of node is deleted */ -+ prepare_removal_node40(node, info); -+ else if (first_key_changed) { -+ prepare_for_update(NULL, node, info); -+ } -+ } -+ -+ coord_clear_iplug(kdata->params.from); -+ coord_clear_iplug(kdata->params.to); -+ -+ znode_make_dirty(node); -+ return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count; -+} -+ -+/* plugin->u.node.cut -+ return value is number of items removed completely */ -+int cut_node40(struct carry_cut_data *cdata, carry_plugin_info * info) -+{ -+ znode *node; -+ struct cut40_info cinfo; -+ int first_key_changed; -+ -+ node = cdata->params.from->node; -+ -+ first_key_changed = -+ prepare_for_compact(&cinfo, &cdata->params, 1 /* not cut */ , cdata, -+ info); -+ compact(node, &cinfo); -+ -+ if (info) { -+ /* it is not called by node40_shift, so we have to take care -+ of changes on upper levels */ -+ if (node_is_empty(node)) -+ /* all contents of node is deleted */ -+ prepare_removal_node40(node, info); -+ else if (first_key_changed) { -+ prepare_for_update(NULL, node, info); -+ } -+ } -+ -+ coord_clear_iplug(cdata->params.from); -+ coord_clear_iplug(cdata->params.to); -+ -+ znode_make_dirty(node); -+ return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count; -+} -+ -+/* this structure is used by shift method of node40 plugin */ -+struct shift_params { -+ shift_direction pend; /* when @pend == append - we are shifting to -+ left, when @pend == prepend - to right */ -+ coord_t wish_stop; /* when shifting to left this is last unit we -+ want shifted, when shifting to right - this -+ is set to unit we want to start shifting -+ from */ -+ znode *target; -+ int everything; /* it is set to 1 if everything we have to shift is -+ shifted, 0 - otherwise */ -+ -+ /* FIXME-VS: get rid of read_stop */ -+ -+ /* these are set by estimate_shift */ -+ coord_t real_stop; /* this will be set to last unit which will be -+ really shifted */ -+ -+ /* coordinate in source node before operation of unit which becomes -+ first after shift to left of last after shift to right */ -+ union { -+ coord_t future_first; -+ coord_t future_last; -+ } u; -+ -+ unsigned merging_units; /* number of units of first item which have to -+ be merged with last item of target node */ -+ unsigned merging_bytes; /* number of bytes in those units */ -+ -+ unsigned entire; /* items shifted in their entirety */ -+ unsigned entire_bytes; /* number of bytes in those items */ -+ -+ unsigned part_units; /* number of units of partially copied item */ -+ unsigned part_bytes; /* number of bytes in those units */ -+ -+ unsigned shift_bytes; /* total number of bytes in items shifted (item -+ headers not included) */ -+ -+}; -+ -+static int item_creation_overhead(coord_t *item) -+{ -+ return node_plugin_by_coord(item)->item_overhead(item->node, NULL); -+} -+ -+/* how many units are there in @source starting from source->unit_pos -+ but not further than @stop_coord */ -+static int -+wanted_units(coord_t *source, coord_t *stop_coord, shift_direction pend) -+{ -+ if (pend == SHIFT_LEFT) { -+ assert("vs-181", source->unit_pos == 0); -+ } else { -+ assert("vs-182", -+ source->unit_pos == coord_last_unit_pos(source)); -+ } -+ -+ if (source->item_pos != stop_coord->item_pos) { -+ /* @source and @stop_coord are different items */ -+ return coord_last_unit_pos(source) + 1; -+ } -+ -+ if (pend == SHIFT_LEFT) { -+ return stop_coord->unit_pos + 1; -+ } else { -+ return source->unit_pos - stop_coord->unit_pos + 1; -+ } -+} -+ -+/* this calculates what can be copied from @shift->wish_stop.node to -+ @shift->target */ -+static void -+estimate_shift(struct shift_params *shift, const reiser4_context * ctx) -+{ -+ unsigned target_free_space, size; -+ pos_in_node_t stop_item; /* item which estimating should not consider */ -+ unsigned want; /* number of units of item we want shifted */ -+ coord_t source; /* item being estimated */ -+ item_plugin *iplug; -+ -+ /* shifting to left/right starts from first/last units of -+ @shift->wish_stop.node */ -+ if (shift->pend == SHIFT_LEFT) { -+ coord_init_first_unit(&source, shift->wish_stop.node); -+ } else { -+ coord_init_last_unit(&source, shift->wish_stop.node); -+ } -+ shift->real_stop = source; -+ -+ /* free space in target node and number of items in source */ -+ target_free_space = znode_free_space(shift->target); -+ -+ shift->everything = 0; -+ if (!node_is_empty(shift->target)) { -+ /* target node is not empty, check for boundary items -+ mergeability */ -+ coord_t to; -+ -+ /* item we try to merge @source with */ -+ if (shift->pend == SHIFT_LEFT) { -+ coord_init_last_unit(&to, shift->target); -+ } else { -+ coord_init_first_unit(&to, shift->target); -+ } -+ -+ if ((shift->pend == SHIFT_LEFT) ? are_items_mergeable(&to, -+ &source) : -+ are_items_mergeable(&source, &to)) { -+ /* how many units of @source do we want to merge to -+ item @to */ -+ want = -+ wanted_units(&source, &shift->wish_stop, -+ shift->pend); -+ -+ /* how many units of @source we can merge to item -+ @to */ -+ iplug = item_plugin_by_coord(&source); -+ if (iplug->b.can_shift != NULL) -+ shift->merging_units = -+ iplug->b.can_shift(target_free_space, -+ &source, shift->target, -+ shift->pend, &size, -+ want); -+ else { -+ shift->merging_units = 0; -+ size = 0; -+ } -+ shift->merging_bytes = size; -+ shift->shift_bytes += size; -+ /* update stop coord to be set to last unit of @source -+ we can merge to @target */ -+ if (shift->merging_units) -+ /* at least one unit can be shifted */ -+ shift->real_stop.unit_pos = -+ (shift->merging_units - source.unit_pos - -+ 1) * shift->pend; -+ else { -+ /* nothing can be shifted */ -+ if (shift->pend == SHIFT_LEFT) -+ coord_init_before_first_item(&shift-> -+ real_stop, -+ source. -+ node); -+ else -+ coord_init_after_last_item(&shift-> -+ real_stop, -+ source.node); -+ } -+ assert("nikita-2081", shift->real_stop.unit_pos + 1); -+ -+ if (shift->merging_units != want) { -+ /* we could not copy as many as we want, so, -+ there is no reason for estimating any -+ longer */ -+ return; -+ } -+ -+ target_free_space -= size; -+ coord_add_item_pos(&source, shift->pend); -+ } -+ } -+ -+ /* number of item nothing of which we want to shift */ -+ stop_item = shift->wish_stop.item_pos + shift->pend; -+ -+ /* calculate how many items can be copied into given free -+ space as whole */ -+ for (; source.item_pos != stop_item; -+ coord_add_item_pos(&source, shift->pend)) { -+ if (shift->pend == SHIFT_RIGHT) -+ source.unit_pos = coord_last_unit_pos(&source); -+ -+ /* how many units of @source do we want to copy */ -+ want = wanted_units(&source, &shift->wish_stop, shift->pend); -+ -+ if (want == coord_last_unit_pos(&source) + 1) { -+ /* we want this item to be copied entirely */ -+ size = -+ item_length_by_coord(&source) + -+ item_creation_overhead(&source); -+ if (size <= target_free_space) { -+ /* item fits into target node as whole */ -+ target_free_space -= size; -+ shift->shift_bytes += -+ size - item_creation_overhead(&source); -+ shift->entire_bytes += -+ size - item_creation_overhead(&source); -+ shift->entire++; -+ -+ /* update shift->real_stop coord to be set to -+ last unit of @source we can merge to -+ @target */ -+ shift->real_stop = source; -+ if (shift->pend == SHIFT_LEFT) -+ shift->real_stop.unit_pos = -+ coord_last_unit_pos(&shift-> -+ real_stop); -+ else -+ shift->real_stop.unit_pos = 0; -+ continue; -+ } -+ } -+ -+ /* we reach here only for an item which does not fit into -+ target node in its entirety. This item may be either -+ partially shifted, or not shifted at all. We will have to -+ create new item in target node, so decrease amout of free -+ space by an item creation overhead. We can reach here also -+ if stop coord is in this item */ -+ if (target_free_space >= -+ (unsigned)item_creation_overhead(&source)) { -+ target_free_space -= item_creation_overhead(&source); -+ iplug = item_plugin_by_coord(&source); -+ if (iplug->b.can_shift) { -+ shift->part_units = iplug->b.can_shift(target_free_space, -+ &source, -+ NULL, /* target */ -+ shift->pend, -+ &size, -+ want); -+ } else { -+ target_free_space = 0; -+ shift->part_units = 0; -+ size = 0; -+ } -+ } else { -+ target_free_space = 0; -+ shift->part_units = 0; -+ size = 0; -+ } -+ shift->part_bytes = size; -+ shift->shift_bytes += size; -+ -+ /* set @shift->real_stop to last unit of @source we can merge -+ to @shift->target */ -+ if (shift->part_units) { -+ shift->real_stop = source; -+ shift->real_stop.unit_pos = -+ (shift->part_units - source.unit_pos - -+ 1) * shift->pend; -+ assert("nikita-2082", shift->real_stop.unit_pos + 1); -+ } -+ -+ if (want != shift->part_units) -+ /* not everything wanted were shifted */ -+ return; -+ break; -+ } -+ -+ shift->everything = 1; -+} -+ -+static void -+copy_units(coord_t * target, coord_t * source, unsigned from, unsigned count, -+ shift_direction dir, unsigned free_space) -+{ -+ item_plugin *iplug; -+ -+ assert("nikita-1463", target != NULL); -+ assert("nikita-1464", source != NULL); -+ assert("nikita-1465", from + count <= coord_num_units(source)); -+ -+ iplug = item_plugin_by_coord(source); -+ assert("nikita-1468", iplug == item_plugin_by_coord(target)); -+ iplug->b.copy_units(target, source, from, count, dir, free_space); -+ -+ if (dir == SHIFT_RIGHT) { -+ /* FIXME-VS: this looks not necessary. update_item_key was -+ called already by copy_units method */ -+ reiser4_key split_key; -+ -+ assert("nikita-1469", target->unit_pos == 0); -+ -+ unit_key_by_coord(target, &split_key); -+ node_plugin_by_coord(target)->update_item_key(target, -+ &split_key, NULL); -+ } -+} -+ -+/* copy part of @shift->real_stop.node starting either from its beginning or -+ from its end and ending at @shift->real_stop to either the end or the -+ beginning of @shift->target */ -+static void copy(struct shift_params *shift) -+{ -+ node40_header *nh; -+ coord_t from; -+ coord_t to; -+ item_header40 *from_ih, *to_ih; -+ int free_space_start; -+ int new_items; -+ unsigned old_items; -+ int old_offset; -+ unsigned i; -+ -+ nh = node40_node_header(shift->target); -+ free_space_start = nh40_get_free_space_start(nh); -+ old_items = nh40_get_num_items(nh); -+ new_items = shift->entire + (shift->part_units ? 1 : 0); -+ assert("vs-185", -+ shift->shift_bytes == -+ shift->merging_bytes + shift->entire_bytes + shift->part_bytes); -+ -+ from = shift->wish_stop; -+ -+ coord_init_first_unit(&to, shift->target); -+ -+ /* NOTE:NIKITA->VS not sure what I am doing: shift->target is empty, -+ hence to.between is set to EMPTY_NODE above. Looks like we want it -+ to be AT_UNIT. -+ -+ Oh, wonders of ->betweeness... -+ -+ */ -+ to.between = AT_UNIT; -+ -+ if (shift->pend == SHIFT_LEFT) { -+ /* copying to left */ -+ -+ coord_set_item_pos(&from, 0); -+ from_ih = node40_ih_at(from.node, 0); -+ -+ coord_set_item_pos(&to, -+ node40_num_of_items_internal(to.node) - 1); -+ if (shift->merging_units) { -+ /* expand last item, so that plugin methods will see -+ correct data */ -+ free_space_start += shift->merging_bytes; -+ nh40_set_free_space_start(nh, -+ (unsigned)free_space_start); -+ nh40_set_free_space(nh, -+ nh40_get_free_space(nh) - -+ shift->merging_bytes); -+ -+ /* appending last item of @target */ -+ copy_units(&to, &from, 0, /* starting from 0-th unit */ -+ shift->merging_units, SHIFT_LEFT, -+ shift->merging_bytes); -+ coord_inc_item_pos(&from); -+ from_ih--; -+ coord_inc_item_pos(&to); -+ } -+ -+ to_ih = node40_ih_at(shift->target, old_items); -+ if (shift->entire) { -+ /* copy @entire items entirely */ -+ -+ /* copy item headers */ -+ memcpy(to_ih - shift->entire + 1, -+ from_ih - shift->entire + 1, -+ shift->entire * sizeof(item_header40)); -+ /* update item header offset */ -+ old_offset = ih40_get_offset(from_ih); -+ /* AUDIT: Looks like if we calculate old_offset + free_space_start here instead of just old_offset, we can perform one "add" operation less per each iteration */ -+ for (i = 0; i < shift->entire; i++, to_ih--, from_ih--) -+ ih40_set_offset(to_ih, -+ ih40_get_offset(from_ih) - -+ old_offset + free_space_start); -+ -+ /* copy item bodies */ -+ memcpy(zdata(shift->target) + free_space_start, zdata(from.node) + old_offset, /*ih40_get_offset (from_ih), */ -+ shift->entire_bytes); -+ -+ coord_add_item_pos(&from, (int)shift->entire); -+ coord_add_item_pos(&to, (int)shift->entire); -+ } -+ -+ nh40_set_free_space_start(nh, -+ free_space_start + -+ shift->shift_bytes - -+ shift->merging_bytes); -+ nh40_set_free_space(nh, -+ nh40_get_free_space(nh) - -+ (shift->shift_bytes - shift->merging_bytes + -+ sizeof(item_header40) * new_items)); -+ -+ /* update node header */ -+ node40_set_num_items(shift->target, nh, old_items + new_items); -+ assert("vs-170", -+ nh40_get_free_space(nh) < znode_size(shift->target)); -+ -+ if (shift->part_units) { -+ /* copy heading part (@part units) of @source item as -+ a new item into @target->node */ -+ -+ /* copy item header of partially copied item */ -+ coord_set_item_pos(&to, -+ node40_num_of_items_internal(to.node) -+ - 1); -+ memcpy(to_ih, from_ih, sizeof(item_header40)); -+ ih40_set_offset(to_ih, -+ nh40_get_free_space_start(nh) - -+ shift->part_bytes); -+ if (item_plugin_by_coord(&to)->b.init) -+ item_plugin_by_coord(&to)->b.init(&to, &from, -+ NULL); -+ copy_units(&to, &from, 0, shift->part_units, SHIFT_LEFT, -+ shift->part_bytes); -+ } -+ -+ } else { -+ /* copying to right */ -+ -+ coord_set_item_pos(&from, -+ node40_num_of_items_internal(from.node) - 1); -+ from_ih = node40_ih_at_coord(&from); -+ -+ coord_set_item_pos(&to, 0); -+ -+ /* prepare space for new items */ -+ memmove(zdata(to.node) + sizeof(node40_header) + -+ shift->shift_bytes, -+ zdata(to.node) + sizeof(node40_header), -+ free_space_start - sizeof(node40_header)); -+ /* update item headers of moved items */ -+ to_ih = node40_ih_at(to.node, 0); -+ /* first item gets @merging_bytes longer. free space appears -+ at its beginning */ -+ if (!node_is_empty(to.node)) -+ ih40_set_offset(to_ih, -+ ih40_get_offset(to_ih) + -+ shift->shift_bytes - -+ shift->merging_bytes); -+ -+ for (i = 1; i < old_items; i++) -+ ih40_set_offset(to_ih - i, -+ ih40_get_offset(to_ih - i) + -+ shift->shift_bytes); -+ -+ /* move item headers to make space for new items */ -+ memmove(to_ih - old_items + 1 - new_items, -+ to_ih - old_items + 1, -+ sizeof(item_header40) * old_items); -+ to_ih -= (new_items - 1); -+ -+ nh40_set_free_space_start(nh, -+ free_space_start + -+ shift->shift_bytes); -+ nh40_set_free_space(nh, -+ nh40_get_free_space(nh) - -+ (shift->shift_bytes + -+ sizeof(item_header40) * new_items)); -+ -+ /* update node header */ -+ node40_set_num_items(shift->target, nh, old_items + new_items); -+ assert("vs-170", -+ nh40_get_free_space(nh) < znode_size(shift->target)); -+ -+ if (shift->merging_units) { -+ coord_add_item_pos(&to, new_items); -+ to.unit_pos = 0; -+ to.between = AT_UNIT; -+ /* prepend first item of @to */ -+ copy_units(&to, &from, -+ coord_last_unit_pos(&from) - -+ shift->merging_units + 1, -+ shift->merging_units, SHIFT_RIGHT, -+ shift->merging_bytes); -+ coord_dec_item_pos(&from); -+ from_ih++; -+ } -+ -+ if (shift->entire) { -+ /* copy @entire items entirely */ -+ -+ /* copy item headers */ -+ memcpy(to_ih, from_ih, -+ shift->entire * sizeof(item_header40)); -+ -+ /* update item header offset */ -+ old_offset = -+ ih40_get_offset(from_ih + shift->entire - 1); -+ /* AUDIT: old_offset + sizeof (node40_header) + shift->part_bytes calculation can be taken off the loop. */ -+ for (i = 0; i < shift->entire; i++, to_ih++, from_ih++) -+ ih40_set_offset(to_ih, -+ ih40_get_offset(from_ih) - -+ old_offset + -+ sizeof(node40_header) + -+ shift->part_bytes); -+ /* copy item bodies */ -+ coord_add_item_pos(&from, -(int)(shift->entire - 1)); -+ memcpy(zdata(to.node) + sizeof(node40_header) + -+ shift->part_bytes, item_by_coord_node40(&from), -+ shift->entire_bytes); -+ coord_dec_item_pos(&from); -+ } -+ -+ if (shift->part_units) { -+ coord_set_item_pos(&to, 0); -+ to.unit_pos = 0; -+ to.between = AT_UNIT; -+ /* copy heading part (@part units) of @source item as -+ a new item into @target->node */ -+ -+ /* copy item header of partially copied item */ -+ memcpy(to_ih, from_ih, sizeof(item_header40)); -+ ih40_set_offset(to_ih, sizeof(node40_header)); -+ if (item_plugin_by_coord(&to)->b.init) -+ item_plugin_by_coord(&to)->b.init(&to, &from, -+ NULL); -+ copy_units(&to, &from, -+ coord_last_unit_pos(&from) - -+ shift->part_units + 1, shift->part_units, -+ SHIFT_RIGHT, shift->part_bytes); -+ } -+ } -+} -+ -+/* remove everything either before or after @fact_stop. Number of items -+ removed completely is returned */ -+static int delete_copied(struct shift_params *shift) -+{ -+ coord_t from; -+ coord_t to; -+ struct carry_cut_data cdata; -+ -+ if (shift->pend == SHIFT_LEFT) { -+ /* we were shifting to left, remove everything from the -+ beginning of @shift->wish_stop->node upto -+ @shift->wish_stop */ -+ coord_init_first_unit(&from, shift->real_stop.node); -+ to = shift->real_stop; -+ -+ /* store old coordinate of unit which will be first after -+ shift to left */ -+ shift->u.future_first = to; -+ coord_next_unit(&shift->u.future_first); -+ } else { -+ /* we were shifting to right, remove everything from -+ @shift->stop_coord upto to end of -+ @shift->stop_coord->node */ -+ from = shift->real_stop; -+ coord_init_last_unit(&to, from.node); -+ -+ /* store old coordinate of unit which will be last after -+ shift to right */ -+ shift->u.future_last = from; -+ coord_prev_unit(&shift->u.future_last); -+ } -+ -+ cdata.params.from = &from; -+ cdata.params.to = &to; -+ cdata.params.from_key = NULL; -+ cdata.params.to_key = NULL; -+ cdata.params.smallest_removed = NULL; -+ return cut_node40(&cdata, NULL); -+} -+ -+/* something was moved between @left and @right. Add carry operation to @info -+ list to have carry to update delimiting key between them */ -+static int -+prepare_for_update(znode * left, znode * right, carry_plugin_info * info) -+{ -+ carry_op *op; -+ carry_node *cn; -+ -+ if (info == NULL) -+ /* nowhere to send operation to. */ -+ return 0; -+ -+ if (!should_notify_parent(right)) -+ return 0; -+ -+ op = node_post_carry(info, COP_UPDATE, right, 1); -+ if (IS_ERR(op) || op == NULL) -+ return op ? PTR_ERR(op) : -EIO; -+ -+ if (left != NULL) { -+ carry_node *reference; -+ -+ if (info->doing) -+ reference = insert_carry_node(info->doing, -+ info->todo, left); -+ else -+ reference = op->node; -+ assert("nikita-2992", reference != NULL); -+ cn = reiser4_add_carry(info->todo, POOLO_BEFORE, reference); -+ if (IS_ERR(cn)) -+ return PTR_ERR(cn); -+ cn->parent = 1; -+ cn->node = left; -+ if (ZF_ISSET(left, JNODE_ORPHAN)) -+ cn->left_before = 1; -+ op->u.update.left = cn; -+ } else -+ op->u.update.left = NULL; -+ return 0; -+} -+ -+/* plugin->u.node.prepare_removal -+ to delete a pointer to @empty from the tree add corresponding carry -+ operation (delete) to @info list */ -+int prepare_removal_node40(znode * empty, carry_plugin_info * info) -+{ -+ carry_op *op; -+ reiser4_tree *tree; -+ -+ if (!should_notify_parent(empty)) -+ return 0; -+ /* already on a road to Styx */ -+ if (ZF_ISSET(empty, JNODE_HEARD_BANSHEE)) -+ return 0; -+ op = node_post_carry(info, COP_DELETE, empty, 1); -+ if (IS_ERR(op) || op == NULL) -+ return RETERR(op ? PTR_ERR(op) : -EIO); -+ -+ op->u.delete.child = NULL; -+ op->u.delete.flags = 0; -+ -+ /* fare thee well */ -+ tree = znode_get_tree(empty); -+ read_lock_tree(tree); -+ write_lock_dk(tree); -+ znode_set_ld_key(empty, znode_get_rd_key(empty)); -+ if (znode_is_left_connected(empty) && empty->left) -+ znode_set_rd_key(empty->left, znode_get_rd_key(empty)); -+ write_unlock_dk(tree); -+ read_unlock_tree(tree); -+ -+ ZF_SET(empty, JNODE_HEARD_BANSHEE); -+ return 0; -+} -+ -+/* something were shifted from @insert_coord->node to @shift->target, update -+ @insert_coord correspondingly */ -+static void -+adjust_coord(coord_t * insert_coord, struct shift_params *shift, int removed, -+ int including_insert_coord) -+{ -+ /* item plugin was invalidated by shifting */ -+ coord_clear_iplug(insert_coord); -+ -+ if (node_is_empty(shift->wish_stop.node)) { -+ assert("vs-242", shift->everything); -+ if (including_insert_coord) { -+ if (shift->pend == SHIFT_RIGHT) { -+ /* set @insert_coord before first unit of -+ @shift->target node */ -+ coord_init_before_first_item(insert_coord, -+ shift->target); -+ } else { -+ /* set @insert_coord after last in target node */ -+ coord_init_after_last_item(insert_coord, -+ shift->target); -+ } -+ } else { -+ /* set @insert_coord inside of empty node. There is -+ only one possible coord within an empty -+ node. init_first_unit will set that coord */ -+ coord_init_first_unit(insert_coord, -+ shift->wish_stop.node); -+ } -+ return; -+ } -+ -+ if (shift->pend == SHIFT_RIGHT) { -+ /* there was shifting to right */ -+ if (shift->everything) { -+ /* everything wanted was shifted */ -+ if (including_insert_coord) { -+ /* @insert_coord is set before first unit of -+ @to node */ -+ coord_init_before_first_item(insert_coord, -+ shift->target); -+ insert_coord->between = BEFORE_UNIT; -+ } else { -+ /* @insert_coord is set after last unit of -+ @insert->node */ -+ coord_init_last_unit(insert_coord, -+ shift->wish_stop.node); -+ insert_coord->between = AFTER_UNIT; -+ } -+ } -+ return; -+ } -+ -+ /* there was shifting to left */ -+ if (shift->everything) { -+ /* everything wanted was shifted */ -+ if (including_insert_coord) { -+ /* @insert_coord is set after last unit in @to node */ -+ coord_init_after_last_item(insert_coord, shift->target); -+ } else { -+ /* @insert_coord is set before first unit in the same -+ node */ -+ coord_init_before_first_item(insert_coord, -+ shift->wish_stop.node); -+ } -+ return; -+ } -+ -+ /* FIXME-VS: the code below is complicated because with between == -+ AFTER_ITEM unit_pos is set to 0 */ -+ -+ if (!removed) { -+ /* no items were shifted entirely */ -+ assert("vs-195", shift->merging_units == 0 -+ || shift->part_units == 0); -+ -+ if (shift->real_stop.item_pos == insert_coord->item_pos) { -+ if (shift->merging_units) { -+ if (insert_coord->between == AFTER_UNIT) { -+ assert("nikita-1441", -+ insert_coord->unit_pos >= -+ shift->merging_units); -+ insert_coord->unit_pos -= -+ shift->merging_units; -+ } else if (insert_coord->between == BEFORE_UNIT) { -+ assert("nikita-2090", -+ insert_coord->unit_pos > -+ shift->merging_units); -+ insert_coord->unit_pos -= -+ shift->merging_units; -+ } -+ -+ assert("nikita-2083", -+ insert_coord->unit_pos + 1); -+ } else { -+ if (insert_coord->between == AFTER_UNIT) { -+ assert("nikita-1442", -+ insert_coord->unit_pos >= -+ shift->part_units); -+ insert_coord->unit_pos -= -+ shift->part_units; -+ } else if (insert_coord->between == BEFORE_UNIT) { -+ assert("nikita-2089", -+ insert_coord->unit_pos > -+ shift->part_units); -+ insert_coord->unit_pos -= -+ shift->part_units; -+ } -+ -+ assert("nikita-2084", -+ insert_coord->unit_pos + 1); -+ } -+ } -+ return; -+ } -+ -+ /* we shifted to left and there was no enough space for everything */ -+ switch (insert_coord->between) { -+ case AFTER_UNIT: -+ case BEFORE_UNIT: -+ if (shift->real_stop.item_pos == insert_coord->item_pos) -+ insert_coord->unit_pos -= shift->part_units; -+ case AFTER_ITEM: -+ coord_add_item_pos(insert_coord, -removed); -+ break; -+ default: -+ impossible("nikita-2087", "not ready"); -+ } -+ assert("nikita-2085", insert_coord->unit_pos + 1); -+} -+ -+static int call_shift_hooks(struct shift_params *shift) -+{ -+ unsigned i, shifted; -+ coord_t coord; -+ item_plugin *iplug; -+ -+ assert("vs-275", !node_is_empty(shift->target)); -+ -+ /* number of items shift touches */ -+ shifted = -+ shift->entire + (shift->merging_units ? 1 : 0) + -+ (shift->part_units ? 1 : 0); -+ -+ if (shift->pend == SHIFT_LEFT) { -+ /* moved items are at the end */ -+ coord_init_last_unit(&coord, shift->target); -+ coord.unit_pos = 0; -+ -+ assert("vs-279", shift->pend == 1); -+ for (i = 0; i < shifted; i++) { -+ unsigned from, count; -+ -+ iplug = item_plugin_by_coord(&coord); -+ if (i == 0 && shift->part_units) { -+ assert("vs-277", -+ coord_num_units(&coord) == -+ shift->part_units); -+ count = shift->part_units; -+ from = 0; -+ } else if (i == shifted - 1 && shift->merging_units) { -+ count = shift->merging_units; -+ from = coord_num_units(&coord) - count; -+ } else { -+ count = coord_num_units(&coord); -+ from = 0; -+ } -+ -+ if (iplug->b.shift_hook) { -+ iplug->b.shift_hook(&coord, from, count, -+ shift->wish_stop.node); -+ } -+ coord_add_item_pos(&coord, -shift->pend); -+ } -+ } else { -+ /* moved items are at the beginning */ -+ coord_init_first_unit(&coord, shift->target); -+ -+ assert("vs-278", shift->pend == -1); -+ for (i = 0; i < shifted; i++) { -+ unsigned from, count; -+ -+ iplug = item_plugin_by_coord(&coord); -+ if (i == 0 && shift->part_units) { -+ assert("vs-277", -+ coord_num_units(&coord) == -+ shift->part_units); -+ count = coord_num_units(&coord); -+ from = 0; -+ } else if (i == shifted - 1 && shift->merging_units) { -+ count = shift->merging_units; -+ from = 0; -+ } else { -+ count = coord_num_units(&coord); -+ from = 0; -+ } -+ -+ if (iplug->b.shift_hook) { -+ iplug->b.shift_hook(&coord, from, count, -+ shift->wish_stop.node); -+ } -+ coord_add_item_pos(&coord, -shift->pend); -+ } -+ } -+ -+ return 0; -+} -+ -+/* shift to left is completed. Return 1 if unit @old was moved to left neighbor */ -+static int -+unit_moved_left(const struct shift_params *shift, const coord_t * old) -+{ -+ assert("vs-944", shift->real_stop.node == old->node); -+ -+ if (shift->real_stop.item_pos < old->item_pos) -+ return 0; -+ if (shift->real_stop.item_pos == old->item_pos) { -+ if (shift->real_stop.unit_pos < old->unit_pos) -+ return 0; -+ } -+ return 1; -+} -+ -+/* shift to right is completed. Return 1 if unit @old was moved to right -+ neighbor */ -+static int -+unit_moved_right(const struct shift_params *shift, const coord_t * old) -+{ -+ assert("vs-944", shift->real_stop.node == old->node); -+ -+ if (shift->real_stop.item_pos > old->item_pos) -+ return 0; -+ if (shift->real_stop.item_pos == old->item_pos) { -+ if (shift->real_stop.unit_pos > old->unit_pos) -+ return 0; -+ } -+ return 1; -+} -+ -+/* coord @old was set in node from which shift was performed. What was shifted -+ is stored in @shift. Update @old correspondingly to performed shift */ -+static coord_t *adjust_coord2(const struct shift_params *shift, -+ const coord_t * old, coord_t * new) -+{ -+ coord_clear_iplug(new); -+ new->between = old->between; -+ -+ coord_clear_iplug(new); -+ if (old->node == shift->target) { -+ if (shift->pend == SHIFT_LEFT) { -+ /* coord which is set inside of left neighbor does not -+ change during shift to left */ -+ coord_dup(new, old); -+ return new; -+ } -+ new->node = old->node; -+ coord_set_item_pos(new, -+ old->item_pos + shift->entire + -+ (shift->part_units ? 1 : 0)); -+ new->unit_pos = old->unit_pos; -+ if (old->item_pos == 0 && shift->merging_units) -+ new->unit_pos += shift->merging_units; -+ return new; -+ } -+ -+ assert("vs-977", old->node == shift->wish_stop.node); -+ if (shift->pend == SHIFT_LEFT) { -+ if (unit_moved_left(shift, old)) { -+ /* unit @old moved to left neighbor. Calculate its -+ coordinate there */ -+ new->node = shift->target; -+ coord_set_item_pos(new, -+ node_num_items(shift->target) - -+ shift->entire - -+ (shift->part_units ? 1 : 0) + -+ old->item_pos); -+ -+ new->unit_pos = old->unit_pos; -+ if (shift->merging_units) { -+ coord_dec_item_pos(new); -+ if (old->item_pos == 0) { -+ /* unit_pos only changes if item got -+ merged */ -+ new->unit_pos = -+ coord_num_units(new) - -+ (shift->merging_units - -+ old->unit_pos); -+ } -+ } -+ } else { -+ /* unit @old did not move to left neighbor. -+ -+ Use _nocheck, because @old is outside of its node. -+ */ -+ coord_dup_nocheck(new, old); -+ coord_add_item_pos(new, -+ -shift->u.future_first.item_pos); -+ if (new->item_pos == 0) -+ new->unit_pos -= shift->u.future_first.unit_pos; -+ } -+ } else { -+ if (unit_moved_right(shift, old)) { -+ /* unit @old moved to right neighbor */ -+ new->node = shift->target; -+ coord_set_item_pos(new, -+ old->item_pos - -+ shift->real_stop.item_pos); -+ if (new->item_pos == 0) { -+ /* unit @old might change unit pos */ -+ coord_set_item_pos(new, -+ old->unit_pos - -+ shift->real_stop.unit_pos); -+ } -+ } else { -+ /* unit @old did not move to right neighbor, therefore -+ it did not change */ -+ coord_dup(new, old); -+ } -+ } -+ coord_set_iplug(new, item_plugin_by_coord(new)); -+ return new; -+} -+ -+/* this is called when shift is completed (something of source node is copied -+ to target and deleted in source) to update all taps set in current -+ context */ -+static void update_taps(const struct shift_params *shift) -+{ -+ tap_t *tap; -+ coord_t new; -+ -+ for_all_taps(tap) { -+ /* update only taps set to nodes participating in shift */ -+ if (tap->coord->node == shift->wish_stop.node -+ || tap->coord->node == shift->target) -+ tap_to_coord(tap, -+ adjust_coord2(shift, tap->coord, &new)); -+ } -+} -+ -+#if REISER4_DEBUG -+ -+struct shift_check { -+ reiser4_key key; -+ __u16 plugin_id; -+ union { -+ __u64 bytes; -+ __u64 entries; -+ void *unused; -+ } u; -+}; -+ -+void *shift_check_prepare(const znode * left, const znode * right) -+{ -+ pos_in_node_t i, nr_items; -+ int mergeable; -+ struct shift_check *data; -+ item_header40 *ih; -+ -+ if (node_is_empty(left) || node_is_empty(right)) -+ mergeable = 0; -+ else { -+ coord_t l, r; -+ -+ coord_init_last_unit(&l, left); -+ coord_init_first_unit(&r, right); -+ mergeable = are_items_mergeable(&l, &r); -+ } -+ nr_items = -+ node40_num_of_items_internal(left) + -+ node40_num_of_items_internal(right) - (mergeable ? 1 : 0); -+ data = -+ kmalloc(sizeof(struct shift_check) * nr_items, -+ reiser4_ctx_gfp_mask_get()); -+ if (data != NULL) { -+ coord_t coord; -+ pos_in_node_t item_pos; -+ -+ coord_init_first_unit(&coord, left); -+ i = 0; -+ -+ for (item_pos = 0; -+ item_pos < node40_num_of_items_internal(left); -+ item_pos++) { -+ -+ coord_set_item_pos(&coord, item_pos); -+ ih = node40_ih_at_coord(&coord); -+ -+ data[i].key = ih->key; -+ data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id)); -+ switch (data[i].plugin_id) { -+ case CTAIL_ID: -+ case FORMATTING_ID: -+ data[i].u.bytes = coord_num_units(&coord); -+ break; -+ case EXTENT_POINTER_ID: -+ data[i].u.bytes = -+ reiser4_extent_size(&coord, -+ coord_num_units(&coord)); -+ break; -+ case COMPOUND_DIR_ID: -+ data[i].u.entries = coord_num_units(&coord); -+ break; -+ default: -+ data[i].u.unused = NULL; -+ break; -+ } -+ i++; -+ } -+ -+ coord_init_first_unit(&coord, right); -+ -+ if (mergeable) { -+ assert("vs-1609", i != 0); -+ -+ ih = node40_ih_at_coord(&coord); -+ -+ assert("vs-1589", -+ data[i - 1].plugin_id == -+ le16_to_cpu(get_unaligned(&ih->plugin_id))); -+ switch (data[i - 1].plugin_id) { -+ case CTAIL_ID: -+ case FORMATTING_ID: -+ data[i - 1].u.bytes += coord_num_units(&coord); -+ break; -+ case EXTENT_POINTER_ID: -+ data[i - 1].u.bytes += -+ reiser4_extent_size(&coord, -+ coord_num_units(&coord)); -+ break; -+ case COMPOUND_DIR_ID: -+ data[i - 1].u.entries += -+ coord_num_units(&coord); -+ break; -+ default: -+ impossible("vs-1605", "wrong mergeable item"); -+ break; -+ } -+ item_pos = 1; -+ } else -+ item_pos = 0; -+ for (; item_pos < node40_num_of_items_internal(right); -+ item_pos++) { -+ -+ assert("vs-1604", i < nr_items); -+ coord_set_item_pos(&coord, item_pos); -+ ih = node40_ih_at_coord(&coord); -+ -+ data[i].key = ih->key; -+ data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id)); -+ switch (data[i].plugin_id) { -+ case CTAIL_ID: -+ case FORMATTING_ID: -+ data[i].u.bytes = coord_num_units(&coord); -+ break; -+ case EXTENT_POINTER_ID: -+ data[i].u.bytes = -+ reiser4_extent_size(&coord, -+ coord_num_units(&coord)); -+ break; -+ case COMPOUND_DIR_ID: -+ data[i].u.entries = coord_num_units(&coord); -+ break; -+ default: -+ data[i].u.unused = NULL; -+ break; -+ } -+ i++; -+ } -+ assert("vs-1606", i == nr_items); -+ } -+ return data; -+} -+ -+void shift_check(void *vp, const znode * left, const znode * right) -+{ -+ pos_in_node_t i, nr_items; -+ coord_t coord; -+ __u64 last_bytes; -+ int mergeable; -+ item_header40 *ih; -+ pos_in_node_t item_pos; -+ struct shift_check *data; -+ -+ data = (struct shift_check *)vp; -+ -+ if (data == NULL) -+ return; -+ -+ if (node_is_empty(left) || node_is_empty(right)) -+ mergeable = 0; -+ else { -+ coord_t l, r; -+ -+ coord_init_last_unit(&l, left); -+ coord_init_first_unit(&r, right); -+ mergeable = are_items_mergeable(&l, &r); -+ } -+ -+ nr_items = -+ node40_num_of_items_internal(left) + -+ node40_num_of_items_internal(right) - (mergeable ? 1 : 0); -+ -+ i = 0; -+ last_bytes = 0; -+ -+ coord_init_first_unit(&coord, left); -+ -+ for (item_pos = 0; item_pos < node40_num_of_items_internal(left); -+ item_pos++) { -+ -+ coord_set_item_pos(&coord, item_pos); -+ ih = node40_ih_at_coord(&coord); -+ -+ assert("vs-1611", i == item_pos); -+ assert("vs-1590", keyeq(&ih->key, &data[i].key)); -+ assert("vs-1591", -+ le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id); -+ if ((i < (node40_num_of_items_internal(left) - 1)) -+ || !mergeable) { -+ switch (data[i].plugin_id) { -+ case CTAIL_ID: -+ case FORMATTING_ID: -+ assert("vs-1592", -+ data[i].u.bytes == -+ coord_num_units(&coord)); -+ break; -+ case EXTENT_POINTER_ID: -+ assert("vs-1593", -+ data[i].u.bytes == -+ reiser4_extent_size(&coord, -+ coord_num_units -+ (&coord))); -+ break; -+ case COMPOUND_DIR_ID: -+ assert("vs-1594", -+ data[i].u.entries == -+ coord_num_units(&coord)); -+ break; -+ default: -+ break; -+ } -+ } -+ if (item_pos == (node40_num_of_items_internal(left) - 1) -+ && mergeable) { -+ switch (data[i].plugin_id) { -+ case CTAIL_ID: -+ case FORMATTING_ID: -+ last_bytes = coord_num_units(&coord); -+ break; -+ case EXTENT_POINTER_ID: -+ last_bytes = -+ reiser4_extent_size(&coord, -+ coord_num_units(&coord)); -+ break; -+ case COMPOUND_DIR_ID: -+ last_bytes = coord_num_units(&coord); -+ break; -+ default: -+ impossible("vs-1595", "wrong mergeable item"); -+ break; -+ } -+ } -+ i++; -+ } -+ -+ coord_init_first_unit(&coord, right); -+ if (mergeable) { -+ ih = node40_ih_at_coord(&coord); -+ -+ assert("vs-1589", -+ data[i - 1].plugin_id == le16_to_cpu(get_unaligned(&ih->plugin_id))); -+ assert("vs-1608", last_bytes != 0); -+ switch (data[i - 1].plugin_id) { -+ case CTAIL_ID: -+ case FORMATTING_ID: -+ assert("vs-1596", -+ data[i - 1].u.bytes == -+ last_bytes + coord_num_units(&coord)); -+ break; -+ -+ case EXTENT_POINTER_ID: -+ assert("vs-1597", -+ data[i - 1].u.bytes == -+ last_bytes + reiser4_extent_size(&coord, -+ coord_num_units -+ (&coord))); -+ break; -+ -+ case COMPOUND_DIR_ID: -+ assert("vs-1598", -+ data[i - 1].u.bytes == -+ last_bytes + coord_num_units(&coord)); -+ break; -+ default: -+ impossible("vs-1599", "wrong mergeable item"); -+ break; -+ } -+ item_pos = 1; -+ } else -+ item_pos = 0; -+ -+ for (; item_pos < node40_num_of_items_internal(right); item_pos++) { -+ -+ coord_set_item_pos(&coord, item_pos); -+ ih = node40_ih_at_coord(&coord); -+ -+ assert("vs-1612", keyeq(&ih->key, &data[i].key)); -+ assert("vs-1613", -+ le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id); -+ switch (data[i].plugin_id) { -+ case CTAIL_ID: -+ case FORMATTING_ID: -+ assert("vs-1600", -+ data[i].u.bytes == coord_num_units(&coord)); -+ break; -+ case EXTENT_POINTER_ID: -+ assert("vs-1601", -+ data[i].u.bytes == -+ reiser4_extent_size(&coord, -+ coord_num_units -+ (&coord))); -+ break; -+ case COMPOUND_DIR_ID: -+ assert("vs-1602", -+ data[i].u.entries == coord_num_units(&coord)); -+ break; -+ default: -+ break; -+ } -+ i++; -+ } -+ -+ assert("vs-1603", i == nr_items); -+ kfree(data); -+} -+ -+#endif -+ -+/* plugin->u.node.shift -+ look for description of this method in plugin/node/node.h */ -+int shift_node40(coord_t * from, znode * to, shift_direction pend, int delete_child, /* if @from->node becomes empty - it will be -+ deleted from the tree if this is set to 1 */ -+ int including_stop_coord, carry_plugin_info * info) -+{ -+ struct shift_params shift; -+ int result; -+ znode *left, *right; -+ znode *source; -+ int target_empty; -+ -+ assert("nikita-2161", coord_check(from)); -+ -+ memset(&shift, 0, sizeof(shift)); -+ shift.pend = pend; -+ shift.wish_stop = *from; -+ shift.target = to; -+ -+ assert("nikita-1473", znode_is_write_locked(from->node)); -+ assert("nikita-1474", znode_is_write_locked(to)); -+ -+ source = from->node; -+ -+ /* set @shift.wish_stop to rightmost/leftmost unit among units we want -+ shifted */ -+ if (pend == SHIFT_LEFT) { -+ result = coord_set_to_left(&shift.wish_stop); -+ left = to; -+ right = from->node; -+ } else { -+ result = coord_set_to_right(&shift.wish_stop); -+ left = from->node; -+ right = to; -+ } -+ -+ if (result) { -+ /* move insertion coord even if there is nothing to move */ -+ if (including_stop_coord) { -+ /* move insertion coord (@from) */ -+ if (pend == SHIFT_LEFT) { -+ /* after last item in target node */ -+ coord_init_after_last_item(from, to); -+ } else { -+ /* before first item in target node */ -+ coord_init_before_first_item(from, to); -+ } -+ } -+ -+ if (delete_child && node_is_empty(shift.wish_stop.node)) -+ result = -+ prepare_removal_node40(shift.wish_stop.node, info); -+ else -+ result = 0; -+ /* there is nothing to shift */ -+ assert("nikita-2078", coord_check(from)); -+ return result; -+ } -+ -+ target_empty = node_is_empty(to); -+ -+ /* when first node plugin with item body compression is implemented, -+ this must be changed to call node specific plugin */ -+ -+ /* shift->stop_coord is updated to last unit which really will be -+ shifted */ -+ estimate_shift(&shift, get_current_context()); -+ if (!shift.shift_bytes) { -+ /* we could not shift anything */ -+ assert("nikita-2079", coord_check(from)); -+ return 0; -+ } -+ -+ copy(&shift); -+ -+ /* result value of this is important. It is used by adjust_coord below */ -+ result = delete_copied(&shift); -+ -+ assert("vs-1610", result >= 0); -+ assert("vs-1471", -+ ((reiser4_context *) current->journal_info)->magic == -+ context_magic); -+ -+ /* item which has been moved from one node to another might want to do -+ something on that event. This can be done by item's shift_hook -+ method, which will be now called for every moved items */ -+ call_shift_hooks(&shift); -+ -+ assert("vs-1472", -+ ((reiser4_context *) current->journal_info)->magic == -+ context_magic); -+ -+ update_taps(&shift); -+ -+ assert("vs-1473", -+ ((reiser4_context *) current->journal_info)->magic == -+ context_magic); -+ -+ /* adjust @from pointer in accordance with @including_stop_coord flag -+ and amount of data which was really shifted */ -+ adjust_coord(from, &shift, result, including_stop_coord); -+ -+ if (target_empty) -+ /* -+ * items were shifted into empty node. Update delimiting key. -+ */ -+ result = prepare_for_update(NULL, left, info); -+ -+ /* add update operation to @info, which is the list of operations to -+ be performed on a higher level */ -+ result = prepare_for_update(left, right, info); -+ if (!result && node_is_empty(source) && delete_child) { -+ /* all contents of @from->node is moved to @to and @from->node -+ has to be removed from the tree, so, on higher level we -+ will be removing the pointer to node @from->node */ -+ result = prepare_removal_node40(source, info); -+ } -+ assert("nikita-2080", coord_check(from)); -+ return result ? result : (int)shift.shift_bytes; -+} -+ -+/* plugin->u.node.fast_insert() -+ look for description of this method in plugin/node/node.h */ -+int fast_insert_node40(const coord_t * coord UNUSED_ARG /* node to query */ ) -+{ -+ return 1; -+} -+ -+/* plugin->u.node.fast_paste() -+ look for description of this method in plugin/node/node.h */ -+int fast_paste_node40(const coord_t * coord UNUSED_ARG /* node to query */ ) -+{ -+ return 1; -+} -+ -+/* plugin->u.node.fast_cut() -+ look for description of this method in plugin/node/node.h */ -+int fast_cut_node40(const coord_t * coord UNUSED_ARG /* node to query */ ) -+{ -+ return 1; -+} -+ -+/* plugin->u.node.modify - not defined */ -+ -+/* plugin->u.node.max_item_size */ -+int max_item_size_node40(void) -+{ -+ return reiser4_get_current_sb()->s_blocksize - sizeof(node40_header) - -+ sizeof(item_header40); -+} -+ -+/* plugin->u.node.set_item_plugin */ -+int set_item_plugin_node40(coord_t *coord, item_id id) -+{ -+ item_header40 *ih; -+ -+ ih = node40_ih_at_coord(coord); -+ put_unaligned(cpu_to_le16(id), &ih->plugin_id); -+ coord->iplugid = id; -+ return 0; -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/node/node40.h linux-2.6.24/fs/reiser4/plugin/node/node40.h ---- linux-2.6.24.orig/fs/reiser4/plugin/node/node40.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/node/node40.h 2008-01-25 11:39:07.040234479 +0300 -@@ -0,0 +1,125 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#if !defined( __REISER4_NODE40_H__ ) -+#define __REISER4_NODE40_H__ -+ -+#include "../../forward.h" -+#include "../../dformat.h" -+#include "node.h" -+ -+#include -+ -+/* format of node header for 40 node layouts. Keep bloat out of this struct. */ -+typedef struct node40_header { -+ /* identifier of node plugin. Must be located at the very beginning -+ of a node. */ -+ common_node_header common_header; /* this is 16 bits */ -+ /* number of items. Should be first element in the node header, -+ because we haven't yet finally decided whether it shouldn't go into -+ common_header. -+ */ -+/* NIKITA-FIXME-HANS: Create a macro such that if there is only one -+ * node format at compile time, and it is this one, accesses do not function dereference when -+ * accessing these fields (and otherwise they do). Probably 80% of users will only have one node format at a time throughout the life of reiser4. */ -+ d16 nr_items; -+ /* free space in node measured in bytes */ -+ d16 free_space; -+ /* offset to start of free space in node */ -+ d16 free_space_start; -+ /* for reiser4_fsck. When information about what is a free -+ block is corrupted, and we try to recover everything even -+ if marked as freed, then old versions of data may -+ duplicate newer versions, and this field allows us to -+ restore the newer version. Also useful for when users -+ who don't have the new trashcan installed on their linux distro -+ delete the wrong files and send us desperate emails -+ offering $25 for them back. */ -+ -+ /* magic field we need to tell formatted nodes NIKITA-FIXME-HANS: improve this comment */ -+ d32 magic; -+ /* flushstamp is made of mk_id and write_counter. mk_id is an -+ id generated randomly at mkreiserfs time. So we can just -+ skip all nodes with different mk_id. write_counter is d64 -+ incrementing counter of writes on disk. It is used for -+ choosing the newest data at fsck time. NIKITA-FIXME-HANS: why was field name changed but not comment? */ -+ -+ d32 mkfs_id; -+ d64 flush_id; -+ /* node flags to be used by fsck (reiser4ck or reiser4fsck?) -+ and repacker NIKITA-FIXME-HANS: say more or reference elsewhere that says more */ -+ d16 flags; -+ -+ /* 1 is leaf level, 2 is twig level, root is the numerically -+ largest level */ -+ d8 level; -+ -+ d8 pad; -+} PACKED node40_header; -+ -+/* item headers are not standard across all node layouts, pass -+ pos_in_node to functions instead */ -+typedef struct item_header40 { -+ /* key of item */ -+ /* 0 */ reiser4_key key; -+ /* offset from start of a node measured in 8-byte chunks */ -+ /* 24 */ d16 offset; -+ /* 26 */ d16 flags; -+ /* 28 */ d16 plugin_id; -+} PACKED item_header40; -+ -+size_t item_overhead_node40(const znode * node, flow_t * aflow); -+size_t free_space_node40(znode * node); -+node_search_result lookup_node40(znode * node, const reiser4_key * key, -+ lookup_bias bias, coord_t * coord); -+int num_of_items_node40(const znode * node); -+char *item_by_coord_node40(const coord_t * coord); -+int length_by_coord_node40(const coord_t * coord); -+item_plugin *plugin_by_coord_node40(const coord_t * coord); -+reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key); -+size_t estimate_node40(znode * node); -+int check_node40(const znode * node, __u32 flags, const char **error); -+int parse_node40(znode * node); -+int init_node40(znode * node); -+#ifdef GUESS_EXISTS -+int guess_node40(const znode * node); -+#endif -+void change_item_size_node40(coord_t * coord, int by); -+int create_item_node40(coord_t * target, const reiser4_key * key, -+ reiser4_item_data * data, carry_plugin_info * info); -+void update_item_key_node40(coord_t * target, const reiser4_key * key, -+ carry_plugin_info * info); -+int kill_node40(struct carry_kill_data *, carry_plugin_info *); -+int cut_node40(struct carry_cut_data *, carry_plugin_info *); -+int shift_node40(coord_t * from, znode * to, shift_direction pend, -+ /* if @from->node becomes -+ empty - it will be deleted from -+ the tree if this is set to 1 -+ */ -+ int delete_child, int including_stop_coord, -+ carry_plugin_info * info); -+ -+int fast_insert_node40(const coord_t * coord); -+int fast_paste_node40(const coord_t * coord); -+int fast_cut_node40(const coord_t * coord); -+int max_item_size_node40(void); -+int prepare_removal_node40(znode * empty, carry_plugin_info * info); -+int set_item_plugin_node40(coord_t * coord, item_id id); -+int shrink_item_node40(coord_t * coord, int delta); -+ -+#if REISER4_DEBUG -+void *shift_check_prepare(const znode *left, const znode *right); -+void shift_check(void *vp, const znode *left, const znode *right); -+#endif -+ -+/* __REISER4_NODE40_H__ */ -+#endif -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/node/node.c linux-2.6.24/fs/reiser4/plugin/node/node.c ---- linux-2.6.24.orig/fs/reiser4/plugin/node/node.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/node/node.c 2008-01-25 11:39:07.040234479 +0300 -@@ -0,0 +1,131 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Node plugin interface. -+ -+ Description: The tree provides the abstraction of flows, which it -+ internally fragments into items which it stores in nodes. -+ -+ A key_atom is a piece of data bound to a single key. -+ -+ For reasonable space efficiency to be achieved it is often -+ necessary to store key_atoms in the nodes in the form of items, where -+ an item is a sequence of key_atoms of the same or similar type. It is -+ more space-efficient, because the item can implement (very) -+ efficient compression of key_atom's bodies using internal knowledge -+ about their semantics, and it can often avoid having a key for each -+ key_atom. Each type of item has specific operations implemented by its -+ item handler (see balance.c). -+ -+ Rationale: the rest of the code (specifically balancing routines) -+ accesses leaf level nodes through this interface. This way we can -+ implement various block layouts and even combine various layouts -+ within the same tree. Balancing/allocating algorithms should not -+ care about peculiarities of splitting/merging specific item types, -+ but rather should leave that to the item's item handler. -+ -+ Items, including those that provide the abstraction of flows, have -+ the property that if you move them in part or in whole to another -+ node, the balancing code invokes their is_left_mergeable() -+ item_operation to determine if they are mergeable with their new -+ neighbor in the node you have moved them to. For some items the -+ is_left_mergeable() function always returns null. -+ -+ When moving the bodies of items from one node to another: -+ -+ if a partial item is shifted to another node the balancing code invokes -+ an item handler method to handle the item splitting. -+ -+ if the balancing code needs to merge with an item in the node it -+ is shifting to, it will invoke an item handler method to handle -+ the item merging. -+ -+ if it needs to move whole item bodies unchanged, the balancing code uses xmemcpy() -+ adjusting the item headers after the move is done using the node handler. -+*/ -+ -+#include "../../forward.h" -+#include "../../debug.h" -+#include "../../key.h" -+#include "../../coord.h" -+#include "../plugin_header.h" -+#include "../item/item.h" -+#include "node.h" -+#include "../plugin.h" -+#include "../../znode.h" -+#include "../../tree.h" -+#include "../../super.h" -+#include "../../reiser4.h" -+ -+/** -+ * leftmost_key_in_node - get the smallest key in node -+ * @node: -+ * @key: store result here -+ * -+ * Stores the leftmost key of @node in @key. -+ */ -+reiser4_key *leftmost_key_in_node(const znode *node, reiser4_key *key) -+{ -+ assert("nikita-1634", node != NULL); -+ assert("nikita-1635", key != NULL); -+ -+ if (!node_is_empty(node)) { -+ coord_t first_item; -+ -+ coord_init_first_unit(&first_item, (znode *) node); -+ item_key_by_coord(&first_item, key); -+ } else -+ *key = *reiser4_max_key(); -+ return key; -+} -+ -+node_plugin node_plugins[LAST_NODE_ID] = { -+ [NODE40_ID] = { -+ .h = { -+ .type_id = REISER4_NODE_PLUGIN_TYPE, -+ .id = NODE40_ID, -+ .pops = NULL, -+ .label = "unified", -+ .desc = "unified node layout", -+ .linkage = {NULL, NULL} -+ }, -+ .item_overhead = item_overhead_node40, -+ .free_space = free_space_node40, -+ .lookup = lookup_node40, -+ .num_of_items = num_of_items_node40, -+ .item_by_coord = item_by_coord_node40, -+ .length_by_coord = length_by_coord_node40, -+ .plugin_by_coord = plugin_by_coord_node40, -+ .key_at = key_at_node40, -+ .estimate = estimate_node40, -+ .check = check_node40, -+ .parse = parse_node40, -+ .init = init_node40, -+#ifdef GUESS_EXISTS -+ .guess = guess_node40, -+#endif -+ .change_item_size = change_item_size_node40, -+ .create_item = create_item_node40, -+ .update_item_key = update_item_key_node40, -+ .cut_and_kill = kill_node40, -+ .cut = cut_node40, -+ .shift = shift_node40, -+ .shrink_item = shrink_item_node40, -+ .fast_insert = fast_insert_node40, -+ .fast_paste = fast_paste_node40, -+ .fast_cut = fast_cut_node40, -+ .max_item_size = max_item_size_node40, -+ .prepare_removal = prepare_removal_node40, -+ .set_item_plugin = set_item_plugin_node40 -+ } -+}; -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/node/node.h linux-2.6.24/fs/reiser4/plugin/node/node.h ---- linux-2.6.24.orig/fs/reiser4/plugin/node/node.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/node/node.h 2008-01-25 11:39:07.044235509 +0300 -@@ -0,0 +1,272 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* We need a definition of the default node layout here. */ -+ -+/* Generally speaking, it is best to have free space in the middle of the -+ node so that two sets of things can grow towards it, and to have the -+ item bodies on the left so that the last one of them grows into free -+ space. We optimize for the case where we append new items to the end -+ of the node, or grow the last item, because it hurts nothing to so -+ optimize and it is a common special case to do massive insertions in -+ increasing key order (and one of cases more likely to have a real user -+ notice the delay time for). -+ -+ formatted leaf default layout: (leaf1) -+ -+ |node header:item bodies:free space:key + pluginid + item offset| -+ -+ We grow towards the middle, optimizing layout for the case where we -+ append new items to the end of the node. The node header is fixed -+ length. Keys, and item offsets plus pluginids for the items -+ corresponding to them are in increasing key order, and are fixed -+ length. Item offsets are relative to start of node (16 bits creating -+ a node size limit of 64k, 12 bits might be a better choice....). Item -+ bodies are in decreasing key order. Item bodies have a variable size. -+ There is a one to one to one mapping of keys to item offsets to item -+ bodies. Item offsets consist of pointers to the zeroth byte of the -+ item body. Item length equals the start of the next item minus the -+ start of this item, except the zeroth item whose length equals the end -+ of the node minus the start of that item (plus a byte). In other -+ words, the item length is not recorded anywhere, and it does not need -+ to be since it is computable. -+ -+ Leaf variable length items and keys layout : (lvar) -+ -+ |node header:key offset + item offset + pluginid triplets:free space:key bodies:item bodies| -+ -+ We grow towards the middle, optimizing layout for the case where we -+ append new items to the end of the node. The node header is fixed -+ length. Keys and item offsets for the items corresponding to them are -+ in increasing key order, and keys are variable length. Item offsets -+ are relative to start of node (16 bits). Item bodies are in -+ decreasing key order. Item bodies have a variable size. There is a -+ one to one to one mapping of keys to item offsets to item bodies. -+ Item offsets consist of pointers to the zeroth byte of the item body. -+ Item length equals the start of the next item's key minus the start of -+ this item, except the zeroth item whose length equals the end of the -+ node minus the start of that item (plus a byte). -+ -+ leaf compressed keys layout: (lcomp) -+ -+ |node header:key offset + key inherit + item offset pairs:free space:key bodies:item bodies| -+ -+ We grow towards the middle, optimizing layout for the case where we -+ append new items to the end of the node. The node header is fixed -+ length. Keys and item offsets for the items corresponding to them are -+ in increasing key order, and keys are variable length. The "key -+ inherit" field indicates how much of the key prefix is identical to -+ the previous key (stem compression as described in "Managing -+ Gigabytes" is used). key_inherit is a one byte integer. The -+ intra-node searches performed through this layout are linear searches, -+ and this is theorized to not hurt performance much due to the high -+ cost of processor stalls on modern CPUs, and the small number of keys -+ in a single node. Item offsets are relative to start of node (16 -+ bits). Item bodies are in decreasing key order. Item bodies have a -+ variable size. There is a one to one to one mapping of keys to item -+ offsets to item bodies. Item offsets consist of pointers to the -+ zeroth byte of the item body. Item length equals the start of the -+ next item minus the start of this item, except the zeroth item whose -+ length equals the end of the node minus the start of that item (plus a -+ byte). In other words, item length and key length is not recorded -+ anywhere, and it does not need to be since it is computable. -+ -+ internal node default layout: (idef1) -+ -+ just like ldef1 except that item bodies are either blocknrs of -+ children or extents, and moving them may require updating parent -+ pointers in the nodes that they point to. -+*/ -+ -+/* There is an inherent 3-way tradeoff between optimizing and -+ exchanging disks between different architectures and code -+ complexity. This is optimal and simple and inexchangeable. -+ Someone else can do the code for exchanging disks and make it -+ complex. It would not be that hard. Using other than the PAGE_SIZE -+ might be suboptimal. -+*/ -+ -+#if !defined( __REISER4_NODE_H__ ) -+#define __REISER4_NODE_H__ -+ -+#define LEAF40_NODE_SIZE PAGE_CACHE_SIZE -+ -+#include "../../dformat.h" -+#include "../plugin_header.h" -+ -+#include -+ -+typedef enum { -+ NS_FOUND = 0, -+ NS_NOT_FOUND = -ENOENT -+} node_search_result; -+ -+/* Maximal possible space overhead for creation of new item in a node */ -+#define REISER4_NODE_MAX_OVERHEAD ( sizeof( reiser4_key ) + 32 ) -+ -+typedef enum { -+ REISER4_NODE_DKEYS = (1 << 0), -+ REISER4_NODE_TREE_STABLE = (1 << 1) -+} reiser4_node_check_flag; -+ -+/* cut and cut_and_kill have too long list of parameters. This structure is just to safe some space on stack */ -+struct cut_list { -+ coord_t *from; -+ coord_t *to; -+ const reiser4_key *from_key; -+ const reiser4_key *to_key; -+ reiser4_key *smallest_removed; -+ carry_plugin_info *info; -+ __u32 flags; -+ struct inode *inode; /* this is to pass list of eflushed jnodes down to extent_kill_hook */ -+ lock_handle *left; -+ lock_handle *right; -+}; -+ -+struct carry_cut_data; -+struct carry_kill_data; -+ -+/* The responsibility of the node plugin is to store and give access -+ to the sequence of items within the node. */ -+typedef struct node_plugin { -+ /* generic plugin fields */ -+ plugin_header h; -+ -+ /* calculates the amount of space that will be required to store an -+ item which is in addition to the space consumed by the item body. -+ (the space consumed by the item body can be gotten by calling -+ item->estimate) */ -+ size_t(*item_overhead) (const znode * node, flow_t * f); -+ -+ /* returns free space by looking into node (i.e., without using -+ znode->free_space). */ -+ size_t(*free_space) (znode * node); -+ /* search within the node for the one item which might -+ contain the key, invoking item->search_within to search within -+ that item to see if it is in there */ -+ node_search_result(*lookup) (znode * node, const reiser4_key * key, -+ lookup_bias bias, coord_t * coord); -+ /* number of items in node */ -+ int (*num_of_items) (const znode * node); -+ -+ /* store information about item in @coord in @data */ -+ /* break into several node ops, don't add any more uses of this before doing so */ -+ /*int ( *item_at )( const coord_t *coord, reiser4_item_data *data ); */ -+ char *(*item_by_coord) (const coord_t * coord); -+ int (*length_by_coord) (const coord_t * coord); -+ item_plugin *(*plugin_by_coord) (const coord_t * coord); -+ -+ /* store item key in @key */ -+ reiser4_key *(*key_at) (const coord_t * coord, reiser4_key * key); -+ /* conservatively estimate whether unit of what size can fit -+ into node. This estimation should be performed without -+ actually looking into the node's content (free space is saved in -+ znode). */ -+ size_t(*estimate) (znode * node); -+ -+ /* performs every consistency check the node plugin author could -+ imagine. Optional. */ -+ int (*check) (const znode * node, __u32 flags, const char **error); -+ -+ /* Called when node is read into memory and node plugin is -+ already detected. This should read some data into znode (like free -+ space counter) and, optionally, check data consistency. -+ */ -+ int (*parse) (znode * node); -+ /* This method is called on a new node to initialise plugin specific -+ data (header, etc.) */ -+ int (*init) (znode * node); -+ /* Check whether @node content conforms to this plugin format. -+ Probably only useful after support for old V3.x formats is added. -+ Uncomment after 4.0 only. -+ */ -+ /* int ( *guess )( const znode *node ); */ -+#if REISER4_DEBUG -+ void (*print) (const char *prefix, const znode * node, __u32 flags); -+#endif -+ /* change size of @item by @by bytes. @item->node has enough free -+ space. When @by > 0 - free space is appended to end of item. When -+ @by < 0 - item is truncated - it is assumed that last @by bytes if -+ the item are freed already */ -+ void (*change_item_size) (coord_t * item, int by); -+ -+ /* create new item @length bytes long in coord @target */ -+ int (*create_item) (coord_t * target, const reiser4_key * key, -+ reiser4_item_data * data, carry_plugin_info * info); -+ -+ /* update key of item. */ -+ void (*update_item_key) (coord_t * target, const reiser4_key * key, -+ carry_plugin_info * info); -+ -+ int (*cut_and_kill) (struct carry_kill_data *, carry_plugin_info *); -+ int (*cut) (struct carry_cut_data *, carry_plugin_info *); -+ -+ /* -+ * shrink item pointed to by @coord by @delta bytes. -+ */ -+ int (*shrink_item) (coord_t * coord, int delta); -+ -+ /* copy as much as possible but not more than up to @stop from -+ @stop->node to @target. If (pend == append) then data from beginning of -+ @stop->node are copied to the end of @target. If (pend == prepend) then -+ data from the end of @stop->node are copied to the beginning of -+ @target. Copied data are removed from @stop->node. Information -+ about what to do on upper level is stored in @todo */ -+ int (*shift) (coord_t * stop, znode * target, shift_direction pend, -+ int delete_node, int including_insert_coord, -+ carry_plugin_info * info); -+ /* return true if this node allows skip carry() in some situations -+ (see fs/reiser4/tree.c:insert_by_coord()). Reiser3.x format -+ emulation doesn't. -+ -+ This will speedup insertions that doesn't require updates to the -+ parent, by bypassing initialisation of carry() structures. It's -+ believed that majority of insertions will fit there. -+ -+ */ -+ int (*fast_insert) (const coord_t * coord); -+ int (*fast_paste) (const coord_t * coord); -+ int (*fast_cut) (const coord_t * coord); -+ /* this limits max size of item which can be inserted into a node and -+ number of bytes item in a node may be appended with */ -+ int (*max_item_size) (void); -+ int (*prepare_removal) (znode * empty, carry_plugin_info * info); -+ /* change plugin id of items which are in a node already. Currently it is Used in tail conversion for regular -+ * files */ -+ int (*set_item_plugin) (coord_t * coord, item_id); -+} node_plugin; -+ -+typedef enum { -+ /* standard unified node layout used for both leaf and internal -+ nodes */ -+ NODE40_ID, -+ LAST_NODE_ID -+} reiser4_node_id; -+ -+extern reiser4_key *leftmost_key_in_node(const znode * node, reiser4_key * key); -+#if REISER4_DEBUG -+extern void print_node_content(const char *prefix, const znode * node, -+ __u32 flags); -+#endif -+ -+extern void indent_znode(const znode * node); -+ -+typedef struct common_node_header { -+ /* -+ * identifier of node plugin. Must be located at the very beginning of -+ * a node. -+ */ -+ __le16 plugin_id; -+} common_node_header; -+ -+/* __REISER4_NODE_H__ */ -+#endif -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/object.c linux-2.6.24/fs/reiser4/plugin/object.c ---- linux-2.6.24.orig/fs/reiser4/plugin/object.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/object.c 2008-01-25 11:39:07.044235509 +0300 -@@ -0,0 +1,531 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* -+ * Examples of object plugins: file, directory, symlink, special file. -+ * -+ * Plugins associated with inode: -+ * -+ * Plugin of inode is plugin referenced by plugin-id field of on-disk -+ * stat-data. How we store this plugin in in-core inode is not -+ * important. Currently pointers are used, another variant is to store offsets -+ * and do array lookup on each access. -+ * -+ * Now, each inode has one selected plugin: object plugin that -+ * determines what type of file this object is: directory, regular etc. -+ * -+ * This main plugin can use other plugins that are thus subordinated to -+ * it. Directory instance of object plugin uses hash; regular file -+ * instance uses tail policy plugin. -+ * -+ * Object plugin is either taken from id in stat-data or guessed from -+ * i_mode bits. Once it is established we ask it to install its -+ * subordinate plugins, by looking again in stat-data or inheriting them -+ * from parent. -+ * -+ * How new inode is initialized during ->read_inode(): -+ * 1 read stat-data and initialize inode fields: i_size, i_mode, -+ * i_generation, capabilities etc. -+ * 2 read plugin id from stat data or try to guess plugin id -+ * from inode->i_mode bits if plugin id is missing. -+ * 3 Call ->init_inode() method of stat-data plugin to initialise inode fields. -+ * -+ * NIKITA-FIXME-HANS: can you say a little about 1 being done before 3? What -+ * if stat data does contain i_size, etc., due to it being an unusual plugin? -+ * -+ * 4 Call ->activate() method of object's plugin. Plugin is either read from -+ * from stat-data or guessed from mode bits -+ * 5 Call ->inherit() method of object plugin to inherit as yet un initialized -+ * plugins from parent. -+ * -+ * Easy induction proves that on last step all plugins of inode would be -+ * initialized. -+ * -+ * When creating new object: -+ * 1 obtain object plugin id (see next period) -+ * NIKITA-FIXME-HANS: period? -+ * 2 ->install() this plugin -+ * 3 ->inherit() the rest from the parent -+ * -+ * We need some examples of creating an object with default and non-default -+ * plugin ids. Nikita, please create them. -+ */ -+ -+#include "../inode.h" -+ -+static int _bugop(void) -+{ -+ BUG_ON(1); -+ return 0; -+} -+ -+#define bugop ((void *)_bugop) -+ -+static int _dummyop(void) -+{ -+ return 0; -+} -+ -+#define dummyop ((void *)_dummyop) -+ -+static int change_file(struct inode *inode, -+ reiser4_plugin * plugin, -+ pset_member memb) -+{ -+ /* cannot change object plugin of already existing object */ -+ if (memb == PSET_FILE) -+ return RETERR(-EINVAL); -+ -+ /* Change PSET_CREATE */ -+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset, memb, plugin); -+} -+ -+static reiser4_plugin_ops file_plugin_ops = { -+ .change = change_file -+}; -+ -+static struct inode_operations null_i_ops = {.create = NULL}; -+static struct file_operations null_f_ops = {.owner = NULL}; -+static struct address_space_operations null_a_ops = {.writepage = NULL}; -+ -+/* VFS methods for regular files */ -+static struct inode_operations regular_file_i_ops = { -+ .permission = reiser4_permission_common, -+ .setattr = reiser4_setattr, -+ .getattr = reiser4_getattr_common -+}; -+static struct file_operations regular_file_f_ops = { -+ .llseek = generic_file_llseek, -+ .read = reiser4_read_careful, -+ .write = reiser4_write_careful, -+ .aio_read = generic_file_aio_read, -+ .ioctl = reiser4_ioctl_careful, -+ .mmap = reiser4_mmap_careful, -+ .open = reiser4_open_careful, -+ .release = reiser4_release_careful, -+ .fsync = reiser4_sync_file_common, -+ .splice_read = generic_file_splice_read, -+ .splice_write = generic_file_splice_write -+}; -+static struct address_space_operations regular_file_a_ops = { -+ .writepage = reiser4_writepage, -+ .readpage = reiser4_readpage, -+ .sync_page = block_sync_page, -+ .writepages = reiser4_writepages, -+ .set_page_dirty = reiser4_set_page_dirty, -+ .readpages = reiser4_readpages, -+ .prepare_write = reiser4_prepare_write, -+ .commit_write = reiser4_commit_write, -+ .bmap = reiser4_bmap_careful, -+ .invalidatepage = reiser4_invalidatepage, -+ .releasepage = reiser4_releasepage -+}; -+ -+/* VFS methods for symlink files */ -+static struct inode_operations symlink_file_i_ops = { -+ .readlink = generic_readlink, -+ .follow_link = reiser4_follow_link_common, -+ .permission = reiser4_permission_common, -+ .setattr = reiser4_setattr_common, -+ .getattr = reiser4_getattr_common -+}; -+ -+/* VFS methods for special files */ -+static struct inode_operations special_file_i_ops = { -+ .permission = reiser4_permission_common, -+ .setattr = reiser4_setattr_common, -+ .getattr = reiser4_getattr_common -+}; -+ -+/* VFS methods for directories */ -+static struct inode_operations directory_i_ops = { -+ .create = reiser4_create_common, -+ .lookup = reiser4_lookup_common, -+ .link = reiser4_link_common, -+ .unlink = reiser4_unlink_common, -+ .symlink = reiser4_symlink_common, -+ .mkdir = reiser4_mkdir_common, -+ .rmdir = reiser4_unlink_common, -+ .mknod = reiser4_mknod_common, -+ .rename = reiser4_rename_common, -+ .permission = reiser4_permission_common, -+ .setattr = reiser4_setattr_common, -+ .getattr = reiser4_getattr_common -+}; -+static struct file_operations directory_f_ops = { -+ .llseek = reiser4_llseek_dir_common, -+ .read = generic_read_dir, -+ .readdir = reiser4_readdir_common, -+ .release = reiser4_release_dir_common, -+ .fsync = reiser4_sync_common -+}; -+static struct address_space_operations directory_a_ops = { -+ .writepage = bugop, -+ .sync_page = bugop, -+ .writepages = dummyop, -+ .set_page_dirty = bugop, -+ .readpages = bugop, -+ .prepare_write = bugop, -+ .commit_write = bugop, -+ .bmap = bugop, -+ .invalidatepage = bugop, -+ .releasepage = bugop -+}; -+ -+/* -+ * Definitions of object plugins. -+ */ -+ -+file_plugin file_plugins[LAST_FILE_PLUGIN_ID] = { -+ [UNIX_FILE_PLUGIN_ID] = { -+ .h = { -+ .type_id = REISER4_FILE_PLUGIN_TYPE, -+ .id = UNIX_FILE_PLUGIN_ID, -+ .groups = (1 << REISER4_REGULAR_FILE), -+ .pops = &file_plugin_ops, -+ .label = "reg", -+ .desc = "regular file", -+ .linkage = {NULL, NULL}, -+ }, -+ /* -+ * invariant vfs ops -+ */ -+ .inode_ops = ®ular_file_i_ops, -+ .file_ops = ®ular_file_f_ops, -+ .as_ops = ®ular_file_a_ops, -+ /* -+ * private i_ops -+ */ -+ .setattr = setattr_unix_file, -+ .open = open_unix_file, -+ .read = read_unix_file, -+ .write = write_unix_file, -+ .ioctl = ioctl_unix_file, -+ .mmap = mmap_unix_file, -+ .release = release_unix_file, -+ /* -+ * private f_ops -+ */ -+ .readpage = readpage_unix_file, -+ .readpages = readpages_unix_file, -+ .writepages = writepages_unix_file, -+ .prepare_write = prepare_write_unix_file, -+ .commit_write = commit_write_unix_file, -+ /* -+ * private a_ops -+ */ -+ .bmap = bmap_unix_file, -+ /* -+ * other private methods -+ */ -+ .write_sd_by_inode = write_sd_by_inode_common, -+ .flow_by_inode = flow_by_inode_unix_file, -+ .key_by_inode = key_by_inode_and_offset_common, -+ .set_plug_in_inode = set_plug_in_inode_common, -+ .adjust_to_parent = adjust_to_parent_common, -+ .create_object = reiser4_create_object_common, -+ .delete_object = delete_object_unix_file, -+ .add_link = reiser4_add_link_common, -+ .rem_link = reiser4_rem_link_common, -+ .owns_item = owns_item_unix_file, -+ .can_add_link = can_add_link_common, -+ .detach = dummyop, -+ .bind = dummyop, -+ .safelink = safelink_common, -+ .estimate = { -+ .create = estimate_create_common, -+ .update = estimate_update_common, -+ .unlink = estimate_unlink_common -+ }, -+ .init_inode_data = init_inode_data_unix_file, -+ .cut_tree_worker = cut_tree_worker_common, -+ .wire = { -+ .write = wire_write_common, -+ .read = wire_read_common, -+ .get = wire_get_common, -+ .size = wire_size_common, -+ .done = wire_done_common -+ } -+ }, -+ [DIRECTORY_FILE_PLUGIN_ID] = { -+ .h = { -+ .type_id = REISER4_FILE_PLUGIN_TYPE, -+ .id = DIRECTORY_FILE_PLUGIN_ID, -+ .groups = (1 << REISER4_DIRECTORY_FILE), -+ .pops = &file_plugin_ops, -+ .label = "dir", -+ .desc = "directory", -+ .linkage = {NULL, NULL} -+ }, -+ .inode_ops = &null_i_ops, -+ .file_ops = &null_f_ops, -+ .as_ops = &null_a_ops, -+ -+ .write_sd_by_inode = write_sd_by_inode_common, -+ .flow_by_inode = bugop, -+ .key_by_inode = bugop, -+ .set_plug_in_inode = set_plug_in_inode_common, -+ .adjust_to_parent = adjust_to_parent_common_dir, -+ .create_object = reiser4_create_object_common, -+ .delete_object = reiser4_delete_dir_common, -+ .add_link = reiser4_add_link_common, -+ .rem_link = rem_link_common_dir, -+ .owns_item = owns_item_common_dir, -+ .can_add_link = can_add_link_common, -+ .can_rem_link = can_rem_link_common_dir, -+ .detach = reiser4_detach_common_dir, -+ .bind = reiser4_bind_common_dir, -+ .safelink = safelink_common, -+ .estimate = { -+ .create = estimate_create_common_dir, -+ .update = estimate_update_common, -+ .unlink = estimate_unlink_common_dir -+ }, -+ .wire = { -+ .write = wire_write_common, -+ .read = wire_read_common, -+ .get = wire_get_common, -+ .size = wire_size_common, -+ .done = wire_done_common -+ }, -+ .init_inode_data = init_inode_ordering, -+ .cut_tree_worker = cut_tree_worker_common, -+ }, -+ [SYMLINK_FILE_PLUGIN_ID] = { -+ .h = { -+ .type_id = REISER4_FILE_PLUGIN_TYPE, -+ .id = SYMLINK_FILE_PLUGIN_ID, -+ .groups = (1 << REISER4_SYMLINK_FILE), -+ .pops = &file_plugin_ops, -+ .label = "symlink", -+ .desc = "symbolic link", -+ .linkage = {NULL,NULL} -+ }, -+ .inode_ops = &symlink_file_i_ops, -+ /* inode->i_fop of symlink is initialized -+ by NULL in setup_inode_ops */ -+ .file_ops = &null_f_ops, -+ .as_ops = &null_a_ops, -+ -+ .write_sd_by_inode = write_sd_by_inode_common, -+ .set_plug_in_inode = set_plug_in_inode_common, -+ .adjust_to_parent = adjust_to_parent_common, -+ .create_object = reiser4_create_symlink, -+ .delete_object = reiser4_delete_object_common, -+ .add_link = reiser4_add_link_common, -+ .rem_link = reiser4_rem_link_common, -+ .can_add_link = can_add_link_common, -+ .detach = dummyop, -+ .bind = dummyop, -+ .safelink = safelink_common, -+ .estimate = { -+ .create = estimate_create_common, -+ .update = estimate_update_common, -+ .unlink = estimate_unlink_common -+ }, -+ .init_inode_data = init_inode_ordering, -+ .cut_tree_worker = cut_tree_worker_common, -+ .destroy_inode = destroy_inode_symlink, -+ .wire = { -+ .write = wire_write_common, -+ .read = wire_read_common, -+ .get = wire_get_common, -+ .size = wire_size_common, -+ .done = wire_done_common -+ } -+ }, -+ [SPECIAL_FILE_PLUGIN_ID] = { -+ .h = { -+ .type_id = REISER4_FILE_PLUGIN_TYPE, -+ .id = SPECIAL_FILE_PLUGIN_ID, -+ .groups = (1 << REISER4_SPECIAL_FILE), -+ .pops = &file_plugin_ops, -+ .label = "special", -+ .desc = -+ "special: fifo, device or socket", -+ .linkage = {NULL, NULL} -+ }, -+ .inode_ops = &special_file_i_ops, -+ /* file_ops of special files (sockets, block, char, fifo) are -+ initialized by init_special_inode. */ -+ .file_ops = &null_f_ops, -+ .as_ops = &null_a_ops, -+ -+ .write_sd_by_inode = write_sd_by_inode_common, -+ .set_plug_in_inode = set_plug_in_inode_common, -+ .adjust_to_parent = adjust_to_parent_common, -+ .create_object = reiser4_create_object_common, -+ .delete_object = reiser4_delete_object_common, -+ .add_link = reiser4_add_link_common, -+ .rem_link = reiser4_rem_link_common, -+ .owns_item = owns_item_common, -+ .can_add_link = can_add_link_common, -+ .detach = dummyop, -+ .bind = dummyop, -+ .safelink = safelink_common, -+ .estimate = { -+ .create = estimate_create_common, -+ .update = estimate_update_common, -+ .unlink = estimate_unlink_common -+ }, -+ .init_inode_data = init_inode_ordering, -+ .cut_tree_worker = cut_tree_worker_common, -+ .wire = { -+ .write = wire_write_common, -+ .read = wire_read_common, -+ .get = wire_get_common, -+ .size = wire_size_common, -+ .done = wire_done_common -+ } -+ }, -+ [CRYPTCOMPRESS_FILE_PLUGIN_ID] = { -+ .h = { -+ .type_id = REISER4_FILE_PLUGIN_TYPE, -+ .id = CRYPTCOMPRESS_FILE_PLUGIN_ID, -+ .groups = (1 << REISER4_REGULAR_FILE), -+ .pops = &file_plugin_ops, -+ .label = "cryptcompress", -+ .desc = "cryptcompress file", -+ .linkage = {NULL, NULL} -+ }, -+ .inode_ops = ®ular_file_i_ops, -+ .file_ops = ®ular_file_f_ops, -+ .as_ops = ®ular_file_a_ops, -+ -+ .setattr = setattr_cryptcompress, -+ .open = open_cryptcompress, -+ .read = read_cryptcompress, -+ .write = write_cryptcompress, -+ .ioctl = ioctl_cryptcompress, -+ .mmap = mmap_cryptcompress, -+ .release = release_cryptcompress, -+ -+ .readpage = readpage_cryptcompress, -+ .readpages = readpages_cryptcompress, -+ .writepages = writepages_cryptcompress, -+ .prepare_write = prepare_write_cryptcompress, -+ .commit_write = commit_write_cryptcompress, -+ -+ .bmap = bmap_cryptcompress, -+ -+ .write_sd_by_inode = write_sd_by_inode_common, -+ .flow_by_inode = flow_by_inode_cryptcompress, -+ .key_by_inode = key_by_inode_cryptcompress, -+ .set_plug_in_inode = set_plug_in_inode_common, -+ .adjust_to_parent = adjust_to_parent_cryptcompress, -+ .create_object = create_object_cryptcompress, -+ .delete_object = delete_object_cryptcompress, -+ .add_link = reiser4_add_link_common, -+ .rem_link = reiser4_rem_link_common, -+ .owns_item = owns_item_common, -+ .can_add_link = can_add_link_common, -+ .detach = dummyop, -+ .bind = dummyop, -+ .safelink = safelink_common, -+ .estimate = { -+ .create = estimate_create_common, -+ .update = estimate_update_common, -+ .unlink = estimate_unlink_common -+ }, -+ .init_inode_data = init_inode_data_cryptcompress, -+ .cut_tree_worker = cut_tree_worker_cryptcompress, -+ .destroy_inode = destroy_inode_cryptcompress, -+ .wire = { -+ .write = wire_write_common, -+ .read = wire_read_common, -+ .get = wire_get_common, -+ .size = wire_size_common, -+ .done = wire_done_common -+ } -+ } -+}; -+ -+static int change_dir(struct inode *inode, -+ reiser4_plugin * plugin, -+ pset_member memb) -+{ -+ /* cannot change dir plugin of already existing object */ -+ return RETERR(-EINVAL); -+} -+ -+static reiser4_plugin_ops dir_plugin_ops = { -+ .change = change_dir -+}; -+ -+/* -+ * definition of directory plugins -+ */ -+ -+dir_plugin dir_plugins[LAST_DIR_ID] = { -+ /* standard hashed directory plugin */ -+ [HASHED_DIR_PLUGIN_ID] = { -+ .h = { -+ .type_id = REISER4_DIR_PLUGIN_TYPE, -+ .id = HASHED_DIR_PLUGIN_ID, -+ .pops = &dir_plugin_ops, -+ .label = "dir", -+ .desc = "hashed directory", -+ .linkage = {NULL, NULL} -+ }, -+ .inode_ops = &directory_i_ops, -+ .file_ops = &directory_f_ops, -+ .as_ops = &directory_a_ops, -+ -+ .get_parent = get_parent_common, -+ .is_name_acceptable = is_name_acceptable_common, -+ .build_entry_key = build_entry_key_hashed, -+ .build_readdir_key = build_readdir_key_common, -+ .add_entry = reiser4_add_entry_common, -+ .rem_entry = reiser4_rem_entry_common, -+ .init = reiser4_dir_init_common, -+ .done = reiser4_dir_done_common, -+ .attach = reiser4_attach_common, -+ .detach = reiser4_detach_common, -+ .estimate = { -+ .add_entry = estimate_add_entry_common, -+ .rem_entry = estimate_rem_entry_common, -+ .unlink = dir_estimate_unlink_common -+ } -+ }, -+ /* hashed directory for which seekdir/telldir are guaranteed to -+ * work. Brain-damage. */ -+ [SEEKABLE_HASHED_DIR_PLUGIN_ID] = { -+ .h = { -+ .type_id = REISER4_DIR_PLUGIN_TYPE, -+ .id = SEEKABLE_HASHED_DIR_PLUGIN_ID, -+ .pops = &dir_plugin_ops, -+ .label = "dir32", -+ .desc = "directory hashed with 31 bit hash", -+ .linkage = {NULL, NULL} -+ }, -+ .inode_ops = &directory_i_ops, -+ .file_ops = &directory_f_ops, -+ .as_ops = &directory_a_ops, -+ -+ .get_parent = get_parent_common, -+ .is_name_acceptable = is_name_acceptable_common, -+ .build_entry_key = build_entry_key_seekable, -+ .build_readdir_key = build_readdir_key_common, -+ .add_entry = reiser4_add_entry_common, -+ .rem_entry = reiser4_rem_entry_common, -+ .init = reiser4_dir_init_common, -+ .done = reiser4_dir_done_common, -+ .attach = reiser4_attach_common, -+ .detach = reiser4_detach_common, -+ .estimate = { -+ .add_entry = estimate_add_entry_common, -+ .rem_entry = estimate_rem_entry_common, -+ .unlink = dir_estimate_unlink_common -+ } -+ } -+}; -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/object.h linux-2.6.24/fs/reiser4/plugin/object.h ---- linux-2.6.24.orig/fs/reiser4/plugin/object.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/object.h 2008-01-25 11:39:07.044235509 +0300 -@@ -0,0 +1,121 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Declaration of object plugin functions. */ -+ -+#if !defined( __FS_REISER4_PLUGIN_OBJECT_H__ ) -+#define __FS_REISER4_PLUGIN_OBJECT_H__ -+ -+#include "../type_safe_hash.h" -+ -+/* common implementations of inode operations */ -+int reiser4_create_common(struct inode *parent, struct dentry *dentry, -+ int mode, struct nameidata *); -+struct dentry * reiser4_lookup_common(struct inode *parent, -+ struct dentry *dentry, -+ struct nameidata *nameidata); -+int reiser4_link_common(struct dentry *existing, struct inode *parent, -+ struct dentry *newname); -+int reiser4_unlink_common(struct inode *parent, struct dentry *victim); -+int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode); -+int reiser4_symlink_common(struct inode *parent, struct dentry *dentry, -+ const char *linkname); -+int reiser4_mknod_common(struct inode *parent, struct dentry *dentry, -+ int mode, dev_t rdev); -+int reiser4_rename_common(struct inode *old_dir, struct dentry *old_name, -+ struct inode *new_dir, struct dentry *new_name); -+void *reiser4_follow_link_common(struct dentry *, struct nameidata *data); -+int reiser4_permission_common(struct inode *, int mask, -+ struct nameidata *nameidata); -+int reiser4_setattr_common(struct dentry *, struct iattr *); -+int reiser4_getattr_common(struct vfsmount *mnt, struct dentry *, -+ struct kstat *); -+ -+/* common implementations of file operations */ -+loff_t reiser4_llseek_dir_common(struct file *, loff_t off, int origin); -+int reiser4_readdir_common(struct file *, void *dirent, filldir_t); -+int reiser4_release_dir_common(struct inode *, struct file *); -+int reiser4_sync_common(struct file *, struct dentry *, int datasync); -+ -+/* common implementations of address space operations */ -+int prepare_write_common(struct file *, struct page *, unsigned from, -+ unsigned to); -+ -+/* file plugin operations: common implementations */ -+int write_sd_by_inode_common(struct inode *); -+int key_by_inode_and_offset_common(struct inode *, loff_t, reiser4_key *); -+int set_plug_in_inode_common(struct inode *object, struct inode *parent, -+ reiser4_object_create_data *); -+int adjust_to_parent_common(struct inode *object, struct inode *parent, -+ struct inode *root); -+int adjust_to_parent_common_dir(struct inode *object, struct inode *parent, -+ struct inode *root); -+int adjust_to_parent_cryptcompress(struct inode *object, struct inode *parent, -+ struct inode *root); -+int reiser4_create_object_common(struct inode *object, struct inode *parent, -+ reiser4_object_create_data *); -+int reiser4_delete_object_common(struct inode *); -+int reiser4_delete_dir_common(struct inode *); -+int reiser4_add_link_common(struct inode *object, struct inode *parent); -+int reiser4_rem_link_common(struct inode *object, struct inode *parent); -+int rem_link_common_dir(struct inode *object, struct inode *parent); -+int owns_item_common(const struct inode *, const coord_t *); -+int owns_item_common_dir(const struct inode *, const coord_t *); -+int can_add_link_common(const struct inode *); -+int can_rem_link_common_dir(const struct inode *); -+int reiser4_detach_common_dir(struct inode *child, struct inode *parent); -+int reiser4_bind_common_dir(struct inode *child, struct inode *parent); -+int safelink_common(struct inode *, reiser4_safe_link_t, __u64 value); -+reiser4_block_nr estimate_create_common(const struct inode *); -+reiser4_block_nr estimate_create_common_dir(const struct inode *); -+reiser4_block_nr estimate_update_common(const struct inode *); -+reiser4_block_nr estimate_unlink_common(const struct inode *, -+ const struct inode *); -+reiser4_block_nr estimate_unlink_common_dir(const struct inode *, -+ const struct inode *); -+char *wire_write_common(struct inode *, char *start); -+char *wire_read_common(char *addr, reiser4_object_on_wire *); -+struct dentry *wire_get_common(struct super_block *, reiser4_object_on_wire *); -+int wire_size_common(struct inode *); -+void wire_done_common(reiser4_object_on_wire *); -+ -+/* dir plugin operations: common implementations */ -+struct dentry *get_parent_common(struct inode *child); -+int is_name_acceptable_common(const struct inode *, const char *name, int len); -+void build_entry_key_common(const struct inode *, -+ const struct qstr *qname, reiser4_key *); -+int build_readdir_key_common(struct file *dir, reiser4_key *); -+int reiser4_add_entry_common(struct inode *object, struct dentry *where, -+ reiser4_object_create_data *, reiser4_dir_entry_desc *); -+int reiser4_rem_entry_common(struct inode *object, struct dentry *where, -+ reiser4_dir_entry_desc *); -+int reiser4_dir_init_common(struct inode *object, struct inode *parent, -+ reiser4_object_create_data *); -+int reiser4_dir_done_common(struct inode *); -+int reiser4_attach_common(struct inode *child, struct inode *parent); -+int reiser4_detach_common(struct inode *object, struct inode *parent); -+reiser4_block_nr estimate_add_entry_common(const struct inode *); -+reiser4_block_nr estimate_rem_entry_common(const struct inode *); -+reiser4_block_nr dir_estimate_unlink_common(const struct inode *, -+ const struct inode *); -+ -+/* these are essential parts of common implementations, they are to make -+ customized implementations easier */ -+int do_prepare_write(struct file *, struct page *, unsigned from, unsigned to); -+ -+/* merely useful functions */ -+int lookup_sd(struct inode *, znode_lock_mode, coord_t *, lock_handle *, -+ const reiser4_key *, int silent); -+ -+/* __FS_REISER4_PLUGIN_OBJECT_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/plugin.c linux-2.6.24/fs/reiser4/plugin/plugin.c ---- linux-2.6.24.orig/fs/reiser4/plugin/plugin.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/plugin.c 2008-01-25 11:39:07.048236540 +0300 -@@ -0,0 +1,559 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Basic plugin infrastructure, lookup etc. */ -+ -+/* PLUGINS: -+ -+ Plugins are internal Reiser4 "modules" or "objects" used to increase -+ extensibility and allow external users to easily adapt reiser4 to -+ their needs. -+ -+ Plugins are classified into several disjoint "types". Plugins -+ belonging to the particular plugin type are termed "instances" of -+ this type. Existing types are listed by enum reiser4_plugin_type -+ (see plugin/plugin_header.h) -+ -+NIKITA-FIXME-HANS: update this list, and review this entire comment for currency -+ -+ Object (file) plugin determines how given file-system object serves -+ standard VFS requests for read, write, seek, mmap etc. Instances of -+ file plugins are: regular file, directory, symlink. Another example -+ of file plugin is audit plugin, that optionally records accesses to -+ underlying object and forwards requests to it. -+ -+ Hash plugins compute hashes used by reiser4 to store and locate -+ files within directories. Instances of hash plugin type are: r5, -+ tea, rupasov. -+ -+ Tail plugins (or, more precisely, tail policy plugins) determine -+ when last part of the file should be stored in a formatted item. -+ -+ Scope and lookup: -+ -+ label such that pair ( type_label, plugin_label ) is unique. This -+ pair is a globally persistent and user-visible plugin -+ identifier. Internally kernel maintains plugins and plugin types in -+ arrays using an index into those arrays as plugin and plugin type -+ identifiers. File-system in turn, also maintains persistent -+ "dictionary" which is mapping from plugin label to numerical -+ identifier which is stored in file-system objects. That is, we -+ store the offset into the plugin array for that plugin type as the -+ plugin id in the stat data of the filesystem object. -+ -+ Internal kernel plugin type identifier (index in plugins[] array) is -+ of type reiser4_plugin_type. Set of available plugin types is -+ currently static, but dynamic loading doesn't seem to pose -+ insurmountable problems. -+ -+ Within each type plugins are addressed by the identifiers of type -+ reiser4_plugin_id (indices in reiser4_plugin_type_data.builtin[]). -+ Such identifiers are only required to be unique within one type, -+ not globally. -+ -+ Thus, plugin in memory is uniquely identified by the pair (type_id, -+ id). -+ -+ Usage: -+ -+ There exists only one instance of each plugin instance, but this -+ single instance can be associated with many entities (file-system -+ objects, items, nodes, transactions, file-descriptors etc.). Entity -+ to which plugin of given type is termed (due to the lack of -+ imagination) "subject" of this plugin type and, by abuse of -+ terminology, subject of particular instance of this type to which -+ it's attached currently. For example, inode is subject of object -+ plugin type. Inode representing directory is subject of directory -+ plugin, hash plugin type and some particular instance of hash plugin -+ type. Inode, representing regular file is subject of "regular file" -+ plugin, tail-policy plugin type etc. -+ -+ With each subject the plugin possibly stores some state. For example, -+ the state of a directory plugin (instance of object plugin type) is pointer -+ to hash plugin (if directories always use hashing that is). -+ -+ Interface: -+ -+ In addition to a scalar identifier, each plugin type and plugin -+ proper has a "label": short string and a "description"---longer -+ descriptive string. Labels and descriptions of plugin types are -+ hard-coded into plugins[] array, declared and defined in -+ plugin.c. Label and description of plugin are stored in .label and -+ .desc fields of reiser4_plugin_header respectively. It's possible to -+ locate plugin by the pair of labels. -+ -+ Features (not implemented): -+ -+ . user-level plugin manipulations: -+ + reiser4("filename/..file_plugin<='audit'"); -+ + write(open("filename/..file_plugin"), "audit", 8); -+ -+ . user level utilities lsplug and chplug to manipulate plugins. -+ Utilities are not of primary priority. Possibly they will be not -+ working on v4.0 -+ -+ NIKITA-FIXME-HANS: this should be a mkreiserfs option not a mount -+ option, do you agree? I don't think that specifying it at mount time, -+ and then changing it with each mount, is a good model for usage. -+ -+ . mount option "plug" to set-up plugins of root-directory. -+ "plug=foo:bar" will set "bar" as default plugin of type "foo". -+ -+ Limitations: -+ -+ . each plugin type has to provide at least one builtin -+ plugin. This is technical limitation and it can be lifted in the -+ future. -+ -+ TODO: -+ -+ New plugin types/plugings: -+ Things we should be able to separately choose to inherit: -+ -+ security plugins -+ -+ stat data -+ -+ file bodies -+ -+ file plugins -+ -+ dir plugins -+ -+ . perm:acl -+ -+ . audi---audit plugin intercepting and possibly logging all -+ accesses to object. Requires to put stub functions in file_operations -+ in stead of generic_file_*. -+ -+NIKITA-FIXME-HANS: why make overflows a plugin? -+ . over---handle hash overflows -+ -+ . sqnt---handle different access patterns and instruments read-ahead -+ -+NIKITA-FIXME-HANS: describe the line below in more detail. -+ -+ . hier---handle inheritance of plugins along file-system hierarchy -+ -+ Different kinds of inheritance: on creation vs. on access. -+ Compatible/incompatible plugins. -+ Inheritance for multi-linked files. -+ Layered plugins. -+ Notion of plugin context is abandoned. -+ -+Each file is associated -+ with one plugin and dependant plugins (hash, etc.) are stored as -+ main plugin state. Now, if we have plugins used for regular files -+ but not for directories, how such plugins would be inherited? -+ . always store them with directories also -+ -+NIKTIA-FIXME-HANS: Do the line above. It is not exclusive of doing -+the line below which is also useful. -+ -+ . use inheritance hierarchy, independent of file-system namespace -+*/ -+ -+#include "../debug.h" -+#include "../dformat.h" -+#include "plugin_header.h" -+#include "item/static_stat.h" -+#include "node/node.h" -+#include "security/perm.h" -+#include "space/space_allocator.h" -+#include "disk_format/disk_format.h" -+#include "plugin.h" -+#include "../reiser4.h" -+#include "../jnode.h" -+#include "../inode.h" -+ -+#include /* for struct super_block */ -+ -+/* -+ * init_plugins - initialize plugin sub-system. -+ * Just call this once on reiser4 startup. -+ * -+ * Initializes plugin sub-system. It is part of reiser4 module -+ * initialization. For each plugin of each type init method is called and each -+ * plugin is put into list of plugins. -+ */ -+int init_plugins(void) -+{ -+ reiser4_plugin_type type_id; -+ -+ for (type_id = 0; type_id < REISER4_PLUGIN_TYPES; ++type_id) { -+ struct reiser4_plugin_type_data *ptype; -+ int i; -+ -+ ptype = &plugins[type_id]; -+ assert("nikita-3508", ptype->label != NULL); -+ assert("nikita-3509", ptype->type_id == type_id); -+ -+ INIT_LIST_HEAD(&ptype->plugins_list); -+/* NIKITA-FIXME-HANS: change builtin_num to some other name lacking the term builtin. */ -+ for (i = 0; i < ptype->builtin_num; ++i) { -+ reiser4_plugin *plugin; -+ -+ plugin = plugin_at(ptype, i); -+ -+ if (plugin->h.label == NULL) -+ /* uninitialized slot encountered */ -+ continue; -+ assert("nikita-3445", plugin->h.type_id == type_id); -+ plugin->h.id = i; -+ if (plugin->h.pops != NULL && -+ plugin->h.pops->init != NULL) { -+ int result; -+ -+ result = plugin->h.pops->init(plugin); -+ if (result != 0) -+ return result; -+ } -+ INIT_LIST_HEAD(&plugin->h.linkage); -+ list_add_tail(&plugin->h.linkage, &ptype->plugins_list); -+ } -+ } -+ return 0; -+} -+ -+/* true if plugin type id is valid */ -+int is_plugin_type_valid(reiser4_plugin_type type) -+{ -+ /* "type" is unsigned, so no comparison with 0 is -+ necessary */ -+ return (type < REISER4_PLUGIN_TYPES); -+} -+ -+/* true if plugin id is valid */ -+int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id) -+{ -+ assert("nikita-1653", is_plugin_type_valid(type)); -+ return id < plugins[type].builtin_num; -+} -+ -+/* return plugin by its @type and @id. -+ -+ Both arguments are checked for validness: this is supposed to be called -+ from user-level. -+ -+NIKITA-FIXME-HANS: Do you instead mean that this checks ids created in -+user space, and passed to the filesystem by use of method files? Your -+comment really confused me on the first reading.... -+ -+*/ -+reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type /* plugin type -+ * unchecked */, -+ reiser4_plugin_id id /* plugin id, -+ * unchecked */) -+{ -+ if (is_plugin_type_valid(type)) { -+ if (is_plugin_id_valid(type, id)) -+ return plugin_at(&plugins[type], id); -+ else -+ /* id out of bounds */ -+ warning("nikita-2913", -+ "Invalid plugin id: [%i:%i]", type, id); -+ } else -+ /* type_id out of bounds */ -+ warning("nikita-2914", "Invalid type_id: %i", type); -+ return NULL; -+} -+ -+/** -+ * save_plugin_id - store plugin id in disk format -+ * @plugin: plugin to convert -+ * @area: where to store result -+ * -+ * Puts id of @plugin in little endian format to address @area. -+ */ -+int save_plugin_id(reiser4_plugin *plugin /* plugin to convert */ , -+ d16 *area /* where to store result */ ) -+{ -+ assert("nikita-1261", plugin != NULL); -+ assert("nikita-1262", area != NULL); -+ -+ put_unaligned(cpu_to_le16(plugin->h.id), area); -+ return 0; -+} -+ -+/* list of all plugins of given type */ -+struct list_head *get_plugin_list(reiser4_plugin_type type) -+{ -+ assert("nikita-1056", is_plugin_type_valid(type)); -+ return &plugins[type].plugins_list; -+} -+ -+static void update_pset_mask(reiser4_inode * info, pset_member memb) -+{ -+ struct dentry *rootdir; -+ reiser4_inode *root; -+ -+ assert("edward-1443", memb != PSET_FILE); -+ -+ rootdir = inode_by_reiser4_inode(info)->i_sb->s_root; -+ if (rootdir != NULL) { -+ root = reiser4_inode_data(rootdir->d_inode); -+ /* -+ * if inode is different from the default one, or we are -+ * changing plugin of root directory, update plugin_mask -+ */ -+ if (aset_get(info->pset, memb) != -+ aset_get(root->pset, memb) || -+ info == root) -+ info->plugin_mask |= (1 << memb); -+ else -+ info->plugin_mask &= ~(1 << memb); -+ } -+} -+ -+/* Get specified plugin set member from parent, -+ or from fs-defaults (if no parent is given) and -+ install the result to pset of @self */ -+int grab_plugin_pset(struct inode *self, -+ struct inode *ancestor, -+ pset_member memb) -+{ -+ reiser4_plugin *plug; -+ reiser4_inode *info; -+ int result = 0; -+ -+ /* Do not grab if initialised already. */ -+ info = reiser4_inode_data(self); -+ if (aset_get(info->pset, memb) != NULL) -+ return 0; -+ if (ancestor) { -+ reiser4_inode *parent; -+ -+ parent = reiser4_inode_data(ancestor); -+ plug = aset_get(parent->hset, memb) ? : -+ aset_get(parent->pset, memb); -+ } -+ else -+ plug = get_default_plugin(memb); -+ -+ result = set_plugin(&info->pset, memb, plug); -+ if (result == 0) { -+ if (!ancestor || self->i_sb->s_root->d_inode != self) -+ update_pset_mask(info, memb); -+ } -+ return result; -+} -+ -+/* Take missing pset members from root inode */ -+int finish_pset(struct inode *inode) -+{ -+ reiser4_plugin *plug; -+ reiser4_inode *root; -+ reiser4_inode *info; -+ pset_member memb; -+ int result = 0; -+ -+ root = reiser4_inode_data(inode->i_sb->s_root->d_inode); -+ info = reiser4_inode_data(inode); -+ -+ assert("edward-1455", root != NULL); -+ assert("edward-1456", info != NULL); -+ -+ /* file and directory plugins are already initialized. */ -+ for (memb = PSET_DIR + 1; memb < PSET_LAST; ++memb) { -+ -+ /* Do not grab if initialised already. */ -+ if (aset_get(info->pset, memb) != NULL) -+ continue; -+ -+ plug = aset_get(root->pset, memb); -+ result = set_plugin(&info->pset, memb, plug); -+ if (result != 0) -+ break; -+ } -+ if (result != 0) { -+ warning("nikita-3447", -+ "Cannot set up plugins for %lli", -+ (unsigned long long) -+ get_inode_oid(inode)); -+ } -+ return result; -+} -+ -+int force_plugin_pset(struct inode *self, pset_member memb, reiser4_plugin * plug) -+{ -+ reiser4_inode *info; -+ int result = 0; -+ -+ if (!self->i_sb->s_root || self->i_sb->s_root->d_inode == self) { -+ /* Changing pset in the root object. */ -+ return RETERR(-EINVAL); -+ } -+ -+ info = reiser4_inode_data(self); -+ if (plug->h.pops != NULL && plug->h.pops->change != NULL) -+ result = plug->h.pops->change(self, plug, memb); -+ else -+ result = aset_set_unsafe(&info->pset, memb, plug); -+ if (result == 0) { -+ __u16 oldmask = info->plugin_mask; -+ -+ update_pset_mask(info, memb); -+ if (oldmask != info->plugin_mask) -+ reiser4_inode_clr_flag(self, REISER4_SDLEN_KNOWN); -+ } -+ return result; -+} -+ -+struct reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES] = { -+ /* C90 initializers */ -+ [REISER4_FILE_PLUGIN_TYPE] = { -+ .type_id = REISER4_FILE_PLUGIN_TYPE, -+ .label = "file", -+ .desc = "Object plugins", -+ .builtin_num = sizeof_array(file_plugins), -+ .builtin = file_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(file_plugin) -+ }, -+ [REISER4_DIR_PLUGIN_TYPE] = { -+ .type_id = REISER4_DIR_PLUGIN_TYPE, -+ .label = "dir", -+ .desc = "Directory plugins", -+ .builtin_num = sizeof_array(dir_plugins), -+ .builtin = dir_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(dir_plugin) -+ }, -+ [REISER4_HASH_PLUGIN_TYPE] = { -+ .type_id = REISER4_HASH_PLUGIN_TYPE, -+ .label = "hash", -+ .desc = "Directory hashes", -+ .builtin_num = sizeof_array(hash_plugins), -+ .builtin = hash_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(hash_plugin) -+ }, -+ [REISER4_FIBRATION_PLUGIN_TYPE] = { -+ .type_id = -+ REISER4_FIBRATION_PLUGIN_TYPE, -+ .label = "fibration", -+ .desc = "Directory fibrations", -+ .builtin_num = sizeof_array(fibration_plugins), -+ .builtin = fibration_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(fibration_plugin) -+ }, -+ [REISER4_CIPHER_PLUGIN_TYPE] = { -+ .type_id = REISER4_CIPHER_PLUGIN_TYPE, -+ .label = "cipher", -+ .desc = "Cipher plugins", -+ .builtin_num = sizeof_array(cipher_plugins), -+ .builtin = cipher_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(cipher_plugin) -+ }, -+ [REISER4_DIGEST_PLUGIN_TYPE] = { -+ .type_id = REISER4_DIGEST_PLUGIN_TYPE, -+ .label = "digest", -+ .desc = "Digest plugins", -+ .builtin_num = sizeof_array(digest_plugins), -+ .builtin = digest_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(digest_plugin) -+ }, -+ [REISER4_COMPRESSION_PLUGIN_TYPE] = { -+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE, -+ .label = "compression", -+ .desc = "Compression plugins", -+ .builtin_num = sizeof_array(compression_plugins), -+ .builtin = compression_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(compression_plugin) -+ }, -+ [REISER4_FORMATTING_PLUGIN_TYPE] = { -+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE, -+ .label = "formatting", -+ .desc = "Tail inlining policies", -+ .builtin_num = sizeof_array(formatting_plugins), -+ .builtin = formatting_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(formatting_plugin) -+ }, -+ [REISER4_PERM_PLUGIN_TYPE] = { -+ .type_id = REISER4_PERM_PLUGIN_TYPE, -+ .label = "perm", -+ .desc = "Permission checks", -+ .builtin_num = sizeof_array(perm_plugins), -+ .builtin = perm_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(perm_plugin) -+ }, -+ [REISER4_ITEM_PLUGIN_TYPE] = { -+ .type_id = REISER4_ITEM_PLUGIN_TYPE, -+ .label = "item", -+ .desc = "Item handlers", -+ .builtin_num = sizeof_array(item_plugins), -+ .builtin = item_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(item_plugin) -+ }, -+ [REISER4_NODE_PLUGIN_TYPE] = { -+ .type_id = REISER4_NODE_PLUGIN_TYPE, -+ .label = "node", -+ .desc = "node layout handlers", -+ .builtin_num = sizeof_array(node_plugins), -+ .builtin = node_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(node_plugin) -+ }, -+ [REISER4_SD_EXT_PLUGIN_TYPE] = { -+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE, -+ .label = "sd_ext", -+ .desc = "Parts of stat-data", -+ .builtin_num = sizeof_array(sd_ext_plugins), -+ .builtin = sd_ext_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(sd_ext_plugin) -+ }, -+ [REISER4_FORMAT_PLUGIN_TYPE] = { -+ .type_id = REISER4_FORMAT_PLUGIN_TYPE, -+ .label = "disk_layout", -+ .desc = "defines filesystem on disk layout", -+ .builtin_num = sizeof_array(format_plugins), -+ .builtin = format_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(disk_format_plugin) -+ }, -+ [REISER4_JNODE_PLUGIN_TYPE] = { -+ .type_id = REISER4_JNODE_PLUGIN_TYPE, -+ .label = "jnode", -+ .desc = "defines kind of jnode", -+ .builtin_num = sizeof_array(jnode_plugins), -+ .builtin = jnode_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(jnode_plugin) -+ }, -+ [REISER4_COMPRESSION_MODE_PLUGIN_TYPE] = { -+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ .label = "compression_mode", -+ .desc = "Defines compression mode", -+ .builtin_num = sizeof_array(compression_mode_plugins), -+ .builtin = compression_mode_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(compression_mode_plugin) -+ }, -+ [REISER4_CLUSTER_PLUGIN_TYPE] = { -+ .type_id = REISER4_CLUSTER_PLUGIN_TYPE, -+ .label = "cluster", -+ .desc = "Defines cluster size", -+ .builtin_num = sizeof_array(cluster_plugins), -+ .builtin = cluster_plugins, -+ .plugins_list = {NULL, NULL}, -+ .size = sizeof(cluster_plugin) -+ } -+}; -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 120 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/plugin.h linux-2.6.24/fs/reiser4/plugin/plugin.h ---- linux-2.6.24.orig/fs/reiser4/plugin/plugin.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/plugin.h 2008-01-25 11:39:07.052237570 +0300 -@@ -0,0 +1,937 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Basic plugin data-types. -+ see fs/reiser4/plugin/plugin.c for details */ -+ -+#if !defined( __FS_REISER4_PLUGIN_TYPES_H__ ) -+#define __FS_REISER4_PLUGIN_TYPES_H__ -+ -+#include "../forward.h" -+#include "../debug.h" -+#include "../dformat.h" -+#include "../key.h" -+#include "compress/compress.h" -+#include "crypto/cipher.h" -+#include "plugin_header.h" -+#include "item/static_stat.h" -+#include "item/internal.h" -+#include "item/sde.h" -+#include "item/cde.h" -+#include "item/item.h" -+#include "node/node.h" -+#include "node/node40.h" -+#include "security/perm.h" -+#include "fibration.h" -+ -+#include "space/bitmap.h" -+#include "space/space_allocator.h" -+ -+#include "disk_format/disk_format40.h" -+#include "disk_format/disk_format.h" -+ -+#include /* for struct super_block, address_space */ -+#include /* for struct page */ -+#include /* for struct buffer_head */ -+#include /* for struct dentry */ -+#include -+#include -+ -+typedef struct reiser4_object_on_wire reiser4_object_on_wire; -+ -+/* -+ * File plugin. Defines the set of methods that file plugins implement, some -+ * of which are optional. -+ * -+ * A file plugin offers to the caller an interface for IO ( writing to and/or -+ * reading from) to what the caller sees as one sequence of bytes. An IO to it -+ * may affect more than one physical sequence of bytes, or no physical sequence -+ * of bytes, it may affect sequences of bytes offered by other file plugins to -+ * the semantic layer, and the file plugin may invoke other plugins and -+ * delegate work to them, but its interface is structured for offering the -+ * caller the ability to read and/or write what the caller sees as being a -+ * single sequence of bytes. -+ * -+ * The file plugin must present a sequence of bytes to the caller, but it does -+ * not necessarily have to store a sequence of bytes, it does not necessarily -+ * have to support efficient tree traversal to any offset in the sequence of -+ * bytes (tail and extent items, whose keys contain offsets, do however provide -+ * efficient non-sequential lookup of any offset in the sequence of bytes). -+ * -+ * Directory plugins provide methods for selecting file plugins by resolving a -+ * name for them. -+ * -+ * The functionality other filesystems call an attribute, and rigidly tie -+ * together, we decompose into orthogonal selectable features of files. Using -+ * the terminology we will define next, an attribute is a perhaps constrained, -+ * perhaps static length, file whose parent has a uni-count-intra-link to it, -+ * which might be grandparent-major-packed, and whose parent has a deletion -+ * method that deletes it. -+ * -+ * File plugins can implement constraints. -+ * -+ * Files can be of variable length (e.g. regular unix files), or of static -+ * length (e.g. static sized attributes). -+ * -+ * An object may have many sequences of bytes, and many file plugins, but, it -+ * has exactly one objectid. It is usually desirable that an object has a -+ * deletion method which deletes every item with that objectid. Items cannot -+ * in general be found by just their objectids. This means that an object must -+ * have either a method built into its deletion plugin method for knowing what -+ * items need to be deleted, or links stored with the object that provide the -+ * plugin with a method for finding those items. Deleting a file within an -+ * object may or may not have the effect of deleting the entire object, -+ * depending on the file plugin's deletion method. -+ * -+ * LINK TAXONOMY: -+ * -+ * Many objects have a reference count, and when the reference count reaches 0 -+ * the object's deletion method is invoked. Some links embody a reference -+ * count increase ("countlinks"), and others do not ("nocountlinks"). -+ * -+ * Some links are bi-directional links ("bilinks"), and some are -+ * uni-directional("unilinks"). -+ * -+ * Some links are between parts of the same object ("intralinks"), and some are -+ * between different objects ("interlinks"). -+ * -+ * PACKING TAXONOMY: -+ * -+ * Some items of an object are stored with a major packing locality based on -+ * their object's objectid (e.g. unix directory items in plan A), and these are -+ * called "self-major-packed". -+ * -+ * Some items of an object are stored with a major packing locality based on -+ * their semantic parent object's objectid (e.g. unix file bodies in plan A), -+ * and these are called "parent-major-packed". -+ * -+ * Some items of an object are stored with a major packing locality based on -+ * their semantic grandparent, and these are called "grandparent-major-packed". -+ * Now carefully notice that we run into trouble with key length if we have to -+ * store a 8 byte major+minor grandparent based packing locality, an 8 byte -+ * parent objectid, an 8 byte attribute objectid, and an 8 byte offset, all in -+ * a 24 byte key. One of these fields must be sacrificed if an item is to be -+ * grandparent-major-packed, and which to sacrifice is left to the item author -+ * choosing to make the item grandparent-major-packed. You cannot make tail -+ * items and extent items grandparent-major-packed, though you could make them -+ * self-major-packed (usually they are parent-major-packed). -+ * -+ * In the case of ACLs (which are composed of fixed length ACEs which consist -+ * of {subject-type, subject, and permission bitmask} triples), it makes sense -+ * to not have an offset field in the ACE item key, and to allow duplicate keys -+ * for ACEs. Thus, the set of ACES for a given file is found by looking for a -+ * key consisting of the objectid of the grandparent (thus grouping all ACLs in -+ * a directory together), the minor packing locality of ACE, the objectid of -+ * the file, and 0. -+ * -+ * IO involves moving data from one location to another, which means that two -+ * locations must be specified, source and destination. -+ * -+ * This source and destination can be in the filesystem, or they can be a -+ * pointer in the user process address space plus a byte count. -+ * -+ * If both source and destination are in the filesystem, then at least one of -+ * them must be representable as a pure stream of bytes (which we call a flow, -+ * and define as a struct containing a key, a data pointer, and a length). -+ * This may mean converting one of them into a flow. We provide a generic -+ * cast_into_flow() method, which will work for any plugin supporting -+ * read_flow(), though it is inefficiently implemented in that it temporarily -+ * stores the flow in a buffer (Question: what to do with huge flows that -+ * cannot fit into memory? Answer: we must not convert them all at once. ) -+ * -+ * Performing a write requires resolving the write request into a flow defining -+ * the source, and a method that performs the write, and a key that defines -+ * where in the tree the write is to go. -+ * -+ * Performing a read requires resolving the read request into a flow defining -+ * the target, and a method that performs the read, and a key that defines -+ * where in the tree the read is to come from. -+ * -+ * There will exist file plugins which have no pluginid stored on the disk for -+ * them, and which are only invoked by other plugins. -+ */ -+ -+/* This should be incremented with each new contributed -+ pair (plugin type, plugin id). -+ NOTE: Make sure there is a release of reiser4progs -+ with the corresponding version number */ -+#define PLUGIN_LIBRARY_VERSION 0 -+ -+ /* enumeration of fields within plugin_set */ -+typedef enum { -+ PSET_FILE, -+ PSET_DIR, /* PSET_FILE and PSET_DIR should be first elements: -+ * inode.c:read_inode() depends on this. */ -+ PSET_PERM, -+ PSET_FORMATTING, -+ PSET_HASH, -+ PSET_FIBRATION, -+ PSET_SD, -+ PSET_DIR_ITEM, -+ PSET_CIPHER, -+ PSET_DIGEST, -+ PSET_COMPRESSION, -+ PSET_COMPRESSION_MODE, -+ PSET_CLUSTER, -+ PSET_CREATE, -+ PSET_LAST -+} pset_member; -+ -+/* builtin file-plugins */ -+typedef enum { -+ /* regular file */ -+ UNIX_FILE_PLUGIN_ID, -+ /* directory */ -+ DIRECTORY_FILE_PLUGIN_ID, -+ /* symlink */ -+ SYMLINK_FILE_PLUGIN_ID, -+ /* for objects completely handled by the VFS: fifos, devices, -+ sockets */ -+ SPECIAL_FILE_PLUGIN_ID, -+ /* regular cryptcompress file */ -+ CRYPTCOMPRESS_FILE_PLUGIN_ID, -+ /* number of file plugins. Used as size of arrays to hold -+ file plugins. */ -+ LAST_FILE_PLUGIN_ID -+} reiser4_file_id; -+ -+typedef struct file_plugin { -+ -+ /* generic fields */ -+ plugin_header h; -+ -+ /* VFS methods. -+ * Must be invariant with respect to plugin conversion. -+ * It can be achieved by using "common" methods, which -+ * are the same for all plugins that take participation in -+ * conversion, or by using "generic" or "careful" methods, -+ * which provide automatic redirection to proper private -+ * plugin methods ("careful" are the same as "generic", -+ * but with protection of pset and other disk structures -+ * from being rebuilt during conversion. -+ */ -+ struct inode_operations * inode_ops; -+ struct file_operations * file_ops; -+ struct address_space_operations * as_ops; -+ /** -+ * Private methods. These are optional. If used they will allow you -+ * to minimize the amount of code needed to implement a deviation -+ * from some other method that also uses them. -+ */ -+ /* -+ * private inode_ops -+ */ -+ int (*setattr)(struct dentry *, struct iattr *); -+ /* -+ * private file_ops -+ */ -+ /* do whatever is necessary to do when object is opened */ -+ int (*open) (struct inode * inode, struct file * file); -+ ssize_t (*read) (struct file *, char __user *buf, size_t read_amount, -+ loff_t *off); -+ /* write as much as possible bytes from nominated @write_amount -+ * before plugin scheduling is occurred. Save scheduling state -+ * in @cont */ -+ ssize_t (*write) (struct file *, const char __user *buf, -+ size_t write_amount, loff_t * off, -+ struct psched_context * cont); -+ int (*ioctl) (struct inode *inode, struct file *filp, -+ unsigned int cmd, unsigned long arg); -+ int (*mmap) (struct file *, struct vm_area_struct *); -+ int (*release) (struct inode *, struct file *); -+ /* -+ * private a_ops -+ */ -+ int (*readpage) (struct file *file, struct page *page); -+ int (*readpages)(struct file *file, struct address_space *mapping, -+ struct list_head *pages, unsigned nr_pages); -+ int (*writepages)(struct address_space *mapping, -+ struct writeback_control *wbc); -+ int (*prepare_write)(struct file *file, struct page *page, -+ unsigned from, unsigned to); -+ int (*commit_write)(struct file *file, struct page *page, -+ unsigned from, unsigned to); -+ sector_t (*bmap) (struct address_space * mapping, sector_t lblock); -+ /* other private methods */ -+ /* save inode cached stat-data onto disk. It was called -+ reiserfs_update_sd() in 3.x */ -+ int (*write_sd_by_inode) (struct inode *); -+ /* -+ * Construct flow into @flow according to user-supplied data. -+ * -+ * This is used by read/write methods to construct a flow to -+ * write/read. ->flow_by_inode() is plugin method, rather than single -+ * global implementation, because key in a flow used by plugin may -+ * depend on data in a @buf. -+ * -+ * NIKITA-FIXME-HANS: please create statistics on what functions are -+ * dereferenced how often for the mongo benchmark. You can supervise -+ * Elena doing this for you if that helps. Email me the list of the -+ * top 10, with their counts, and an estimate of the total number of -+ * CPU cycles spent dereferencing as a percentage of CPU cycles spent -+ * processing (non-idle processing). If the total percent is, say, -+ * less than 1%, it will make our coding discussions much easier, and -+ * keep me from questioning whether functions like the below are too -+ * frequently called to be dereferenced. If the total percent is more -+ * than 1%, perhaps private methods should be listed in a "required" -+ * comment at the top of each plugin (with stern language about how if -+ * the comment is missing it will not be accepted by the maintainer), -+ * and implemented using macros not dereferenced functions. How about -+ * replacing this whole private methods part of the struct with a -+ * thorough documentation of what the standard helper functions are for -+ * use in constructing plugins? I think users have been asking for -+ * that, though not in so many words. -+ */ -+ int (*flow_by_inode) (struct inode *, const char __user *buf, -+ int user, loff_t size, -+ loff_t off, rw_op op, flow_t *); -+ /* -+ * Return the key used to retrieve an offset of a file. It is used by -+ * default implementation of ->flow_by_inode() method -+ * (common_build_flow()) and, among other things, to get to the extent -+ * from jnode of unformatted node. -+ */ -+ int (*key_by_inode) (struct inode *, loff_t off, reiser4_key *); -+ -+ /* NIKITA-FIXME-HANS: this comment is not as clear to others as you think.... */ -+ /* -+ * set the plugin for a file. Called during file creation in creat() -+ * but not reiser4() unless an inode already exists for the file. -+ */ -+ int (*set_plug_in_inode) (struct inode *inode, struct inode *parent, -+ reiser4_object_create_data *); -+ -+ /* NIKITA-FIXME-HANS: comment and name seem to say different things, -+ * are you setting up the object itself also or just adjusting the -+ * parent?.... */ -+ /* set up plugins for new @object created in @parent. @root is root -+ directory. */ -+ int (*adjust_to_parent) (struct inode *object, struct inode *parent, -+ struct inode *root); -+ /* -+ * this does whatever is necessary to do when object is created. For -+ * instance, for unix files stat data is inserted. It is supposed to be -+ * called by create of struct inode_operations. -+ */ -+ int (*create_object) (struct inode *object, struct inode *parent, -+ reiser4_object_create_data *); -+ /* -+ * this method should check REISER4_NO_SD and set REISER4_NO_SD on -+ * success. Deletion of an object usually includes removal of items -+ * building file body (for directories this is removal of "." and "..") -+ * and removal of stat-data item. -+ */ -+ int (*delete_object) (struct inode *); -+ -+ /* add link from @parent to @object */ -+ int (*add_link) (struct inode *object, struct inode *parent); -+ -+ /* remove link from @parent to @object */ -+ int (*rem_link) (struct inode *object, struct inode *parent); -+ -+ /* -+ * return true if item addressed by @coord belongs to @inode. This is -+ * used by read/write to properly slice flow into items in presence of -+ * multiple key assignment policies, because items of a file are not -+ * necessarily contiguous in a key space, for example, in a plan-b. -+ */ -+ int (*owns_item) (const struct inode *, const coord_t *); -+ -+ /* checks whether yet another hard links to this object can be -+ added */ -+ int (*can_add_link) (const struct inode *); -+ -+ /* checks whether hard links to this object can be removed */ -+ int (*can_rem_link) (const struct inode *); -+ -+ /* not empty for DIRECTORY_FILE_PLUGIN_ID only currently. It calls -+ detach of directory plugin to remove ".." */ -+ int (*detach) (struct inode * child, struct inode * parent); -+ -+ /* called when @child was just looked up in the @parent. It is not -+ empty for DIRECTORY_FILE_PLUGIN_ID only where it calls attach of -+ directory plugin */ -+ int (*bind) (struct inode * child, struct inode * parent); -+ -+ /* process safe-link during mount */ -+ int (*safelink) (struct inode * object, reiser4_safe_link_t link, -+ __u64 value); -+ -+ /* The couple of estimate methods for all file operations */ -+ struct { -+ reiser4_block_nr(*create) (const struct inode *); -+ reiser4_block_nr(*update) (const struct inode *); -+ reiser4_block_nr(*unlink) (const struct inode *, -+ const struct inode *); -+ } estimate; -+ -+ /* -+ * reiser4 specific part of inode has a union of structures which are -+ * specific to a plugin. This method is called when inode is read -+ * (read_inode) and when file is created (common_create_child) so that -+ * file plugin could initialize its inode data -+ */ -+ void (*init_inode_data) (struct inode *, reiser4_object_create_data *, -+ int); -+ -+ /* -+ * This method performs progressive deletion of items and whole nodes -+ * from right to left. -+ * -+ * @tap: the point deletion process begins from, -+ * @from_key: the beginning of the deleted key range, -+ * @to_key: the end of the deleted key range, -+ * @smallest_removed: the smallest removed key, -+ * -+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree -+ * operation was interrupted for allowing atom commit . -+ */ -+ int (*cut_tree_worker) (tap_t *, const reiser4_key * from_key, -+ const reiser4_key * to_key, -+ reiser4_key * smallest_removed, struct inode *, -+ int, int *); -+ -+ /* called from ->destroy_inode() */ -+ void (*destroy_inode) (struct inode *); -+ -+ /* -+ * methods to serialize object identify. This is used, for example, by -+ * reiser4_{en,de}code_fh(). -+ */ -+ struct { -+ /* store object's identity at @area */ -+ char *(*write) (struct inode * inode, char *area); -+ /* parse object from wire to the @obj */ -+ char *(*read) (char *area, reiser4_object_on_wire * obj); -+ /* given object identity in @obj, find or create its dentry */ -+ struct dentry *(*get) (struct super_block * s, -+ reiser4_object_on_wire * obj); -+ /* how many bytes ->wire.write() consumes */ -+ int (*size) (struct inode * inode); -+ /* finish with object identify */ -+ void (*done) (reiser4_object_on_wire * obj); -+ } wire; -+} file_plugin; -+ -+extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID]; -+ -+struct reiser4_object_on_wire { -+ file_plugin *plugin; -+ union { -+ struct { -+ obj_key_id key_id; -+ } std; -+ void *generic; -+ } u; -+}; -+ -+/* builtin dir-plugins */ -+typedef enum { -+ HASHED_DIR_PLUGIN_ID, -+ SEEKABLE_HASHED_DIR_PLUGIN_ID, -+ LAST_DIR_ID -+} reiser4_dir_id; -+ -+typedef struct dir_plugin { -+ /* generic fields */ -+ plugin_header h; -+ -+ struct inode_operations * inode_ops; -+ struct file_operations * file_ops; -+ struct address_space_operations * as_ops; -+ -+ /* -+ * private methods: These are optional. If used they will allow you to -+ * minimize the amount of code needed to implement a deviation from -+ * some other method that uses them. You could logically argue that -+ * they should be a separate type of plugin. -+ */ -+ -+ struct dentry *(*get_parent) (struct inode * childdir); -+ -+ /* -+ * check whether "name" is acceptable name to be inserted into this -+ * object. Optionally implemented by directory-like objects. Can check -+ * for maximal length, reserved symbols etc -+ */ -+ int (*is_name_acceptable) (const struct inode * inode, const char *name, -+ int len); -+ -+ void (*build_entry_key) (const struct inode * dir /* directory where -+ * entry is (or will -+ * be) in.*/ , -+ const struct qstr * name /* name of file -+ * referenced by this -+ * entry */ , -+ reiser4_key * result /* resulting key of -+ * directory entry */ ); -+ int (*build_readdir_key) (struct file * dir, reiser4_key * result); -+ int (*add_entry) (struct inode * object, struct dentry * where, -+ reiser4_object_create_data * data, -+ reiser4_dir_entry_desc * entry); -+ int (*rem_entry) (struct inode * object, struct dentry * where, -+ reiser4_dir_entry_desc * entry); -+ -+ /* -+ * initialize directory structure for newly created object. For normal -+ * unix directories, insert dot and dotdot. -+ */ -+ int (*init) (struct inode * object, struct inode * parent, -+ reiser4_object_create_data * data); -+ -+ /* destroy directory */ -+ int (*done) (struct inode * child); -+ -+ /* called when @subdir was just looked up in the @dir */ -+ int (*attach) (struct inode * subdir, struct inode * dir); -+ int (*detach) (struct inode * subdir, struct inode * dir); -+ -+ struct { -+ reiser4_block_nr(*add_entry) (const struct inode *); -+ reiser4_block_nr(*rem_entry) (const struct inode *); -+ reiser4_block_nr(*unlink) (const struct inode *, -+ const struct inode *); -+ } estimate; -+} dir_plugin; -+ -+extern dir_plugin dir_plugins[LAST_DIR_ID]; -+ -+typedef struct formatting_plugin { -+ /* generic fields */ -+ plugin_header h; -+ /* returns non-zero iff file's tail has to be stored -+ in a direct item. */ -+ int (*have_tail) (const struct inode * inode, loff_t size); -+} formatting_plugin; -+ -+typedef struct hash_plugin { -+ /* generic fields */ -+ plugin_header h; -+ /* computes hash of the given name */ -+ __u64(*hash) (const unsigned char *name, int len); -+} hash_plugin; -+ -+typedef struct cipher_plugin { -+ /* generic fields */ -+ plugin_header h; -+ struct crypto_blkcipher * (*alloc) (void); -+ void (*free) (struct crypto_blkcipher * tfm); -+ /* Offset translator. For each offset this returns (k * offset), where -+ k (k >= 1) is an expansion factor of the cipher algorithm. -+ For all symmetric algorithms k == 1. For asymmetric algorithms (which -+ inflate data) offset translation guarantees that all disk cluster's -+ units will have keys smaller then next cluster's one. -+ */ -+ loff_t(*scale) (struct inode * inode, size_t blocksize, loff_t src); -+ /* Cipher algorithms can accept data only by chunks of cipher block -+ size. This method is to align any flow up to cipher block size when -+ we pass it to cipher algorithm. To align means to append padding of -+ special format specific to the cipher algorithm */ -+ int (*align_stream) (__u8 * tail, int clust_size, int blocksize); -+ /* low-level key manager (check, install, etc..) */ -+ int (*setkey) (struct crypto_tfm * tfm, const __u8 * key, -+ unsigned int keylen); -+ /* main text processing procedures */ -+ void (*encrypt) (__u32 * expkey, __u8 * dst, const __u8 * src); -+ void (*decrypt) (__u32 * expkey, __u8 * dst, const __u8 * src); -+} cipher_plugin; -+ -+typedef struct digest_plugin { -+ /* generic fields */ -+ plugin_header h; -+ /* fingerprint size in bytes */ -+ int fipsize; -+ struct crypto_hash * (*alloc) (void); -+ void (*free) (struct crypto_hash * tfm); -+} digest_plugin; -+ -+typedef struct compression_plugin { -+ /* generic fields */ -+ plugin_header h; -+ int (*init) (void); -+ /* the maximum number of bytes the size of the "compressed" data can -+ * exceed the uncompressed data. */ -+ int (*overrun) (unsigned src_len); -+ coa_t(*alloc) (tfm_action act); -+ void (*free) (coa_t coa, tfm_action act); -+ /* minimal size of the flow we still try to compress */ -+ int (*min_size_deflate) (void); -+ __u32(*checksum) (char *data, __u32 length); -+ /* main transform procedures */ -+ void (*compress) (coa_t coa, __u8 * src_first, unsigned src_len, -+ __u8 * dst_first, unsigned *dst_len); -+ void (*decompress) (coa_t coa, __u8 * src_first, unsigned src_len, -+ __u8 * dst_first, unsigned *dst_len); -+} compression_plugin; -+ -+typedef struct compression_mode_plugin { -+ /* generic fields */ -+ plugin_header h; -+ /* this is called when estimating compressibility -+ of a logical cluster by its content */ -+ int (*should_deflate) (struct inode * inode, cloff_t index); -+ /* this is called when results of compression should be saved */ -+ int (*accept_hook) (struct inode * inode, cloff_t index); -+ /* this is called when results of compression should be discarded */ -+ int (*discard_hook) (struct inode * inode, cloff_t index); -+} compression_mode_plugin; -+ -+typedef struct cluster_plugin { -+ /* generic fields */ -+ plugin_header h; -+ int shift; -+} cluster_plugin; -+ -+typedef struct sd_ext_plugin { -+ /* generic fields */ -+ plugin_header h; -+ int (*present) (struct inode * inode, char **area, int *len); -+ int (*absent) (struct inode * inode); -+ int (*save_len) (struct inode * inode); -+ int (*save) (struct inode * inode, char **area); -+ /* alignment requirement for this stat-data part */ -+ int alignment; -+} sd_ext_plugin; -+ -+/* this plugin contains methods to allocate objectid for newly created files, -+ to deallocate objectid when file gets removed, to report number of used and -+ free objectids */ -+typedef struct oid_allocator_plugin { -+ /* generic fields */ -+ plugin_header h; -+ int (*init_oid_allocator) (reiser4_oid_allocator * map, __u64 nr_files, -+ __u64 oids); -+ /* used to report statfs->f_files */ -+ __u64(*oids_used) (reiser4_oid_allocator * map); -+ /* get next oid to use */ -+ __u64(*next_oid) (reiser4_oid_allocator * map); -+ /* used to report statfs->f_ffree */ -+ __u64(*oids_free) (reiser4_oid_allocator * map); -+ /* allocate new objectid */ -+ int (*allocate_oid) (reiser4_oid_allocator * map, oid_t *); -+ /* release objectid */ -+ int (*release_oid) (reiser4_oid_allocator * map, oid_t); -+ /* how many pages to reserve in transaction for allocation of new -+ objectid */ -+ int (*oid_reserve_allocate) (reiser4_oid_allocator * map); -+ /* how many pages to reserve in transaction for freeing of an -+ objectid */ -+ int (*oid_reserve_release) (reiser4_oid_allocator * map); -+ void (*print_info) (const char *, reiser4_oid_allocator *); -+} oid_allocator_plugin; -+ -+/* disk layout plugin: this specifies super block, journal, bitmap (if there -+ are any) locations, etc */ -+typedef struct disk_format_plugin { -+ /* generic fields */ -+ plugin_header h; -+ /* replay journal, initialize super_info_data, etc */ -+ int (*init_format) (struct super_block *, void *data); -+ -+ /* key of root directory stat data */ -+ const reiser4_key *(*root_dir_key) (const struct super_block *); -+ -+ int (*release) (struct super_block *); -+ jnode *(*log_super) (struct super_block *); -+ int (*check_open) (const struct inode * object); -+ int (*version_update) (struct super_block *); -+} disk_format_plugin; -+ -+struct jnode_plugin { -+ /* generic fields */ -+ plugin_header h; -+ int (*init) (jnode * node); -+ int (*parse) (jnode * node); -+ struct address_space *(*mapping) (const jnode * node); -+ unsigned long (*index) (const jnode * node); -+ jnode *(*clone) (jnode * node); -+}; -+ -+/* plugin instance. */ -+/* */ -+/* This is "wrapper" union for all types of plugins. Most of the code uses */ -+/* plugins of particular type (file_plugin, dir_plugin, etc.) rather than */ -+/* operates with pointers to reiser4_plugin. This union is only used in */ -+/* some generic code in plugin/plugin.c that operates on all */ -+/* plugins. Technically speaking purpose of this union is to add type */ -+/* safety to said generic code: each plugin type (file_plugin, for */ -+/* example), contains plugin_header as its first memeber. This first member */ -+/* is located at the same place in memory as .h member of */ -+/* reiser4_plugin. Generic code, obtains pointer to reiser4_plugin and */ -+/* looks in the .h which is header of plugin type located in union. This */ -+/* allows to avoid type-casts. */ -+union reiser4_plugin { -+ /* generic fields */ -+ plugin_header h; -+ /* file plugin */ -+ file_plugin file; -+ /* directory plugin */ -+ dir_plugin dir; -+ /* hash plugin, used by directory plugin */ -+ hash_plugin hash; -+ /* fibration plugin used by directory plugin */ -+ fibration_plugin fibration; -+ /* cipher transform plugin, used by file plugin */ -+ cipher_plugin cipher; -+ /* digest transform plugin, used by file plugin */ -+ digest_plugin digest; -+ /* compression transform plugin, used by file plugin */ -+ compression_plugin compression; -+ /* tail plugin, used by file plugin */ -+ formatting_plugin formatting; -+ /* permission plugin */ -+ perm_plugin perm; -+ /* node plugin */ -+ node_plugin node; -+ /* item plugin */ -+ item_plugin item; -+ /* stat-data extension plugin */ -+ sd_ext_plugin sd_ext; -+ /* disk layout plugin */ -+ disk_format_plugin format; -+ /* object id allocator plugin */ -+ oid_allocator_plugin oid_allocator; -+ /* plugin for different jnode types */ -+ jnode_plugin jnode; -+ /* compression mode plugin, used by object plugin */ -+ compression_mode_plugin compression_mode; -+ /* cluster plugin, used by object plugin */ -+ cluster_plugin clust; -+ /* place-holder for new plugin types that can be registered -+ dynamically, and used by other dynamically loaded plugins. */ -+ void *generic; -+}; -+ -+struct reiser4_plugin_ops { -+ /* called when plugin is initialized */ -+ int (*init) (reiser4_plugin * plugin); -+ /* called when plugin is unloaded */ -+ int (*done) (reiser4_plugin * plugin); -+ /* load given plugin from disk */ -+ int (*load) (struct inode * inode, -+ reiser4_plugin * plugin, char **area, int *len); -+ /* how many space is required to store this plugin's state -+ in stat-data */ -+ int (*save_len) (struct inode * inode, reiser4_plugin * plugin); -+ /* save persistent plugin-data to disk */ -+ int (*save) (struct inode * inode, reiser4_plugin * plugin, -+ char **area); -+ /* alignment requirement for on-disk state of this plugin -+ in number of bytes */ -+ int alignment; -+ /* install itself into given inode. This can return error -+ (e.g., you cannot change hash of non-empty directory). */ -+ int (*change) (struct inode * inode, reiser4_plugin * plugin, -+ pset_member memb); -+ /* install itself into given inode. This can return error -+ (e.g., you cannot change hash of non-empty directory). */ -+ int (*inherit) (struct inode * inode, struct inode * parent, -+ reiser4_plugin * plugin); -+}; -+ -+/* functions implemented in fs/reiser4/plugin/plugin.c */ -+ -+/* stores plugin reference in reiser4-specific part of inode */ -+extern int set_object_plugin(struct inode *inode, reiser4_plugin_id id); -+extern int init_plugins(void); -+ -+/* builtin plugins */ -+ -+/* builtin hash-plugins */ -+ -+typedef enum { -+ RUPASOV_HASH_ID, -+ R5_HASH_ID, -+ TEA_HASH_ID, -+ FNV1_HASH_ID, -+ DEGENERATE_HASH_ID, -+ LAST_HASH_ID -+} reiser4_hash_id; -+ -+/* builtin cipher plugins */ -+ -+typedef enum { -+ NONE_CIPHER_ID, -+ LAST_CIPHER_ID -+} reiser4_cipher_id; -+ -+/* builtin digest plugins */ -+ -+typedef enum { -+ SHA256_32_DIGEST_ID, -+ LAST_DIGEST_ID -+} reiser4_digest_id; -+ -+/* builtin compression mode plugins */ -+typedef enum { -+ NONE_COMPRESSION_MODE_ID, -+ LATTD_COMPRESSION_MODE_ID, -+ ULTIM_COMPRESSION_MODE_ID, -+ FORCE_COMPRESSION_MODE_ID, -+ CONVX_COMPRESSION_MODE_ID, -+ LAST_COMPRESSION_MODE_ID -+} reiser4_compression_mode_id; -+ -+/* builtin cluster plugins */ -+typedef enum { -+ CLUSTER_64K_ID, -+ CLUSTER_32K_ID, -+ CLUSTER_16K_ID, -+ CLUSTER_8K_ID, -+ CLUSTER_4K_ID, -+ LAST_CLUSTER_ID -+} reiser4_cluster_id; -+ -+/* builtin tail-plugins */ -+ -+typedef enum { -+ NEVER_TAILS_FORMATTING_ID, -+ ALWAYS_TAILS_FORMATTING_ID, -+ SMALL_FILE_FORMATTING_ID, -+ LAST_TAIL_FORMATTING_ID -+} reiser4_formatting_id; -+ -+/* data type used to pack parameters that we pass to vfs object creation -+ function create_object() */ -+struct reiser4_object_create_data { -+ /* plugin to control created object */ -+ reiser4_file_id id; -+ /* mode of regular file, directory or special file */ -+/* what happens if some other sort of perm plugin is in use? */ -+ int mode; -+ /* rdev of special file */ -+ dev_t rdev; -+ /* symlink target */ -+ const char *name; -+ /* add here something for non-standard objects you invent, like -+ query for interpolation file etc. */ -+ -+ struct reiser4_crypto_info * crypto; -+ -+ struct inode *parent; -+ struct dentry *dentry; -+}; -+ -+/* description of directory entry being created/destroyed/sought for -+ -+ It is passed down to the directory plugin and farther to the -+ directory item plugin methods. Creation of new directory is done in -+ several stages: first we search for an entry with the same name, then -+ create new one. reiser4_dir_entry_desc is used to store some information -+ collected at some stage of this process and required later: key of -+ item that we want to insert/delete and pointer to an object that will -+ be bound by the new directory entry. Probably some more fields will -+ be added there. -+ -+*/ -+struct reiser4_dir_entry_desc { -+ /* key of directory entry */ -+ reiser4_key key; -+ /* object bound by this entry. */ -+ struct inode *obj; -+}; -+ -+#define MAX_PLUGIN_TYPE_LABEL_LEN 32 -+#define MAX_PLUGIN_PLUG_LABEL_LEN 32 -+ -+#define PLUGIN_BY_ID(TYPE,ID,FIELD) \ -+static inline TYPE *TYPE ## _by_id( reiser4_plugin_id id ) \ -+{ \ -+ reiser4_plugin *plugin = plugin_by_id ( ID, id ); \ -+ return plugin ? & plugin -> FIELD : NULL; \ -+} \ -+static inline TYPE *TYPE ## _by_disk_id( reiser4_tree *tree, d16 *id ) \ -+{ \ -+ reiser4_plugin *plugin = plugin_by_disk_id ( tree, ID, id ); \ -+ return plugin ? & plugin -> FIELD : NULL; \ -+} \ -+static inline TYPE *TYPE ## _by_unsafe_id( reiser4_plugin_id id ) \ -+{ \ -+ reiser4_plugin *plugin = plugin_by_unsafe_id ( ID, id ); \ -+ return plugin ? & plugin -> FIELD : NULL; \ -+} \ -+static inline reiser4_plugin* TYPE ## _to_plugin( TYPE* plugin ) \ -+{ \ -+ return ( reiser4_plugin * ) plugin; \ -+} \ -+static inline reiser4_plugin_id TYPE ## _id( TYPE* plugin ) \ -+{ \ -+ return TYPE ## _to_plugin (plugin) -> h.id; \ -+} \ -+typedef struct { int foo; } TYPE ## _plugin_dummy -+ -+PLUGIN_BY_ID(item_plugin, REISER4_ITEM_PLUGIN_TYPE, item); -+PLUGIN_BY_ID(file_plugin, REISER4_FILE_PLUGIN_TYPE, file); -+PLUGIN_BY_ID(dir_plugin, REISER4_DIR_PLUGIN_TYPE, dir); -+PLUGIN_BY_ID(node_plugin, REISER4_NODE_PLUGIN_TYPE, node); -+PLUGIN_BY_ID(sd_ext_plugin, REISER4_SD_EXT_PLUGIN_TYPE, sd_ext); -+PLUGIN_BY_ID(perm_plugin, REISER4_PERM_PLUGIN_TYPE, perm); -+PLUGIN_BY_ID(hash_plugin, REISER4_HASH_PLUGIN_TYPE, hash); -+PLUGIN_BY_ID(fibration_plugin, REISER4_FIBRATION_PLUGIN_TYPE, fibration); -+PLUGIN_BY_ID(cipher_plugin, REISER4_CIPHER_PLUGIN_TYPE, cipher); -+PLUGIN_BY_ID(digest_plugin, REISER4_DIGEST_PLUGIN_TYPE, digest); -+PLUGIN_BY_ID(compression_plugin, REISER4_COMPRESSION_PLUGIN_TYPE, compression); -+PLUGIN_BY_ID(formatting_plugin, REISER4_FORMATTING_PLUGIN_TYPE, formatting); -+PLUGIN_BY_ID(disk_format_plugin, REISER4_FORMAT_PLUGIN_TYPE, format); -+PLUGIN_BY_ID(jnode_plugin, REISER4_JNODE_PLUGIN_TYPE, jnode); -+PLUGIN_BY_ID(compression_mode_plugin, REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ compression_mode); -+PLUGIN_BY_ID(cluster_plugin, REISER4_CLUSTER_PLUGIN_TYPE, clust); -+ -+extern int save_plugin_id(reiser4_plugin * plugin, d16 * area); -+ -+extern struct list_head *get_plugin_list(reiser4_plugin_type type_id); -+ -+#define for_all_plugins(ptype, plugin) \ -+for (plugin = list_entry(get_plugin_list(ptype)->next, reiser4_plugin, h.linkage); \ -+ get_plugin_list(ptype) != &plugin->h.linkage; \ -+ plugin = list_entry(plugin->h.linkage.next, reiser4_plugin, h.linkage)) -+ -+ -+extern int grab_plugin_pset(struct inode *self, struct inode *ancestor, pset_member memb); -+extern int force_plugin_pset(struct inode *self, pset_member memb, reiser4_plugin *plug); -+extern int finish_pset(struct inode *inode); -+ -+/* defined in fs/reiser4/plugin/object.c */ -+extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID]; -+/* defined in fs/reiser4/plugin/object.c */ -+extern dir_plugin dir_plugins[LAST_DIR_ID]; -+/* defined in fs/reiser4/plugin/item/static_stat.c */ -+extern sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION]; -+/* defined in fs/reiser4/plugin/hash.c */ -+extern hash_plugin hash_plugins[LAST_HASH_ID]; -+/* defined in fs/reiser4/plugin/fibration.c */ -+extern fibration_plugin fibration_plugins[LAST_FIBRATION_ID]; -+/* defined in fs/reiser4/plugin/crypt.c */ -+extern cipher_plugin cipher_plugins[LAST_CIPHER_ID]; -+/* defined in fs/reiser4/plugin/digest.c */ -+extern digest_plugin digest_plugins[LAST_DIGEST_ID]; -+/* defined in fs/reiser4/plugin/compress/compress.c */ -+extern compression_plugin compression_plugins[LAST_COMPRESSION_ID]; -+/* defined in fs/reiser4/plugin/compress/compression_mode.c */ -+extern compression_mode_plugin -+compression_mode_plugins[LAST_COMPRESSION_MODE_ID]; -+/* defined in fs/reiser4/plugin/cluster.c */ -+extern cluster_plugin cluster_plugins[LAST_CLUSTER_ID]; -+/* defined in fs/reiser4/plugin/tail.c */ -+extern formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID]; -+/* defined in fs/reiser4/plugin/security/security.c */ -+extern perm_plugin perm_plugins[LAST_PERM_ID]; -+/* defined in fs/reiser4/plugin/item/item.c */ -+extern item_plugin item_plugins[LAST_ITEM_ID]; -+/* defined in fs/reiser4/plugin/node/node.c */ -+extern node_plugin node_plugins[LAST_NODE_ID]; -+/* defined in fs/reiser4/plugin/disk_format/disk_format.c */ -+extern disk_format_plugin format_plugins[LAST_FORMAT_ID]; -+ -+/* __FS_REISER4_PLUGIN_TYPES_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/plugin_header.h linux-2.6.24/fs/reiser4/plugin/plugin_header.h ---- linux-2.6.24.orig/fs/reiser4/plugin/plugin_header.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/plugin_header.h 2008-01-25 11:39:07.052237570 +0300 -@@ -0,0 +1,155 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* plugin header. Data structures required by all plugin types. */ -+ -+#if !defined( __PLUGIN_HEADER_H__ ) -+#define __PLUGIN_HEADER_H__ -+ -+/* plugin data-types and constants */ -+ -+#include "../debug.h" -+#include "../dformat.h" -+ -+/* Every plugin type can be considered as a class of virtual objects -+ {(type, i) | i = 0, 1, ...}, which has one the following categories -+ of virtualization: -+ A - no virtualization; -+ F - per-file virtualization; -+ S - per-superblock virtualization; -+ FIXME-EDWARD: Define every such category */ -+ -+/* Supported plugin types: (id, (virtualization category), short description) */ -+typedef enum { -+ REISER4_FILE_PLUGIN_TYPE, /* (F) service VFS enry-points */ -+ REISER4_DIR_PLUGIN_TYPE, /* (F) service VFS enry-points */ -+ REISER4_ITEM_PLUGIN_TYPE, /* (F) manage items */ -+ REISER4_NODE_PLUGIN_TYPE, /* (S) manage formatted nodes */ -+ REISER4_HASH_PLUGIN_TYPE, /* (F) compute hash */ -+ REISER4_FIBRATION_PLUGIN_TYPE, /* (F) directory fibrations */ -+ REISER4_FORMATTING_PLUGIN_TYPE, /* (F) tail-packing policy */ -+ REISER4_PERM_PLUGIN_TYPE, /* stub (vacancy) */ -+ REISER4_SD_EXT_PLUGIN_TYPE, /* (A) stat-data extensions */ -+ REISER4_FORMAT_PLUGIN_TYPE, /* (S) specify disk format */ -+ REISER4_JNODE_PLUGIN_TYPE, /* (A) in-memory node headers */ -+ REISER4_CIPHER_PLUGIN_TYPE, /* (F) cipher transform algs */ -+ REISER4_DIGEST_PLUGIN_TYPE, /* (F) digest transform algs */ -+ REISER4_COMPRESSION_PLUGIN_TYPE, /* (F) compression tfm algs */ -+ REISER4_COMPRESSION_MODE_PLUGIN_TYPE, /* (F) compression heuristic */ -+ REISER4_CLUSTER_PLUGIN_TYPE, /* (F) size of logical cluster */ -+ REISER4_PLUGIN_TYPES -+} reiser4_plugin_type; -+ -+/* Supported plugin groups */ -+typedef enum { -+ REISER4_DIRECTORY_FILE, -+ REISER4_REGULAR_FILE, -+ REISER4_SYMLINK_FILE, -+ REISER4_SPECIAL_FILE, -+} file_plugin_group; -+ -+struct reiser4_plugin_ops; -+/* generic plugin operations, supported by each -+ plugin type. */ -+typedef struct reiser4_plugin_ops reiser4_plugin_ops; -+ -+/* the common part of all plugin instances. */ -+typedef struct plugin_header { -+ /* plugin type */ -+ reiser4_plugin_type type_id; -+ /* id of this plugin */ -+ reiser4_plugin_id id; -+ /* bitmask of groups the plugin belongs to. */ -+ reiser4_plugin_groups groups; -+ /* plugin operations */ -+ reiser4_plugin_ops *pops; -+/* NIKITA-FIXME-HANS: usage of and access to label and desc is not commented and defined. */ -+ /* short label of this plugin */ -+ const char *label; -+ /* descriptive string.. */ -+ const char *desc; -+ /* list linkage */ -+ struct list_head linkage; -+} plugin_header; -+ -+#define plugin_of_group(plug, group) (plug->h.groups & (1 << group)) -+ -+/* PRIVATE INTERFACES */ -+/* NIKITA-FIXME-HANS: what is this for and why does it duplicate what is in plugin_header? */ -+/* plugin type representation. */ -+struct reiser4_plugin_type_data { -+ /* internal plugin type identifier. Should coincide with -+ index of this item in plugins[] array. */ -+ reiser4_plugin_type type_id; -+ /* short symbolic label of this plugin type. Should be no longer -+ than MAX_PLUGIN_TYPE_LABEL_LEN characters including '\0'. */ -+ const char *label; -+ /* plugin type description longer than .label */ -+ const char *desc; -+ -+/* NIKITA-FIXME-HANS: define built-in */ -+ /* number of built-in plugin instances of this type */ -+ int builtin_num; -+ /* array of built-in plugins */ -+ void *builtin; -+ struct list_head plugins_list; -+ size_t size; -+}; -+ -+extern struct reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES]; -+ -+int is_plugin_type_valid(reiser4_plugin_type type); -+int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id); -+ -+static inline reiser4_plugin *plugin_at(struct reiser4_plugin_type_data * ptype, -+ int i) -+{ -+ char *builtin; -+ -+ builtin = ptype->builtin; -+ return (reiser4_plugin *) (builtin + i * ptype->size); -+} -+ -+/* return plugin by its @type_id and @id */ -+static inline reiser4_plugin *plugin_by_id(reiser4_plugin_type type, -+ reiser4_plugin_id id) -+{ -+ assert("nikita-1651", is_plugin_type_valid(type)); -+ assert("nikita-1652", is_plugin_id_valid(type, id)); -+ return plugin_at(&plugins[type], id); -+} -+ -+extern reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type_id, -+ reiser4_plugin_id id); -+ -+/** -+ * plugin_by_disk_id - get reiser4_plugin -+ * @type_id: plugin type id -+ * @did: plugin id in disk format -+ * -+ * Returns reiser4_plugin by plugin type id an dplugin_id. -+ */ -+static inline reiser4_plugin *plugin_by_disk_id(reiser4_tree * tree UNUSED_ARG, -+ reiser4_plugin_type type_id, -+ __le16 *plugin_id) -+{ -+ /* -+ * what we should do properly is to maintain within each file-system a -+ * dictionary that maps on-disk plugin ids to "universal" ids. This -+ * dictionary will be resolved on mount time, so that this function -+ * will perform just one additional array lookup. -+ */ -+ return plugin_by_unsafe_id(type_id, le16_to_cpu(*plugin_id)); -+} -+ -+/* __PLUGIN_HEADER_H__ */ -+#endif -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/plugin_set.c linux-2.6.24/fs/reiser4/plugin/plugin_set.c ---- linux-2.6.24.orig/fs/reiser4/plugin/plugin_set.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/plugin_set.c 2008-01-25 11:39:07.052237570 +0300 -@@ -0,0 +1,379 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+/* This file contains Reiser4 plugin set operations */ -+ -+/* plugin sets -+ * -+ * Each file in reiser4 is controlled by a whole set of plugins (file plugin, -+ * directory plugin, hash plugin, tail policy plugin, security plugin, etc.) -+ * assigned (inherited, deduced from mode bits, etc.) at creation time. This -+ * set of plugins (so called pset) is described by structure plugin_set (see -+ * plugin/plugin_set.h), which contains pointers to all required plugins. -+ * -+ * Children can inherit some pset members from their parent, however sometimes -+ * it is useful to specify members different from parent ones. Since object's -+ * pset can not be easily changed without fatal consequences, we use for this -+ * purpose another special plugin table (so called hset, or heir set) described -+ * by the same structure. -+ * -+ * Inode only stores a pointers to pset and hset. Different inodes with the -+ * same set of pset (hset) members point to the same pset (hset). This is -+ * archived by storing psets and hsets in global hash table. Races are avoided -+ * by simple (and efficient so far) solution of never recycling psets, even -+ * when last inode pointing to it is destroyed. -+ */ -+ -+#include "../debug.h" -+#include "../super.h" -+#include "plugin_set.h" -+ -+#include -+#include -+ -+/* slab for plugin sets */ -+static struct kmem_cache *plugin_set_slab; -+ -+static spinlock_t plugin_set_lock[8] __cacheline_aligned_in_smp = { -+ [0 ... 7] = SPIN_LOCK_UNLOCKED -+}; -+ -+/* hash table support */ -+ -+#define PS_TABLE_SIZE (32) -+ -+static inline plugin_set *cast_to(const unsigned long *a) -+{ -+ return container_of(a, plugin_set, hashval); -+} -+ -+static inline int pseq(const unsigned long *a1, const unsigned long *a2) -+{ -+ plugin_set *set1; -+ plugin_set *set2; -+ -+ /* make sure fields are not missed in the code below */ -+ cassert(sizeof *set1 == -+ sizeof set1->hashval + -+ sizeof set1->link + -+ sizeof set1->file + -+ sizeof set1->dir + -+ sizeof set1->perm + -+ sizeof set1->formatting + -+ sizeof set1->hash + -+ sizeof set1->fibration + -+ sizeof set1->sd + -+ sizeof set1->dir_item + -+ sizeof set1->cipher + -+ sizeof set1->digest + -+ sizeof set1->compression + -+ sizeof set1->compression_mode + -+ sizeof set1->cluster + -+ sizeof set1->create); -+ -+ set1 = cast_to(a1); -+ set2 = cast_to(a2); -+ return -+ set1->hashval == set2->hashval && -+ set1->file == set2->file && -+ set1->dir == set2->dir && -+ set1->perm == set2->perm && -+ set1->formatting == set2->formatting && -+ set1->hash == set2->hash && -+ set1->fibration == set2->fibration && -+ set1->sd == set2->sd && -+ set1->dir_item == set2->dir_item && -+ set1->cipher == set2->cipher && -+ set1->digest == set2->digest && -+ set1->compression == set2->compression && -+ set1->compression_mode == set2->compression_mode && -+ set1->cluster == set2->cluster && -+ set1->create == set2->create; -+} -+ -+#define HASH_FIELD(hash, set, field) \ -+({ \ -+ (hash) += (unsigned long)(set)->field >> 2; \ -+}) -+ -+static inline unsigned long calculate_hash(const plugin_set * set) -+{ -+ unsigned long result; -+ -+ result = 0; -+ HASH_FIELD(result, set, file); -+ HASH_FIELD(result, set, dir); -+ HASH_FIELD(result, set, perm); -+ HASH_FIELD(result, set, formatting); -+ HASH_FIELD(result, set, hash); -+ HASH_FIELD(result, set, fibration); -+ HASH_FIELD(result, set, sd); -+ HASH_FIELD(result, set, dir_item); -+ HASH_FIELD(result, set, cipher); -+ HASH_FIELD(result, set, digest); -+ HASH_FIELD(result, set, compression); -+ HASH_FIELD(result, set, compression_mode); -+ HASH_FIELD(result, set, cluster); -+ HASH_FIELD(result, set, create); -+ return result & (PS_TABLE_SIZE - 1); -+} -+ -+static inline unsigned long -+pshash(ps_hash_table * table, const unsigned long *a) -+{ -+ return *a; -+} -+ -+/* The hash table definition */ -+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get()) -+#define KFREE(ptr, size) kfree(ptr) -+TYPE_SAFE_HASH_DEFINE(ps, plugin_set, unsigned long, hashval, link, pshash, -+ pseq); -+#undef KFREE -+#undef KMALLOC -+ -+static ps_hash_table ps_table; -+static plugin_set empty_set = { -+ .hashval = 0, -+ .file = NULL, -+ .dir = NULL, -+ .perm = NULL, -+ .formatting = NULL, -+ .hash = NULL, -+ .fibration = NULL, -+ .sd = NULL, -+ .dir_item = NULL, -+ .cipher = NULL, -+ .digest = NULL, -+ .compression = NULL, -+ .compression_mode = NULL, -+ .cluster = NULL, -+ .create = NULL, -+ .link = {NULL} -+}; -+ -+plugin_set *plugin_set_get_empty(void) -+{ -+ return &empty_set; -+} -+ -+void plugin_set_put(plugin_set * set) -+{ -+} -+ -+static inline unsigned long *pset_field(plugin_set * set, int offset) -+{ -+ return (unsigned long *)(((char *)set) + offset); -+} -+ -+static int plugin_set_field(plugin_set ** set, const unsigned long val, -+ const int offset) -+{ -+ unsigned long *spot; -+ spinlock_t *lock; -+ plugin_set replica; -+ plugin_set *twin; -+ plugin_set *psal; -+ plugin_set *orig; -+ -+ assert("nikita-2902", set != NULL); -+ assert("nikita-2904", *set != NULL); -+ -+ spot = pset_field(*set, offset); -+ if (unlikely(*spot == val)) -+ return 0; -+ -+ replica = *(orig = *set); -+ *pset_field(&replica, offset) = val; -+ replica.hashval = calculate_hash(&replica); -+ rcu_read_lock(); -+ twin = ps_hash_find(&ps_table, &replica.hashval); -+ if (unlikely(twin == NULL)) { -+ rcu_read_unlock(); -+ psal = kmem_cache_alloc(plugin_set_slab, -+ reiser4_ctx_gfp_mask_get()); -+ if (psal == NULL) -+ return RETERR(-ENOMEM); -+ *psal = replica; -+ lock = &plugin_set_lock[replica.hashval & 7]; -+ spin_lock(lock); -+ twin = ps_hash_find(&ps_table, &replica.hashval); -+ if (likely(twin == NULL)) { -+ *set = psal; -+ ps_hash_insert_rcu(&ps_table, psal); -+ } else { -+ *set = twin; -+ kmem_cache_free(plugin_set_slab, psal); -+ } -+ spin_unlock(lock); -+ } else { -+ rcu_read_unlock(); -+ *set = twin; -+ } -+ return 0; -+} -+ -+static struct { -+ int offset; -+ reiser4_plugin_groups groups; -+ reiser4_plugin_type type; -+} pset_descr[PSET_LAST] = { -+ [PSET_FILE] = { -+ .offset = offsetof(plugin_set, file), -+ .type = REISER4_FILE_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_DIR] = { -+ .offset = offsetof(plugin_set, dir), -+ .type = REISER4_DIR_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_PERM] = { -+ .offset = offsetof(plugin_set, perm), -+ .type = REISER4_PERM_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_FORMATTING] = { -+ .offset = offsetof(plugin_set, formatting), -+ .type = REISER4_FORMATTING_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_HASH] = { -+ .offset = offsetof(plugin_set, hash), -+ .type = REISER4_HASH_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_FIBRATION] = { -+ .offset = offsetof(plugin_set, fibration), -+ .type = REISER4_FIBRATION_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_SD] = { -+ .offset = offsetof(plugin_set, sd), -+ .type = REISER4_ITEM_PLUGIN_TYPE, -+ .groups = (1 << STAT_DATA_ITEM_TYPE) -+ }, -+ [PSET_DIR_ITEM] = { -+ .offset = offsetof(plugin_set, dir_item), -+ .type = REISER4_ITEM_PLUGIN_TYPE, -+ .groups = (1 << DIR_ENTRY_ITEM_TYPE) -+ }, -+ [PSET_CIPHER] = { -+ .offset = offsetof(plugin_set, cipher), -+ .type = REISER4_CIPHER_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_DIGEST] = { -+ .offset = offsetof(plugin_set, digest), -+ .type = REISER4_DIGEST_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_COMPRESSION] = { -+ .offset = offsetof(plugin_set, compression), -+ .type = REISER4_COMPRESSION_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_COMPRESSION_MODE] = { -+ .offset = offsetof(plugin_set, compression_mode), -+ .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_CLUSTER] = { -+ .offset = offsetof(plugin_set, cluster), -+ .type = REISER4_CLUSTER_PLUGIN_TYPE, -+ .groups = 0 -+ }, -+ [PSET_CREATE] = { -+ .offset = offsetof(plugin_set, create), -+ .type = REISER4_FILE_PLUGIN_TYPE, -+ .groups = (1 << REISER4_REGULAR_FILE) -+ } -+}; -+ -+#define DEFINE_PSET_OPS(PREFIX) \ -+ reiser4_plugin_type PREFIX##_member_to_type_unsafe(pset_member memb) \ -+{ \ -+ if (memb > PSET_LAST) \ -+ return REISER4_PLUGIN_TYPES; \ -+ return pset_descr[memb].type; \ -+} \ -+ \ -+int PREFIX##_set_unsafe(plugin_set ** set, pset_member memb, \ -+ reiser4_plugin * plugin) \ -+{ \ -+ assert("nikita-3492", set != NULL); \ -+ assert("nikita-3493", *set != NULL); \ -+ assert("nikita-3494", plugin != NULL); \ -+ assert("nikita-3495", 0 <= memb && memb < PSET_LAST); \ -+ assert("nikita-3496", plugin->h.type_id == pset_descr[memb].type); \ -+ \ -+ if (pset_descr[memb].groups) \ -+ if (!(pset_descr[memb].groups & plugin->h.groups)) \ -+ return -EINVAL; \ -+ \ -+ return plugin_set_field(set, \ -+ (unsigned long)plugin, pset_descr[memb].offset); \ -+} \ -+ \ -+reiser4_plugin *PREFIX##_get(plugin_set * set, pset_member memb) \ -+{ \ -+ assert("nikita-3497", set != NULL); \ -+ assert("nikita-3498", 0 <= memb && memb < PSET_LAST); \ -+ \ -+ return *(reiser4_plugin **) (((char *)set) + pset_descr[memb].offset); \ -+} -+ -+DEFINE_PSET_OPS(aset); -+ -+int set_plugin(plugin_set ** set, pset_member memb, reiser4_plugin * plugin) { -+ return plugin_set_field(set, -+ (unsigned long)plugin, pset_descr[memb].offset); -+} -+ -+/** -+ * init_plugin_set - create plugin set cache and hash table -+ * -+ * Initializes slab cache of plugin_set-s and their hash table. It is part of -+ * reiser4 module initialization. -+ */ -+int init_plugin_set(void) -+{ -+ int result; -+ -+ result = ps_hash_init(&ps_table, PS_TABLE_SIZE); -+ if (result == 0) { -+ plugin_set_slab = kmem_cache_create("plugin_set", -+ sizeof(plugin_set), 0, -+ SLAB_HWCACHE_ALIGN, -+ NULL); -+ if (plugin_set_slab == NULL) -+ result = RETERR(-ENOMEM); -+ } -+ return result; -+} -+ -+/** -+ * done_plugin_set - delete plugin_set cache and plugin_set hash table -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+void done_plugin_set(void) -+{ -+ plugin_set *cur, *next; -+ -+ for_all_in_htable(&ps_table, ps, cur, next) { -+ ps_hash_remove(&ps_table, cur); -+ kmem_cache_free(plugin_set_slab, cur); -+ } -+ destroy_reiser4_cache(&plugin_set_slab); -+ ps_hash_done(&ps_table); -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 120 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/plugin_set.h linux-2.6.24/fs/reiser4/plugin/plugin_set.h ---- linux-2.6.24.orig/fs/reiser4/plugin/plugin_set.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/plugin_set.h 2008-01-25 11:39:07.056238601 +0300 -@@ -0,0 +1,77 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Reiser4 plugin set definition. -+ See fs/reiser4/plugin/plugin_set.c for details */ -+ -+#if !defined( __PLUGIN_SET_H__ ) -+#define __PLUGIN_SET_H__ -+ -+#include "../type_safe_hash.h" -+#include "plugin.h" -+ -+#include -+ -+struct plugin_set; -+typedef struct plugin_set plugin_set; -+ -+TYPE_SAFE_HASH_DECLARE(ps, plugin_set); -+ -+struct plugin_set { -+ unsigned long hashval; -+ /* plugin of file */ -+ file_plugin *file; -+ /* plugin of dir */ -+ dir_plugin *dir; -+ /* perm plugin for this file */ -+ perm_plugin *perm; -+ /* tail policy plugin. Only meaningful for regular files */ -+ formatting_plugin *formatting; -+ /* hash plugin. Only meaningful for directories. */ -+ hash_plugin *hash; -+ /* fibration plugin. Only meaningful for directories. */ -+ fibration_plugin *fibration; -+ /* plugin of stat-data */ -+ item_plugin *sd; -+ /* plugin of items a directory is built of */ -+ item_plugin *dir_item; -+ /* cipher plugin */ -+ cipher_plugin *cipher; -+ /* digest plugin */ -+ digest_plugin *digest; -+ /* compression plugin */ -+ compression_plugin *compression; -+ /* compression mode plugin */ -+ compression_mode_plugin *compression_mode; -+ /* cluster plugin */ -+ cluster_plugin *cluster; -+ /* this specifies file plugin of regular children. -+ only meaningful for directories */ -+ file_plugin *create; -+ ps_hash_link link; -+}; -+ -+extern plugin_set *plugin_set_get_empty(void); -+extern void plugin_set_put(plugin_set * set); -+ -+extern int init_plugin_set(void); -+extern void done_plugin_set(void); -+ -+extern reiser4_plugin *aset_get(plugin_set * set, pset_member memb); -+extern int set_plugin(plugin_set ** set, pset_member memb, -+ reiser4_plugin * plugin); -+extern int aset_set_unsafe(plugin_set ** set, pset_member memb, -+ reiser4_plugin * plugin); -+extern reiser4_plugin_type aset_member_to_type_unsafe(pset_member memb); -+ -+/* __PLUGIN_SET_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/security/Makefile linux-2.6.24/fs/reiser4/plugin/security/Makefile ---- linux-2.6.24.orig/fs/reiser4/plugin/security/Makefile 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/security/Makefile 2008-01-25 11:39:07.056238601 +0300 -@@ -0,0 +1,4 @@ -+obj-$(CONFIG_REISER4_FS) += security_plugins.o -+ -+security_plugins-objs := \ -+ perm.o -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/security/perm.c linux-2.6.24/fs/reiser4/plugin/security/perm.c ---- linux-2.6.24.orig/fs/reiser4/plugin/security/perm.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/security/perm.c 2008-01-25 11:39:07.056238601 +0300 -@@ -0,0 +1,33 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* -+ * This file contains implementation of permission plugins. -+ * See the comments in perm.h -+ */ -+ -+#include "../plugin.h" -+#include "../plugin_header.h" -+#include "../../debug.h" -+ -+perm_plugin perm_plugins[LAST_PERM_ID] = { -+ [NULL_PERM_ID] = { -+ .h = { -+ .type_id = REISER4_PERM_PLUGIN_TYPE, -+ .id = NULL_PERM_ID, -+ .pops = NULL, -+ .label = "null", -+ .desc = "stub permission plugin", -+ .linkage = {NULL, NULL} -+ } -+ } -+}; -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/security/perm.h linux-2.6.24/fs/reiser4/plugin/security/perm.h ---- linux-2.6.24.orig/fs/reiser4/plugin/security/perm.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/security/perm.h 2008-01-25 11:39:07.060239631 +0300 -@@ -0,0 +1,38 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Perm (short for "permissions") plugins common stuff. */ -+ -+#if !defined( __REISER4_PERM_H__ ) -+#define __REISER4_PERM_H__ -+ -+#include "../../forward.h" -+#include "../plugin_header.h" -+ -+#include -+ -+/* Definition of permission plugin */ -+/* NIKITA-FIXME-HANS: define what this is targeted for. -+ It does not seem to be intended for use with sys_reiser4. Explain. */ -+ -+/* NOTE-EDWARD: This seems to be intended for deprecated sys_reiser4. -+ Consider it like a temporary "seam" and reserved pset member. -+ If you have something usefull to add, then rename this plugin and add here */ -+typedef struct perm_plugin { -+ /* generic plugin fields */ -+ plugin_header h; -+} perm_plugin; -+ -+typedef enum { NULL_PERM_ID, LAST_PERM_ID } reiser4_perm_id; -+ -+/* __REISER4_PERM_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/space/bitmap.c linux-2.6.24/fs/reiser4/plugin/space/bitmap.c ---- linux-2.6.24.orig/fs/reiser4/plugin/space/bitmap.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/space/bitmap.c 2008-01-25 11:39:07.064240661 +0300 -@@ -0,0 +1,1585 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#include "../../debug.h" -+#include "../../dformat.h" -+#include "../../txnmgr.h" -+#include "../../jnode.h" -+#include "../../block_alloc.h" -+#include "../../tree.h" -+#include "../../super.h" -+#include "../plugin.h" -+#include "space_allocator.h" -+#include "bitmap.h" -+ -+#include -+#include /* for struct super_block */ -+#include -+#include -+ -+/* Proposed (but discarded) optimization: dynamic loading/unloading of bitmap -+ * blocks -+ -+ A useful optimization of reiser4 bitmap handling would be dynamic bitmap -+ blocks loading/unloading which is different from v3.x where all bitmap -+ blocks are loaded at mount time. -+ -+ To implement bitmap blocks unloading we need to count bitmap block usage -+ and detect currently unused blocks allowing them to be unloaded. It is not -+ a simple task since we allow several threads to modify one bitmap block -+ simultaneously. -+ -+ Briefly speaking, the following schema is proposed: we count in special -+ variable associated with each bitmap block. That is for counting of block -+ alloc/dealloc operations on that bitmap block. With a deferred block -+ deallocation feature of reiser4 all those operation will be represented in -+ atom dirty/deleted lists as jnodes for freshly allocated or deleted -+ nodes. -+ -+ So, we increment usage counter for each new node allocated or deleted, and -+ decrement it at atom commit one time for each node from the dirty/deleted -+ atom's list. Of course, freshly allocated node deletion and node reusing -+ from atom deleted (if we do so) list should decrement bitmap usage counter -+ also. -+ -+ This schema seems to be working but that reference counting is -+ not easy to debug. I think we should agree with Hans and do not implement -+ it in v4.0. Current code implements "on-demand" bitmap blocks loading only. -+ -+ For simplicity all bitmap nodes (both commit and working bitmap blocks) are -+ loaded into memory on fs mount time or each bitmap nodes are loaded at the -+ first access to it, the "dont_load_bitmap" mount option controls whether -+ bimtap nodes should be loaded at mount time. Dynamic unloading of bitmap -+ nodes currently is not supported. */ -+ -+#define CHECKSUM_SIZE 4 -+ -+#define BYTES_PER_LONG (sizeof(long)) -+ -+#if BITS_PER_LONG == 64 -+# define LONG_INT_SHIFT (6) -+#else -+# define LONG_INT_SHIFT (5) -+#endif -+ -+#define LONG_INT_MASK (BITS_PER_LONG - 1UL) -+ -+typedef unsigned long ulong_t; -+ -+#define bmap_size(blocksize) ((blocksize) - CHECKSUM_SIZE) -+#define bmap_bit_count(blocksize) (bmap_size(blocksize) << 3) -+ -+/* Block allocation/deallocation are done through special bitmap objects which -+ are allocated in an array at fs mount. */ -+struct bitmap_node { -+ struct mutex mutex; /* long term lock object */ -+ -+ jnode *wjnode; /* j-nodes for WORKING ... */ -+ jnode *cjnode; /* ... and COMMIT bitmap blocks */ -+ -+ bmap_off_t first_zero_bit; /* for skip_busy option implementation */ -+ -+ atomic_t loaded; /* a flag which shows that bnode is loaded -+ * already */ -+}; -+ -+static inline char *bnode_working_data(struct bitmap_node *bnode) -+{ -+ char *data; -+ -+ data = jdata(bnode->wjnode); -+ assert("zam-429", data != NULL); -+ -+ return data + CHECKSUM_SIZE; -+} -+ -+static inline char *bnode_commit_data(const struct bitmap_node *bnode) -+{ -+ char *data; -+ -+ data = jdata(bnode->cjnode); -+ assert("zam-430", data != NULL); -+ -+ return data + CHECKSUM_SIZE; -+} -+ -+static inline __u32 bnode_commit_crc(const struct bitmap_node *bnode) -+{ -+ char *data; -+ -+ data = jdata(bnode->cjnode); -+ assert("vpf-261", data != NULL); -+ -+ return le32_to_cpu(get_unaligned((d32 *)data)); -+} -+ -+static inline void bnode_set_commit_crc(struct bitmap_node *bnode, __u32 crc) -+{ -+ char *data; -+ -+ data = jdata(bnode->cjnode); -+ assert("vpf-261", data != NULL); -+ -+ put_unaligned(cpu_to_le32(crc), (d32 *)data); -+} -+ -+/* ZAM-FIXME-HANS: is the idea that this might be a union someday? having -+ * written the code, does this added abstraction still have */ -+/* ANSWER(Zam): No, the abstractions is in the level above (exact place is the -+ * reiser4_space_allocator structure) */ -+/* ZAM-FIXME-HANS: I don't understand your english in comment above. */ -+/* FIXME-HANS(Zam): I don't understand the questions like "might be a union -+ * someday?". What they about? If there is a reason to have a union, it should -+ * be a union, if not, it should not be a union. "..might be someday" means no -+ * reason. */ -+struct bitmap_allocator_data { -+ /* an array for bitmap blocks direct access */ -+ struct bitmap_node *bitmap; -+}; -+ -+#define get_barray(super) \ -+(((struct bitmap_allocator_data *)(get_super_private(super)->space_allocator.u.generic)) -> bitmap) -+ -+#define get_bnode(super, i) (get_barray(super) + i) -+ -+/* allocate and initialize jnode with JNODE_BITMAP type */ -+static jnode *bnew(void) -+{ -+ jnode *jal = jalloc(); -+ -+ if (jal) -+ jnode_init(jal, current_tree, JNODE_BITMAP); -+ -+ return jal; -+} -+ -+/* this file contains: -+ - bitmap based implementation of space allocation plugin -+ - all the helper functions like set bit, find_first_zero_bit, etc */ -+ -+/* Audited by: green(2002.06.12) */ -+static int find_next_zero_bit_in_word(ulong_t word, int start_bit) -+{ -+ ulong_t mask = 1UL << start_bit; -+ int i = start_bit; -+ -+ while ((word & mask) != 0) { -+ mask <<= 1; -+ if (++i >= BITS_PER_LONG) -+ break; -+ } -+ -+ return i; -+} -+ -+#include -+ -+#if BITS_PER_LONG == 64 -+ -+#define OFF(addr) (((ulong_t)(addr) & (BYTES_PER_LONG - 1)) << 3) -+#define BASE(addr) ((ulong_t*) ((ulong_t)(addr) & ~(BYTES_PER_LONG - 1))) -+ -+static inline void reiser4_set_bit(int nr, void *addr) -+{ -+ ext2_set_bit(nr + OFF(addr), BASE(addr)); -+} -+ -+static inline void reiser4_clear_bit(int nr, void *addr) -+{ -+ ext2_clear_bit(nr + OFF(addr), BASE(addr)); -+} -+ -+static inline int reiser4_test_bit(int nr, void *addr) -+{ -+ return ext2_test_bit(nr + OFF(addr), BASE(addr)); -+} -+static inline int reiser4_find_next_zero_bit(void *addr, int maxoffset, -+ int offset) -+{ -+ int off = OFF(addr); -+ -+ return ext2_find_next_zero_bit(BASE(addr), maxoffset + off, -+ offset + off) - off; -+} -+ -+#else -+ -+#define reiser4_set_bit(nr, addr) ext2_set_bit(nr, addr) -+#define reiser4_clear_bit(nr, addr) ext2_clear_bit(nr, addr) -+#define reiser4_test_bit(nr, addr) ext2_test_bit(nr, addr) -+ -+#define reiser4_find_next_zero_bit(addr, maxoffset, offset) \ -+ext2_find_next_zero_bit(addr, maxoffset, offset) -+#endif -+ -+/* Search for a set bit in the bit array [@start_offset, @max_offset[, offsets -+ * are counted from @addr, return the offset of the first bit if it is found, -+ * @maxoffset otherwise. */ -+static bmap_off_t __reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset, -+ bmap_off_t start_offset) -+{ -+ ulong_t *base = addr; -+ /* start_offset is in bits, convert it to byte offset within bitmap. */ -+ int word_nr = start_offset >> LONG_INT_SHIFT; -+ /* bit number within the byte. */ -+ int bit_nr = start_offset & LONG_INT_MASK; -+ int max_word_nr = (max_offset - 1) >> LONG_INT_SHIFT; -+ -+ assert("zam-387", max_offset != 0); -+ -+ /* Unaligned @start_offset case. */ -+ if (bit_nr != 0) { -+ bmap_nr_t nr; -+ -+ nr = find_next_zero_bit_in_word(~(base[word_nr]), bit_nr); -+ -+ if (nr < BITS_PER_LONG) -+ return (word_nr << LONG_INT_SHIFT) + nr; -+ -+ ++word_nr; -+ } -+ -+ /* Fast scan trough aligned words. */ -+ while (word_nr <= max_word_nr) { -+ if (base[word_nr] != 0) { -+ return (word_nr << LONG_INT_SHIFT) -+ + find_next_zero_bit_in_word(~(base[word_nr]), 0); -+ } -+ -+ ++word_nr; -+ } -+ -+ return max_offset; -+} -+ -+#if BITS_PER_LONG == 64 -+ -+static bmap_off_t reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset, -+ bmap_off_t start_offset) -+{ -+ bmap_off_t off = OFF(addr); -+ -+ return __reiser4_find_next_set_bit(BASE(addr), max_offset + off, -+ start_offset + off) - off; -+} -+ -+#else -+#define reiser4_find_next_set_bit(addr, max_offset, start_offset) \ -+ __reiser4_find_next_set_bit(addr, max_offset, start_offset) -+#endif -+ -+/* search for the first set bit in single word. */ -+static int find_last_set_bit_in_word(ulong_t word, int start_bit) -+{ -+ ulong_t bit_mask; -+ int nr = start_bit; -+ -+ assert("zam-965", start_bit < BITS_PER_LONG); -+ assert("zam-966", start_bit >= 0); -+ -+ bit_mask = (1UL << nr); -+ -+ while (bit_mask != 0) { -+ if (bit_mask & word) -+ return nr; -+ bit_mask >>= 1; -+ nr--; -+ } -+ return BITS_PER_LONG; -+} -+ -+/* Search bitmap for a set bit in backward direction from the end to the -+ * beginning of given region -+ * -+ * @result: result offset of the last set bit -+ * @addr: base memory address, -+ * @low_off: low end of the search region, edge bit included into the region, -+ * @high_off: high end of the search region, edge bit included into the region, -+ * -+ * @return: 0 - set bit was found, -1 otherwise. -+ */ -+static int -+reiser4_find_last_set_bit(bmap_off_t * result, void *addr, bmap_off_t low_off, -+ bmap_off_t high_off) -+{ -+ ulong_t *base = addr; -+ int last_word; -+ int first_word; -+ int last_bit; -+ int nr; -+ -+ assert("zam-962", high_off >= low_off); -+ -+ last_word = high_off >> LONG_INT_SHIFT; -+ last_bit = high_off & LONG_INT_MASK; -+ first_word = low_off >> LONG_INT_SHIFT; -+ -+ if (last_bit < BITS_PER_LONG) { -+ nr = find_last_set_bit_in_word(base[last_word], last_bit); -+ if (nr < BITS_PER_LONG) { -+ *result = (last_word << LONG_INT_SHIFT) + nr; -+ return 0; -+ } -+ --last_word; -+ } -+ while (last_word >= first_word) { -+ if (base[last_word] != 0x0) { -+ last_bit = -+ find_last_set_bit_in_word(base[last_word], -+ BITS_PER_LONG - 1); -+ assert("zam-972", last_bit < BITS_PER_LONG); -+ *result = (last_word << LONG_INT_SHIFT) + last_bit; -+ return 0; -+ } -+ --last_word; -+ } -+ -+ return -1; /* set bit not found */ -+} -+ -+/* Search bitmap for a clear bit in backward direction from the end to the -+ * beginning of given region */ -+static int -+reiser4_find_last_zero_bit(bmap_off_t * result, void *addr, bmap_off_t low_off, -+ bmap_off_t high_off) -+{ -+ ulong_t *base = addr; -+ int last_word; -+ int first_word; -+ int last_bit; -+ int nr; -+ -+ last_word = high_off >> LONG_INT_SHIFT; -+ last_bit = high_off & LONG_INT_MASK; -+ first_word = low_off >> LONG_INT_SHIFT; -+ -+ if (last_bit < BITS_PER_LONG) { -+ nr = find_last_set_bit_in_word(~base[last_word], last_bit); -+ if (nr < BITS_PER_LONG) { -+ *result = (last_word << LONG_INT_SHIFT) + nr; -+ return 0; -+ } -+ --last_word; -+ } -+ while (last_word >= first_word) { -+ if (base[last_word] != (ulong_t) (-1)) { -+ *result = (last_word << LONG_INT_SHIFT) + -+ find_last_set_bit_in_word(~base[last_word], -+ BITS_PER_LONG - 1); -+ return 0; -+ } -+ --last_word; -+ } -+ -+ return -1; /* zero bit not found */ -+} -+ -+/* Audited by: green(2002.06.12) */ -+static void reiser4_clear_bits(char *addr, bmap_off_t start, bmap_off_t end) -+{ -+ int first_byte; -+ int last_byte; -+ -+ unsigned char first_byte_mask = 0xFF; -+ unsigned char last_byte_mask = 0xFF; -+ -+ assert("zam-410", start < end); -+ -+ first_byte = start >> 3; -+ last_byte = (end - 1) >> 3; -+ -+ if (last_byte > first_byte + 1) -+ memset(addr + first_byte + 1, 0, -+ (size_t) (last_byte - first_byte - 1)); -+ -+ first_byte_mask >>= 8 - (start & 0x7); -+ last_byte_mask <<= ((end - 1) & 0x7) + 1; -+ -+ if (first_byte == last_byte) { -+ addr[first_byte] &= (first_byte_mask | last_byte_mask); -+ } else { -+ addr[first_byte] &= first_byte_mask; -+ addr[last_byte] &= last_byte_mask; -+ } -+} -+ -+/* Audited by: green(2002.06.12) */ -+/* ZAM-FIXME-HANS: comment this */ -+static void reiser4_set_bits(char *addr, bmap_off_t start, bmap_off_t end) -+{ -+ int first_byte; -+ int last_byte; -+ -+ unsigned char first_byte_mask = 0xFF; -+ unsigned char last_byte_mask = 0xFF; -+ -+ assert("zam-386", start < end); -+ -+ first_byte = start >> 3; -+ last_byte = (end - 1) >> 3; -+ -+ if (last_byte > first_byte + 1) -+ memset(addr + first_byte + 1, 0xFF, -+ (size_t) (last_byte - first_byte - 1)); -+ -+ first_byte_mask <<= start & 0x7; -+ last_byte_mask >>= 7 - ((end - 1) & 0x7); -+ -+ if (first_byte == last_byte) { -+ addr[first_byte] |= (first_byte_mask & last_byte_mask); -+ } else { -+ addr[first_byte] |= first_byte_mask; -+ addr[last_byte] |= last_byte_mask; -+ } -+} -+ -+#define ADLER_BASE 65521 -+#define ADLER_NMAX 5552 -+ -+/* Calculates the adler32 checksum for the data pointed by `data` of the -+ length `len`. This function was originally taken from zlib, version 1.1.3, -+ July 9th, 1998. -+ -+ Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler -+ -+ This software is provided 'as-is', without any express or implied -+ warranty. In no event will the authors be held liable for any damages -+ arising from the use of this software. -+ -+ Permission is granted to anyone to use this software for any purpose, -+ including commercial applications, and to alter it and redistribute it -+ freely, subject to the following restrictions: -+ -+ 1. The origin of this software must not be misrepresented; you must not -+ claim that you wrote the original software. If you use this software -+ in a product, an acknowledgment in the product documentation would be -+ appreciated but is not required. -+ 2. Altered source versions must be plainly marked as such, and must not be -+ misrepresented as being the original software. -+ 3. This notice may not be removed or altered from any source distribution. -+ -+ Jean-loup Gailly Mark Adler -+ jloup@gzip.org madler@alumni.caltech.edu -+ -+ The above comment applies only to the reiser4_adler32 function. -+*/ -+ -+__u32 reiser4_adler32(char *data, __u32 len) -+{ -+ unsigned char *t = data; -+ __u32 s1 = 1; -+ __u32 s2 = 0; -+ int k; -+ -+ while (len > 0) { -+ k = len < ADLER_NMAX ? len : ADLER_NMAX; -+ len -= k; -+ -+ while (k--) { -+ s1 += *t++; -+ s2 += s1; -+ } -+ -+ s1 %= ADLER_BASE; -+ s2 %= ADLER_BASE; -+ } -+ return (s2 << 16) | s1; -+} -+ -+#define sb_by_bnode(bnode) \ -+ ((struct super_block *)jnode_get_tree(bnode->wjnode)->super) -+ -+static __u32 bnode_calc_crc(const struct bitmap_node *bnode, unsigned long size) -+{ -+ return reiser4_adler32(bnode_commit_data(bnode), bmap_size(size)); -+} -+ -+static int -+bnode_check_adler32(const struct bitmap_node *bnode, unsigned long size) -+{ -+ if (bnode_calc_crc(bnode, size) != bnode_commit_crc(bnode)) { -+ bmap_nr_t bmap; -+ -+ bmap = bnode - get_bnode(sb_by_bnode(bnode), 0); -+ -+ warning("vpf-263", -+ "Checksum for the bitmap block %llu is incorrect", -+ bmap); -+ -+ return RETERR(-EIO); -+ } -+ -+ return 0; -+} -+ -+#define REISER4_CHECK_BMAP_CRC (0) -+ -+#if REISER4_CHECK_BMAP_CRC -+static int bnode_check_crc(const struct bitmap_node *bnode) -+{ -+ return bnode_check_adler32(bnode, -+ bmap_size(sb_by_bnode(bnode)->s_blocksize)); -+} -+ -+/* REISER4_CHECK_BMAP_CRC */ -+#else -+ -+#define bnode_check_crc(bnode) (0) -+ -+/* REISER4_CHECK_BMAP_CRC */ -+#endif -+ -+/* Recalculates the adler32 checksum for only 1 byte change. -+ adler - previous adler checksum -+ old_data, data - old, new byte values. -+ tail == (chunk - offset) : length, checksum was calculated for, - offset of -+ the changed byte within this chunk. -+ This function can be used for checksum calculation optimisation. -+*/ -+ -+static __u32 -+adler32_recalc(__u32 adler, unsigned char old_data, unsigned char data, -+ __u32 tail) -+{ -+ __u32 delta = data - old_data + 2 * ADLER_BASE; -+ __u32 s1 = adler & 0xffff; -+ __u32 s2 = (adler >> 16) & 0xffff; -+ -+ s1 = (delta + s1) % ADLER_BASE; -+ s2 = (delta * tail + s2) % ADLER_BASE; -+ -+ return (s2 << 16) | s1; -+} -+ -+#define LIMIT(val, boundary) ((val) > (boundary) ? (boundary) : (val)) -+ -+/** -+ * get_nr_bitmap - calculate number of bitmap blocks -+ * @super: super block with initialized blocksize and block count -+ * -+ * Calculates number of bitmap blocks of a filesystem which uses bitmaps to -+ * maintain free disk space. It assumes that each bitmap addresses the same -+ * number of blocks which is calculated by bmap_block_count macro defined in -+ * above. Number of blocks in the filesystem has to be initialized in reiser4 -+ * private data of super block already so that it can be obtained via -+ * reiser4_block_count(). Unfortunately, number of blocks addressed by a bitmap -+ * is not power of 2 because 4 bytes are used for checksum. Therefore, we have -+ * to use special function to divide and modulo 64bits filesystem block -+ * counters. -+ * -+ * Example: suppose filesystem have 32768 blocks. Blocksize is 4096. Each bitmap -+ * block addresses (4096 - 4) * 8 = 32736 blocks. Number of bitmaps to address -+ * all 32768 blocks is calculated as (32768 - 1) / 32736 + 1 = 2. -+ */ -+static bmap_nr_t get_nr_bmap(const struct super_block *super) -+{ -+ u64 quotient; -+ -+ assert("zam-393", reiser4_block_count(super) != 0); -+ -+ quotient = reiser4_block_count(super) - 1; -+ do_div(quotient, bmap_bit_count(super->s_blocksize)); -+ return quotient + 1; -+} -+ -+/** -+ * parse_blocknr - calculate bitmap number and offset in it by block number -+ * @block: pointer to block number to calculate location in bitmap of -+ * @bmap: pointer where to store bitmap block number -+ * @offset: pointer where to store offset within bitmap block -+ * -+ * Calculates location of bit which is responsible for allocation/freeing of -+ * block @*block. That location is represented by bitmap block number and offset -+ * within that bitmap block. -+ */ -+static void -+parse_blocknr(const reiser4_block_nr *block, bmap_nr_t *bmap, -+ bmap_off_t *offset) -+{ -+ struct super_block *super = get_current_context()->super; -+ u64 quotient = *block; -+ -+ *offset = do_div(quotient, bmap_bit_count(super->s_blocksize)); -+ *bmap = quotient; -+ -+ assert("zam-433", *bmap < get_nr_bmap(super)); -+ assert("", *offset < bmap_bit_count(super->s_blocksize)); -+} -+ -+#if REISER4_DEBUG -+/* Audited by: green(2002.06.12) */ -+static void -+check_block_range(const reiser4_block_nr * start, const reiser4_block_nr * len) -+{ -+ struct super_block *sb = reiser4_get_current_sb(); -+ -+ assert("zam-436", sb != NULL); -+ -+ assert("zam-455", start != NULL); -+ assert("zam-437", *start != 0); -+ assert("zam-541", !reiser4_blocknr_is_fake(start)); -+ assert("zam-441", *start < reiser4_block_count(sb)); -+ -+ if (len != NULL) { -+ assert("zam-438", *len != 0); -+ assert("zam-442", *start + *len <= reiser4_block_count(sb)); -+ } -+} -+ -+static void check_bnode_loaded(const struct bitmap_node *bnode) -+{ -+ assert("zam-485", bnode != NULL); -+ assert("zam-483", jnode_page(bnode->wjnode) != NULL); -+ assert("zam-484", jnode_page(bnode->cjnode) != NULL); -+ assert("nikita-2820", jnode_is_loaded(bnode->wjnode)); -+ assert("nikita-2821", jnode_is_loaded(bnode->cjnode)); -+} -+ -+#else -+ -+# define check_block_range(start, len) do { /* nothing */} while(0) -+# define check_bnode_loaded(bnode) do { /* nothing */} while(0) -+ -+#endif -+ -+/* modify bnode->first_zero_bit (if we free bits before); bnode should be -+ spin-locked */ -+static inline void -+adjust_first_zero_bit(struct bitmap_node *bnode, bmap_off_t offset) -+{ -+ if (offset < bnode->first_zero_bit) -+ bnode->first_zero_bit = offset; -+} -+ -+/* return a physical disk address for logical bitmap number @bmap */ -+/* FIXME-VS: this is somehow related to disk layout? */ -+/* ZAM-FIXME-HANS: your answer is? Use not more than one function dereference -+ * per block allocation so that performance is not affected. Probably this -+ * whole file should be considered part of the disk layout plugin, and other -+ * disk layouts can use other defines and efficiency will not be significantly -+ * affected. */ -+ -+#define REISER4_FIRST_BITMAP_BLOCK \ -+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 2) -+ -+/* Audited by: green(2002.06.12) */ -+static void -+get_bitmap_blocknr(struct super_block *super, bmap_nr_t bmap, -+ reiser4_block_nr * bnr) -+{ -+ -+ assert("zam-390", bmap < get_nr_bmap(super)); -+ -+#ifdef CONFIG_REISER4_BADBLOCKS -+#define BITMAP_PLUGIN_DISKMAP_ID ((0xc0e1<<16) | (0xe0ff)) -+ /* Check if the diskmap have this already, first. */ -+ if (reiser4_get_diskmap_value(BITMAP_PLUGIN_DISKMAP_ID, bmap, bnr) == 0) -+ return; /* Found it in diskmap */ -+#endif -+ /* FIXME_ZAM: before discussing of disk layouts and disk format -+ plugins I implement bitmap location scheme which is close to scheme -+ used in reiser 3.6 */ -+ if (bmap == 0) { -+ *bnr = REISER4_FIRST_BITMAP_BLOCK; -+ } else { -+ *bnr = bmap * bmap_bit_count(super->s_blocksize); -+ } -+} -+ -+/* construct a fake block number for shadow bitmap (WORKING BITMAP) block */ -+/* Audited by: green(2002.06.12) */ -+static void get_working_bitmap_blocknr(bmap_nr_t bmap, reiser4_block_nr * bnr) -+{ -+ *bnr = -+ (reiser4_block_nr) ((bmap & ~REISER4_BLOCKNR_STATUS_BIT_MASK) | -+ REISER4_BITMAP_BLOCKS_STATUS_VALUE); -+} -+ -+/* bnode structure initialization */ -+static void -+init_bnode(struct bitmap_node *bnode, -+ struct super_block *super UNUSED_ARG, bmap_nr_t bmap UNUSED_ARG) -+{ -+ memset(bnode, 0, sizeof(struct bitmap_node)); -+ -+ mutex_init(&bnode->mutex); -+ atomic_set(&bnode->loaded, 0); -+} -+ -+static void release(jnode * node) -+{ -+ jrelse(node); -+ JF_SET(node, JNODE_HEARD_BANSHEE); -+ jput(node); -+} -+ -+/* This function is for internal bitmap.c use because it assumes that jnode is -+ in under full control of this thread */ -+static void done_bnode(struct bitmap_node *bnode) -+{ -+ if (bnode) { -+ atomic_set(&bnode->loaded, 0); -+ if (bnode->wjnode != NULL) -+ release(bnode->wjnode); -+ if (bnode->cjnode != NULL) -+ release(bnode->cjnode); -+ bnode->wjnode = bnode->cjnode = NULL; -+ } -+} -+ -+/* ZAM-FIXME-HANS: comment this. Called only by load_and_lock_bnode()*/ -+static int prepare_bnode(struct bitmap_node *bnode, jnode **cjnode_ret, -+ jnode **wjnode_ret) -+{ -+ struct super_block *super; -+ jnode *cjnode; -+ jnode *wjnode; -+ bmap_nr_t bmap; -+ int ret; -+ -+ super = reiser4_get_current_sb(); -+ -+ *wjnode_ret = wjnode = bnew(); -+ if (wjnode == NULL) { -+ *cjnode_ret = NULL; -+ return RETERR(-ENOMEM); -+ } -+ -+ *cjnode_ret = cjnode = bnew(); -+ if (cjnode == NULL) -+ return RETERR(-ENOMEM); -+ -+ bmap = bnode - get_bnode(super, 0); -+ -+ get_working_bitmap_blocknr(bmap, &wjnode->blocknr); -+ get_bitmap_blocknr(super, bmap, &cjnode->blocknr); -+ -+ jref(cjnode); -+ jref(wjnode); -+ -+ /* load commit bitmap */ -+ ret = jload_gfp(cjnode, GFP_NOFS, 1); -+ -+ if (ret) -+ goto error; -+ -+ /* allocate memory for working bitmap block. Note that for -+ * bitmaps jinit_new() doesn't actually modifies node content, -+ * so parallel calls to this are ok. */ -+ ret = jinit_new(wjnode, GFP_NOFS); -+ -+ if (ret != 0) { -+ jrelse(cjnode); -+ goto error; -+ } -+ -+ return 0; -+ -+ error: -+ jput(cjnode); -+ jput(wjnode); -+ *wjnode_ret = *cjnode_ret = NULL; -+ return ret; -+ -+} -+ -+/* Check the bnode data on read. */ -+static int check_struct_bnode(struct bitmap_node *bnode, __u32 blksize) -+{ -+ void *data; -+ int ret; -+ -+ /* Check CRC */ -+ ret = bnode_check_adler32(bnode, blksize); -+ -+ if (ret) { -+ return ret; -+ } -+ -+ data = jdata(bnode->cjnode) + CHECKSUM_SIZE; -+ -+ /* Check the very first bit -- it must be busy. */ -+ if (!reiser4_test_bit(0, data)) { -+ warning("vpf-1362", "The allocator block %llu is not marked " -+ "as used.", (unsigned long long)bnode->cjnode->blocknr); -+ -+ return -EINVAL; -+ } -+ -+ return 0; -+} -+ -+/* load bitmap blocks "on-demand" */ -+static int load_and_lock_bnode(struct bitmap_node *bnode) -+{ -+ int ret; -+ -+ jnode *cjnode; -+ jnode *wjnode; -+ -+ assert("nikita-3040", reiser4_schedulable()); -+ -+/* ZAM-FIXME-HANS: since bitmaps are never unloaded, this does not -+ * need to be atomic, right? Just leave a comment that if bitmaps were -+ * unloadable, this would need to be atomic. */ -+ if (atomic_read(&bnode->loaded)) { -+ /* bitmap is already loaded, nothing to do */ -+ check_bnode_loaded(bnode); -+ mutex_lock(&bnode->mutex); -+ assert("nikita-2827", atomic_read(&bnode->loaded)); -+ return 0; -+ } -+ -+ ret = prepare_bnode(bnode, &cjnode, &wjnode); -+ if (ret == 0) { -+ mutex_lock(&bnode->mutex); -+ -+ if (!atomic_read(&bnode->loaded)) { -+ assert("nikita-2822", cjnode != NULL); -+ assert("nikita-2823", wjnode != NULL); -+ assert("nikita-2824", jnode_is_loaded(cjnode)); -+ assert("nikita-2825", jnode_is_loaded(wjnode)); -+ -+ bnode->wjnode = wjnode; -+ bnode->cjnode = cjnode; -+ -+ ret = check_struct_bnode(bnode, current_blocksize); -+ if (!ret) { -+ cjnode = wjnode = NULL; -+ atomic_set(&bnode->loaded, 1); -+ /* working bitmap is initialized by on-disk -+ * commit bitmap. This should be performed -+ * under mutex. */ -+ memcpy(bnode_working_data(bnode), -+ bnode_commit_data(bnode), -+ bmap_size(current_blocksize)); -+ } else -+ mutex_unlock(&bnode->mutex); -+ } else -+ /* race: someone already loaded bitmap while we were -+ * busy initializing data. */ -+ check_bnode_loaded(bnode); -+ } -+ -+ if (wjnode != NULL) { -+ release(wjnode); -+ bnode->wjnode = NULL; -+ } -+ if (cjnode != NULL) { -+ release(cjnode); -+ bnode->cjnode = NULL; -+ } -+ -+ return ret; -+} -+ -+static void release_and_unlock_bnode(struct bitmap_node *bnode) -+{ -+ check_bnode_loaded(bnode); -+ mutex_unlock(&bnode->mutex); -+} -+ -+/* This function does all block allocation work but only for one bitmap -+ block.*/ -+/* FIXME_ZAM: It does not allow us to allocate block ranges across bitmap -+ block responsibility zone boundaries. This had no sense in v3.6 but may -+ have it in v4.x */ -+/* ZAM-FIXME-HANS: do you mean search one bitmap block forward? */ -+static int -+search_one_bitmap_forward(bmap_nr_t bmap, bmap_off_t * offset, -+ bmap_off_t max_offset, int min_len, int max_len) -+{ -+ struct super_block *super = get_current_context()->super; -+ struct bitmap_node *bnode = get_bnode(super, bmap); -+ -+ char *data; -+ -+ bmap_off_t search_end; -+ bmap_off_t start; -+ bmap_off_t end; -+ -+ int set_first_zero_bit = 0; -+ -+ int ret; -+ -+ assert("zam-364", min_len > 0); -+ assert("zam-365", max_len >= min_len); -+ assert("zam-366", *offset <= max_offset); -+ -+ ret = load_and_lock_bnode(bnode); -+ -+ if (ret) -+ return ret; -+ -+ data = bnode_working_data(bnode); -+ -+ start = *offset; -+ -+ if (bnode->first_zero_bit >= start) { -+ start = bnode->first_zero_bit; -+ set_first_zero_bit = 1; -+ } -+ -+ while (start + min_len < max_offset) { -+ -+ start = -+ reiser4_find_next_zero_bit((long *)data, max_offset, start); -+ if (set_first_zero_bit) { -+ bnode->first_zero_bit = start; -+ set_first_zero_bit = 0; -+ } -+ if (start >= max_offset) -+ break; -+ -+ search_end = LIMIT(start + max_len, max_offset); -+ end = -+ reiser4_find_next_set_bit((long *)data, search_end, start); -+ if (end >= start + min_len) { -+ /* we can't trust find_next_set_bit result if set bit -+ was not fount, result may be bigger than -+ max_offset */ -+ if (end > search_end) -+ end = search_end; -+ -+ ret = end - start; -+ *offset = start; -+ -+ reiser4_set_bits(data, start, end); -+ -+ /* FIXME: we may advance first_zero_bit if [start, -+ end] region overlaps the first_zero_bit point */ -+ -+ break; -+ } -+ -+ start = end + 1; -+ } -+ -+ release_and_unlock_bnode(bnode); -+ -+ return ret; -+} -+ -+static int -+search_one_bitmap_backward(bmap_nr_t bmap, bmap_off_t * start_offset, -+ bmap_off_t end_offset, int min_len, int max_len) -+{ -+ struct super_block *super = get_current_context()->super; -+ struct bitmap_node *bnode = get_bnode(super, bmap); -+ char *data; -+ bmap_off_t start; -+ int ret; -+ -+ assert("zam-958", min_len > 0); -+ assert("zam-959", max_len >= min_len); -+ assert("zam-960", *start_offset >= end_offset); -+ -+ ret = load_and_lock_bnode(bnode); -+ if (ret) -+ return ret; -+ -+ data = bnode_working_data(bnode); -+ start = *start_offset; -+ -+ while (1) { -+ bmap_off_t end, search_end; -+ -+ /* Find the beginning of the zero filled region */ -+ if (reiser4_find_last_zero_bit(&start, data, end_offset, start)) -+ break; -+ /* Is there more than `min_len' bits from `start' to -+ * `end_offset'? */ -+ if (start < end_offset + min_len - 1) -+ break; -+ -+ /* Do not search to `end_offset' if we need to find less than -+ * `max_len' zero bits. */ -+ if (end_offset + max_len - 1 < start) -+ search_end = start - max_len + 1; -+ else -+ search_end = end_offset; -+ -+ if (reiser4_find_last_set_bit(&end, data, search_end, start)) -+ end = search_end; -+ else -+ end++; -+ -+ if (end + min_len <= start + 1) { -+ if (end < search_end) -+ end = search_end; -+ ret = start - end + 1; -+ *start_offset = end; /* `end' is lowest offset */ -+ assert("zam-987", -+ reiser4_find_next_set_bit(data, start + 1, -+ end) >= start + 1); -+ reiser4_set_bits(data, end, start + 1); -+ break; -+ } -+ -+ if (end <= end_offset) -+ /* left search boundary reached. */ -+ break; -+ start = end - 1; -+ } -+ -+ release_and_unlock_bnode(bnode); -+ return ret; -+} -+ -+/* allocate contiguous range of blocks in bitmap */ -+static int bitmap_alloc_forward(reiser4_block_nr * start, -+ const reiser4_block_nr * end, int min_len, -+ int max_len) -+{ -+ bmap_nr_t bmap, end_bmap; -+ bmap_off_t offset, end_offset; -+ int len; -+ -+ reiser4_block_nr tmp; -+ -+ struct super_block *super = get_current_context()->super; -+ const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize); -+ -+ parse_blocknr(start, &bmap, &offset); -+ -+ tmp = *end - 1; -+ parse_blocknr(&tmp, &end_bmap, &end_offset); -+ ++end_offset; -+ -+ assert("zam-358", end_bmap >= bmap); -+ assert("zam-359", ergo(end_bmap == bmap, end_offset >= offset)); -+ -+ for (; bmap < end_bmap; bmap++, offset = 0) { -+ len = -+ search_one_bitmap_forward(bmap, &offset, max_offset, -+ min_len, max_len); -+ if (len != 0) -+ goto out; -+ } -+ -+ len = -+ search_one_bitmap_forward(bmap, &offset, end_offset, min_len, -+ max_len); -+ out: -+ *start = bmap * max_offset + offset; -+ return len; -+} -+ -+/* allocate contiguous range of blocks in bitmap (from @start to @end in -+ * backward direction) */ -+static int bitmap_alloc_backward(reiser4_block_nr * start, -+ const reiser4_block_nr * end, int min_len, -+ int max_len) -+{ -+ bmap_nr_t bmap, end_bmap; -+ bmap_off_t offset, end_offset; -+ int len; -+ struct super_block *super = get_current_context()->super; -+ const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize); -+ -+ parse_blocknr(start, &bmap, &offset); -+ parse_blocknr(end, &end_bmap, &end_offset); -+ -+ assert("zam-961", end_bmap <= bmap); -+ assert("zam-962", ergo(end_bmap == bmap, end_offset <= offset)); -+ -+ for (; bmap > end_bmap; bmap--, offset = max_offset - 1) { -+ len = -+ search_one_bitmap_backward(bmap, &offset, 0, min_len, -+ max_len); -+ if (len != 0) -+ goto out; -+ } -+ -+ len = -+ search_one_bitmap_backward(bmap, &offset, end_offset, min_len, -+ max_len); -+ out: -+ *start = bmap * max_offset + offset; -+ return len; -+} -+ -+/* plugin->u.space_allocator.alloc_blocks() */ -+static int alloc_blocks_forward(reiser4_blocknr_hint *hint, int needed, -+ reiser4_block_nr *start, reiser4_block_nr *len) -+{ -+ struct super_block *super = get_current_context()->super; -+ int actual_len; -+ -+ reiser4_block_nr search_start; -+ reiser4_block_nr search_end; -+ -+ assert("zam-398", super != NULL); -+ assert("zam-412", hint != NULL); -+ assert("zam-397", hint->blk <= reiser4_block_count(super)); -+ -+ if (hint->max_dist == 0) -+ search_end = reiser4_block_count(super); -+ else -+ search_end = -+ LIMIT(hint->blk + hint->max_dist, -+ reiser4_block_count(super)); -+ -+ /* We use @hint -> blk as a search start and search from it to the end -+ of the disk or in given region if @hint -> max_dist is not zero */ -+ search_start = hint->blk; -+ -+ actual_len = -+ bitmap_alloc_forward(&search_start, &search_end, 1, needed); -+ -+ /* There is only one bitmap search if max_dist was specified or first -+ pass was from the beginning of the bitmap. We also do one pass for -+ scanning bitmap in backward direction. */ -+ if (!(actual_len != 0 || hint->max_dist != 0 || search_start == 0)) { -+ /* next step is a scanning from 0 to search_start */ -+ search_end = search_start; -+ search_start = 0; -+ actual_len = -+ bitmap_alloc_forward(&search_start, &search_end, 1, needed); -+ } -+ if (actual_len == 0) -+ return RETERR(-ENOSPC); -+ if (actual_len < 0) -+ return RETERR(actual_len); -+ *len = actual_len; -+ *start = search_start; -+ return 0; -+} -+ -+static int alloc_blocks_backward(reiser4_blocknr_hint * hint, int needed, -+ reiser4_block_nr * start, -+ reiser4_block_nr * len) -+{ -+ reiser4_block_nr search_start; -+ reiser4_block_nr search_end; -+ int actual_len; -+ -+ ON_DEBUG(struct super_block *super = reiser4_get_current_sb()); -+ -+ assert("zam-969", super != NULL); -+ assert("zam-970", hint != NULL); -+ assert("zam-971", hint->blk <= reiser4_block_count(super)); -+ -+ search_start = hint->blk; -+ if (hint->max_dist == 0 || search_start <= hint->max_dist) -+ search_end = 0; -+ else -+ search_end = search_start - hint->max_dist; -+ -+ actual_len = -+ bitmap_alloc_backward(&search_start, &search_end, 1, needed); -+ if (actual_len == 0) -+ return RETERR(-ENOSPC); -+ if (actual_len < 0) -+ return RETERR(actual_len); -+ *len = actual_len; -+ *start = search_start; -+ return 0; -+} -+ -+/* plugin->u.space_allocator.alloc_blocks() */ -+int reiser4_alloc_blocks_bitmap(reiser4_space_allocator * allocator, -+ reiser4_blocknr_hint * hint, int needed, -+ reiser4_block_nr * start, reiser4_block_nr * len) -+{ -+ if (hint->backward) -+ return alloc_blocks_backward(hint, needed, start, len); -+ return alloc_blocks_forward(hint, needed, start, len); -+} -+ -+/* plugin->u.space_allocator.dealloc_blocks(). */ -+/* It just frees blocks in WORKING BITMAP. Usually formatted an unformatted -+ nodes deletion is deferred until transaction commit. However, deallocation -+ of temporary objects like wandered blocks and transaction commit records -+ requires immediate node deletion from WORKING BITMAP.*/ -+void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator * allocator, -+ reiser4_block_nr start, reiser4_block_nr len) -+{ -+ struct super_block *super = reiser4_get_current_sb(); -+ -+ bmap_nr_t bmap; -+ bmap_off_t offset; -+ -+ struct bitmap_node *bnode; -+ int ret; -+ -+ assert("zam-468", len != 0); -+ check_block_range(&start, &len); -+ -+ parse_blocknr(&start, &bmap, &offset); -+ -+ assert("zam-469", offset + len <= bmap_bit_count(super->s_blocksize)); -+ -+ bnode = get_bnode(super, bmap); -+ -+ assert("zam-470", bnode != NULL); -+ -+ ret = load_and_lock_bnode(bnode); -+ assert("zam-481", ret == 0); -+ -+ reiser4_clear_bits(bnode_working_data(bnode), offset, -+ (bmap_off_t) (offset + len)); -+ -+ adjust_first_zero_bit(bnode, offset); -+ -+ release_and_unlock_bnode(bnode); -+} -+ -+/* plugin->u.space_allocator.check_blocks(). */ -+void reiser4_check_blocks_bitmap(const reiser4_block_nr * start, -+ const reiser4_block_nr * len, int desired) -+{ -+#if REISER4_DEBUG -+ struct super_block *super = reiser4_get_current_sb(); -+ -+ bmap_nr_t bmap; -+ bmap_off_t start_offset; -+ bmap_off_t end_offset; -+ -+ struct bitmap_node *bnode; -+ int ret; -+ -+ assert("zam-622", len != NULL); -+ check_block_range(start, len); -+ parse_blocknr(start, &bmap, &start_offset); -+ -+ end_offset = start_offset + *len; -+ assert("nikita-2214", end_offset <= bmap_bit_count(super->s_blocksize)); -+ -+ bnode = get_bnode(super, bmap); -+ -+ assert("nikita-2215", bnode != NULL); -+ -+ ret = load_and_lock_bnode(bnode); -+ assert("zam-626", ret == 0); -+ -+ assert("nikita-2216", jnode_is_loaded(bnode->wjnode)); -+ -+ if (desired) { -+ assert("zam-623", -+ reiser4_find_next_zero_bit(bnode_working_data(bnode), -+ end_offset, start_offset) -+ >= end_offset); -+ } else { -+ assert("zam-624", -+ reiser4_find_next_set_bit(bnode_working_data(bnode), -+ end_offset, start_offset) -+ >= end_offset); -+ } -+ -+ release_and_unlock_bnode(bnode); -+#endif -+} -+ -+/* conditional insertion of @node into atom's overwrite set if it was not there */ -+static void cond_add_to_overwrite_set(txn_atom * atom, jnode * node) -+{ -+ assert("zam-546", atom != NULL); -+ assert("zam-547", atom->stage == ASTAGE_PRE_COMMIT); -+ assert("zam-548", node != NULL); -+ -+ spin_lock_atom(atom); -+ spin_lock_jnode(node); -+ -+ if (node->atom == NULL) { -+ JF_SET(node, JNODE_OVRWR); -+ insert_into_atom_ovrwr_list(atom, node); -+ } else { -+ assert("zam-549", node->atom == atom); -+ } -+ -+ spin_unlock_jnode(node); -+ spin_unlock_atom(atom); -+} -+ -+/* an actor which applies delete set to COMMIT bitmap pages and link modified -+ pages in a single-linked list */ -+static int -+apply_dset_to_commit_bmap(txn_atom * atom, const reiser4_block_nr * start, -+ const reiser4_block_nr * len, void *data) -+{ -+ -+ bmap_nr_t bmap; -+ bmap_off_t offset; -+ int ret; -+ -+ long long *blocks_freed_p = data; -+ -+ struct bitmap_node *bnode; -+ -+ struct super_block *sb = reiser4_get_current_sb(); -+ -+ check_block_range(start, len); -+ -+ parse_blocknr(start, &bmap, &offset); -+ -+ /* FIXME-ZAM: we assume that all block ranges are allocated by this -+ bitmap-based allocator and each block range can't go over a zone of -+ responsibility of one bitmap block; same assumption is used in -+ other journal hooks in bitmap code. */ -+ bnode = get_bnode(sb, bmap); -+ assert("zam-448", bnode != NULL); -+ -+ /* it is safe to unlock atom with is in ASTAGE_PRE_COMMIT */ -+ assert("zam-767", atom->stage == ASTAGE_PRE_COMMIT); -+ ret = load_and_lock_bnode(bnode); -+ if (ret) -+ return ret; -+ -+ /* put bnode into atom's overwrite set */ -+ cond_add_to_overwrite_set(atom, bnode->cjnode); -+ -+ data = bnode_commit_data(bnode); -+ -+ ret = bnode_check_crc(bnode); -+ if (ret != 0) -+ return ret; -+ -+ if (len != NULL) { -+ /* FIXME-ZAM: a check that all bits are set should be there */ -+ assert("zam-443", -+ offset + *len <= bmap_bit_count(sb->s_blocksize)); -+ reiser4_clear_bits(data, offset, (bmap_off_t) (offset + *len)); -+ -+ (*blocks_freed_p) += *len; -+ } else { -+ reiser4_clear_bit(offset, data); -+ (*blocks_freed_p)++; -+ } -+ -+ bnode_set_commit_crc(bnode, bnode_calc_crc(bnode, sb->s_blocksize)); -+ -+ release_and_unlock_bnode(bnode); -+ -+ return 0; -+} -+ -+/* plugin->u.space_allocator.pre_commit_hook(). */ -+/* It just applies transaction changes to fs-wide COMMIT BITMAP, hoping the -+ rest is done by transaction manager (allocate wandered locations for COMMIT -+ BITMAP blocks, copy COMMIT BITMAP blocks data). */ -+/* Only one instance of this function can be running at one given time, because -+ only one transaction can be committed a time, therefore it is safe to access -+ some global variables without any locking */ -+ -+int reiser4_pre_commit_hook_bitmap(void) -+{ -+ struct super_block *super = reiser4_get_current_sb(); -+ txn_atom *atom; -+ -+ long long blocks_freed = 0; -+ -+ atom = get_current_atom_locked(); -+ assert("zam-876", atom->stage == ASTAGE_PRE_COMMIT); -+ spin_unlock_atom(atom); -+ -+ { /* scan atom's captured list and find all freshly allocated nodes, -+ * mark corresponded bits in COMMIT BITMAP as used */ -+ struct list_head *head = ATOM_CLEAN_LIST(atom); -+ jnode *node = list_entry(head->next, jnode, capture_link); -+ -+ while (head != &node->capture_link) { -+ /* we detect freshly allocated jnodes */ -+ if (JF_ISSET(node, JNODE_RELOC)) { -+ int ret; -+ bmap_nr_t bmap; -+ -+ bmap_off_t offset; -+ bmap_off_t index; -+ struct bitmap_node *bn; -+ __u32 size = bmap_size(super->s_blocksize); -+ __u32 crc; -+ char byte; -+ -+ assert("zam-559", !JF_ISSET(node, JNODE_OVRWR)); -+ assert("zam-460", -+ !reiser4_blocknr_is_fake(&node->blocknr)); -+ -+ parse_blocknr(&node->blocknr, &bmap, &offset); -+ bn = get_bnode(super, bmap); -+ -+ index = offset >> 3; -+ assert("vpf-276", index < size); -+ -+ ret = bnode_check_crc(bnode); -+ if (ret != 0) -+ return ret; -+ -+ check_bnode_loaded(bn); -+ load_and_lock_bnode(bn); -+ -+ byte = *(bnode_commit_data(bn) + index); -+ reiser4_set_bit(offset, bnode_commit_data(bn)); -+ -+ crc = adler32_recalc(bnode_commit_crc(bn), byte, -+ *(bnode_commit_data(bn) + -+ index), -+ size - index), -+ bnode_set_commit_crc(bn, crc); -+ -+ release_and_unlock_bnode(bn); -+ -+ ret = bnode_check_crc(bn); -+ if (ret != 0) -+ return ret; -+ -+ /* working of this depends on how it inserts -+ new j-node into clean list, because we are -+ scanning the same list now. It is OK, if -+ insertion is done to the list front */ -+ cond_add_to_overwrite_set(atom, bn->cjnode); -+ } -+ -+ node = list_entry(node->capture_link.next, jnode, capture_link); -+ } -+ } -+ -+ blocknr_set_iterator(atom, &atom->delete_set, apply_dset_to_commit_bmap, -+ &blocks_freed, 0); -+ -+ blocks_freed -= atom->nr_blocks_allocated; -+ -+ { -+ reiser4_super_info_data *sbinfo; -+ -+ sbinfo = get_super_private(super); -+ -+ spin_lock_reiser4_super(sbinfo); -+ sbinfo->blocks_free_committed += blocks_freed; -+ spin_unlock_reiser4_super(sbinfo); -+ } -+ -+ return 0; -+} -+ -+/* plugin->u.space_allocator.init_allocator -+ constructor of reiser4_space_allocator object. It is called on fs mount */ -+int reiser4_init_allocator_bitmap(reiser4_space_allocator * allocator, -+ struct super_block *super, void *arg) -+{ -+ struct bitmap_allocator_data *data = NULL; -+ bmap_nr_t bitmap_blocks_nr; -+ bmap_nr_t i; -+ -+ assert("nikita-3039", reiser4_schedulable()); -+ -+ /* getting memory for bitmap allocator private data holder */ -+ data = -+ kmalloc(sizeof(struct bitmap_allocator_data), -+ reiser4_ctx_gfp_mask_get()); -+ -+ if (data == NULL) -+ return RETERR(-ENOMEM); -+ -+ /* allocation and initialization for the array of bnodes */ -+ bitmap_blocks_nr = get_nr_bmap(super); -+ -+ /* FIXME-ZAM: it is not clear what to do with huge number of bitmaps -+ which is bigger than 2^32 (= 8 * 4096 * 4096 * 2^32 bytes = 5.76e+17, -+ may I never meet someone who still uses the ia32 architecture when -+ storage devices of that size enter the market, and wants to use ia32 -+ with that storage device, much less reiser4. ;-) -Hans). Kmalloc is not possible and, -+ probably, another dynamic data structure should replace a static -+ array of bnodes. */ -+ /*data->bitmap = reiser4_kmalloc((size_t) (sizeof (struct bitmap_node) * bitmap_blocks_nr), GFP_KERNEL); */ -+ data->bitmap = reiser4_vmalloc(sizeof(struct bitmap_node) * bitmap_blocks_nr); -+ if (data->bitmap == NULL) { -+ kfree(data); -+ return RETERR(-ENOMEM); -+ } -+ -+ for (i = 0; i < bitmap_blocks_nr; i++) -+ init_bnode(data->bitmap + i, super, i); -+ -+ allocator->u.generic = data; -+ -+#if REISER4_DEBUG -+ get_super_private(super)->min_blocks_used += bitmap_blocks_nr; -+#endif -+ -+ /* Load all bitmap blocks at mount time. */ -+ if (!test_bit -+ (REISER4_DONT_LOAD_BITMAP, &get_super_private(super)->fs_flags)) { -+ __u64 start_time, elapsed_time; -+ struct bitmap_node *bnode; -+ int ret; -+ -+ if (REISER4_DEBUG) -+ printk(KERN_INFO "loading reiser4 bitmap..."); -+ start_time = jiffies; -+ -+ for (i = 0; i < bitmap_blocks_nr; i++) { -+ bnode = data->bitmap + i; -+ ret = load_and_lock_bnode(bnode); -+ if (ret) { -+ reiser4_destroy_allocator_bitmap(allocator, -+ super); -+ return ret; -+ } -+ release_and_unlock_bnode(bnode); -+ } -+ -+ elapsed_time = jiffies - start_time; -+ if (REISER4_DEBUG) -+ printk("...done (%llu jiffies)\n", -+ (unsigned long long)elapsed_time); -+ } -+ -+ return 0; -+} -+ -+/* plugin->u.space_allocator.destroy_allocator -+ destructor. It is called on fs unmount */ -+int reiser4_destroy_allocator_bitmap(reiser4_space_allocator * allocator, -+ struct super_block *super) -+{ -+ bmap_nr_t bitmap_blocks_nr; -+ bmap_nr_t i; -+ -+ struct bitmap_allocator_data *data = allocator->u.generic; -+ -+ assert("zam-414", data != NULL); -+ assert("zam-376", data->bitmap != NULL); -+ -+ bitmap_blocks_nr = get_nr_bmap(super); -+ -+ for (i = 0; i < bitmap_blocks_nr; i++) { -+ struct bitmap_node *bnode = data->bitmap + i; -+ -+ mutex_lock(&bnode->mutex); -+ -+#if REISER4_DEBUG -+ if (atomic_read(&bnode->loaded)) { -+ jnode *wj = bnode->wjnode; -+ jnode *cj = bnode->cjnode; -+ -+ assert("zam-480", jnode_page(cj) != NULL); -+ assert("zam-633", jnode_page(wj) != NULL); -+ -+ assert("zam-634", -+ memcmp(jdata(wj), jdata(wj), -+ bmap_size(super->s_blocksize)) == 0); -+ -+ } -+#endif -+ done_bnode(bnode); -+ mutex_unlock(&bnode->mutex); -+ } -+ -+ vfree(data->bitmap); -+ kfree(data); -+ -+ allocator->u.generic = NULL; -+ -+ return 0; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * scroll-step: 1 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/space/bitmap.h linux-2.6.24/fs/reiser4/plugin/space/bitmap.h ---- linux-2.6.24.orig/fs/reiser4/plugin/space/bitmap.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/space/bitmap.h 2008-01-25 11:39:07.068241692 +0300 -@@ -0,0 +1,47 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#if !defined (__REISER4_PLUGIN_SPACE_BITMAP_H__) -+#define __REISER4_PLUGIN_SPACE_BITMAP_H__ -+ -+#include "../../dformat.h" -+#include "../../block_alloc.h" -+ -+#include /* for __u?? */ -+#include /* for struct super_block */ -+/* EDWARD-FIXME-HANS: write something as informative as the below for every .h file lacking it. */ -+/* declarations of functions implementing methods of space allocator plugin for -+ bitmap based allocator. The functions themselves are in bitmap.c */ -+extern int reiser4_init_allocator_bitmap(reiser4_space_allocator *, -+ struct super_block *, void *); -+extern int reiser4_destroy_allocator_bitmap(reiser4_space_allocator *, -+ struct super_block *); -+extern int reiser4_alloc_blocks_bitmap(reiser4_space_allocator *, -+ reiser4_blocknr_hint *, int needed, -+ reiser4_block_nr * start, -+ reiser4_block_nr * len); -+extern void reiser4_check_blocks_bitmap(const reiser4_block_nr *, -+ const reiser4_block_nr *, int); -+extern void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator *, -+ reiser4_block_nr, -+ reiser4_block_nr); -+extern int reiser4_pre_commit_hook_bitmap(void); -+ -+#define reiser4_post_commit_hook_bitmap() do{}while(0) -+#define reiser4_post_write_back_hook_bitmap() do{}while(0) -+#define reiser4_print_info_bitmap(pref, al) do{}while(0) -+ -+typedef __u64 bmap_nr_t; -+typedef __u32 bmap_off_t; -+ -+#endif /* __REISER4_PLUGIN_SPACE_BITMAP_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/space/Makefile linux-2.6.24/fs/reiser4/plugin/space/Makefile ---- linux-2.6.24.orig/fs/reiser4/plugin/space/Makefile 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/space/Makefile 2008-01-25 11:39:07.068241692 +0300 -@@ -0,0 +1,4 @@ -+obj-$(CONFIG_REISER4_FS) += space_plugins.o -+ -+space_plugins-objs := \ -+ bitmap.o -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/space/space_allocator.h linux-2.6.24/fs/reiser4/plugin/space/space_allocator.h ---- linux-2.6.24.orig/fs/reiser4/plugin/space/space_allocator.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/space/space_allocator.h 2008-01-25 11:39:07.068241692 +0300 -@@ -0,0 +1,80 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#ifndef __SPACE_ALLOCATOR_H__ -+#define __SPACE_ALLOCATOR_H__ -+ -+#include "../../forward.h" -+#include "bitmap.h" -+/* NIKITA-FIXME-HANS: surely this could use a comment. Something about how bitmap is the only space allocator for now, -+ * but... */ -+#define DEF_SPACE_ALLOCATOR(allocator) \ -+ \ -+static inline int sa_init_allocator (reiser4_space_allocator * al, struct super_block *s, void * opaque) \ -+{ \ -+ return reiser4_init_allocator_##allocator (al, s, opaque); \ -+} \ -+ \ -+static inline void sa_destroy_allocator (reiser4_space_allocator *al, struct super_block *s) \ -+{ \ -+ reiser4_destroy_allocator_##allocator (al, s); \ -+} \ -+ \ -+static inline int sa_alloc_blocks (reiser4_space_allocator *al, reiser4_blocknr_hint * hint, \ -+ int needed, reiser4_block_nr * start, reiser4_block_nr * len) \ -+{ \ -+ return reiser4_alloc_blocks_##allocator (al, hint, needed, start, len); \ -+} \ -+static inline void sa_dealloc_blocks (reiser4_space_allocator * al, reiser4_block_nr start, reiser4_block_nr len) \ -+{ \ -+ reiser4_dealloc_blocks_##allocator (al, start, len); \ -+} \ -+ \ -+static inline void sa_check_blocks (const reiser4_block_nr * start, const reiser4_block_nr * end, int desired) \ -+{ \ -+ reiser4_check_blocks_##allocator (start, end, desired); \ -+} \ -+ \ -+static inline void sa_pre_commit_hook (void) \ -+{ \ -+ reiser4_pre_commit_hook_##allocator (); \ -+} \ -+ \ -+static inline void sa_post_commit_hook (void) \ -+{ \ -+ reiser4_post_commit_hook_##allocator (); \ -+} \ -+ \ -+static inline void sa_post_write_back_hook (void) \ -+{ \ -+ reiser4_post_write_back_hook_##allocator(); \ -+} \ -+ \ -+static inline void sa_print_info(const char * prefix, reiser4_space_allocator * al) \ -+{ \ -+ reiser4_print_info_##allocator (prefix, al); \ -+} -+ -+DEF_SPACE_ALLOCATOR(bitmap) -+ -+/* this object is part of reiser4 private in-core super block */ -+struct reiser4_space_allocator { -+ union { -+ /* space allocators might use this pointer to reference their -+ * data. */ -+ void *generic; -+ } u; -+}; -+ -+/* __SPACE_ALLOCATOR_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/plugin/tail_policy.c linux-2.6.24/fs/reiser4/plugin/tail_policy.c ---- linux-2.6.24.orig/fs/reiser4/plugin/tail_policy.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/plugin/tail_policy.c 2008-01-25 11:39:07.068241692 +0300 -@@ -0,0 +1,113 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Formatting policy plugins */ -+ -+/* -+ * Formatting policy plugin is used by object plugin (of regular file) to -+ * convert file between two representations. -+ * -+ * Currently following policies are implemented: -+ * never store file in formatted nodes -+ * always store file in formatted nodes -+ * store file in formatted nodes if file is smaller than 4 blocks (default) -+ */ -+ -+#include "../tree.h" -+#include "../inode.h" -+#include "../super.h" -+#include "object.h" -+#include "plugin.h" -+#include "node/node.h" -+#include "plugin_header.h" -+ -+#include -+#include /* For struct inode */ -+ -+/** -+ * have_formatting_never - -+ * @inode: -+ * @size: -+ * -+ * -+ */ -+/* Never store file's tail as direct item */ -+/* Audited by: green(2002.06.12) */ -+static int have_formatting_never(const struct inode *inode UNUSED_ARG -+ /* inode to operate on */ , -+ loff_t size UNUSED_ARG /* new object size */ ) -+{ -+ return 0; -+} -+ -+/* Always store file's tail as direct item */ -+/* Audited by: green(2002.06.12) */ -+static int -+have_formatting_always(const struct inode *inode UNUSED_ARG -+ /* inode to operate on */ , -+ loff_t size UNUSED_ARG /* new object size */ ) -+{ -+ return 1; -+} -+ -+/* This function makes test if we should store file denoted @inode as tails only or -+ as extents only. */ -+static int -+have_formatting_default(const struct inode *inode UNUSED_ARG -+ /* inode to operate on */ , -+ loff_t size /* new object size */ ) -+{ -+ assert("umka-1253", inode != NULL); -+ -+ if (size > inode->i_sb->s_blocksize * 4) -+ return 0; -+ -+ return 1; -+} -+ -+/* tail plugins */ -+formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID] = { -+ [NEVER_TAILS_FORMATTING_ID] = { -+ .h = { -+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE, -+ .id = NEVER_TAILS_FORMATTING_ID, -+ .pops = NULL, -+ .label = "never", -+ .desc = "Never store file's tail", -+ .linkage = {NULL, NULL} -+ }, -+ .have_tail = have_formatting_never -+ }, -+ [ALWAYS_TAILS_FORMATTING_ID] = { -+ .h = { -+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE, -+ .id = ALWAYS_TAILS_FORMATTING_ID, -+ .pops = NULL, -+ .label = "always", -+ .desc = "Always store file's tail", -+ .linkage = {NULL, NULL} -+ }, -+ .have_tail = have_formatting_always -+ }, -+ [SMALL_FILE_FORMATTING_ID] = { -+ .h = { -+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE, -+ .id = SMALL_FILE_FORMATTING_ID, -+ .pops = NULL, -+ .label = "4blocks", -+ .desc = "store files shorter than 4 blocks in tail items", -+ .linkage = {NULL, NULL} -+ }, -+ .have_tail = have_formatting_default -+ } -+}; -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/pool.c linux-2.6.24/fs/reiser4/pool.c ---- linux-2.6.24.orig/fs/reiser4/pool.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/pool.c 2008-01-25 11:39:07.072242722 +0300 -@@ -0,0 +1,231 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Fast pool allocation. -+ -+ There are situations when some sub-system normally asks memory allocator -+ for only few objects, but under some circumstances could require much -+ more. Typical and actually motivating example is tree balancing. It needs -+ to keep track of nodes that were involved into it, and it is well-known -+ that in reasonable packed balanced tree most (92.938121%) percent of all -+ balancings end up after working with only few nodes (3.141592 on -+ average). But in rare cases balancing can involve much more nodes -+ (3*tree_height+1 in extremal situation). -+ -+ On the one hand, we don't want to resort to dynamic allocation (slab, -+ malloc(), etc.) to allocate data structures required to keep track of -+ nodes during balancing. On the other hand, we cannot statically allocate -+ required amount of space on the stack, because first: it is useless wastage -+ of precious resource, and second: this amount is unknown in advance (tree -+ height can change). -+ -+ Pools, implemented in this file are solution for this problem: -+ -+ - some configurable amount of objects is statically preallocated on the -+ stack -+ -+ - if this preallocated pool is exhausted and more objects is requested -+ they are allocated dynamically. -+ -+ Pools encapsulate distinction between statically and dynamically allocated -+ objects. Both allocation and recycling look exactly the same. -+ -+ To keep track of dynamically allocated objects, pool adds its own linkage -+ to each object. -+ -+ NOTE-NIKITA This linkage also contains some balancing-specific data. This -+ is not perfect. On the other hand, balancing is currently the only client -+ of pool code. -+ -+ NOTE-NIKITA Another desirable feature is to rewrite all pool manipulation -+ functions in the style of tslist/tshash, i.e., make them unreadable, but -+ type-safe. -+ -+*/ -+ -+#include "debug.h" -+#include "pool.h" -+#include "super.h" -+ -+#include -+#include -+ -+/* initialize new pool object @h */ -+static void reiser4_init_pool_obj(struct reiser4_pool_header * h) -+{ -+ INIT_LIST_HEAD(&h->usage_linkage); -+ INIT_LIST_HEAD(&h->level_linkage); -+ INIT_LIST_HEAD(&h->extra_linkage); -+} -+ -+/* initialize new pool */ -+void reiser4_init_pool(struct reiser4_pool * pool /* pool to initialize */ , -+ size_t obj_size /* size of objects in @pool */ , -+ int num_of_objs /* number of preallocated objects */ , -+ char *data /* area for preallocated objects */ ) -+{ -+ struct reiser4_pool_header *h; -+ int i; -+ -+ assert("nikita-955", pool != NULL); -+ assert("nikita-1044", obj_size > 0); -+ assert("nikita-956", num_of_objs >= 0); -+ assert("nikita-957", data != NULL); -+ -+ memset(pool, 0, sizeof *pool); -+ pool->obj_size = obj_size; -+ pool->data = data; -+ INIT_LIST_HEAD(&pool->free); -+ INIT_LIST_HEAD(&pool->used); -+ INIT_LIST_HEAD(&pool->extra); -+ memset(data, 0, obj_size * num_of_objs); -+ for (i = 0; i < num_of_objs; ++i) { -+ h = (struct reiser4_pool_header *) (data + i * obj_size); -+ reiser4_init_pool_obj(h); -+ /* add pool header to the end of pool's free list */ -+ list_add_tail(&h->usage_linkage, &pool->free); -+ } -+} -+ -+/* release pool resources -+ -+ Release all resources acquired by this pool, specifically, dynamically -+ allocated objects. -+ -+*/ -+void reiser4_done_pool(struct reiser4_pool * pool UNUSED_ARG) -+{ -+} -+ -+/* allocate carry object from @pool -+ -+ First, try to get preallocated object. If this fails, resort to dynamic -+ allocation. -+ -+*/ -+static void *reiser4_pool_alloc(struct reiser4_pool * pool) -+{ -+ struct reiser4_pool_header *result; -+ -+ assert("nikita-959", pool != NULL); -+ -+ if (!list_empty(&pool->free)) { -+ struct list_head *linkage; -+ -+ linkage = pool->free.next; -+ list_del(linkage); -+ INIT_LIST_HEAD(linkage); -+ result = list_entry(linkage, struct reiser4_pool_header, -+ usage_linkage); -+ BUG_ON(!list_empty(&result->level_linkage) || -+ !list_empty(&result->extra_linkage)); -+ } else { -+ /* pool is empty. Extra allocations don't deserve dedicated -+ slab to be served from, as they are expected to be rare. */ -+ result = kmalloc(pool->obj_size, reiser4_ctx_gfp_mask_get()); -+ if (result != 0) { -+ reiser4_init_pool_obj(result); -+ list_add(&result->extra_linkage, &pool->extra); -+ } else -+ return ERR_PTR(RETERR(-ENOMEM)); -+ BUG_ON(!list_empty(&result->usage_linkage) || -+ !list_empty(&result->level_linkage)); -+ } -+ ++pool->objs; -+ list_add(&result->usage_linkage, &pool->used); -+ memset(result + 1, 0, pool->obj_size - sizeof *result); -+ return result; -+} -+ -+/* return object back to the pool */ -+void reiser4_pool_free(struct reiser4_pool * pool, -+ struct reiser4_pool_header * h) -+{ -+ assert("nikita-961", h != NULL); -+ assert("nikita-962", pool != NULL); -+ -+ --pool->objs; -+ assert("nikita-963", pool->objs >= 0); -+ -+ list_del_init(&h->usage_linkage); -+ list_del_init(&h->level_linkage); -+ -+ if (list_empty(&h->extra_linkage)) -+ /* -+ * pool header is not an extra one. Push it onto free list -+ * using usage_linkage -+ */ -+ list_add(&h->usage_linkage, &pool->free); -+ else { -+ /* remove pool header from pool's extra list and kfree it */ -+ list_del(&h->extra_linkage); -+ kfree(h); -+ } -+} -+ -+/* add new object to the carry level list -+ -+ Carry level is FIFO most of the time, but not always. Complications arise -+ when make_space() function tries to go to the left neighbor and thus adds -+ carry node before existing nodes, and also, when updating delimiting keys -+ after moving data between two nodes, we want left node to be locked before -+ right node. -+ -+ Latter case is confusing at the first glance. Problem is that COP_UPDATE -+ opration that updates delimiting keys is sometimes called with two nodes -+ (when data are moved between two nodes) and sometimes with only one node -+ (when leftmost item is deleted in a node). In any case operation is -+ supplied with at least node whose left delimiting key is to be updated -+ (that is "right" node). -+ -+ @pool - from which to allocate new object; -+ @list - where to add object; -+ @reference - after (or before) which existing object to add -+*/ -+struct reiser4_pool_header *reiser4_add_obj(struct reiser4_pool * pool, -+ struct list_head *list, -+ pool_ordering order, -+ struct reiser4_pool_header * reference) -+{ -+ struct reiser4_pool_header *result; -+ -+ assert("nikita-972", pool != NULL); -+ -+ result = reiser4_pool_alloc(pool); -+ if (IS_ERR(result)) -+ return result; -+ -+ assert("nikita-973", result != NULL); -+ -+ switch (order) { -+ case POOLO_BEFORE: -+ __list_add(&result->level_linkage, -+ reference->level_linkage.prev, -+ &reference->level_linkage); -+ break; -+ case POOLO_AFTER: -+ __list_add(&result->level_linkage, -+ &reference->level_linkage, -+ reference->level_linkage.next); -+ break; -+ case POOLO_LAST: -+ list_add_tail(&result->level_linkage, list); -+ break; -+ case POOLO_FIRST: -+ list_add(&result->level_linkage, list); -+ break; -+ default: -+ wrong_return_value("nikita-927", "order"); -+ } -+ return result; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/pool.h linux-2.6.24/fs/reiser4/pool.h ---- linux-2.6.24.orig/fs/reiser4/pool.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/pool.h 2008-01-25 11:39:07.072242722 +0300 -@@ -0,0 +1,56 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Fast pool allocation */ -+ -+#ifndef __REISER4_POOL_H__ -+#define __REISER4_POOL_H__ -+ -+#include -+ -+struct reiser4_pool { -+ size_t obj_size; -+ int objs; -+ char *data; -+ struct list_head free; -+ struct list_head used; -+ struct list_head extra; -+}; -+ -+struct reiser4_pool_header { -+ /* object is either on free or "used" lists */ -+ struct list_head usage_linkage; -+ struct list_head level_linkage; -+ struct list_head extra_linkage; -+}; -+ -+typedef enum { -+ POOLO_BEFORE, -+ POOLO_AFTER, -+ POOLO_LAST, -+ POOLO_FIRST -+} pool_ordering; -+ -+/* pool manipulation functions */ -+ -+extern void reiser4_init_pool(struct reiser4_pool * pool, size_t obj_size, -+ int num_of_objs, char *data); -+extern void reiser4_done_pool(struct reiser4_pool * pool); -+extern void reiser4_pool_free(struct reiser4_pool * pool, -+ struct reiser4_pool_header * h); -+struct reiser4_pool_header *reiser4_add_obj(struct reiser4_pool * pool, -+ struct list_head * list, -+ pool_ordering order, -+ struct reiser4_pool_header *reference); -+ -+/* __REISER4_POOL_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/readahead.c linux-2.6.24/fs/reiser4/readahead.c ---- linux-2.6.24.orig/fs/reiser4/readahead.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/readahead.c 2008-01-25 11:39:07.072242722 +0300 -@@ -0,0 +1,138 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#include "forward.h" -+#include "tree.h" -+#include "tree_walk.h" -+#include "super.h" -+#include "inode.h" -+#include "key.h" -+#include "znode.h" -+ -+#include /* for totalram_pages */ -+ -+void reiser4_init_ra_info(ra_info_t * rai) -+{ -+ rai->key_to_stop = *reiser4_min_key(); -+} -+ -+/* global formatted node readahead parameter. It can be set by mount option -o readahead:NUM:1 */ -+static inline int ra_adjacent_only(int flags) -+{ -+ return flags & RA_ADJACENT_ONLY; -+} -+ -+/* this is used by formatted_readahead to decide whether read for right neighbor of node is to be issued. It returns 1 -+ if right neighbor's first key is less or equal to readahead's stop key */ -+static int should_readahead_neighbor(znode * node, ra_info_t * info) -+{ -+ int result; -+ -+ read_lock_dk(znode_get_tree(node)); -+ result = keyle(znode_get_rd_key(node), &info->key_to_stop); -+ read_unlock_dk(znode_get_tree(node)); -+ return result; -+} -+ -+#define LOW_MEM_PERCENTAGE (5) -+ -+static int low_on_memory(void) -+{ -+ unsigned int freepages; -+ -+ freepages = nr_free_pages(); -+ return freepages < (totalram_pages * LOW_MEM_PERCENTAGE / 100); -+} -+ -+/* start read for @node and for a few of its right neighbors */ -+void formatted_readahead(znode * node, ra_info_t * info) -+{ -+ struct formatted_ra_params *ra_params; -+ znode *cur; -+ int i; -+ int grn_flags; -+ lock_handle next_lh; -+ -+ /* do nothing if node block number has not been assigned to node (which means it is still in cache). */ -+ if (reiser4_blocknr_is_fake(znode_get_block(node))) -+ return; -+ -+ ra_params = get_current_super_ra_params(); -+ -+ if (znode_page(node) == NULL) -+ jstartio(ZJNODE(node)); -+ -+ if (znode_get_level(node) != LEAF_LEVEL) -+ return; -+ -+ /* don't waste memory for read-ahead when low on memory */ -+ if (low_on_memory()) -+ return; -+ -+ /* We can have locked nodes on upper tree levels, in this situation lock -+ priorities do not help to resolve deadlocks, we have to use TRY_LOCK -+ here. */ -+ grn_flags = (GN_CAN_USE_UPPER_LEVELS | GN_TRY_LOCK); -+ -+ i = 0; -+ cur = zref(node); -+ init_lh(&next_lh); -+ while (i < ra_params->max) { -+ const reiser4_block_nr *nextblk; -+ -+ if (!should_readahead_neighbor(cur, info)) -+ break; -+ -+ if (reiser4_get_right_neighbor -+ (&next_lh, cur, ZNODE_READ_LOCK, grn_flags)) -+ break; -+ -+ nextblk = znode_get_block(next_lh.node); -+ if (reiser4_blocknr_is_fake(nextblk) || -+ (ra_adjacent_only(ra_params->flags) -+ && *nextblk != *znode_get_block(cur) + 1)) { -+ break; -+ } -+ -+ zput(cur); -+ cur = zref(next_lh.node); -+ done_lh(&next_lh); -+ if (znode_page(cur) == NULL) -+ jstartio(ZJNODE(cur)); -+ else -+ /* Do not scan read-ahead window if pages already -+ * allocated (and i/o already started). */ -+ break; -+ -+ i++; -+ } -+ zput(cur); -+ done_lh(&next_lh); -+} -+ -+void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap) -+{ -+ reiser4_key *stop_key; -+ -+ assert("nikita-3542", dir != NULL); -+ assert("nikita-3543", tap != NULL); -+ -+ stop_key = &tap->ra_info.key_to_stop; -+ /* initialize readdir readahead information: include into readahead -+ * stat data of all files of the directory */ -+ set_key_locality(stop_key, get_inode_oid(dir)); -+ set_key_type(stop_key, KEY_SD_MINOR); -+ set_key_ordering(stop_key, get_key_ordering(reiser4_max_key())); -+ set_key_objectid(stop_key, get_key_objectid(reiser4_max_key())); -+ set_key_offset(stop_key, get_key_offset(reiser4_max_key())); -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/readahead.h linux-2.6.24/fs/reiser4/readahead.h ---- linux-2.6.24.orig/fs/reiser4/readahead.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/readahead.h 2008-01-25 11:39:07.072242722 +0300 -@@ -0,0 +1,51 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#ifndef __READAHEAD_H__ -+#define __READAHEAD_H__ -+ -+#include "key.h" -+ -+typedef enum { -+ RA_ADJACENT_ONLY = 1, /* only requests nodes which are adjacent. -+ Default is NO (not only adjacent) */ -+} ra_global_flags; -+ -+/* reiser4 super block has a field of this type. -+ It controls readahead during tree traversals */ -+struct formatted_ra_params { -+ unsigned long max; /* request not more than this amount of nodes. -+ Default is totalram_pages / 4 */ -+ int flags; -+}; -+ -+typedef struct { -+ reiser4_key key_to_stop; -+} ra_info_t; -+ -+void formatted_readahead(znode *, ra_info_t *); -+void reiser4_init_ra_info(ra_info_t * rai); -+ -+struct reiser4_file_ra_state { -+ loff_t start; /* Current window */ -+ loff_t size; -+ loff_t next_size; /* Next window size */ -+ loff_t ahead_start; /* Ahead window */ -+ loff_t ahead_size; -+ loff_t max_window_size; /* Maximum readahead window */ -+ loff_t slow_start; /* enlarging r/a size algorithm. */ -+}; -+ -+extern void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap); -+ -+/* __READAHEAD_H__ */ -+#endif -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/README linux-2.6.24/fs/reiser4/README ---- linux-2.6.24.orig/fs/reiser4/README 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/README 2008-01-25 11:39:07.076243753 +0300 -@@ -0,0 +1,128 @@ -+[LICENSING] -+ -+Reiser4 is hereby licensed under the GNU General -+Public License version 2. -+ -+Source code files that contain the phrase "licensing governed by -+reiser4/README" are "governed files" throughout this file. Governed -+files are licensed under the GPL. The portions of them owned by Hans -+Reiser, or authorized to be licensed by him, have been in the past, -+and likely will be in the future, licensed to other parties under -+other licenses. If you add your code to governed files, and don't -+want it to be owned by Hans Reiser, put your copyright label on that -+code so the poor blight and his customers can keep things straight. -+All portions of governed files not labeled otherwise are owned by Hans -+Reiser, and by adding your code to it, widely distributing it to -+others or sending us a patch, and leaving the sentence in stating that -+licensing is governed by the statement in this file, you accept this. -+It will be a kindness if you identify whether Hans Reiser is allowed -+to license code labeled as owned by you on your behalf other than -+under the GPL, because he wants to know if it is okay to do so and put -+a check in the mail to you (for non-trivial improvements) when he -+makes his next sale. He makes no guarantees as to the amount if any, -+though he feels motivated to motivate contributors, and you can surely -+discuss this with him before or after contributing. You have the -+right to decline to allow him to license your code contribution other -+than under the GPL. -+ -+Further licensing options are available for commercial and/or other -+interests directly from Hans Reiser: reiser@namesys.com. If you interpret -+the GPL as not allowing those additional licensing options, you read -+it wrongly, and Richard Stallman agrees with me, when carefully read -+you can see that those restrictions on additional terms do not apply -+to the owner of the copyright, and my interpretation of this shall -+govern for this license. -+ -+[END LICENSING] -+ -+Reiser4 is a file system based on dancing tree algorithms, and is -+described at http://www.namesys.com -+ -+mkfs.reiser4 and other utilities are on our webpage or wherever your -+Linux provider put them. You really want to be running the latest -+version off the website if you use fsck. -+ -+Yes, if you update your reiser4 kernel module you do have to -+recompile your kernel, most of the time. The errors you get will be -+quite cryptic if your forget to do so. -+ -+Hideous Commercial Pitch: Spread your development costs across other OS -+vendors. Select from the best in the world, not the best in your -+building, by buying from third party OS component suppliers. Leverage -+the software component development power of the internet. Be the most -+aggressive in taking advantage of the commercial possibilities of -+decentralized internet development, and add value through your branded -+integration that you sell as an operating system. Let your competitors -+be the ones to compete against the entire internet by themselves. Be -+hip, get with the new economic trend, before your competitors do. Send -+email to reiser@namesys.com -+ -+Hans Reiser was the primary architect of Reiser4, but a whole team -+chipped their ideas in. He invested everything he had into Namesys -+for 5.5 dark years of no money before Reiser3 finally started to work well -+enough to bring in money. He owns the copyright. -+ -+DARPA was the primary sponsor of Reiser4. DARPA does not endorse -+Reiser4, it merely sponsors it. DARPA is, in solely Hans's personal -+opinion, unique in its willingness to invest into things more -+theoretical than the VC community can readily understand, and more -+longterm than allows them to be sure that they will be the ones to -+extract the economic benefits from. DARPA also integrated us into a -+security community that transformed our security worldview. -+ -+Vladimir Saveliev is our lead programmer, with us from the beginning, -+and he worked long hours writing the cleanest code. This is why he is -+now the lead programmer after years of commitment to our work. He -+always made the effort to be the best he could be, and to make his -+code the best that it could be. What resulted was quite remarkable. I -+don't think that money can ever motivate someone to work the way he -+did, he is one of the most selfless men I know. -+ -+Alexander Lyamin was our sysadmin, and helped to educate us in -+security issues. Moscow State University and IMT were very generous -+in the internet access they provided us, and in lots of other little -+ways that a generous institution can be. -+ -+Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the -+locking code, the block allocator, and finished the flushing code. -+His code is always crystal clean and well structured. -+ -+Nikita Danilov wrote the core of the balancing code, the core of the -+plugins code, and the directory code. He worked a steady pace of long -+hours that produced a whole lot of well abstracted code. He is our -+senior computer scientist. -+ -+Vladimir Demidov wrote the parser. Writing an in kernel parser is -+something very few persons have the skills for, and it is thanks to -+him that we can say that the parser is really not so big compared to -+various bits of our other code, and making a parser work in the kernel -+was not so complicated as everyone would imagine mainly because it was -+him doing it... -+ -+Joshua McDonald wrote the transaction manager, and the flush code. -+The flush code unexpectedly turned out be extremely hairy for reasons -+you can read about on our web page, and he did a great job on an -+extremely difficult task. -+ -+Nina Reiser handled our accounting, government relations, and much -+more. -+ -+Ramon Reiser developed our website. -+ -+Beverly Palmer drew our graphics. -+ -+Vitaly Fertman developed librepair, userspace plugins repair code, fsck -+and worked with Umka on developing libreiser4 and userspace plugins. -+ -+Yury Umanets (aka Umka) developed libreiser4, userspace plugins and -+userspace tools (reiser4progs). -+ -+Oleg Drokin (aka Green) is the release manager who fixes everything. -+It is so nice to have someone like that on the team. He (plus Chris -+and Jeff) make it possible for the entire rest of the Namesys team to -+focus on Reiser4, and he fixed a whole lot of Reiser4 bugs also. It -+is just amazing to watch his talent for spotting bugs in action. -+ -+Edward Shishkin wrote cryptcompress file plugin (which manages files -+built of encrypted and(or) compressed bodies) and other plugins related -+to transparent encryption and compression support. -diff -urN linux-2.6.24.orig/fs/reiser4/reiser4.h linux-2.6.24/fs/reiser4/reiser4.h ---- linux-2.6.24.orig/fs/reiser4/reiser4.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/reiser4.h 2008-01-25 12:25:01.861363382 +0300 -@@ -0,0 +1,270 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* definitions of common constants used by reiser4 */ -+ -+#if !defined( __REISER4_H__ ) -+#define __REISER4_H__ -+ -+#include /* for HZ */ -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * reiser4 compilation options. -+ */ -+ -+#if defined(CONFIG_REISER4_DEBUG) -+/* turn on assertion checks */ -+#define REISER4_DEBUG (1) -+#else -+#define REISER4_DEBUG (0) -+#endif -+ -+#if defined(CONFIG_ZLIB_INFLATE) -+/* turn on zlib */ -+#define REISER4_ZLIB (1) -+#else -+#define REISER4_ZLIB (0) -+#endif -+ -+#if defined(CONFIG_CRYPTO_SHA256) -+#define REISER4_SHA256 (1) -+#else -+#define REISER4_SHA256 (0) -+#endif -+ -+/* -+ * Turn on large keys mode. In his mode (which is default), reiser4 key has 4 -+ * 8-byte components. In the old "small key" mode, it's 3 8-byte -+ * components. Additional component, referred to as "ordering" is used to -+ * order items from which given object is composed of. As such, ordering is -+ * placed between locality and objectid. For directory item ordering contains -+ * initial prefix of the file name this item is for. This sorts all directory -+ * items within given directory lexicographically (but see -+ * fibration.[ch]). For file body and stat-data, ordering contains initial -+ * prefix of the name file was initially created with. In the common case -+ * (files with single name) this allows to order file bodies and stat-datas in -+ * the same order as their respective directory entries, thus speeding up -+ * readdir. -+ * -+ * Note, that kernel can only mount file system with the same key size as one -+ * it is compiled for, so flipping this option may render your data -+ * inaccessible. -+ */ -+#define REISER4_LARGE_KEY (1) -+/*#define REISER4_LARGE_KEY (0)*/ -+ -+/*#define GUESS_EXISTS 1*/ -+ -+/* -+ * PLEASE update fs/reiser4/kattr.c:show_options() when adding new compilation -+ * option -+ */ -+ -+extern const char *REISER4_SUPER_MAGIC_STRING; -+extern const int REISER4_MAGIC_OFFSET; /* offset to magic string from the -+ * beginning of device */ -+ -+/* here go tunable parameters that are not worth special entry in kernel -+ configuration */ -+ -+/* default number of slots in coord-by-key caches */ -+#define CBK_CACHE_SLOTS (16) -+/* how many elementary tree operation to carry on the next level */ -+#define CARRIES_POOL_SIZE (5) -+/* size of pool of preallocated nodes for carry process. */ -+#define NODES_LOCKED_POOL_SIZE (5) -+ -+#define REISER4_NEW_NODE_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT) -+#define REISER4_NEW_EXTENT_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT) -+#define REISER4_PASTE_FLAGS (COPI_GO_LEFT) -+#define REISER4_INSERT_FLAGS (COPI_GO_LEFT) -+ -+/* we are supporting reservation of disk space on uid basis */ -+#define REISER4_SUPPORT_UID_SPACE_RESERVATION (0) -+/* we are supporting reservation of disk space for groups */ -+#define REISER4_SUPPORT_GID_SPACE_RESERVATION (0) -+/* we are supporting reservation of disk space for root */ -+#define REISER4_SUPPORT_ROOT_SPACE_RESERVATION (0) -+/* we use rapid flush mode, see flush.c for comments. */ -+#define REISER4_USE_RAPID_FLUSH (1) -+ -+/* -+ * set this to 0 if you don't want to use wait-for-flush in ->writepage(). -+ */ -+#define REISER4_USE_ENTD (1) -+ -+/* key allocation is Plan-A */ -+#define REISER4_PLANA_KEY_ALLOCATION (1) -+/* key allocation follows good old 3.x scheme */ -+#define REISER4_3_5_KEY_ALLOCATION (0) -+ -+/* size of hash-table for znodes */ -+#define REISER4_ZNODE_HASH_TABLE_SIZE (1 << 13) -+ -+/* number of buckets in lnode hash-table */ -+#define LNODE_HTABLE_BUCKETS (1024) -+ -+/* some ridiculously high maximal limit on height of znode tree. This -+ is used in declaration of various per level arrays and -+ to allocate stattistics gathering array for per-level stats. */ -+#define REISER4_MAX_ZTREE_HEIGHT (8) -+ -+#define REISER4_PANIC_MSG_BUFFER_SIZE (1024) -+ -+/* If array contains less than REISER4_SEQ_SEARCH_BREAK elements then, -+ sequential search is on average faster than binary. This is because -+ of better optimization and because sequential search is more CPU -+ cache friendly. This number (25) was found by experiments on dual AMD -+ Athlon(tm), 1400MHz. -+ -+ NOTE: testing in kernel has shown that binary search is more effective than -+ implied by results of the user level benchmarking. Probably because in the -+ node keys are separated by other data. So value was adjusted after few -+ tests. More thorough tuning is needed. -+*/ -+#define REISER4_SEQ_SEARCH_BREAK (3) -+ -+/* don't allow tree to be lower than this */ -+#define REISER4_MIN_TREE_HEIGHT (TWIG_LEVEL) -+ -+/* NOTE NIKITA this is no longer used: maximal atom size is auto-adjusted to -+ * available memory. */ -+/* Default value of maximal atom size. Can be ovewritten by -+ tmgr.atom_max_size mount option. By default infinity. */ -+#define REISER4_ATOM_MAX_SIZE ((unsigned)(~0)) -+ -+/* Default value of maximal atom age (in jiffies). After reaching this age -+ atom will be forced to commit, either synchronously or asynchronously. Can -+ be overwritten by tmgr.atom_max_age mount option. */ -+#define REISER4_ATOM_MAX_AGE (600 * HZ) -+ -+/* sleeping period for ktxnmrgd */ -+#define REISER4_TXNMGR_TIMEOUT (5 * HZ) -+ -+/* timeout to wait for ent thread in writepage. Default: 3 milliseconds. */ -+#define REISER4_ENTD_TIMEOUT (3 * HZ / 1000) -+ -+/* start complaining after that many restarts in coord_by_key(). -+ -+ This either means incredibly heavy contention for this part of a tree, or -+ some corruption or bug. -+*/ -+#define REISER4_CBK_ITERATIONS_LIMIT (100) -+ -+/* return -EIO after that many iterations in coord_by_key(). -+ -+ I have witnessed more than 800 iterations (in 30 thread test) before cbk -+ finished. --nikita -+*/ -+#define REISER4_MAX_CBK_ITERATIONS 500000 -+ -+/* put a per-inode limit on maximal number of directory entries with identical -+ keys in hashed directory. -+ -+ Disable this until inheritance interfaces stabilize: we need some way to -+ set per directory limit. -+*/ -+#define REISER4_USE_COLLISION_LIMIT (0) -+ -+/* If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty leaf-level blocks it -+ will force them to be relocated. */ -+#define FLUSH_RELOCATE_THRESHOLD 64 -+/* If flush finds can find a block allocation closer than at most FLUSH_RELOCATE_DISTANCE -+ from the preceder it will relocate to that position. */ -+#define FLUSH_RELOCATE_DISTANCE 64 -+ -+/* If we have written this much or more blocks before encountering busy jnode -+ in flush list - abort flushing hoping that next time we get called -+ this jnode will be clean already, and we will save some seeks. */ -+#define FLUSH_WRITTEN_THRESHOLD 50 -+ -+/* The maximum number of nodes to scan left on a level during flush. */ -+#define FLUSH_SCAN_MAXNODES 10000 -+ -+/* per-atom limit of flushers */ -+#define ATOM_MAX_FLUSHERS (1) -+ -+/* default tracing buffer size */ -+#define REISER4_TRACE_BUF_SIZE (1 << 15) -+ -+/* what size units of IO we would like cp, etc., to use, in writing to -+ reiser4. In bytes. -+ -+ Can be overwritten by optimal_io_size mount option. -+*/ -+#define REISER4_OPTIMAL_IO_SIZE (64 * 1024) -+ -+/* see comments in inode.c:oid_to_uino() */ -+#define REISER4_UINO_SHIFT (1 << 30) -+ -+/* Mark function argument as unused to avoid compiler warnings. */ -+#define UNUSED_ARG __attribute__((unused)) -+ -+#if ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3) -+#define NONNULL __attribute__((nonnull)) -+#else -+#define NONNULL -+#endif -+ -+/* master super block offset in bytes.*/ -+#define REISER4_MASTER_OFFSET 65536 -+ -+/* size of VFS block */ -+#define VFS_BLKSIZE 512 -+/* number of bits in size of VFS block (512==2^9) */ -+#define VFS_BLKSIZE_BITS 9 -+ -+#define REISER4_I reiser4_inode_data -+ -+/* implication */ -+#define ergo( antecedent, consequent ) ( !( antecedent ) || ( consequent ) ) -+/* logical equivalence */ -+#define equi( p1, p2 ) ( ergo( ( p1 ), ( p2 ) ) && ergo( ( p2 ), ( p1 ) ) ) -+ -+#define sizeof_array(x) ((int) (sizeof(x) / sizeof(x[0]))) -+ -+#define NOT_YET (0) -+ -+/** Reiser4 specific error codes **/ -+ -+#define REISER4_ERROR_CODE_BASE 10000 -+ -+/* Neighbor is not available (side neighbor or parent) */ -+#define E_NO_NEIGHBOR (REISER4_ERROR_CODE_BASE) -+ -+/* Node was not found in cache */ -+#define E_NOT_IN_CACHE (REISER4_ERROR_CODE_BASE + 1) -+ -+/* node has no free space enough for completion of balancing operation */ -+#define E_NODE_FULL (REISER4_ERROR_CODE_BASE + 2) -+ -+/* repeat operation */ -+#define E_REPEAT (REISER4_ERROR_CODE_BASE + 3) -+ -+/* deadlock happens */ -+#define E_DEADLOCK (REISER4_ERROR_CODE_BASE + 4) -+ -+/* operation cannot be performed, because it would block and non-blocking mode -+ * was requested. */ -+#define E_BLOCK (REISER4_ERROR_CODE_BASE + 5) -+ -+/* wait some event (depends on context), then repeat */ -+#define E_WAIT (REISER4_ERROR_CODE_BASE + 6) -+ -+#endif /* __REISER4_H__ */ -+ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/safe_link.c linux-2.6.24/fs/reiser4/safe_link.c ---- linux-2.6.24.orig/fs/reiser4/safe_link.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/safe_link.c 2008-01-25 11:39:07.076243753 +0300 -@@ -0,0 +1,352 @@ -+/* Copyright 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Safe-links. */ -+ -+/* -+ * Safe-links are used to maintain file system consistency during operations -+ * that spawns multiple transactions. For example: -+ * -+ * 1. Unlink. UNIX supports "open-but-unlinked" files, that is files -+ * without user-visible names in the file system, but still opened by some -+ * active process. What happens here is that unlink proper (i.e., removal -+ * of the last file name) and file deletion (truncate of file body to zero -+ * and deletion of stat-data, that happens when last file descriptor is -+ * closed), may belong to different transactions T1 and T2. If a crash -+ * happens after T1 commit, but before T2 commit, on-disk file system has -+ * a file without name, that is, disk space leak. -+ * -+ * 2. Truncate. Truncate of large file may spawn multiple transactions. If -+ * system crashes while truncate was in-progress, file is left partially -+ * truncated, which violates "atomicity guarantees" of reiser4, viz. that -+ * every system is atomic. -+ * -+ * Safe-links address both above cases. Basically, safe-link is a way post -+ * some operation to be executed during commit of some other transaction than -+ * current one. (Another way to look at the safe-link is to interpret it as a -+ * logical logging.) -+ * -+ * Specifically, at the beginning of unlink safe-link in inserted in the -+ * tree. This safe-link is normally removed by file deletion code (during -+ * transaction T2 in the above terms). Truncate also inserts safe-link that is -+ * normally removed when truncate operation is finished. -+ * -+ * This means, that in the case of "clean umount" there are no safe-links in -+ * the tree. If safe-links are observed during mount, it means that (a) system -+ * was terminated abnormally, and (b) safe-link correspond to the "pending" -+ * (i.e., not finished) operations that were in-progress during system -+ * termination. Each safe-link record enough information to complete -+ * corresponding operation, and mount simply "replays" them (hence, the -+ * analogy with the logical logging). -+ * -+ * Safe-links are implemented as blackbox items (see -+ * plugin/item/blackbox.[ch]). -+ * -+ * For the reference: ext3 also has similar mechanism, it's called "an orphan -+ * list" there. -+ */ -+ -+#include "safe_link.h" -+#include "debug.h" -+#include "inode.h" -+ -+#include "plugin/item/blackbox.h" -+ -+#include -+ -+/* -+ * On-disk format of safe-link. -+ */ -+typedef struct safelink { -+ reiser4_key sdkey; /* key of stat-data for the file safe-link is -+ * for */ -+ d64 size; /* size to which file should be truncated */ -+} safelink_t; -+ -+/* -+ * locality where safe-link items are stored. Next to the objectid of root -+ * directory. -+ */ -+static oid_t safe_link_locality(reiser4_tree * tree) -+{ -+ return get_key_objectid(get_super_private(tree->super)->df_plug-> -+ root_dir_key(tree->super)) + 1; -+} -+ -+/* -+ Construct a key for the safe-link. Key has the following format: -+ -+| 60 | 4 | 64 | 4 | 60 | 64 | -++---------------+---+------------------+---+---------------+------------------+ -+| locality | 0 | 0 | 0 | objectid | link type | -++---------------+---+------------------+---+---------------+------------------+ -+| | | | | -+| 8 bytes | 8 bytes | 8 bytes | 8 bytes | -+ -+ This is in large keys format. In small keys format second 8 byte chunk is -+ out. Locality is a constant returned by safe_link_locality(). objectid is -+ an oid of a file on which operation protected by this safe-link is -+ performed. link-type is used to distinguish safe-links for different -+ operations. -+ -+ */ -+static reiser4_key *build_link_key(reiser4_tree * tree, oid_t oid, -+ reiser4_safe_link_t link, reiser4_key * key) -+{ -+ reiser4_key_init(key); -+ set_key_locality(key, safe_link_locality(tree)); -+ set_key_objectid(key, oid); -+ set_key_offset(key, link); -+ return key; -+} -+ -+/* -+ * how much disk space is necessary to insert and remove (in the -+ * error-handling path) safe-link. -+ */ -+static __u64 safe_link_tograb(reiser4_tree * tree) -+{ -+ return -+ /* insert safe link */ -+ estimate_one_insert_item(tree) + -+ /* remove safe link */ -+ estimate_one_item_removal(tree) + -+ /* drill to the leaf level during insertion */ -+ 1 + estimate_one_insert_item(tree) + -+ /* -+ * possible update of existing safe-link. Actually, if -+ * safe-link existed already (we failed to remove it), then no -+ * insertion is necessary, so this term is already "covered", -+ * but for simplicity let's left it. -+ */ -+ 1; -+} -+ -+/* -+ * grab enough disk space to insert and remove (in the error-handling path) -+ * safe-link. -+ */ -+int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags) -+{ -+ int result; -+ -+ grab_space_enable(); -+ /* The sbinfo->delete_mutex can be taken here. -+ * safe_link_release() should be called before leaving reiser4 -+ * context. */ -+ result = -+ reiser4_grab_reserved(tree->super, safe_link_tograb(tree), flags); -+ grab_space_enable(); -+ return result; -+} -+ -+/* -+ * release unused disk space reserved by safe_link_grab(). -+ */ -+void safe_link_release(reiser4_tree * tree) -+{ -+ reiser4_release_reserved(tree->super); -+} -+ -+/* -+ * insert into tree safe-link for operation @link on inode @inode. -+ */ -+int safe_link_add(struct inode *inode, reiser4_safe_link_t link) -+{ -+ reiser4_key key; -+ safelink_t sl; -+ int length; -+ int result; -+ reiser4_tree *tree; -+ -+ build_sd_key(inode, &sl.sdkey); -+ length = sizeof sl.sdkey; -+ -+ if (link == SAFE_TRUNCATE) { -+ /* -+ * for truncate we have to store final file length also, -+ * expand item. -+ */ -+ length += sizeof(sl.size); -+ put_unaligned(cpu_to_le64(inode->i_size), &sl.size); -+ } -+ tree = reiser4_tree_by_inode(inode); -+ build_link_key(tree, get_inode_oid(inode), link, &key); -+ -+ result = store_black_box(tree, &key, &sl, length); -+ if (result == -EEXIST) -+ result = update_black_box(tree, &key, &sl, length); -+ return result; -+} -+ -+/* -+ * remove safe-link corresponding to the operation @link on inode @inode from -+ * the tree. -+ */ -+int safe_link_del(reiser4_tree * tree, oid_t oid, reiser4_safe_link_t link) -+{ -+ reiser4_key key; -+ -+ return kill_black_box(tree, build_link_key(tree, oid, link, &key)); -+} -+ -+/* -+ * in-memory structure to keep information extracted from safe-link. This is -+ * used to iterate over all safe-links. -+ */ -+struct safe_link_context { -+ reiser4_tree *tree; /* internal tree */ -+ reiser4_key key; /* safe-link key */ -+ reiser4_key sdkey; /* key of object stat-data */ -+ reiser4_safe_link_t link; /* safe-link type */ -+ oid_t oid; /* object oid */ -+ __u64 size; /* final size for truncate */ -+}; -+ -+/* -+ * start iterating over all safe-links. -+ */ -+static void safe_link_iter_begin(reiser4_tree * tree, -+ struct safe_link_context * ctx) -+{ -+ ctx->tree = tree; -+ reiser4_key_init(&ctx->key); -+ set_key_locality(&ctx->key, safe_link_locality(tree)); -+ set_key_objectid(&ctx->key, get_key_objectid(reiser4_max_key())); -+ set_key_offset(&ctx->key, get_key_offset(reiser4_max_key())); -+} -+ -+/* -+ * return next safe-link. -+ */ -+static int safe_link_iter_next(struct safe_link_context * ctx) -+{ -+ int result; -+ safelink_t sl; -+ -+ result = load_black_box(ctx->tree, &ctx->key, &sl, sizeof sl, 0); -+ if (result == 0) { -+ ctx->oid = get_key_objectid(&ctx->key); -+ ctx->link = get_key_offset(&ctx->key); -+ ctx->sdkey = sl.sdkey; -+ if (ctx->link == SAFE_TRUNCATE) -+ ctx->size = le64_to_cpu(get_unaligned(&sl.size)); -+ } -+ return result; -+} -+ -+/* -+ * check are there any more safe-links left in the tree. -+ */ -+static int safe_link_iter_finished(struct safe_link_context * ctx) -+{ -+ return get_key_locality(&ctx->key) != safe_link_locality(ctx->tree); -+} -+ -+/* -+ * finish safe-link iteration. -+ */ -+static void safe_link_iter_end(struct safe_link_context * ctx) -+{ -+ /* nothing special */ -+} -+ -+/* -+ * process single safe-link. -+ */ -+static int process_safelink(struct super_block *super, reiser4_safe_link_t link, -+ reiser4_key * sdkey, oid_t oid, __u64 size) -+{ -+ struct inode *inode; -+ int result; -+ -+ /* -+ * obtain object inode by reiser4_iget(), then call object plugin -+ * ->safelink() method to do actual work, then delete safe-link on -+ * success. -+ */ -+ inode = reiser4_iget(super, sdkey, 1); -+ if (!IS_ERR(inode)) { -+ file_plugin *fplug; -+ -+ fplug = inode_file_plugin(inode); -+ assert("nikita-3428", fplug != NULL); -+ assert("", oid == get_inode_oid(inode)); -+ if (fplug->safelink != NULL) { -+ /* reiser4_txn_restart_current is not necessary because -+ * mounting is signle thread. However, without it -+ * deadlock detection code will complain (see -+ * nikita-3361). */ -+ reiser4_txn_restart_current(); -+ result = fplug->safelink(inode, link, size); -+ } else { -+ warning("nikita-3430", -+ "Cannot handle safelink for %lli", -+ (unsigned long long)oid); -+ reiser4_print_key("key", sdkey); -+ result = 0; -+ } -+ if (result != 0) { -+ warning("nikita-3431", -+ "Error processing safelink for %lli: %i", -+ (unsigned long long)oid, result); -+ } -+ reiser4_iget_complete(inode); -+ iput(inode); -+ if (result == 0) { -+ result = safe_link_grab(reiser4_get_tree(super), BA_CAN_COMMIT); -+ if (result == 0) -+ result = -+ safe_link_del(reiser4_get_tree(super), oid, link); -+ safe_link_release(reiser4_get_tree(super)); -+ /* -+ * restart transaction: if there was large number of -+ * safe-links, their processing may fail to fit into -+ * single transaction. -+ */ -+ if (result == 0) -+ reiser4_txn_restart_current(); -+ } -+ } else -+ result = PTR_ERR(inode); -+ return result; -+} -+ -+/* -+ * iterate over all safe-links in the file-system processing them one by one. -+ */ -+int process_safelinks(struct super_block *super) -+{ -+ struct safe_link_context ctx; -+ int result; -+ -+ if (rofs_super(super)) -+ /* do nothing on the read-only file system */ -+ return 0; -+ safe_link_iter_begin(&get_super_private(super)->tree, &ctx); -+ result = 0; -+ do { -+ result = safe_link_iter_next(&ctx); -+ if (safe_link_iter_finished(&ctx) || result == -ENOENT) { -+ result = 0; -+ break; -+ } -+ if (result == 0) -+ result = process_safelink(super, ctx.link, -+ &ctx.sdkey, ctx.oid, -+ ctx.size); -+ } while (result == 0); -+ safe_link_iter_end(&ctx); -+ return result; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/safe_link.h linux-2.6.24/fs/reiser4/safe_link.h ---- linux-2.6.24.orig/fs/reiser4/safe_link.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/safe_link.h 2008-01-25 11:39:07.080244783 +0300 -@@ -0,0 +1,29 @@ -+/* Copyright 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Safe-links. See safe_link.c for details. */ -+ -+#if !defined( __FS_SAFE_LINK_H__ ) -+#define __FS_SAFE_LINK_H__ -+ -+#include "tree.h" -+ -+int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags); -+void safe_link_release(reiser4_tree * tree); -+int safe_link_add(struct inode *inode, reiser4_safe_link_t link); -+int safe_link_del(reiser4_tree *, oid_t oid, reiser4_safe_link_t link); -+ -+int process_safelinks(struct super_block *super); -+ -+/* __FS_SAFE_LINK_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/seal.c linux-2.6.24/fs/reiser4/seal.c ---- linux-2.6.24.orig/fs/reiser4/seal.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/seal.c 2008-01-25 11:39:07.080244783 +0300 -@@ -0,0 +1,218 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+/* Seals implementation. */ -+/* Seals are "weak" tree pointers. They are analogous to tree coords in -+ allowing to bypass tree traversal. But normal usage of coords implies that -+ node pointed to by coord is locked, whereas seals don't keep a lock (or -+ even a reference) to znode. In stead, each znode contains a version number, -+ increased on each znode modification. This version number is copied into a -+ seal when seal is created. Later, one can "validate" seal by calling -+ reiser4_seal_validate(). If znode is in cache and its version number is -+ still the same, seal is "pristine" and coord associated with it can be -+ re-used immediately. -+ -+ If, on the other hand, znode is out of cache, or it is obviously different -+ one from the znode seal was initially attached to (for example, it is on -+ the different level, or is being removed from the tree), seal is -+ irreparably invalid ("burned") and tree traversal has to be repeated. -+ -+ Otherwise, there is some hope, that while znode was modified (and seal was -+ "broken" as a result), key attached to the seal is still in the node. This -+ is checked by first comparing this key with delimiting keys of node and, if -+ key is ok, doing intra-node lookup. -+ -+ Znode version is maintained in the following way: -+ -+ there is reiser4_tree.znode_epoch counter. Whenever new znode is created, -+ znode_epoch is incremented and its new value is stored in ->version field -+ of new znode. Whenever znode is dirtied (which means it was probably -+ modified), znode_epoch is also incremented and its new value is stored in -+ znode->version. This is done so, because just incrementing znode->version -+ on each update is not enough: it may so happen, that znode get deleted, new -+ znode is allocated for the same disk block and gets the same version -+ counter, tricking seal code into false positive. -+*/ -+ -+#include "forward.h" -+#include "debug.h" -+#include "key.h" -+#include "coord.h" -+#include "seal.h" -+#include "plugin/item/item.h" -+#include "plugin/node/node.h" -+#include "jnode.h" -+#include "znode.h" -+#include "super.h" -+ -+static znode *seal_node(const seal_t * seal); -+static int seal_matches(const seal_t * seal, znode * node); -+ -+/* initialise seal. This can be called several times on the same seal. @coord -+ and @key can be NULL. */ -+void reiser4_seal_init(seal_t * seal /* seal to initialise */ , -+ const coord_t * coord /* coord @seal will be -+ * attached to */ , -+ const reiser4_key * key UNUSED_ARG /* key @seal will be -+ * attached to */ ) -+{ -+ assert("nikita-1886", seal != NULL); -+ memset(seal, 0, sizeof *seal); -+ if (coord != NULL) { -+ znode *node; -+ -+ node = coord->node; -+ assert("nikita-1987", node != NULL); -+ spin_lock_znode(node); -+ seal->version = node->version; -+ assert("nikita-1988", seal->version != 0); -+ seal->block = *znode_get_block(node); -+#if REISER4_DEBUG -+ seal->coord1 = *coord; -+ if (key != NULL) -+ seal->key = *key; -+#endif -+ spin_unlock_znode(node); -+ } -+} -+ -+/* finish with seal */ -+void reiser4_seal_done(seal_t * seal /* seal to clear */ ) -+{ -+ assert("nikita-1887", seal != NULL); -+ seal->version = 0; -+} -+ -+/* true if seal was initialised */ -+int reiser4_seal_is_set(const seal_t * seal /* seal to query */ ) -+{ -+ assert("nikita-1890", seal != NULL); -+ return seal->version != 0; -+} -+ -+#if REISER4_DEBUG -+/* helper function for reiser4_seal_validate(). It checks that item at @coord -+ * has expected key. This is to detect cases where node was modified but wasn't -+ * marked dirty. */ -+static inline int check_seal_match(const coord_t * coord /* coord to check */ , -+ const reiser4_key * k /* expected key */ ) -+{ -+ reiser4_key ukey; -+ -+ return (coord->between != AT_UNIT) || -+ /* FIXME-VS: we only can compare keys for items whose units -+ represent exactly one key */ -+ ((coord_is_existing_unit(coord)) -+ && (item_is_extent(coord) -+ || keyeq(k, unit_key_by_coord(coord, &ukey)))) -+ || ((coord_is_existing_unit(coord)) && (item_is_ctail(coord)) -+ && keyge(k, unit_key_by_coord(coord, &ukey))); -+} -+#endif -+ -+/* this is used by reiser4_seal_validate. It accepts return value of -+ * longterm_lock_znode and returns 1 if it can be interpreted as seal -+ * validation failure. For instance, when longterm_lock_znode returns -EINVAL, -+ * reiser4_seal_validate returns -E_REPEAT and caller will call tre search. -+ * We cannot do this in longterm_lock_znode(), because sometimes we want to -+ * distinguish between -EINVAL and -E_REPEAT. */ -+static int should_repeat(int return_code) -+{ -+ return return_code == -EINVAL; -+} -+ -+/* (re-)validate seal. -+ -+ Checks whether seal is pristine, and try to revalidate it if possible. -+ -+ If seal was burned, or broken irreparably, return -E_REPEAT. -+ -+ NOTE-NIKITA currently reiser4_seal_validate() returns -E_REPEAT if key we are -+ looking for is in range of keys covered by the sealed node, but item wasn't -+ found by node ->lookup() method. Alternative is to return -ENOENT in this -+ case, but this would complicate callers logic. -+ -+*/ -+int reiser4_seal_validate(seal_t * seal /* seal to validate */, -+ coord_t * coord /* coord to validate against */, -+ const reiser4_key * key /* key to validate against */, -+ lock_handle * lh /* resulting lock handle */, -+ znode_lock_mode mode /* lock node */, -+ znode_lock_request request /* locking priority */) -+{ -+ znode *node; -+ int result; -+ -+ assert("nikita-1889", seal != NULL); -+ assert("nikita-1881", reiser4_seal_is_set(seal)); -+ assert("nikita-1882", key != NULL); -+ assert("nikita-1883", coord != NULL); -+ assert("nikita-1884", lh != NULL); -+ assert("nikita-1885", keyeq(&seal->key, key)); -+ assert("nikita-1989", coords_equal(&seal->coord1, coord)); -+ -+ /* obtain znode by block number */ -+ node = seal_node(seal); -+ if (node != NULL) { -+ /* znode was in cache, lock it */ -+ result = longterm_lock_znode(lh, node, mode, request); -+ zput(node); -+ if (result == 0) { -+ if (seal_matches(seal, node)) { -+ /* if seal version and znode version -+ coincide */ -+ ON_DEBUG(coord_update_v(coord)); -+ assert("nikita-1990", -+ node == seal->coord1.node); -+ assert("nikita-1898", -+ WITH_DATA_RET(coord->node, 1, -+ check_seal_match(coord, -+ key))); -+ } else -+ result = RETERR(-E_REPEAT); -+ } -+ if (result != 0) { -+ if (should_repeat(result)) -+ result = RETERR(-E_REPEAT); -+ /* unlock node on failure */ -+ done_lh(lh); -+ } -+ } else { -+ /* znode wasn't in cache */ -+ result = RETERR(-E_REPEAT); -+ } -+ return result; -+} -+ -+/* helpers functions */ -+ -+/* obtain reference to znode seal points to, if in cache */ -+static znode *seal_node(const seal_t * seal /* seal to query */ ) -+{ -+ assert("nikita-1891", seal != NULL); -+ return zlook(current_tree, &seal->block); -+} -+ -+/* true if @seal version and @node version coincide */ -+static int seal_matches(const seal_t * seal /* seal to check */ , -+ znode * node /* node to check */ ) -+{ -+ int result; -+ -+ assert("nikita-1991", seal != NULL); -+ assert("nikita-1993", node != NULL); -+ -+ spin_lock_znode(node); -+ result = (seal->version == node->version); -+ spin_unlock_znode(node); -+ return result; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/seal.h linux-2.6.24/fs/reiser4/seal.h ---- linux-2.6.24.orig/fs/reiser4/seal.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/seal.h 2008-01-25 11:39:07.080244783 +0300 -@@ -0,0 +1,49 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Declaration of seals: "weak" tree pointers. See seal.c for comments. */ -+ -+#ifndef __SEAL_H__ -+#define __SEAL_H__ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "coord.h" -+ -+/* for __u?? types */ -+/*#include */ -+ -+/* seal. See comment at the top of seal.c */ -+typedef struct seal_s { -+ /* version of znode recorder at the time of seal creation */ -+ __u64 version; -+ /* block number of znode attached to this seal */ -+ reiser4_block_nr block; -+#if REISER4_DEBUG -+ /* coord this seal is attached to. For debugging. */ -+ coord_t coord1; -+ /* key this seal is attached to. For debugging. */ -+ reiser4_key key; -+#endif -+} seal_t; -+ -+extern void reiser4_seal_init(seal_t *, const coord_t *, const reiser4_key *); -+extern void reiser4_seal_done(seal_t *); -+extern int reiser4_seal_is_set(const seal_t *); -+extern int reiser4_seal_validate(seal_t *, coord_t *, -+ const reiser4_key *, lock_handle *, -+ znode_lock_mode mode, znode_lock_request request); -+ -+/* __SEAL_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/search.c linux-2.6.24/fs/reiser4/search.c ---- linux-2.6.24.orig/fs/reiser4/search.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/search.c 2008-01-25 11:39:07.084245813 +0300 -@@ -0,0 +1,1611 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "coord.h" -+#include "seal.h" -+#include "plugin/item/item.h" -+#include "plugin/node/node.h" -+#include "plugin/plugin.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree_walk.h" -+#include "tree.h" -+#include "reiser4.h" -+#include "super.h" -+#include "inode.h" -+ -+#include -+ -+static const char *bias_name(lookup_bias bias); -+ -+/* tree searching algorithm, intranode searching algorithms are in -+ plugin/node/ */ -+ -+/* tree lookup cache -+ * -+ * The coord by key cache consists of small list of recently accessed nodes -+ * maintained according to the LRU discipline. Before doing real top-to-down -+ * tree traversal this cache is scanned for nodes that can contain key -+ * requested. -+ * -+ * The efficiency of coord cache depends heavily on locality of reference for -+ * tree accesses. Our user level simulations show reasonably good hit ratios -+ * for coord cache under most loads so far. -+ */ -+ -+/* Initialise coord cache slot */ -+static void cbk_cache_init_slot(cbk_cache_slot *slot) -+{ -+ assert("nikita-345", slot != NULL); -+ -+ INIT_LIST_HEAD(&slot->lru); -+ slot->node = NULL; -+} -+ -+/* Initialize coord cache */ -+int cbk_cache_init(cbk_cache *cache /* cache to init */ ) -+{ -+ int i; -+ -+ assert("nikita-346", cache != NULL); -+ -+ cache->slot = -+ kmalloc(sizeof(cbk_cache_slot) * cache->nr_slots, -+ reiser4_ctx_gfp_mask_get()); -+ if (cache->slot == NULL) -+ return RETERR(-ENOMEM); -+ -+ INIT_LIST_HEAD(&cache->lru); -+ for (i = 0; i < cache->nr_slots; ++i) { -+ cbk_cache_init_slot(cache->slot + i); -+ list_add_tail(&((cache->slot + i)->lru), &cache->lru); -+ } -+ rwlock_init(&cache->guard); -+ return 0; -+} -+ -+/* free cbk cache data */ -+void cbk_cache_done(cbk_cache * cache /* cache to release */ ) -+{ -+ assert("nikita-2493", cache != NULL); -+ if (cache->slot != NULL) { -+ kfree(cache->slot); -+ cache->slot = NULL; -+ } -+} -+ -+/* macro to iterate over all cbk cache slots */ -+#define for_all_slots(cache, slot) \ -+ for ((slot) = list_entry((cache)->lru.next, cbk_cache_slot, lru); \ -+ &(cache)->lru != &(slot)->lru; \ -+ (slot) = list_entry(slot->lru.next, cbk_cache_slot, lru)) -+ -+#if REISER4_DEBUG -+/* this function assures that [cbk-cache-invariant] invariant holds */ -+static int cbk_cache_invariant(const cbk_cache *cache) -+{ -+ cbk_cache_slot *slot; -+ int result; -+ int unused; -+ -+ if (cache->nr_slots == 0) -+ return 1; -+ -+ assert("nikita-2469", cache != NULL); -+ unused = 0; -+ result = 1; -+ read_lock(&((cbk_cache *)cache)->guard); -+ for_all_slots(cache, slot) { -+ /* in LRU first go all `used' slots followed by `unused' */ -+ if (unused && (slot->node != NULL)) -+ result = 0; -+ if (slot->node == NULL) -+ unused = 1; -+ else { -+ cbk_cache_slot *scan; -+ -+ /* all cached nodes are different */ -+ scan = slot; -+ while (result) { -+ scan = list_entry(scan->lru.next, cbk_cache_slot, lru); -+ if (&cache->lru == &scan->lru) -+ break; -+ if (slot->node == scan->node) -+ result = 0; -+ } -+ } -+ if (!result) -+ break; -+ } -+ read_unlock(&((cbk_cache *)cache)->guard); -+ return result; -+} -+ -+#endif -+ -+/* Remove references, if any, to @node from coord cache */ -+void cbk_cache_invalidate(const znode * node /* node to remove from cache */ , -+ reiser4_tree * tree /* tree to remove node from */ ) -+{ -+ cbk_cache_slot *slot; -+ cbk_cache *cache; -+ int i; -+ -+ assert("nikita-350", node != NULL); -+ assert("nikita-1479", LOCK_CNT_GTZ(rw_locked_tree)); -+ -+ cache = &tree->cbk_cache; -+ assert("nikita-2470", cbk_cache_invariant(cache)); -+ -+ write_lock(&(cache->guard)); -+ for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) { -+ if (slot->node == node) { -+ list_move_tail(&slot->lru, &cache->lru); -+ slot->node = NULL; -+ break; -+ } -+ } -+ write_unlock(&(cache->guard)); -+ assert("nikita-2471", cbk_cache_invariant(cache)); -+} -+ -+/* add to the cbk-cache in the "tree" information about "node". This -+ can actually be update of existing slot in a cache. */ -+static void cbk_cache_add(const znode *node /* node to add to the cache */ ) -+{ -+ cbk_cache *cache; -+ cbk_cache_slot *slot; -+ int i; -+ -+ assert("nikita-352", node != NULL); -+ -+ cache = &znode_get_tree(node)->cbk_cache; -+ assert("nikita-2472", cbk_cache_invariant(cache)); -+ -+ if (cache->nr_slots == 0) -+ return; -+ -+ write_lock(&(cache->guard)); -+ /* find slot to update/add */ -+ for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) { -+ /* oops, this node is already in a cache */ -+ if (slot->node == node) -+ break; -+ } -+ /* if all slots are used, reuse least recently used one */ -+ if (i == cache->nr_slots) { -+ slot = list_entry(cache->lru.prev, cbk_cache_slot, lru); -+ slot->node = (znode *) node; -+ } -+ list_move(&slot->lru, &cache->lru); -+ write_unlock(&(cache->guard)); -+ assert("nikita-2473", cbk_cache_invariant(cache)); -+} -+ -+static int setup_delimiting_keys(cbk_handle * h); -+static lookup_result coord_by_handle(cbk_handle * handle); -+static lookup_result traverse_tree(cbk_handle * h); -+static int cbk_cache_search(cbk_handle * h); -+ -+static level_lookup_result cbk_level_lookup(cbk_handle * h); -+static level_lookup_result cbk_node_lookup(cbk_handle * h); -+ -+/* helper functions */ -+ -+static void update_stale_dk(reiser4_tree * tree, znode * node); -+ -+/* release parent node during traversal */ -+static void put_parent(cbk_handle * h); -+/* check consistency of fields */ -+static int sanity_check(cbk_handle * h); -+/* release resources in handle */ -+static void hput(cbk_handle * h); -+ -+static level_lookup_result search_to_left(cbk_handle * h); -+ -+/* pack numerous (numberous I should say) arguments of coord_by_key() into -+ * cbk_handle */ -+static cbk_handle *cbk_pack(cbk_handle * handle, -+ reiser4_tree * tree, -+ const reiser4_key * key, -+ coord_t * coord, -+ lock_handle * active_lh, -+ lock_handle * parent_lh, -+ znode_lock_mode lock_mode, -+ lookup_bias bias, -+ tree_level lock_level, -+ tree_level stop_level, -+ __u32 flags, ra_info_t * info) -+{ -+ memset(handle, 0, sizeof *handle); -+ -+ handle->tree = tree; -+ handle->key = key; -+ handle->lock_mode = lock_mode; -+ handle->bias = bias; -+ handle->lock_level = lock_level; -+ handle->stop_level = stop_level; -+ handle->coord = coord; -+ /* set flags. See comment in tree.h:cbk_flags */ -+ handle->flags = flags | CBK_TRUST_DK | CBK_USE_CRABLOCK; -+ -+ handle->active_lh = active_lh; -+ handle->parent_lh = parent_lh; -+ handle->ra_info = info; -+ return handle; -+} -+ -+/* main tree lookup procedure -+ -+ Check coord cache. If key we are looking for is not found there, call cbk() -+ to do real tree traversal. -+ -+ As we have extents on the twig level, @lock_level and @stop_level can -+ be different from LEAF_LEVEL and each other. -+ -+ Thread cannot keep any reiser4 locks (tree, znode, dk spin-locks, or znode -+ long term locks) while calling this. -+*/ -+lookup_result coord_by_key(reiser4_tree * tree /* tree to perform search -+ * in. Usually this tree is -+ * part of file-system -+ * super-block */ , -+ const reiser4_key * key /* key to look for */ , -+ coord_t * coord /* where to store found -+ * position in a tree. Fields -+ * in "coord" are only valid if -+ * coord_by_key() returned -+ * "CBK_COORD_FOUND" */ , -+ lock_handle * lh, /* resulting lock handle */ -+ znode_lock_mode lock_mode /* type of lookup we -+ * want on node. Pass -+ * ZNODE_READ_LOCK here -+ * if you only want to -+ * read item found and -+ * ZNODE_WRITE_LOCK if -+ * you want to modify -+ * it */ , -+ lookup_bias bias /* what to return if coord -+ * with exactly the @key is -+ * not in the tree */ , -+ tree_level lock_level /* tree level where to start -+ * taking @lock type of -+ * locks */ , -+ tree_level stop_level /* tree level to stop. Pass -+ * LEAF_LEVEL or TWIG_LEVEL -+ * here Item being looked -+ * for has to be between -+ * @lock_level and -+ * @stop_level, inclusive */ , -+ __u32 flags /* search flags */ , -+ ra_info_t * -+ info -+ /* information about desired tree traversal readahead */ -+ ) -+{ -+ cbk_handle handle; -+ lock_handle parent_lh; -+ lookup_result result; -+ -+ init_lh(lh); -+ init_lh(&parent_lh); -+ -+ assert("nikita-3023", reiser4_schedulable()); -+ -+ assert("nikita-353", tree != NULL); -+ assert("nikita-354", key != NULL); -+ assert("nikita-355", coord != NULL); -+ assert("nikita-356", (bias == FIND_EXACT) -+ || (bias == FIND_MAX_NOT_MORE_THAN)); -+ assert("nikita-357", stop_level >= LEAF_LEVEL); -+ /* no locks can be held during tree traversal */ -+ assert("nikita-2104", lock_stack_isclean(get_current_lock_stack())); -+ -+ cbk_pack(&handle, -+ tree, -+ key, -+ coord, -+ lh, -+ &parent_lh, -+ lock_mode, bias, lock_level, stop_level, flags, info); -+ -+ result = coord_by_handle(&handle); -+ assert("nikita-3247", -+ ergo(!IS_CBKERR(result), coord->node == lh->node)); -+ return result; -+} -+ -+/* like coord_by_key(), but starts traversal from vroot of @object rather than -+ * from tree root. */ -+lookup_result reiser4_object_lookup(struct inode * object, -+ const reiser4_key * key, -+ coord_t * coord, -+ lock_handle * lh, -+ znode_lock_mode lock_mode, -+ lookup_bias bias, -+ tree_level lock_level, -+ tree_level stop_level, __u32 flags, -+ ra_info_t * info) -+{ -+ cbk_handle handle; -+ lock_handle parent_lh; -+ lookup_result result; -+ -+ init_lh(lh); -+ init_lh(&parent_lh); -+ -+ assert("nikita-3023", reiser4_schedulable()); -+ -+ assert("nikita-354", key != NULL); -+ assert("nikita-355", coord != NULL); -+ assert("nikita-356", (bias == FIND_EXACT) -+ || (bias == FIND_MAX_NOT_MORE_THAN)); -+ assert("nikita-357", stop_level >= LEAF_LEVEL); -+ /* no locks can be held during tree search by key */ -+ assert("nikita-2104", lock_stack_isclean(get_current_lock_stack())); -+ -+ cbk_pack(&handle, -+ object != NULL ? reiser4_tree_by_inode(object) : current_tree, -+ key, -+ coord, -+ lh, -+ &parent_lh, -+ lock_mode, bias, lock_level, stop_level, flags, info); -+ handle.object = object; -+ -+ result = coord_by_handle(&handle); -+ assert("nikita-3247", -+ ergo(!IS_CBKERR(result), coord->node == lh->node)); -+ return result; -+} -+ -+/* lookup by cbk_handle. Common part of coord_by_key() and -+ reiser4_object_lookup(). */ -+static lookup_result coord_by_handle(cbk_handle * handle) -+{ -+ /* -+ * first check cbk_cache (which is look-aside cache for our tree) and -+ * of this fails, start traversal. -+ */ -+ /* first check whether "key" is in cache of recent lookups. */ -+ if (cbk_cache_search(handle) == 0) -+ return handle->result; -+ else -+ return traverse_tree(handle); -+} -+ -+/* Execute actor for each item (or unit, depending on @through_units_p), -+ starting from @coord, right-ward, until either: -+ -+ - end of the tree is reached -+ - unformatted node is met -+ - error occurred -+ - @actor returns 0 or less -+ -+ Error code, or last actor return value is returned. -+ -+ This is used by plugin/dir/hashe_dir.c:reiser4_find_entry() to move through -+ sequence of entries with identical keys and alikes. -+*/ -+int reiser4_iterate_tree(reiser4_tree * tree /* tree to scan */ , -+ coord_t * coord /* coord to start from */ , -+ lock_handle * lh /* lock handle to start with and to -+ * update along the way */ , -+ tree_iterate_actor_t actor /* function to call on each -+ * item/unit */ , -+ void *arg /* argument to pass to @actor */ , -+ znode_lock_mode mode /* lock mode on scanned nodes */ , -+ int through_units_p /* call @actor on each item or on -+ * each unit */ ) -+{ -+ int result; -+ -+ assert("nikita-1143", tree != NULL); -+ assert("nikita-1145", coord != NULL); -+ assert("nikita-1146", lh != NULL); -+ assert("nikita-1147", actor != NULL); -+ -+ result = zload(coord->node); -+ coord_clear_iplug(coord); -+ if (result != 0) -+ return result; -+ if (!coord_is_existing_unit(coord)) { -+ zrelse(coord->node); -+ return -ENOENT; -+ } -+ while ((result = actor(tree, coord, lh, arg)) > 0) { -+ /* move further */ -+ if ((through_units_p && coord_next_unit(coord)) || -+ (!through_units_p && coord_next_item(coord))) { -+ do { -+ lock_handle couple; -+ -+ /* move to the next node */ -+ init_lh(&couple); -+ result = -+ reiser4_get_right_neighbor(&couple, -+ coord->node, -+ (int)mode, -+ GN_CAN_USE_UPPER_LEVELS); -+ zrelse(coord->node); -+ if (result == 0) { -+ -+ result = zload(couple.node); -+ if (result != 0) { -+ done_lh(&couple); -+ return result; -+ } -+ -+ coord_init_first_unit(coord, -+ couple.node); -+ done_lh(lh); -+ move_lh(lh, &couple); -+ } else -+ return result; -+ } while (node_is_empty(coord->node)); -+ } -+ -+ assert("nikita-1149", coord_is_existing_unit(coord)); -+ } -+ zrelse(coord->node); -+ return result; -+} -+ -+/* return locked uber znode for @tree */ -+int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode, -+ znode_lock_request pri, lock_handle * lh) -+{ -+ int result; -+ -+ result = longterm_lock_znode(lh, tree->uber, mode, pri); -+ return result; -+} -+ -+/* true if @key is strictly within @node -+ -+ we are looking for possibly non-unique key and it is item is at the edge of -+ @node. May be it is in the neighbor. -+*/ -+static int znode_contains_key_strict(znode * node /* node to check key -+ * against */ , -+ const reiser4_key * -+ key /* key to check */ , -+ int isunique) -+{ -+ int answer; -+ -+ assert("nikita-1760", node != NULL); -+ assert("nikita-1722", key != NULL); -+ -+ if (keyge(key, &node->rd_key)) -+ return 0; -+ -+ answer = keycmp(&node->ld_key, key); -+ -+ if (isunique) -+ return answer != GREATER_THAN; -+ else -+ return answer == LESS_THAN; -+} -+ -+/* -+ * Virtual Root (vroot) code. -+ * -+ * For given file system object (e.g., regular file or directory) let's -+ * define its "virtual root" as lowest in the tree (that is, furtherest -+ * from the tree root) node such that all body items of said object are -+ * located in a tree rooted at this node. -+ * -+ * Once vroot of object is found all tree lookups for items within body of -+ * this object ("object lookups") can be started from its vroot rather -+ * than from real root. This has following advantages: -+ * -+ * 1. amount of nodes traversed during lookup (and, hence, amount of -+ * key comparisons made) decreases, and -+ * -+ * 2. contention on tree root is decreased. This latter was actually -+ * motivating reason behind vroot, because spin lock of root node, -+ * which is taken when acquiring long-term lock on root node is the -+ * hottest lock in the reiser4. -+ * -+ * How to find vroot. -+ * -+ * When vroot of object F is not yet determined, all object lookups start -+ * from the root of the tree. At each tree level during traversal we have -+ * a node N such that a key we are looking for (which is the key inside -+ * object's body) is located within N. In function handle_vroot() called -+ * from cbk_level_lookup() we check whether N is possible vroot for -+ * F. Check is trivial---if neither leftmost nor rightmost item of N -+ * belongs to F (and we already have helpful ->owns_item() method of -+ * object plugin for this), then N is possible vroot of F. This, of -+ * course, relies on the assumption that each object occupies contiguous -+ * range of keys in the tree. -+ * -+ * Thus, traversing tree downward and checking each node as we go, we can -+ * find lowest such node, which, by definition, is vroot. -+ * -+ * How to track vroot. -+ * -+ * Nohow. If actual vroot changes, next object lookup will just restart -+ * from the actual tree root, refreshing object's vroot along the way. -+ * -+ */ -+ -+/* -+ * Check whether @node is possible vroot of @object. -+ */ -+static void handle_vroot(struct inode *object, znode * node) -+{ -+ file_plugin *fplug; -+ coord_t coord; -+ -+ fplug = inode_file_plugin(object); -+ assert("nikita-3353", fplug != NULL); -+ assert("nikita-3354", fplug->owns_item != NULL); -+ -+ if (unlikely(node_is_empty(node))) -+ return; -+ -+ coord_init_first_unit(&coord, node); -+ /* -+ * if leftmost item of @node belongs to @object, we cannot be sure -+ * that @node is vroot of @object, because, some items of @object are -+ * probably in the sub-tree rooted at the left neighbor of @node. -+ */ -+ if (fplug->owns_item(object, &coord)) -+ return; -+ coord_init_last_unit(&coord, node); -+ /* mutatis mutandis for the rightmost item */ -+ if (fplug->owns_item(object, &coord)) -+ return; -+ /* otherwise, @node is possible vroot of @object */ -+ inode_set_vroot(object, node); -+} -+ -+/* -+ * helper function used by traverse tree to start tree traversal not from the -+ * tree root, but from @h->object's vroot, if possible. -+ */ -+static int prepare_object_lookup(cbk_handle * h) -+{ -+ znode *vroot; -+ int result; -+ -+ vroot = inode_get_vroot(h->object); -+ if (vroot == NULL) { -+ /* -+ * object doesn't have known vroot, start from real tree root. -+ */ -+ return LOOKUP_CONT; -+ } -+ -+ h->level = znode_get_level(vroot); -+ /* take a long-term lock on vroot */ -+ h->result = longterm_lock_znode(h->active_lh, vroot, -+ cbk_lock_mode(h->level, h), -+ ZNODE_LOCK_LOPRI); -+ result = LOOKUP_REST; -+ if (h->result == 0) { -+ int isunique; -+ int inside; -+ -+ isunique = h->flags & CBK_UNIQUE; -+ /* check that key is inside vroot */ -+ read_lock_dk(h->tree); -+ inside = (znode_contains_key_strict(vroot, h->key, isunique) && -+ !ZF_ISSET(vroot, JNODE_HEARD_BANSHEE)); -+ read_unlock_dk(h->tree); -+ if (inside) { -+ h->result = zload(vroot); -+ if (h->result == 0) { -+ /* search for key in vroot. */ -+ result = cbk_node_lookup(h); -+ zrelse(vroot); /*h->active_lh->node); */ -+ if (h->active_lh->node != vroot) { -+ result = LOOKUP_REST; -+ } else if (result == LOOKUP_CONT) { -+ move_lh(h->parent_lh, h->active_lh); -+ h->flags &= ~CBK_DKSET; -+ } -+ } -+ } -+ } -+ -+ zput(vroot); -+ -+ if (IS_CBKERR(h->result) || result == LOOKUP_REST) -+ hput(h); -+ return result; -+} -+ -+/* main function that handles common parts of tree traversal: starting -+ (fake znode handling), restarts, error handling, completion */ -+static lookup_result traverse_tree(cbk_handle * h /* search handle */ ) -+{ -+ int done; -+ int iterations; -+ int vroot_used; -+ -+ assert("nikita-365", h != NULL); -+ assert("nikita-366", h->tree != NULL); -+ assert("nikita-367", h->key != NULL); -+ assert("nikita-368", h->coord != NULL); -+ assert("nikita-369", (h->bias == FIND_EXACT) -+ || (h->bias == FIND_MAX_NOT_MORE_THAN)); -+ assert("nikita-370", h->stop_level >= LEAF_LEVEL); -+ assert("nikita-2949", !(h->flags & CBK_DKSET)); -+ assert("zam-355", lock_stack_isclean(get_current_lock_stack())); -+ -+ done = 0; -+ iterations = 0; -+ vroot_used = 0; -+ -+ /* loop for restarts */ -+ restart: -+ -+ assert("nikita-3024", reiser4_schedulable()); -+ -+ h->result = CBK_COORD_FOUND; -+ /* connect_znode() needs it */ -+ h->ld_key = *reiser4_min_key(); -+ h->rd_key = *reiser4_max_key(); -+ h->flags |= CBK_DKSET; -+ h->error = NULL; -+ -+ if (!vroot_used && h->object != NULL) { -+ vroot_used = 1; -+ done = prepare_object_lookup(h); -+ if (done == LOOKUP_REST) { -+ goto restart; -+ } else if (done == LOOKUP_DONE) -+ return h->result; -+ } -+ if (h->parent_lh->node == NULL) { -+ done = -+ get_uber_znode(h->tree, ZNODE_READ_LOCK, ZNODE_LOCK_LOPRI, -+ h->parent_lh); -+ -+ assert("nikita-1637", done != -E_DEADLOCK); -+ -+ h->block = h->tree->root_block; -+ h->level = h->tree->height; -+ h->coord->node = h->parent_lh->node; -+ -+ if (done != 0) -+ return done; -+ } -+ -+ /* loop descending a tree */ -+ while (!done) { -+ -+ if (unlikely((iterations > REISER4_CBK_ITERATIONS_LIMIT) && -+ IS_POW(iterations))) { -+ warning("nikita-1481", "Too many iterations: %i", -+ iterations); -+ reiser4_print_key("key", h->key); -+ ++iterations; -+ } else if (unlikely(iterations > REISER4_MAX_CBK_ITERATIONS)) { -+ h->error = -+ "reiser-2018: Too many iterations. Tree corrupted, or (less likely) starvation occurring."; -+ h->result = RETERR(-EIO); -+ break; -+ } -+ switch (cbk_level_lookup(h)) { -+ case LOOKUP_CONT: -+ move_lh(h->parent_lh, h->active_lh); -+ continue; -+ default: -+ wrong_return_value("nikita-372", "cbk_level"); -+ case LOOKUP_DONE: -+ done = 1; -+ break; -+ case LOOKUP_REST: -+ hput(h); -+ /* deadlock avoidance is normal case. */ -+ if (h->result != -E_DEADLOCK) -+ ++iterations; -+ reiser4_preempt_point(); -+ goto restart; -+ } -+ } -+ /* that's all. The rest is error handling */ -+ if (unlikely(h->error != NULL)) { -+ warning("nikita-373", "%s: level: %i, " -+ "lock_level: %i, stop_level: %i " -+ "lock_mode: %s, bias: %s", -+ h->error, h->level, h->lock_level, h->stop_level, -+ lock_mode_name(h->lock_mode), bias_name(h->bias)); -+ reiser4_print_address("block", &h->block); -+ reiser4_print_key("key", h->key); -+ print_coord_content("coord", h->coord); -+ } -+ /* `unlikely' error case */ -+ if (unlikely(IS_CBKERR(h->result))) { -+ /* failure. do cleanup */ -+ hput(h); -+ } else { -+ assert("nikita-1605", WITH_DATA_RET -+ (h->coord->node, 1, -+ ergo((h->result == CBK_COORD_FOUND) && -+ (h->bias == FIND_EXACT) && -+ (!node_is_empty(h->coord->node)), -+ coord_is_existing_item(h->coord)))); -+ } -+ return h->result; -+} -+ -+/* find delimiting keys of child -+ -+ Determine left and right delimiting keys for child pointed to by -+ @parent_coord. -+ -+*/ -+static void find_child_delimiting_keys(znode * parent /* parent znode, passed -+ * locked */ , -+ const coord_t * parent_coord /* coord where -+ * pointer to -+ * child is -+ * stored */ , -+ reiser4_key * ld /* where to store left -+ * delimiting key */ , -+ reiser4_key * rd /* where to store right -+ * delimiting key */ ) -+{ -+ coord_t neighbor; -+ -+ assert("nikita-1484", parent != NULL); -+ assert_rw_locked(&(znode_get_tree(parent)->dk_lock)); -+ -+ coord_dup(&neighbor, parent_coord); -+ -+ if (neighbor.between == AT_UNIT) -+ /* imitate item ->lookup() behavior. */ -+ neighbor.between = AFTER_UNIT; -+ -+ if (coord_set_to_left(&neighbor) == 0) -+ unit_key_by_coord(&neighbor, ld); -+ else { -+ assert("nikita-14851", 0); -+ *ld = *znode_get_ld_key(parent); -+ } -+ -+ coord_dup(&neighbor, parent_coord); -+ if (neighbor.between == AT_UNIT) -+ neighbor.between = AFTER_UNIT; -+ if (coord_set_to_right(&neighbor) == 0) -+ unit_key_by_coord(&neighbor, rd); -+ else -+ *rd = *znode_get_rd_key(parent); -+} -+ -+/* -+ * setup delimiting keys for a child -+ * -+ * @parent parent node -+ * -+ * @coord location in @parent where pointer to @child is -+ * -+ * @child child node -+ */ -+int -+set_child_delimiting_keys(znode * parent, const coord_t * coord, znode * child) -+{ -+ reiser4_tree *tree; -+ -+ assert("nikita-2952", -+ znode_get_level(parent) == znode_get_level(coord->node)); -+ -+ /* fast check without taking dk lock. This is safe, because -+ * JNODE_DKSET is never cleared once set. */ -+ if (!ZF_ISSET(child, JNODE_DKSET)) { -+ tree = znode_get_tree(parent); -+ write_lock_dk(tree); -+ if (likely(!ZF_ISSET(child, JNODE_DKSET))) { -+ find_child_delimiting_keys(parent, coord, -+ &child->ld_key, -+ &child->rd_key); -+ ON_DEBUG(child->ld_key_version = -+ atomic_inc_return(&delim_key_version); -+ child->rd_key_version = -+ atomic_inc_return(&delim_key_version);); -+ ZF_SET(child, JNODE_DKSET); -+ } -+ write_unlock_dk(tree); -+ return 1; -+ } -+ return 0; -+} -+ -+/* Perform tree lookup at one level. This is called from cbk_traverse() -+ function that drives lookup through tree and calls cbk_node_lookup() to -+ perform lookup within one node. -+ -+ See comments in a code. -+*/ -+static level_lookup_result cbk_level_lookup(cbk_handle * h /* search handle */ ) -+{ -+ int ret; -+ int setdk; -+ int ldkeyset = 0; -+ reiser4_key ldkey; -+ reiser4_key key; -+ znode *active; -+ -+ assert("nikita-3025", reiser4_schedulable()); -+ -+ /* acquire reference to @active node */ -+ active = -+ zget(h->tree, &h->block, h->parent_lh->node, h->level, -+ reiser4_ctx_gfp_mask_get()); -+ -+ if (IS_ERR(active)) { -+ h->result = PTR_ERR(active); -+ return LOOKUP_DONE; -+ } -+ -+ /* lock @active */ -+ h->result = longterm_lock_znode(h->active_lh, -+ active, -+ cbk_lock_mode(h->level, h), -+ ZNODE_LOCK_LOPRI); -+ /* longterm_lock_znode() acquires additional reference to znode (which -+ will be later released by longterm_unlock_znode()). Release -+ reference acquired by zget(). -+ */ -+ zput(active); -+ if (unlikely(h->result != 0)) -+ goto fail_or_restart; -+ -+ setdk = 0; -+ /* if @active is accessed for the first time, setup delimiting keys on -+ it. Delimiting keys are taken from the parent node. See -+ setup_delimiting_keys() for details. -+ */ -+ if (h->flags & CBK_DKSET) { -+ setdk = setup_delimiting_keys(h); -+ h->flags &= ~CBK_DKSET; -+ } else { -+ znode *parent; -+ -+ parent = h->parent_lh->node; -+ h->result = zload(parent); -+ if (unlikely(h->result != 0)) -+ goto fail_or_restart; -+ -+ if (!ZF_ISSET(active, JNODE_DKSET)) -+ setdk = set_child_delimiting_keys(parent, -+ h->coord, active); -+ else { -+ read_lock_dk(h->tree); -+ find_child_delimiting_keys(parent, h->coord, &ldkey, -+ &key); -+ read_unlock_dk(h->tree); -+ ldkeyset = 1; -+ } -+ zrelse(parent); -+ } -+ -+ /* this is ugly kludge. Reminder: this is necessary, because -+ ->lookup() method returns coord with ->between field probably set -+ to something different from AT_UNIT. -+ */ -+ h->coord->between = AT_UNIT; -+ -+ if (znode_just_created(active) && (h->coord->node != NULL)) { -+ write_lock_tree(h->tree); -+ /* if we are going to load znode right now, setup -+ ->in_parent: coord where pointer to this node is stored in -+ parent. -+ */ -+ coord_to_parent_coord(h->coord, &active->in_parent); -+ write_unlock_tree(h->tree); -+ } -+ -+ /* check connectedness without holding tree lock---false negatives -+ * will be re-checked by connect_znode(), and false positives are -+ * impossible---@active cannot suddenly turn into unconnected -+ * state. */ -+ if (!znode_is_connected(active)) { -+ h->result = connect_znode(h->coord, active); -+ if (unlikely(h->result != 0)) { -+ put_parent(h); -+ goto fail_or_restart; -+ } -+ } -+ -+ jload_prefetch(ZJNODE(active)); -+ -+ if (setdk) -+ update_stale_dk(h->tree, active); -+ -+ /* put_parent() cannot be called earlier, because connect_znode() -+ assumes parent node is referenced; */ -+ put_parent(h); -+ -+ if ((!znode_contains_key_lock(active, h->key) && -+ (h->flags & CBK_TRUST_DK)) -+ || ZF_ISSET(active, JNODE_HEARD_BANSHEE)) { -+ /* 1. key was moved out of this node while this thread was -+ waiting for the lock. Restart. More elaborate solution is -+ to determine where key moved (to the left, or to the right) -+ and try to follow it through sibling pointers. -+ -+ 2. or, node itself is going to be removed from the -+ tree. Release lock and restart. -+ */ -+ h->result = -E_REPEAT; -+ } -+ if (h->result == -E_REPEAT) -+ return LOOKUP_REST; -+ -+ h->result = zload_ra(active, h->ra_info); -+ if (h->result) { -+ return LOOKUP_DONE; -+ } -+ -+ /* sanity checks */ -+ if (sanity_check(h)) { -+ zrelse(active); -+ return LOOKUP_DONE; -+ } -+ -+ /* check that key of leftmost item in the @active is the same as in -+ * its parent */ -+ if (ldkeyset && !node_is_empty(active) && -+ !keyeq(leftmost_key_in_node(active, &key), &ldkey)) { -+ warning("vs-3533", "Keys are inconsistent. Fsck?"); -+ reiser4_print_key("inparent", &ldkey); -+ reiser4_print_key("inchild", &key); -+ h->result = RETERR(-EIO); -+ zrelse(active); -+ return LOOKUP_DONE; -+ } -+ -+ if (h->object != NULL) -+ handle_vroot(h->object, active); -+ -+ ret = cbk_node_lookup(h); -+ -+ /* h->active_lh->node might change, but active is yet to be zrelsed */ -+ zrelse(active); -+ -+ return ret; -+ -+ fail_or_restart: -+ if (h->result == -E_DEADLOCK) -+ return LOOKUP_REST; -+ return LOOKUP_DONE; -+} -+ -+#if REISER4_DEBUG -+/* check left and right delimiting keys of a znode */ -+void check_dkeys(znode * node) -+{ -+ znode *left; -+ znode *right; -+ -+ read_lock_tree(current_tree); -+ read_lock_dk(current_tree); -+ -+ assert("vs-1710", znode_is_any_locked(node)); -+ assert("vs-1197", -+ !keygt(znode_get_ld_key(node), znode_get_rd_key(node))); -+ -+ left = node->left; -+ right = node->right; -+ -+ if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET) -+ && left != NULL && ZF_ISSET(left, JNODE_DKSET)) -+ /* check left neighbor. Note that left neighbor is not locked, -+ so it might get wrong delimiting keys therefore */ -+ assert("vs-1198", -+ (keyeq(znode_get_rd_key(left), znode_get_ld_key(node)) -+ || ZF_ISSET(left, JNODE_HEARD_BANSHEE))); -+ -+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET) -+ && right != NULL && ZF_ISSET(right, JNODE_DKSET)) -+ /* check right neighbor. Note that right neighbor is not -+ locked, so it might get wrong delimiting keys therefore */ -+ assert("vs-1199", -+ (keyeq(znode_get_rd_key(node), znode_get_ld_key(right)) -+ || ZF_ISSET(right, JNODE_HEARD_BANSHEE))); -+ -+ read_unlock_dk(current_tree); -+ read_unlock_tree(current_tree); -+} -+#endif -+ -+/* true if @key is left delimiting key of @node */ -+static int key_is_ld(znode * node, const reiser4_key * key) -+{ -+ int ld; -+ -+ assert("nikita-1716", node != NULL); -+ assert("nikita-1758", key != NULL); -+ -+ read_lock_dk(znode_get_tree(node)); -+ assert("nikita-1759", znode_contains_key(node, key)); -+ ld = keyeq(znode_get_ld_key(node), key); -+ read_unlock_dk(znode_get_tree(node)); -+ return ld; -+} -+ -+/* Process one node during tree traversal. -+ -+ This is called by cbk_level_lookup(). */ -+static level_lookup_result cbk_node_lookup(cbk_handle * h /* search handle */ ) -+{ -+ /* node plugin of @active */ -+ node_plugin *nplug; -+ /* item plugin of item that was found */ -+ item_plugin *iplug; -+ /* search bias */ -+ lookup_bias node_bias; -+ /* node we are operating upon */ -+ znode *active; -+ /* tree we are searching in */ -+ reiser4_tree *tree; -+ /* result */ -+ int result; -+ -+ assert("nikita-379", h != NULL); -+ -+ active = h->active_lh->node; -+ tree = h->tree; -+ -+ nplug = active->nplug; -+ assert("nikita-380", nplug != NULL); -+ -+ ON_DEBUG(check_dkeys(active)); -+ -+ /* return item from "active" node with maximal key not greater than -+ "key" */ -+ node_bias = h->bias; -+ result = nplug->lookup(active, h->key, node_bias, h->coord); -+ if (unlikely(result != NS_FOUND && result != NS_NOT_FOUND)) { -+ /* error occurred */ -+ h->result = result; -+ return LOOKUP_DONE; -+ } -+ if (h->level == h->stop_level) { -+ /* welcome to the stop level */ -+ assert("nikita-381", h->coord->node == active); -+ if (result == NS_FOUND) { -+ /* success of tree lookup */ -+ if (!(h->flags & CBK_UNIQUE) -+ && key_is_ld(active, h->key)) { -+ return search_to_left(h); -+ } else -+ h->result = CBK_COORD_FOUND; -+ } else { -+ h->result = CBK_COORD_NOTFOUND; -+ } -+ if (!(h->flags & CBK_IN_CACHE)) -+ cbk_cache_add(active); -+ return LOOKUP_DONE; -+ } -+ -+ if (h->level > TWIG_LEVEL && result == NS_NOT_FOUND) { -+ h->error = "not found on internal node"; -+ h->result = result; -+ return LOOKUP_DONE; -+ } -+ -+ assert("vs-361", h->level > h->stop_level); -+ -+ if (handle_eottl(h, &result)) { -+ assert("vs-1674", (result == LOOKUP_DONE || -+ result == LOOKUP_REST)); -+ return result; -+ } -+ -+ /* go down to next level */ -+ check_me("vs-12", zload(h->coord->node) == 0); -+ assert("nikita-2116", item_is_internal(h->coord)); -+ iplug = item_plugin_by_coord(h->coord); -+ iplug->s.internal.down_link(h->coord, h->key, &h->block); -+ zrelse(h->coord->node); -+ --h->level; -+ return LOOKUP_CONT; /* continue */ -+} -+ -+/* scan cbk_cache slots looking for a match for @h */ -+static int cbk_cache_scan_slots(cbk_handle * h /* cbk handle */ ) -+{ -+ level_lookup_result llr; -+ znode *node; -+ reiser4_tree *tree; -+ cbk_cache_slot *slot; -+ cbk_cache *cache; -+ tree_level level; -+ int isunique; -+ const reiser4_key *key; -+ int result; -+ -+ assert("nikita-1317", h != NULL); -+ assert("nikita-1315", h->tree != NULL); -+ assert("nikita-1316", h->key != NULL); -+ -+ tree = h->tree; -+ cache = &tree->cbk_cache; -+ if (cache->nr_slots == 0) -+ /* size of cbk cache was set to 0 by mount time option. */ -+ return RETERR(-ENOENT); -+ -+ assert("nikita-2474", cbk_cache_invariant(cache)); -+ node = NULL; /* to keep gcc happy */ -+ level = h->level; -+ key = h->key; -+ isunique = h->flags & CBK_UNIQUE; -+ result = RETERR(-ENOENT); -+ -+ /* -+ * this is time-critical function and dragons had, hence, been settled -+ * here. -+ * -+ * Loop below scans cbk cache slots trying to find matching node with -+ * suitable range of delimiting keys and located at the h->level. -+ * -+ * Scan is done under cbk cache spin lock that protects slot->node -+ * pointers. If suitable node is found we want to pin it in -+ * memory. But slot->node can point to the node with x_count 0 -+ * (unreferenced). Such node can be recycled at any moment, or can -+ * already be in the process of being recycled (within jput()). -+ * -+ * As we found node in the cbk cache, it means that jput() hasn't yet -+ * called cbk_cache_invalidate(). -+ * -+ * We acquire reference to the node without holding tree lock, and -+ * later, check node's RIP bit. This avoids races with jput(). -+ */ -+ -+ rcu_read_lock(); -+ read_lock(&((cbk_cache *)cache)->guard); -+ -+ slot = list_entry(cache->lru.next, cbk_cache_slot, lru); -+ slot = list_entry(slot->lru.prev, cbk_cache_slot, lru); -+ BUG_ON(&slot->lru != &cache->lru);/*????*/ -+ while (1) { -+ -+ slot = list_entry(slot->lru.next, cbk_cache_slot, lru); -+ -+ if (&cache->lru != &slot->lru) -+ node = slot->node; -+ else -+ node = NULL; -+ -+ if (unlikely(node == NULL)) -+ break; -+ -+ /* -+ * this is (hopefully) the only place in the code where we are -+ * working with delimiting keys without holding dk lock. This -+ * is fine here, because this is only "guess" anyway---keys -+ * are rechecked under dk lock below. -+ */ -+ if (znode_get_level(node) == level && -+ /* reiser4_min_key < key < reiser4_max_key */ -+ znode_contains_key_strict(node, key, isunique)) { -+ zref(node); -+ result = 0; -+ spin_lock_prefetch(&tree->tree_lock); -+ break; -+ } -+ } -+ read_unlock(&((cbk_cache *)cache)->guard); -+ -+ assert("nikita-2475", cbk_cache_invariant(cache)); -+ -+ if (unlikely(result == 0 && ZF_ISSET(node, JNODE_RIP))) -+ result = -ENOENT; -+ -+ rcu_read_unlock(); -+ -+ if (result != 0) { -+ h->result = CBK_COORD_NOTFOUND; -+ return RETERR(-ENOENT); -+ } -+ -+ result = -+ longterm_lock_znode(h->active_lh, node, cbk_lock_mode(level, h), -+ ZNODE_LOCK_LOPRI); -+ zput(node); -+ if (result != 0) -+ return result; -+ result = zload(node); -+ if (result != 0) -+ return result; -+ -+ /* recheck keys */ -+ read_lock_dk(tree); -+ result = (znode_contains_key_strict(node, key, isunique) && -+ !ZF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ read_unlock_dk(tree); -+ if (result) { -+ /* do lookup inside node */ -+ llr = cbk_node_lookup(h); -+ /* if cbk_node_lookup() wandered to another node (due to eottl -+ or non-unique keys), adjust @node */ -+ /*node = h->active_lh->node; */ -+ -+ if (llr != LOOKUP_DONE) { -+ /* restart or continue on the next level */ -+ result = RETERR(-ENOENT); -+ } else if (IS_CBKERR(h->result)) -+ /* io or oom */ -+ result = RETERR(-ENOENT); -+ else { -+ /* good. Either item found or definitely not found. */ -+ result = 0; -+ -+ write_lock(&(cache->guard)); -+ if (slot->node == h->active_lh->node /*node */ ) { -+ /* if this node is still in cbk cache---move -+ its slot to the head of the LRU list. */ -+ list_move(&slot->lru, &cache->lru); -+ } -+ write_unlock(&(cache->guard)); -+ } -+ } else { -+ /* race. While this thread was waiting for the lock, node was -+ rebalanced and item we are looking for, shifted out of it -+ (if it ever was here). -+ -+ Continuing scanning is almost hopeless: node key range was -+ moved to, is almost certainly at the beginning of the LRU -+ list at this time, because it's hot, but restarting -+ scanning from the very beginning is complex. Just return, -+ so that cbk() will be performed. This is not that -+ important, because such races should be rare. Are they? -+ */ -+ result = RETERR(-ENOENT); /* -ERAUGHT */ -+ } -+ zrelse(node); -+ assert("nikita-2476", cbk_cache_invariant(cache)); -+ return result; -+} -+ -+/* look for item with given key in the coord cache -+ -+ This function, called by coord_by_key(), scans "coord cache" (&cbk_cache) -+ which is a small LRU list of znodes accessed lately. For each znode in -+ znode in this list, it checks whether key we are looking for fits into key -+ range covered by this node. If so, and in addition, node lies at allowed -+ level (this is to handle extents on a twig level), node is locked, and -+ lookup inside it is performed. -+ -+ we need a measurement of the cost of this cache search compared to the cost -+ of coord_by_key. -+ -+*/ -+static int cbk_cache_search(cbk_handle * h /* cbk handle */ ) -+{ -+ int result = 0; -+ tree_level level; -+ -+ /* add CBK_IN_CACHE to the handle flags. This means that -+ * cbk_node_lookup() assumes that cbk_cache is scanned and would add -+ * found node to the cache. */ -+ h->flags |= CBK_IN_CACHE; -+ for (level = h->stop_level; level <= h->lock_level; ++level) { -+ h->level = level; -+ result = cbk_cache_scan_slots(h); -+ if (result != 0) { -+ done_lh(h->active_lh); -+ done_lh(h->parent_lh); -+ } else { -+ assert("nikita-1319", !IS_CBKERR(h->result)); -+ break; -+ } -+ } -+ h->flags &= ~CBK_IN_CACHE; -+ return result; -+} -+ -+/* type of lock we want to obtain during tree traversal. On stop level -+ we want type of lock user asked for, on upper levels: read lock. */ -+znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h) -+{ -+ assert("nikita-382", h != NULL); -+ -+ return (level <= h->lock_level) ? h->lock_mode : ZNODE_READ_LOCK; -+} -+ -+/* update outdated delimiting keys */ -+static void stale_dk(reiser4_tree * tree, znode * node) -+{ -+ znode *right; -+ -+ read_lock_tree(tree); -+ write_lock_dk(tree); -+ right = node->right; -+ -+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && -+ right && ZF_ISSET(right, JNODE_DKSET) && -+ !keyeq(znode_get_rd_key(node), znode_get_ld_key(right))) -+ znode_set_rd_key(node, znode_get_ld_key(right)); -+ -+ write_unlock_dk(tree); -+ read_unlock_tree(tree); -+} -+ -+/* check for possibly outdated delimiting keys, and update them if -+ * necessary. */ -+static void update_stale_dk(reiser4_tree * tree, znode * node) -+{ -+ znode *right; -+ reiser4_key rd; -+ -+ read_lock_tree(tree); -+ read_lock_dk(tree); -+ rd = *znode_get_rd_key(node); -+ right = node->right; -+ if (unlikely(ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && -+ right && ZF_ISSET(right, JNODE_DKSET) && -+ !keyeq(&rd, znode_get_ld_key(right)))) { -+ assert("nikita-38211", ZF_ISSET(node, JNODE_DKSET)); -+ read_unlock_dk(tree); -+ read_unlock_tree(tree); -+ stale_dk(tree, node); -+ return; -+ } -+ read_unlock_dk(tree); -+ read_unlock_tree(tree); -+} -+ -+/* -+ * handle searches a the non-unique key. -+ * -+ * Suppose that we are looking for an item with possibly non-unique key 100. -+ * -+ * Root node contains two pointers: one to a node with left delimiting key 0, -+ * and another to a node with left delimiting key 100. Item we interested in -+ * may well happen in the sub-tree rooted at the first pointer. -+ * -+ * To handle this search_to_left() is called when search reaches stop -+ * level. This function checks it is _possible_ that item we are looking for -+ * is in the left neighbor (this can be done by comparing delimiting keys) and -+ * if so, tries to lock left neighbor (this is low priority lock, so it can -+ * deadlock, tree traversal is just restarted if it did) and then checks -+ * whether left neighbor actually contains items with our key. -+ * -+ * Note that this is done on the stop level only. It is possible to try such -+ * left-check on each level, but as duplicate keys are supposed to be rare -+ * (very unlikely that more than one node is completely filled with items with -+ * duplicate keys), it sis cheaper to scan to the left on the stop level once. -+ * -+ */ -+static level_lookup_result search_to_left(cbk_handle * h /* search handle */ ) -+{ -+ level_lookup_result result; -+ coord_t *coord; -+ znode *node; -+ znode *neighbor; -+ -+ lock_handle lh; -+ -+ assert("nikita-1761", h != NULL); -+ assert("nikita-1762", h->level == h->stop_level); -+ -+ init_lh(&lh); -+ coord = h->coord; -+ node = h->active_lh->node; -+ assert("nikita-1763", coord_is_leftmost_unit(coord)); -+ -+ h->result = -+ reiser4_get_left_neighbor(&lh, node, (int)h->lock_mode, -+ GN_CAN_USE_UPPER_LEVELS); -+ neighbor = NULL; -+ switch (h->result) { -+ case -E_DEADLOCK: -+ result = LOOKUP_REST; -+ break; -+ case 0:{ -+ node_plugin *nplug; -+ coord_t crd; -+ lookup_bias bias; -+ -+ neighbor = lh.node; -+ h->result = zload(neighbor); -+ if (h->result != 0) { -+ result = LOOKUP_DONE; -+ break; -+ } -+ -+ nplug = neighbor->nplug; -+ -+ coord_init_zero(&crd); -+ bias = h->bias; -+ h->bias = FIND_EXACT; -+ h->result = -+ nplug->lookup(neighbor, h->key, h->bias, &crd); -+ h->bias = bias; -+ -+ if (h->result == NS_NOT_FOUND) { -+ case -E_NO_NEIGHBOR: -+ h->result = CBK_COORD_FOUND; -+ if (!(h->flags & CBK_IN_CACHE)) -+ cbk_cache_add(node); -+ default: /* some other error */ -+ result = LOOKUP_DONE; -+ } else if (h->result == NS_FOUND) { -+ read_lock_dk(znode_get_tree(neighbor)); -+ h->rd_key = *znode_get_ld_key(node); -+ leftmost_key_in_node(neighbor, &h->ld_key); -+ read_unlock_dk(znode_get_tree(neighbor)); -+ h->flags |= CBK_DKSET; -+ -+ h->block = *znode_get_block(neighbor); -+ /* clear coord -> node so that cbk_level_lookup() -+ wouldn't overwrite parent hint in neighbor. -+ -+ Parent hint was set up by -+ reiser4_get_left_neighbor() -+ */ -+ /* FIXME: why do we have to spinlock here? */ -+ write_lock_tree(znode_get_tree(neighbor)); -+ h->coord->node = NULL; -+ write_unlock_tree(znode_get_tree(neighbor)); -+ result = LOOKUP_CONT; -+ } else { -+ result = LOOKUP_DONE; -+ } -+ if (neighbor != NULL) -+ zrelse(neighbor); -+ } -+ } -+ done_lh(&lh); -+ return result; -+} -+ -+/* debugging aid: return symbolic name of search bias */ -+static const char *bias_name(lookup_bias bias /* bias to get name of */ ) -+{ -+ if (bias == FIND_EXACT) -+ return "exact"; -+ else if (bias == FIND_MAX_NOT_MORE_THAN) -+ return "left-slant"; -+/* else if( bias == RIGHT_SLANT_BIAS ) */ -+/* return "right-bias"; */ -+ else { -+ static char buf[30]; -+ -+ sprintf(buf, "unknown: %i", bias); -+ return buf; -+ } -+} -+ -+#if REISER4_DEBUG -+/* debugging aid: print human readable information about @p */ -+void print_coord_content(const char *prefix /* prefix to print */ , -+ coord_t * p /* coord to print */ ) -+{ -+ reiser4_key key; -+ -+ if (p == NULL) { -+ printk("%s: null\n", prefix); -+ return; -+ } -+ if ((p->node != NULL) && znode_is_loaded(p->node) -+ && coord_is_existing_item(p)) -+ printk("%s: data: %p, length: %i\n", prefix, -+ item_body_by_coord(p), item_length_by_coord(p)); -+ if (znode_is_loaded(p->node)) { -+ item_key_by_coord(p, &key); -+ reiser4_print_key(prefix, &key); -+ } -+} -+ -+/* debugging aid: print human readable information about @block */ -+void reiser4_print_address(const char *prefix /* prefix to print */ , -+ const reiser4_block_nr * block /* block number to print */ ) -+{ -+ printk("%s: %s\n", prefix, sprint_address(block)); -+} -+#endif -+ -+/* return string containing human readable representation of @block */ -+char *sprint_address(const reiser4_block_nr * -+ block /* block number to print */ ) -+{ -+ static char address[30]; -+ -+ if (block == NULL) -+ sprintf(address, "null"); -+ else if (reiser4_blocknr_is_fake(block)) -+ sprintf(address, "%llx", (unsigned long long)(*block)); -+ else -+ sprintf(address, "%llu", (unsigned long long)(*block)); -+ return address; -+} -+ -+/* release parent node during traversal */ -+static void put_parent(cbk_handle * h /* search handle */ ) -+{ -+ assert("nikita-383", h != NULL); -+ if (h->parent_lh->node != NULL) { -+ longterm_unlock_znode(h->parent_lh); -+ } -+} -+ -+/* helper function used by coord_by_key(): release reference to parent znode -+ stored in handle before processing its child. */ -+static void hput(cbk_handle * h /* search handle */ ) -+{ -+ assert("nikita-385", h != NULL); -+ done_lh(h->parent_lh); -+ done_lh(h->active_lh); -+} -+ -+/* Helper function used by cbk(): update delimiting keys of child node (stored -+ in h->active_lh->node) using key taken from parent on the parent level. */ -+static int setup_delimiting_keys(cbk_handle * h /* search handle */ ) -+{ -+ znode *active; -+ reiser4_tree *tree; -+ -+ assert("nikita-1088", h != NULL); -+ -+ active = h->active_lh->node; -+ -+ /* fast check without taking dk lock. This is safe, because -+ * JNODE_DKSET is never cleared once set. */ -+ if (!ZF_ISSET(active, JNODE_DKSET)) { -+ tree = znode_get_tree(active); -+ write_lock_dk(tree); -+ if (!ZF_ISSET(active, JNODE_DKSET)) { -+ znode_set_ld_key(active, &h->ld_key); -+ znode_set_rd_key(active, &h->rd_key); -+ ZF_SET(active, JNODE_DKSET); -+ } -+ write_unlock_dk(tree); -+ return 1; -+ } -+ return 0; -+} -+ -+/* true if @block makes sense for the @tree. Used to detect corrupted node -+ * pointers */ -+static int -+block_nr_is_correct(reiser4_block_nr * block /* block number to check */ , -+ reiser4_tree * tree /* tree to check against */ ) -+{ -+ assert("nikita-757", block != NULL); -+ assert("nikita-758", tree != NULL); -+ -+ /* check to see if it exceeds the size of the device. */ -+ return reiser4_blocknr_is_sane_for(tree->super, block); -+} -+ -+/* check consistency of fields */ -+static int sanity_check(cbk_handle * h /* search handle */ ) -+{ -+ assert("nikita-384", h != NULL); -+ -+ if (h->level < h->stop_level) { -+ h->error = "Buried under leaves"; -+ h->result = RETERR(-EIO); -+ return LOOKUP_DONE; -+ } else if (!block_nr_is_correct(&h->block, h->tree)) { -+ h->error = "bad block number"; -+ h->result = RETERR(-EIO); -+ return LOOKUP_DONE; -+ } else -+ return 0; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/status_flags.c linux-2.6.24/fs/reiser4/status_flags.c ---- linux-2.6.24.orig/fs/reiser4/status_flags.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/status_flags.c 2008-01-25 11:54:46.665843146 +0300 -@@ -0,0 +1,170 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Functions that deal with reiser4 status block, query status and update it, if needed */ -+ -+#include -+#include -+#include -+#include -+#include "debug.h" -+#include "dformat.h" -+#include "status_flags.h" -+#include "super.h" -+ -+/* This is our end I/O handler that marks page uptodate if IO was successful. It also -+ unconditionally unlocks the page, so we can see that io was done. -+ We do not free bio, because we hope to reuse that. */ -+static void reiser4_status_endio(struct bio *bio, int err) -+{ -+ if (test_bit(BIO_UPTODATE, &bio->bi_flags)) { -+ SetPageUptodate(bio->bi_io_vec->bv_page); -+ } else { -+ ClearPageUptodate(bio->bi_io_vec->bv_page); -+ SetPageError(bio->bi_io_vec->bv_page); -+ } -+ unlock_page(bio->bi_io_vec->bv_page); -+} -+ -+/* Initialise status code. This is expected to be called from the disk format -+ code. block paremeter is where status block lives. */ -+int reiser4_status_init(reiser4_block_nr block) -+{ -+ struct super_block *sb = reiser4_get_current_sb(); -+ struct reiser4_status *statuspage; -+ struct bio *bio; -+ struct page *page; -+ -+ get_super_private(sb)->status_page = NULL; -+ get_super_private(sb)->status_bio = NULL; -+ -+ page = alloc_pages(reiser4_ctx_gfp_mask_get(), 0); -+ if (!page) -+ return -ENOMEM; -+ -+ bio = bio_alloc(reiser4_ctx_gfp_mask_get(), 1); -+ if (bio != NULL) { -+ bio->bi_sector = block * (sb->s_blocksize >> 9); -+ bio->bi_bdev = sb->s_bdev; -+ bio->bi_io_vec[0].bv_page = page; -+ bio->bi_io_vec[0].bv_len = sb->s_blocksize; -+ bio->bi_io_vec[0].bv_offset = 0; -+ bio->bi_vcnt = 1; -+ bio->bi_size = sb->s_blocksize; -+ bio->bi_end_io = reiser4_status_endio; -+ } else { -+ __free_pages(page, 0); -+ return -ENOMEM; -+ } -+ lock_page(page); -+ submit_bio(READ, bio); -+ blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping); -+ wait_on_page_locked(page); -+ if (!PageUptodate(page)) { -+ warning("green-2007", -+ "I/O error while tried to read status page\n"); -+ return -EIO; -+ } -+ -+ statuspage = (struct reiser4_status *)kmap_atomic(page, KM_USER0); -+ if (memcmp -+ (statuspage->magic, REISER4_STATUS_MAGIC, -+ sizeof(REISER4_STATUS_MAGIC))) { -+ /* Magic does not match. */ -+ kunmap_atomic((char *)statuspage, KM_USER0); -+ warning("green-2008", "Wrong magic in status block\n"); -+ __free_pages(page, 0); -+ bio_put(bio); -+ return -EINVAL; -+ } -+ kunmap_atomic((char *)statuspage, KM_USER0); -+ -+ get_super_private(sb)->status_page = page; -+ get_super_private(sb)->status_bio = bio; -+ return 0; -+} -+ -+/* Query the status of fs. Returns if the FS can be safely mounted. -+ Also if "status" and "extended" parameters are given, it will fill -+ actual parts of status from disk there. */ -+int reiser4_status_query(u64 * status, u64 * extended) -+{ -+ struct super_block *sb = reiser4_get_current_sb(); -+ struct reiser4_status *statuspage; -+ int retval; -+ -+ if (!get_super_private(sb)->status_page) { // No status page? -+ return REISER4_STATUS_MOUNT_UNKNOWN; -+ } -+ statuspage = (struct reiser4_status *) -+ kmap_atomic(get_super_private(sb)->status_page, KM_USER0); -+ switch ((long)le64_to_cpu(get_unaligned(&statuspage->status))) { // FIXME: this cast is a hack for 32 bit arches to work. -+ case REISER4_STATUS_OK: -+ retval = REISER4_STATUS_MOUNT_OK; -+ break; -+ case REISER4_STATUS_CORRUPTED: -+ retval = REISER4_STATUS_MOUNT_WARN; -+ break; -+ case REISER4_STATUS_DAMAGED: -+ case REISER4_STATUS_DESTROYED: -+ case REISER4_STATUS_IOERROR: -+ retval = REISER4_STATUS_MOUNT_RO; -+ break; -+ default: -+ retval = REISER4_STATUS_MOUNT_UNKNOWN; -+ break; -+ } -+ -+ if (status) -+ *status = le64_to_cpu(get_unaligned(&statuspage->status)); -+ if (extended) -+ *extended = le64_to_cpu(get_unaligned(&statuspage->extended_status)); -+ -+ kunmap_atomic((char *)statuspage, KM_USER0); -+ return retval; -+} -+ -+/* This function should be called when something bad happens (e.g. from reiser4_panic). -+ It fills the status structure and tries to push it to disk. */ -+int reiser4_status_write(__u64 status, __u64 extended_status, char *message) -+{ -+ struct super_block *sb = reiser4_get_current_sb(); -+ struct reiser4_status *statuspage; -+ struct bio *bio = get_super_private(sb)->status_bio; -+ -+ if (!get_super_private(sb)->status_page) { // No status page? -+ return -1; -+ } -+ statuspage = (struct reiser4_status *) -+ kmap_atomic(get_super_private(sb)->status_page, KM_USER0); -+ -+ put_unaligned(cpu_to_le64(status), &statuspage->status); -+ put_unaligned(cpu_to_le64(extended_status), &statuspage->extended_status); -+ strncpy(statuspage->texterror, message, REISER4_TEXTERROR_LEN); -+ -+ kunmap_atomic((char *)statuspage, KM_USER0); -+ bio->bi_bdev = sb->s_bdev; -+ bio->bi_io_vec[0].bv_page = get_super_private(sb)->status_page; -+ bio->bi_io_vec[0].bv_len = sb->s_blocksize; -+ bio->bi_io_vec[0].bv_offset = 0; -+ bio->bi_vcnt = 1; -+ bio->bi_size = sb->s_blocksize; -+ bio->bi_end_io = reiser4_status_endio; -+ lock_page(get_super_private(sb)->status_page); // Safe as nobody should touch our page. -+ /* We can block now, but we have no other choice anyway */ -+ submit_bio(WRITE, bio); -+ blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping); -+ return 0; // We do not wait for io to finish. -+} -+ -+/* Frees the page with status and bio structure. Should be called by disk format at umount time */ -+int reiser4_status_finish(void) -+{ -+ struct super_block *sb = reiser4_get_current_sb(); -+ -+ __free_pages(get_super_private(sb)->status_page, 0); -+ get_super_private(sb)->status_page = NULL; -+ bio_put(get_super_private(sb)->status_bio); -+ get_super_private(sb)->status_bio = NULL; -+ return 0; -+} -diff -urN linux-2.6.24.orig/fs/reiser4/status_flags.h linux-2.6.24/fs/reiser4/status_flags.h ---- linux-2.6.24.orig/fs/reiser4/status_flags.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/status_flags.h 2008-01-25 11:39:07.088246844 +0300 -@@ -0,0 +1,43 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Here we declare structures and flags that store reiser4 status on disk. -+ The status that helps us to find out if the filesystem is valid or if it -+ contains some critical, or not so critical errors */ -+ -+#if !defined( __REISER4_STATUS_FLAGS_H__ ) -+#define __REISER4_STATUS_FLAGS_H__ -+ -+#include "dformat.h" -+/* These are major status flags */ -+#define REISER4_STATUS_OK 0 -+#define REISER4_STATUS_CORRUPTED 0x1 -+#define REISER4_STATUS_DAMAGED 0x2 -+#define REISER4_STATUS_DESTROYED 0x4 -+#define REISER4_STATUS_IOERROR 0x8 -+ -+/* Return values for reiser4_status_query() */ -+#define REISER4_STATUS_MOUNT_OK 0 -+#define REISER4_STATUS_MOUNT_WARN 1 -+#define REISER4_STATUS_MOUNT_RO 2 -+#define REISER4_STATUS_MOUNT_UNKNOWN -1 -+ -+#define REISER4_TEXTERROR_LEN 256 -+ -+#define REISER4_STATUS_MAGIC "ReiSeR4StATusBl" -+/* We probably need to keep its size under sector size which is 512 bytes */ -+struct reiser4_status { -+ char magic[16]; -+ d64 status; /* Current FS state */ -+ d64 extended_status; /* Any additional info that might have sense in addition to "status". E.g. -+ last sector where io error happened if status is "io error encountered" */ -+ d64 stacktrace[10]; /* Last ten functional calls made (addresses) */ -+ char texterror[REISER4_TEXTERROR_LEN]; /* Any error message if appropriate, otherwise filled with zeroes */ -+}; -+ -+int reiser4_status_init(reiser4_block_nr block); -+int reiser4_status_query(u64 * status, u64 * extended); -+int reiser4_status_write(u64 status, u64 extended_status, char *message); -+int reiser4_status_finish(void); -+ -+#endif -diff -urN linux-2.6.24.orig/fs/reiser4/super.c linux-2.6.24/fs/reiser4/super.c ---- linux-2.6.24.orig/fs/reiser4/super.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/super.c 2008-01-25 11:39:07.088246844 +0300 -@@ -0,0 +1,316 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Super-block manipulations. */ -+ -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "plugin/security/perm.h" -+#include "plugin/space/space_allocator.h" -+#include "plugin/plugin.h" -+#include "tree.h" -+#include "vfs_ops.h" -+#include "super.h" -+#include "reiser4.h" -+ -+#include /* for __u?? */ -+#include /* for struct super_block */ -+ -+static __u64 reserved_for_gid(const struct super_block *super, gid_t gid); -+static __u64 reserved_for_uid(const struct super_block *super, uid_t uid); -+static __u64 reserved_for_root(const struct super_block *super); -+ -+/* Return reiser4-specific part of super block */ -+reiser4_super_info_data *get_super_private_nocheck(const struct super_block *super /* super block -+ * queried */ ) -+{ -+ return (reiser4_super_info_data *) super->s_fs_info; -+} -+ -+/* Return reiser4 fstype: value that is returned in ->f_type field by statfs() */ -+long reiser4_statfs_type(const struct super_block *super UNUSED_ARG) -+{ -+ assert("nikita-448", super != NULL); -+ assert("nikita-449", is_reiser4_super(super)); -+ return (long)REISER4_SUPER_MAGIC; -+} -+ -+/* functions to read/modify fields of reiser4_super_info_data */ -+ -+/* get number of blocks in file system */ -+__u64 reiser4_block_count(const struct super_block *super /* super block -+ queried */ ) -+{ -+ assert("vs-494", super != NULL); -+ assert("vs-495", is_reiser4_super(super)); -+ return get_super_private(super)->block_count; -+} -+ -+#if REISER4_DEBUG -+/* -+ * number of blocks in the current file system -+ */ -+__u64 reiser4_current_block_count(void) -+{ -+ return get_current_super_private()->block_count; -+} -+#endif /* REISER4_DEBUG */ -+ -+/* set number of block in filesystem */ -+void reiser4_set_block_count(const struct super_block *super, __u64 nr) -+{ -+ assert("vs-501", super != NULL); -+ assert("vs-502", is_reiser4_super(super)); -+ get_super_private(super)->block_count = nr; -+ /* -+ * The proper calculation of the reserved space counter (%5 of device -+ * block counter) we need a 64 bit division which is missing in Linux -+ * on i386 platform. Because we do not need a precise calculation here -+ * we can replace a div64 operation by this combination of -+ * multiplication and shift: 51. / (2^10) == .0498 . -+ * FIXME: this is a bug. It comes up only for very small filesystems -+ * which probably are never used. Nevertheless, it is a bug. Number of -+ * reserved blocks must be not less than maximal number of blocks which -+ * get grabbed with BA_RESERVED. -+ */ -+ get_super_private(super)->blocks_reserved = ((nr * 51) >> 10); -+} -+ -+/* amount of blocks used (allocated for data) in file system */ -+__u64 reiser4_data_blocks(const struct super_block *super /* super block -+ queried */ ) -+{ -+ assert("nikita-452", super != NULL); -+ assert("nikita-453", is_reiser4_super(super)); -+ return get_super_private(super)->blocks_used; -+} -+ -+/* set number of block used in filesystem */ -+void reiser4_set_data_blocks(const struct super_block *super, __u64 nr) -+{ -+ assert("vs-503", super != NULL); -+ assert("vs-504", is_reiser4_super(super)); -+ get_super_private(super)->blocks_used = nr; -+} -+ -+/* amount of free blocks in file system */ -+__u64 reiser4_free_blocks(const struct super_block *super /* super block -+ queried */ ) -+{ -+ assert("nikita-454", super != NULL); -+ assert("nikita-455", is_reiser4_super(super)); -+ return get_super_private(super)->blocks_free; -+} -+ -+/* set number of blocks free in filesystem */ -+void reiser4_set_free_blocks(const struct super_block *super, __u64 nr) -+{ -+ assert("vs-505", super != NULL); -+ assert("vs-506", is_reiser4_super(super)); -+ get_super_private(super)->blocks_free = nr; -+} -+ -+/* get mkfs unique identifier */ -+__u32 reiser4_mkfs_id(const struct super_block *super /* super block -+ queried */ ) -+{ -+ assert("vpf-221", super != NULL); -+ assert("vpf-222", is_reiser4_super(super)); -+ return get_super_private(super)->mkfs_id; -+} -+ -+/* amount of free blocks in file system */ -+__u64 reiser4_free_committed_blocks(const struct super_block *super) -+{ -+ assert("vs-497", super != NULL); -+ assert("vs-498", is_reiser4_super(super)); -+ return get_super_private(super)->blocks_free_committed; -+} -+ -+/* amount of blocks in the file system reserved for @uid and @gid */ -+long reiser4_reserved_blocks(const struct super_block *super /* super block -+ queried */ , -+ uid_t uid /* user id */ , -+ gid_t gid /* group id */ ) -+{ -+ long reserved; -+ -+ assert("nikita-456", super != NULL); -+ assert("nikita-457", is_reiser4_super(super)); -+ -+ reserved = 0; -+ if (REISER4_SUPPORT_GID_SPACE_RESERVATION) -+ reserved += reserved_for_gid(super, gid); -+ if (REISER4_SUPPORT_UID_SPACE_RESERVATION) -+ reserved += reserved_for_uid(super, uid); -+ if (REISER4_SUPPORT_ROOT_SPACE_RESERVATION && (uid == 0)) -+ reserved += reserved_for_root(super); -+ return reserved; -+} -+ -+/* get/set value of/to grabbed blocks counter */ -+__u64 reiser4_grabbed_blocks(const struct super_block * super) -+{ -+ assert("zam-512", super != NULL); -+ assert("zam-513", is_reiser4_super(super)); -+ -+ return get_super_private(super)->blocks_grabbed; -+} -+ -+__u64 reiser4_flush_reserved(const struct super_block * super) -+{ -+ assert("vpf-285", super != NULL); -+ assert("vpf-286", is_reiser4_super(super)); -+ -+ return get_super_private(super)->blocks_flush_reserved; -+} -+ -+/* get/set value of/to counter of fake allocated formatted blocks */ -+__u64 reiser4_fake_allocated(const struct super_block * super) -+{ -+ assert("zam-516", super != NULL); -+ assert("zam-517", is_reiser4_super(super)); -+ -+ return get_super_private(super)->blocks_fake_allocated; -+} -+ -+/* get/set value of/to counter of fake allocated unformatted blocks */ -+__u64 reiser4_fake_allocated_unformatted(const struct super_block * super) -+{ -+ assert("zam-516", super != NULL); -+ assert("zam-517", is_reiser4_super(super)); -+ -+ return get_super_private(super)->blocks_fake_allocated_unformatted; -+} -+ -+/* get/set value of/to counter of clustered blocks */ -+__u64 reiser4_clustered_blocks(const struct super_block * super) -+{ -+ assert("edward-601", super != NULL); -+ assert("edward-602", is_reiser4_super(super)); -+ -+ return get_super_private(super)->blocks_clustered; -+} -+ -+/* space allocator used by this file system */ -+reiser4_space_allocator * reiser4_get_space_allocator(const struct super_block -+ *super) -+{ -+ assert("nikita-1965", super != NULL); -+ assert("nikita-1966", is_reiser4_super(super)); -+ return &get_super_private(super)->space_allocator; -+} -+ -+/* return fake inode used to bind formatted nodes in the page cache */ -+struct inode *reiser4_get_super_fake(const struct super_block *super /* super block -+ queried */ ) -+{ -+ assert("nikita-1757", super != NULL); -+ return get_super_private(super)->fake; -+} -+ -+/* return fake inode used to bind copied on capture nodes in the page cache */ -+struct inode *reiser4_get_cc_fake(const struct super_block *super /* super block -+ queried */ ) -+{ -+ assert("nikita-1757", super != NULL); -+ return get_super_private(super)->cc; -+} -+ -+/* return fake inode used to bind bitmaps and journlal heads */ -+struct inode *reiser4_get_bitmap_fake(const struct super_block *super) -+{ -+ assert("nikita-17571", super != NULL); -+ return get_super_private(super)->bitmap; -+} -+ -+/* tree used by this file system */ -+reiser4_tree *reiser4_get_tree(const struct super_block * super /* super block -+ * queried */ ) -+{ -+ assert("nikita-460", super != NULL); -+ assert("nikita-461", is_reiser4_super(super)); -+ return &get_super_private(super)->tree; -+} -+ -+/* Check that @super is (looks like) reiser4 super block. This is mainly for -+ use in assertions. */ -+int is_reiser4_super(const struct super_block *super /* super block -+ * queried */ ) -+{ -+ return -+ super != NULL && -+ get_super_private(super) != NULL && -+ super->s_op == &(get_super_private(super)->ops.super); -+} -+ -+int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f) -+{ -+ return test_bit((int)f, &get_super_private(super)->fs_flags); -+} -+ -+/* amount of blocks reserved for given group in file system */ -+static __u64 reserved_for_gid(const struct super_block *super UNUSED_ARG /* super -+ * block -+ * queried */ , -+ gid_t gid UNUSED_ARG /* group id */ ) -+{ -+ return 0; -+} -+ -+/* amount of blocks reserved for given user in file system */ -+static __u64 reserved_for_uid(const struct super_block *super UNUSED_ARG /* super -+ block -+ queried */ , -+ uid_t uid UNUSED_ARG /* user id */ ) -+{ -+ return 0; -+} -+ -+/* amount of blocks reserved for super user in file system */ -+static __u64 reserved_for_root(const struct super_block *super UNUSED_ARG /* super -+ block -+ queried */ ) -+{ -+ return 0; -+} -+ -+/* -+ * true if block number @blk makes sense for the file system at @super. -+ */ -+int -+reiser4_blocknr_is_sane_for(const struct super_block *super, -+ const reiser4_block_nr * blk) -+{ -+ reiser4_super_info_data *sbinfo; -+ -+ assert("nikita-2957", super != NULL); -+ assert("nikita-2958", blk != NULL); -+ -+ if (reiser4_blocknr_is_fake(blk)) -+ return 1; -+ -+ sbinfo = get_super_private(super); -+ return *blk < sbinfo->block_count; -+} -+ -+#if REISER4_DEBUG -+/* -+ * true, if block number @blk makes sense for the current file system -+ */ -+int reiser4_blocknr_is_sane(const reiser4_block_nr * blk) -+{ -+ return reiser4_blocknr_is_sane_for(reiser4_get_current_sb(), blk); -+} -+#endif /* REISER4_DEBUG */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/super.h linux-2.6.24/fs/reiser4/super.h ---- linux-2.6.24.orig/fs/reiser4/super.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/super.h 2008-01-25 11:39:07.088246844 +0300 -@@ -0,0 +1,466 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Super-block functions. See super.c for details. */ -+ -+#if !defined( __REISER4_SUPER_H__ ) -+#define __REISER4_SUPER_H__ -+ -+#include -+ -+#include "tree.h" -+#include "entd.h" -+#include "wander.h" -+#include "fsdata.h" -+#include "plugin/object.h" -+#include "plugin/space/space_allocator.h" -+ -+/* -+ * Flush algorithms parameters. -+ */ -+struct flush_params { -+ unsigned relocate_threshold; -+ unsigned relocate_distance; -+ unsigned written_threshold; -+ unsigned scan_maxnodes; -+}; -+ -+typedef enum { -+ /* -+ * True if this file system doesn't support hard-links (multiple names) -+ * for directories: this is default UNIX behavior. -+ * -+ * If hard-links on directoires are not allowed, file system is Acyclic -+ * Directed Graph (modulo dot, and dotdot, of course). -+ * -+ * This is used by reiser4_link(). -+ */ -+ REISER4_ADG = 0, -+ /* -+ * set if all nodes in internal tree have the same node layout plugin. -+ * If so, znode_guess_plugin() will return tree->node_plugin in stead -+ * of guessing plugin by plugin id stored in the node. -+ */ -+ REISER4_ONE_NODE_PLUGIN = 1, -+ /* if set, bsd gid assignment is supported. */ -+ REISER4_BSD_GID = 2, -+ /* [mac]_time are 32 bit in inode */ -+ REISER4_32_BIT_TIMES = 3, -+ /* load all bitmap blocks at mount time */ -+ REISER4_DONT_LOAD_BITMAP = 5, -+ /* enforce atomicity during write(2) */ -+ REISER4_ATOMIC_WRITE = 6, -+ /* don't use write barriers in the log writer code. */ -+ REISER4_NO_WRITE_BARRIER = 7 -+} reiser4_fs_flag; -+ -+/* -+ * VFS related operation vectors. -+ */ -+struct object_ops { -+ struct super_operations super; -+ struct dentry_operations dentry; -+ struct export_operations export; -+}; -+ -+/* reiser4-specific part of super block -+ -+ Locking -+ -+ Fields immutable after mount: -+ -+ ->oid* -+ ->space* -+ ->default_[ug]id -+ ->mkfs_id -+ ->trace_flags -+ ->debug_flags -+ ->fs_flags -+ ->df_plug -+ ->optimal_io_size -+ ->plug -+ ->flush -+ ->u (bad name) -+ ->txnmgr -+ ->ra_params -+ ->fsuid -+ ->journal_header -+ ->journal_footer -+ -+ Fields protected by ->lnode_guard -+ -+ ->lnode_htable -+ -+ Fields protected by per-super block spin lock -+ -+ ->block_count -+ ->blocks_used -+ ->blocks_free -+ ->blocks_free_committed -+ ->blocks_grabbed -+ ->blocks_fake_allocated_unformatted -+ ->blocks_fake_allocated -+ ->blocks_flush_reserved -+ ->eflushed -+ ->blocknr_hint_default -+ -+ After journal replaying during mount, -+ -+ ->last_committed_tx -+ -+ is protected by ->tmgr.commit_mutex -+ -+ Invariants involving this data-type: -+ -+ [sb-block-counts] -+ [sb-grabbed] -+ [sb-fake-allocated] -+*/ -+struct reiser4_super_info_data { -+ /* -+ * guard spinlock which protects reiser4 super block fields (currently -+ * blocks_free, blocks_free_committed) -+ */ -+ spinlock_t guard; -+ -+ /* next oid that will be returned by oid_allocate() */ -+ oid_t next_to_use; -+ /* total number of used oids */ -+ oid_t oids_in_use; -+ -+ /* space manager plugin */ -+ reiser4_space_allocator space_allocator; -+ -+ /* reiser4 internal tree */ -+ reiser4_tree tree; -+ -+ /* -+ * default user id used for light-weight files without their own -+ * stat-data. -+ */ -+ uid_t default_uid; -+ -+ /* -+ * default group id used for light-weight files without their own -+ * stat-data. -+ */ -+ gid_t default_gid; -+ -+ /* mkfs identifier generated at mkfs time. */ -+ __u32 mkfs_id; -+ /* amount of blocks in a file system */ -+ __u64 block_count; -+ -+ /* inviolable reserve */ -+ __u64 blocks_reserved; -+ -+ /* amount of blocks used by file system data and meta-data. */ -+ __u64 blocks_used; -+ -+ /* -+ * amount of free blocks. This is "working" free blocks counter. It is -+ * like "working" bitmap, please see block_alloc.c for description. -+ */ -+ __u64 blocks_free; -+ -+ /* -+ * free block count for fs committed state. This is "commit" version of -+ * free block counter. -+ */ -+ __u64 blocks_free_committed; -+ -+ /* -+ * number of blocks reserved for further allocation, for all -+ * threads. -+ */ -+ __u64 blocks_grabbed; -+ -+ /* number of fake allocated unformatted blocks in tree. */ -+ __u64 blocks_fake_allocated_unformatted; -+ -+ /* number of fake allocated formatted blocks in tree. */ -+ __u64 blocks_fake_allocated; -+ -+ /* number of blocks reserved for flush operations. */ -+ __u64 blocks_flush_reserved; -+ -+ /* number of blocks reserved for cluster operations. */ -+ __u64 blocks_clustered; -+ -+ /* unique file-system identifier */ -+ __u32 fsuid; -+ -+ /* On-disk format version. If does not equal to the disk_format -+ plugin version, some format updates (e.g. enlarging plugin -+ set, etc) may have place on mount. */ -+ int version; -+ -+ /* file-system wide flags. See reiser4_fs_flag enum */ -+ unsigned long fs_flags; -+ -+ /* transaction manager */ -+ txn_mgr tmgr; -+ -+ /* ent thread */ -+ entd_context entd; -+ -+ /* fake inode used to bind formatted nodes */ -+ struct inode *fake; -+ /* inode used to bind bitmaps (and journal heads) */ -+ struct inode *bitmap; -+ /* inode used to bind copied on capture nodes */ -+ struct inode *cc; -+ -+ /* disk layout plugin */ -+ disk_format_plugin *df_plug; -+ -+ /* disk layout specific part of reiser4 super info data */ -+ union { -+ format40_super_info format40; -+ } u; -+ -+ /* value we return in st_blksize on stat(2) */ -+ unsigned long optimal_io_size; -+ -+ /* parameters for the flush algorithm */ -+ struct flush_params flush; -+ -+ /* pointers to jnodes for journal header and footer */ -+ jnode *journal_header; -+ jnode *journal_footer; -+ -+ journal_location jloc; -+ -+ /* head block number of last committed transaction */ -+ __u64 last_committed_tx; -+ -+ /* -+ * we remember last written location for using as a hint for new block -+ * allocation -+ */ -+ __u64 blocknr_hint_default; -+ -+ /* committed number of files (oid allocator state variable ) */ -+ __u64 nr_files_committed; -+ -+ struct formatted_ra_params ra_params; -+ -+ /* -+ * A mutex for serializing cut tree operation if out-of-free-space: -+ * the only one cut_tree thread is allowed to grab space from reserved -+ * area (it is 5% of disk space) -+ */ -+ struct mutex delete_mutex; -+ /* task owning ->delete_mutex */ -+ struct task_struct *delete_mutex_owner; -+ -+ /* Diskmap's blocknumber */ -+ __u64 diskmap_block; -+ -+ /* What to do in case of error */ -+ int onerror; -+ -+ /* operations for objects on this file system */ -+ struct object_ops ops; -+ -+ /* -+ * structure to maintain d_cursors. See plugin/file_ops_readdir.c for -+ * more details -+ */ -+ struct d_cursor_info d_info; -+ -+#ifdef CONFIG_REISER4_BADBLOCKS -+ /* Alternative master superblock offset (in bytes) */ -+ unsigned long altsuper; -+#endif -+ struct repacker *repacker; -+ struct page *status_page; -+ struct bio *status_bio; -+ -+#if REISER4_DEBUG -+ /* -+ * minimum used blocks value (includes super blocks, bitmap blocks and -+ * other fs reserved areas), depends on fs format and fs size. -+ */ -+ __u64 min_blocks_used; -+ -+ /* -+ * when debugging is on, all jnodes (including znodes, bitmaps, etc.) -+ * are kept on a list anchored at sbinfo->all_jnodes. This list is -+ * protected by sbinfo->all_guard spin lock. This lock should be taken -+ * with _irq modifier, because it is also modified from interrupt -+ * contexts (by RCU). -+ */ -+ spinlock_t all_guard; -+ /* list of all jnodes */ -+ struct list_head all_jnodes; -+#endif -+ struct dentry *debugfs_root; -+}; -+ -+extern reiser4_super_info_data *get_super_private_nocheck(const struct -+ super_block *super); -+ -+/* Return reiser4-specific part of super block */ -+static inline reiser4_super_info_data *get_super_private(const struct -+ super_block *super) -+{ -+ assert("nikita-447", super != NULL); -+ -+ return (reiser4_super_info_data *) super->s_fs_info; -+} -+ -+/* get ent context for the @super */ -+static inline entd_context *get_entd_context(struct super_block *super) -+{ -+ return &get_super_private(super)->entd; -+} -+ -+/* "Current" super-block: main super block used during current system -+ call. Reference to this super block is stored in reiser4_context. */ -+static inline struct super_block *reiser4_get_current_sb(void) -+{ -+ return get_current_context()->super; -+} -+ -+/* Reiser4-specific part of "current" super-block: main super block used -+ during current system call. Reference to this super block is stored in -+ reiser4_context. */ -+static inline reiser4_super_info_data *get_current_super_private(void) -+{ -+ return get_super_private(reiser4_get_current_sb()); -+} -+ -+static inline struct formatted_ra_params *get_current_super_ra_params(void) -+{ -+ return &(get_current_super_private()->ra_params); -+} -+ -+/* -+ * true, if file system on @super is read-only -+ */ -+static inline int rofs_super(struct super_block *super) -+{ -+ return super->s_flags & MS_RDONLY; -+} -+ -+/* -+ * true, if @tree represents read-only file system -+ */ -+static inline int rofs_tree(reiser4_tree * tree) -+{ -+ return rofs_super(tree->super); -+} -+ -+/* -+ * true, if file system where @inode lives on, is read-only -+ */ -+static inline int rofs_inode(struct inode *inode) -+{ -+ return rofs_super(inode->i_sb); -+} -+ -+/* -+ * true, if file system where @node lives on, is read-only -+ */ -+static inline int rofs_jnode(jnode * node) -+{ -+ return rofs_tree(jnode_get_tree(node)); -+} -+ -+extern __u64 reiser4_current_block_count(void); -+ -+extern void build_object_ops(struct super_block *super, struct object_ops * ops); -+ -+#define REISER4_SUPER_MAGIC 0x52345362 /* (*(__u32 *)"R4Sb"); */ -+ -+static inline void spin_lock_reiser4_super(reiser4_super_info_data *sbinfo) -+{ -+ spin_lock(&(sbinfo->guard)); -+} -+ -+static inline void spin_unlock_reiser4_super(reiser4_super_info_data *sbinfo) -+{ -+ assert_spin_locked(&(sbinfo->guard)); -+ spin_unlock(&(sbinfo->guard)); -+} -+ -+extern __u64 reiser4_flush_reserved(const struct super_block *); -+extern int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f); -+extern long reiser4_statfs_type(const struct super_block *super); -+extern __u64 reiser4_block_count(const struct super_block *super); -+extern void reiser4_set_block_count(const struct super_block *super, __u64 nr); -+extern __u64 reiser4_data_blocks(const struct super_block *super); -+extern void reiser4_set_data_blocks(const struct super_block *super, __u64 nr); -+extern __u64 reiser4_free_blocks(const struct super_block *super); -+extern void reiser4_set_free_blocks(const struct super_block *super, __u64 nr); -+extern __u32 reiser4_mkfs_id(const struct super_block *super); -+ -+extern __u64 reiser4_free_committed_blocks(const struct super_block *super); -+ -+extern __u64 reiser4_grabbed_blocks(const struct super_block *); -+extern __u64 reiser4_fake_allocated(const struct super_block *); -+extern __u64 reiser4_fake_allocated_unformatted(const struct super_block *); -+extern __u64 reiser4_clustered_blocks(const struct super_block *); -+ -+extern long reiser4_reserved_blocks(const struct super_block *super, uid_t uid, -+ gid_t gid); -+ -+extern reiser4_space_allocator * -+reiser4_get_space_allocator(const struct super_block *super); -+extern reiser4_oid_allocator * -+reiser4_get_oid_allocator(const struct super_block *super); -+extern struct inode *reiser4_get_super_fake(const struct super_block *super); -+extern struct inode *reiser4_get_cc_fake(const struct super_block *super); -+extern struct inode *reiser4_get_bitmap_fake(const struct super_block *super); -+extern reiser4_tree *reiser4_get_tree(const struct super_block *super); -+extern int is_reiser4_super(const struct super_block *super); -+ -+extern int reiser4_blocknr_is_sane(const reiser4_block_nr * blk); -+extern int reiser4_blocknr_is_sane_for(const struct super_block *super, -+ const reiser4_block_nr * blk); -+extern int reiser4_fill_super(struct super_block *s, void *data, int silent); -+extern int reiser4_done_super(struct super_block *s); -+ -+/* step of fill super */ -+extern int reiser4_init_fs_info(struct super_block *); -+extern void reiser4_done_fs_info(struct super_block *); -+extern int reiser4_init_super_data(struct super_block *, char *opt_string); -+extern int reiser4_init_read_super(struct super_block *, int silent); -+extern int reiser4_init_root_inode(struct super_block *); -+extern reiser4_plugin *get_default_plugin(pset_member memb); -+ -+/* Maximal possible object id. */ -+#define ABSOLUTE_MAX_OID ((oid_t)~0) -+ -+#define OIDS_RESERVED ( 1 << 16 ) -+int oid_init_allocator(struct super_block *, oid_t nr_files, oid_t next); -+oid_t oid_allocate(struct super_block *); -+int oid_release(struct super_block *, oid_t); -+oid_t oid_next(const struct super_block *); -+void oid_count_allocated(void); -+void oid_count_released(void); -+long oids_used(const struct super_block *); -+ -+#if REISER4_DEBUG -+void print_fs_info(const char *prefix, const struct super_block *); -+#endif -+ -+extern void destroy_reiser4_cache(struct kmem_cache **); -+ -+extern struct super_operations reiser4_super_operations; -+extern struct export_operations reiser4_export_operations; -+extern struct dentry_operations reiser4_dentry_operations; -+ -+/* __REISER4_SUPER_H__ */ -+#endif -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 120 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/super_ops.c linux-2.6.24/fs/reiser4/super_ops.c ---- linux-2.6.24.orig/fs/reiser4/super_ops.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/super_ops.c 2008-01-25 12:23:33.922660872 +0300 -@@ -0,0 +1,724 @@ -+/* Copyright 2005 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+#include "inode.h" -+#include "page_cache.h" -+#include "ktxnmgrd.h" -+#include "flush.h" -+#include "safe_link.h" -+ -+#include -+#include -+#include -+#include -+#include -+ -+/* slab cache for inodes */ -+static struct kmem_cache *inode_cache; -+ -+static struct dentry *reiser4_debugfs_root = NULL; -+ -+/** -+ * init_once - constructor for reiser4 inodes -+ * @cache: cache @obj belongs to -+ * @obj: inode to be initialized -+ * -+ * Initialization function to be called when new page is allocated by reiser4 -+ * inode cache. It is set on inode cache creation. -+ */ -+static void init_once(struct kmem_cache *cache, void *obj) -+{ -+ struct reiser4_inode_object *info; -+ -+ info = obj; -+ -+ /* initialize vfs inode */ -+ inode_init_once(&info->vfs_inode); -+ -+ /* -+ * initialize reiser4 specific part fo inode. -+ * NOTE-NIKITA add here initializations for locks, list heads, -+ * etc. that will be added to our private inode part. -+ */ -+ INIT_LIST_HEAD(get_readdir_list(&info->vfs_inode)); -+ init_rwsem(&info->p.conv_sem); -+ /* init semaphore which is used during inode loading */ -+ loading_init_once(&info->p); -+ INIT_RADIX_TREE(jnode_tree_by_reiser4_inode(&info->p), -+ GFP_ATOMIC); -+#if REISER4_DEBUG -+ info->p.nr_jnodes = 0; -+#endif -+} -+ -+/** -+ * init_inodes - create znode cache -+ * -+ * Initializes slab cache of inodes. It is part of reiser4 module initialization. -+ */ -+static int init_inodes(void) -+{ -+ inode_cache = kmem_cache_create("reiser4_inode", -+ sizeof(struct reiser4_inode_object), -+ 0, -+ SLAB_HWCACHE_ALIGN | -+ SLAB_RECLAIM_ACCOUNT, init_once); -+ if (inode_cache == NULL) -+ return RETERR(-ENOMEM); -+ return 0; -+} -+ -+/** -+ * done_inodes - delete inode cache -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+static void done_inodes(void) -+{ -+ destroy_reiser4_cache(&inode_cache); -+} -+ -+/** -+ * reiser4_alloc_inode - alloc_inode of super operations -+ * @super: super block new inode is allocated for -+ * -+ * Allocates new inode, initializes reiser4 specific part of it. -+ */ -+static struct inode *reiser4_alloc_inode(struct super_block *super) -+{ -+ struct reiser4_inode_object *obj; -+ -+ assert("nikita-1696", super != NULL); -+ obj = kmem_cache_alloc(inode_cache, reiser4_ctx_gfp_mask_get()); -+ if (obj != NULL) { -+ reiser4_inode *info; -+ -+ info = &obj->p; -+ -+ info->pset = plugin_set_get_empty(); -+ info->hset = plugin_set_get_empty(); -+ info->extmask = 0; -+ info->locality_id = 0ull; -+ info->plugin_mask = 0; -+ info->heir_mask = 0; -+#if !REISER4_INO_IS_OID -+ info->oid_hi = 0; -+#endif -+ reiser4_seal_init(&info->sd_seal, NULL, NULL); -+ coord_init_invalid(&info->sd_coord, NULL); -+ info->flags = 0; -+ spin_lock_init(&info->guard); -+ /* this deals with info's loading semaphore */ -+ loading_alloc(info); -+ info->vroot = UBER_TREE_ADDR; -+ return &obj->vfs_inode; -+ } else -+ return NULL; -+} -+ -+/** -+ * reiser4_destroy_inode - destroy_inode of super operations -+ * @inode: inode being destroyed -+ * -+ * Puts reiser4 specific portion of inode, frees memory occupied by inode. -+ */ -+static void reiser4_destroy_inode(struct inode *inode) -+{ -+ reiser4_inode *info; -+ -+ info = reiser4_inode_data(inode); -+ -+ assert("vs-1220", inode_has_no_jnodes(info)); -+ -+ if (!is_bad_inode(inode) && is_inode_loaded(inode)) { -+ file_plugin *fplug = inode_file_plugin(inode); -+ if (fplug->destroy_inode != NULL) -+ fplug->destroy_inode(inode); -+ } -+ reiser4_dispose_cursors(inode); -+ if (info->pset) -+ plugin_set_put(info->pset); -+ if (info->hset) -+ plugin_set_put(info->hset); -+ -+ /* -+ * cannot add similar assertion about ->i_list as prune_icache return -+ * inode into slab with dangling ->list.{next,prev}. This is safe, -+ * because they are re-initialized in the new_inode(). -+ */ -+ assert("nikita-2895", list_empty(&inode->i_dentry)); -+ assert("nikita-2896", hlist_unhashed(&inode->i_hash)); -+ assert("nikita-2898", list_empty_careful(get_readdir_list(inode))); -+ -+ /* this deals with info's loading semaphore */ -+ loading_destroy(info); -+ -+ kmem_cache_free(inode_cache, -+ container_of(info, struct reiser4_inode_object, p)); -+} -+ -+/** -+ * reiser4_dirty_inode - dirty_inode of super operations -+ * @inode: inode being dirtied -+ * -+ * Updates stat data. -+ */ -+static void reiser4_dirty_inode(struct inode *inode) -+{ -+ int result; -+ -+ if (!is_in_reiser4_context()) -+ return; -+ assert("", !IS_RDONLY(inode)); -+ assert("", (inode_file_plugin(inode)->estimate.update(inode) <= -+ get_current_context()->grabbed_blocks)); -+ -+ result = reiser4_update_sd(inode); -+ if (result) -+ warning("", "failed to dirty inode for %llu: %d", -+ get_inode_oid(inode), result); -+} -+ -+/** -+ * reiser4_delete_inode - delete_inode of super operations -+ * @inode: inode to delete -+ * -+ * Calls file plugin's delete_object method to delete object items from -+ * filesystem tree and calls clear_inode. -+ */ -+static void reiser4_delete_inode(struct inode *inode) -+{ -+ reiser4_context *ctx; -+ file_plugin *fplug; -+ -+ ctx = reiser4_init_context(inode->i_sb); -+ if (IS_ERR(ctx)) { -+ warning("vs-15", "failed to init context"); -+ return; -+ } -+ -+ if (is_inode_loaded(inode)) { -+ fplug = inode_file_plugin(inode); -+ if (fplug != NULL && fplug->delete_object != NULL) -+ fplug->delete_object(inode); -+ } -+ -+ truncate_inode_pages(&inode->i_data, 0); -+ inode->i_blocks = 0; -+ clear_inode(inode); -+ reiser4_exit_context(ctx); -+} -+ -+/** -+ * reiser4_put_super - put_super of super operations -+ * @super: super block to free -+ * -+ * Stops daemons, release resources, umounts in short. -+ */ -+static void reiser4_put_super(struct super_block *super) -+{ -+ reiser4_super_info_data *sbinfo; -+ reiser4_context *ctx; -+ -+ sbinfo = get_super_private(super); -+ assert("vs-1699", sbinfo); -+ -+ debugfs_remove(sbinfo->tmgr.debugfs_atom_count); -+ debugfs_remove(sbinfo->tmgr.debugfs_id_count); -+ debugfs_remove(sbinfo->debugfs_root); -+ -+ ctx = reiser4_init_context(super); -+ if (IS_ERR(ctx)) { -+ warning("vs-17", "failed to init context"); -+ return; -+ } -+ -+ /* have disk format plugin to free its resources */ -+ if (get_super_private(super)->df_plug->release) -+ get_super_private(super)->df_plug->release(super); -+ -+ reiser4_done_formatted_fake(super); -+ -+ /* stop daemons: ktxnmgr and entd */ -+ reiser4_done_entd(super); -+ reiser4_done_ktxnmgrd(super); -+ reiser4_done_txnmgr(&sbinfo->tmgr); -+ -+ reiser4_done_fs_info(super); -+ reiser4_exit_context(ctx); -+} -+ -+/** -+ * reiser4_write_super - write_super of super operations -+ * @super: super block to write -+ * -+ * Captures znode associated with super block, comit all transactions. -+ */ -+static void reiser4_write_super(struct super_block *super) -+{ -+ int ret; -+ reiser4_context *ctx; -+ -+ assert("vs-1700", !rofs_super(super)); -+ -+ ctx = reiser4_init_context(super); -+ if (IS_ERR(ctx)) { -+ warning("vs-16", "failed to init context"); -+ return; -+ } -+ -+ ret = reiser4_capture_super_block(super); -+ if (ret != 0) -+ warning("vs-1701", -+ "reiser4_capture_super_block failed in write_super: %d", -+ ret); -+ ret = txnmgr_force_commit_all(super, 0); -+ if (ret != 0) -+ warning("jmacd-77113", -+ "txn_force failed in write_super: %d", ret); -+ -+ super->s_dirt = 0; -+ -+ reiser4_exit_context(ctx); -+} -+ -+/** -+ * reiser4_statfs - statfs of super operations -+ * @super: super block of file system in queried -+ * @stafs: buffer to fill with statistics -+ * -+ * Returns information about filesystem. -+ */ -+static int reiser4_statfs(struct dentry *dentry, struct kstatfs *statfs) -+{ -+ sector_t total; -+ sector_t reserved; -+ sector_t free; -+ sector_t forroot; -+ sector_t deleted; -+ reiser4_context *ctx; -+ struct super_block *super = dentry->d_sb; -+ -+ assert("nikita-408", super != NULL); -+ assert("nikita-409", statfs != NULL); -+ -+ ctx = reiser4_init_context(super); -+ if (IS_ERR(ctx)) -+ return PTR_ERR(ctx); -+ -+ statfs->f_type = reiser4_statfs_type(super); -+ statfs->f_bsize = super->s_blocksize; -+ -+ /* -+ * 5% of total block space is reserved. This is needed for flush and -+ * for truncates (so that we are able to perform truncate/unlink even -+ * on the otherwise completely full file system). If this reservation -+ * is hidden from statfs(2), users will mistakenly guess that they -+ * have enough free space to complete some operation, which is -+ * frustrating. -+ * -+ * Another possible solution is to subtract ->blocks_reserved from -+ * ->f_bfree, but changing available space seems less intrusive than -+ * letting user to see 5% of disk space to be used directly after -+ * mkfs. -+ */ -+ total = reiser4_block_count(super); -+ reserved = get_super_private(super)->blocks_reserved; -+ deleted = txnmgr_count_deleted_blocks(); -+ free = reiser4_free_blocks(super) + deleted; -+ forroot = reiser4_reserved_blocks(super, 0, 0); -+ -+ /* -+ * These counters may be in inconsistent state because we take the -+ * values without keeping any global spinlock. Here we do a sanity -+ * check that free block counter does not exceed the number of all -+ * blocks. -+ */ -+ if (free > total) -+ free = total; -+ statfs->f_blocks = total - reserved; -+ /* make sure statfs->f_bfree is never larger than statfs->f_blocks */ -+ if (free > reserved) -+ free -= reserved; -+ else -+ free = 0; -+ statfs->f_bfree = free; -+ -+ if (free > forroot) -+ free -= forroot; -+ else -+ free = 0; -+ statfs->f_bavail = free; -+ -+ statfs->f_files = 0; -+ statfs->f_ffree = 0; -+ -+ /* maximal acceptable name length depends on directory plugin. */ -+ assert("nikita-3351", super->s_root->d_inode != NULL); -+ statfs->f_namelen = reiser4_max_filename_len(super->s_root->d_inode); -+ reiser4_exit_context(ctx); -+ return 0; -+} -+ -+/** -+ * reiser4_clear_inode - clear_inode of super operation -+ * @inode: inode about to destroy -+ * -+ * Does sanity checks: being destroyed should have all jnodes detached. -+ */ -+static void reiser4_clear_inode(struct inode *inode) -+{ -+#if REISER4_DEBUG -+ reiser4_inode *r4_inode; -+ -+ r4_inode = reiser4_inode_data(inode); -+ if (!inode_has_no_jnodes(r4_inode)) -+ warning("vs-1732", "reiser4 inode has %ld jnodes\n", -+ r4_inode->nr_jnodes); -+#endif -+} -+ -+/** -+ * reiser4_sync_inodes - sync_inodes of super operations -+ * @super: -+ * @wbc: -+ * -+ * This method is called by background and non-backgound writeback. Reiser4's -+ * implementation uses generic_sync_sb_inodes to call reiser4_writepages for -+ * each of dirty inodes. Reiser4_writepages handles pages dirtied via shared -+ * mapping - dirty pages get into atoms. Writeout is called to flush some -+ * atoms. -+ */ -+static void reiser4_sync_inodes(struct super_block *super, -+ struct writeback_control *wbc) -+{ -+ reiser4_context *ctx; -+ long to_write; -+ -+ if (wbc->for_kupdate) -+ /* reiser4 has its own means of periodical write-out */ -+ return; -+ -+ to_write = wbc->nr_to_write; -+ assert("vs-49", wbc->older_than_this == NULL); -+ -+ ctx = reiser4_init_context(super); -+ if (IS_ERR(ctx)) { -+ warning("vs-13", "failed to init context"); -+ return; -+ } -+ -+ /* -+ * call reiser4_writepages for each of dirty inodes to turn dirty pages -+ * into transactions if they were not yet. -+ */ -+ generic_sync_sb_inodes(super, wbc); -+ -+ /* flush goes here */ -+ wbc->nr_to_write = to_write; -+ reiser4_writeout(super, wbc); -+ -+ /* avoid recursive calls to ->sync_inodes */ -+ context_set_commit_async(ctx); -+ reiser4_exit_context(ctx); -+} -+ -+/** -+ * reiser4_show_options - show_options of super operations -+ * @m: file where to write information -+ * @mnt: mount structure -+ * -+ * Makes reiser4 mount options visible in /proc/mounts. -+ */ -+static int reiser4_show_options(struct seq_file *m, struct vfsmount *mnt) -+{ -+ struct super_block *super; -+ reiser4_super_info_data *sbinfo; -+ -+ super = mnt->mnt_sb; -+ sbinfo = get_super_private(super); -+ -+ seq_printf(m, ",atom_max_size=0x%x", sbinfo->tmgr.atom_max_size); -+ seq_printf(m, ",atom_max_age=0x%x", sbinfo->tmgr.atom_max_age); -+ seq_printf(m, ",atom_min_size=0x%x", sbinfo->tmgr.atom_min_size); -+ seq_printf(m, ",atom_max_flushers=0x%x", -+ sbinfo->tmgr.atom_max_flushers); -+ seq_printf(m, ",cbk_cache_slots=0x%x", -+ sbinfo->tree.cbk_cache.nr_slots); -+ -+ return 0; -+} -+ -+struct super_operations reiser4_super_operations = { -+ .alloc_inode = reiser4_alloc_inode, -+ .destroy_inode = reiser4_destroy_inode, -+ .dirty_inode = reiser4_dirty_inode, -+ .delete_inode = reiser4_delete_inode, -+ .put_super = reiser4_put_super, -+ .write_super = reiser4_write_super, -+ .statfs = reiser4_statfs, -+ .clear_inode = reiser4_clear_inode, -+ .sync_inodes = reiser4_sync_inodes, -+ .show_options = reiser4_show_options -+}; -+ -+/** -+ * fill_super - initialize super block on mount -+ * @super: super block to fill -+ * @data: reiser4 specific mount option -+ * @silent: -+ * -+ * This is to be called by reiser4_get_sb. Mounts filesystem. -+ */ -+static int fill_super(struct super_block *super, void *data, int silent) -+{ -+ reiser4_context ctx; -+ int result; -+ reiser4_super_info_data *sbinfo; -+ -+ assert("zam-989", super != NULL); -+ -+ super->s_op = NULL; -+ init_stack_context(&ctx, super); -+ -+ /* allocate reiser4 specific super block */ -+ if ((result = reiser4_init_fs_info(super)) != 0) -+ goto failed_init_sinfo; -+ -+ sbinfo = get_super_private(super); -+ /* initialize various reiser4 parameters, parse mount options */ -+ if ((result = reiser4_init_super_data(super, data)) != 0) -+ goto failed_init_super_data; -+ -+ /* read reiser4 master super block, initialize disk format plugin */ -+ if ((result = reiser4_init_read_super(super, silent)) != 0) -+ goto failed_init_read_super; -+ -+ /* initialize transaction manager */ -+ reiser4_init_txnmgr(&sbinfo->tmgr); -+ -+ /* initialize ktxnmgrd context and start kernel thread ktxnmrgd */ -+ if ((result = reiser4_init_ktxnmgrd(super)) != 0) -+ goto failed_init_ktxnmgrd; -+ -+ /* initialize entd context and start kernel thread entd */ -+ if ((result = reiser4_init_entd(super)) != 0) -+ goto failed_init_entd; -+ -+ /* initialize address spaces for formatted nodes and bitmaps */ -+ if ((result = reiser4_init_formatted_fake(super)) != 0) -+ goto failed_init_formatted_fake; -+ -+ /* initialize disk format plugin */ -+ if ((result = get_super_private(super)->df_plug->init_format(super, data)) != 0 ) -+ goto failed_init_disk_format; -+ -+ /* -+ * There are some 'committed' versions of reiser4 super block counters, -+ * which correspond to reiser4 on-disk state. These counters are -+ * initialized here -+ */ -+ sbinfo->blocks_free_committed = sbinfo->blocks_free; -+ sbinfo->nr_files_committed = oids_used(super); -+ -+ /* get inode of root directory */ -+ if ((result = reiser4_init_root_inode(super)) != 0) -+ goto failed_init_root_inode; -+ -+ if ((result = get_super_private(super)->df_plug->version_update(super)) != 0 ) -+ goto failed_update_format_version; -+ -+ process_safelinks(super); -+ reiser4_exit_context(&ctx); -+ -+ sbinfo->debugfs_root = debugfs_create_dir(super->s_id, -+ reiser4_debugfs_root); -+ if (sbinfo->debugfs_root) { -+ sbinfo->tmgr.debugfs_atom_count = -+ debugfs_create_u32("atom_count", S_IFREG|S_IRUSR, -+ sbinfo->debugfs_root, -+ &sbinfo->tmgr.atom_count); -+ sbinfo->tmgr.debugfs_id_count = -+ debugfs_create_u32("id_count", S_IFREG|S_IRUSR, -+ sbinfo->debugfs_root, -+ &sbinfo->tmgr.id_count); -+ } -+ return 0; -+ -+ failed_update_format_version: -+ failed_init_root_inode: -+ if (sbinfo->df_plug->release) -+ sbinfo->df_plug->release(super); -+ failed_init_disk_format: -+ reiser4_done_formatted_fake(super); -+ failed_init_formatted_fake: -+ reiser4_done_entd(super); -+ failed_init_entd: -+ reiser4_done_ktxnmgrd(super); -+ failed_init_ktxnmgrd: -+ reiser4_done_txnmgr(&sbinfo->tmgr); -+ failed_init_read_super: -+ failed_init_super_data: -+ reiser4_done_fs_info(super); -+ failed_init_sinfo: -+ reiser4_exit_context(&ctx); -+ return result; -+} -+ -+/** -+ * reiser4_get_sb - get_sb of file_system_type operations -+ * @fs_type: -+ * @flags: mount flags MS_RDONLY, MS_VERBOSE, etc -+ * @dev_name: block device file name -+ * @data: specific mount options -+ * -+ * Reiser4 mount entry. -+ */ -+static int reiser4_get_sb(struct file_system_type *fs_type, int flags, -+ const char *dev_name, void *data, struct vfsmount *mnt) -+{ -+ return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt); -+} -+ -+/* structure describing the reiser4 filesystem implementation */ -+static struct file_system_type reiser4_fs_type = { -+ .owner = THIS_MODULE, -+ .name = "reiser4", -+ .fs_flags = FS_REQUIRES_DEV, -+ .get_sb = reiser4_get_sb, -+ .kill_sb = kill_block_super, -+ .next = NULL -+}; -+ -+void destroy_reiser4_cache(struct kmem_cache **cachep) -+{ -+ BUG_ON(*cachep == NULL); -+ kmem_cache_destroy(*cachep); -+ *cachep = NULL; -+} -+ -+/** -+ * init_reiser4 - reiser4 initialization entry point -+ * -+ * Initializes reiser4 slabs, registers reiser4 filesystem type. It is called -+ * on kernel initialization or during reiser4 module load. -+ */ -+static int __init init_reiser4(void) -+{ -+ int result; -+ -+ printk(KERN_INFO -+ "Loading Reiser4. " -+ "See www.namesys.com for a description of Reiser4.\n"); -+ -+ /* initialize slab cache of inodes */ -+ if ((result = init_inodes()) != 0) -+ goto failed_inode_cache; -+ -+ /* initialize cache of znodes */ -+ if ((result = init_znodes()) != 0) -+ goto failed_init_znodes; -+ -+ /* initialize all plugins */ -+ if ((result = init_plugins()) != 0) -+ goto failed_init_plugins; -+ -+ /* initialize cache of plugin_set-s and plugin_set's hash table */ -+ if ((result = init_plugin_set()) != 0) -+ goto failed_init_plugin_set; -+ -+ /* initialize caches of txn_atom-s and txn_handle-s */ -+ if ((result = init_txnmgr_static()) != 0) -+ goto failed_init_txnmgr_static; -+ -+ /* initialize cache of jnodes */ -+ if ((result = init_jnodes()) != 0) -+ goto failed_init_jnodes; -+ -+ /* initialize cache of flush queues */ -+ if ((result = reiser4_init_fqs()) != 0) -+ goto failed_init_fqs; -+ -+ /* initialize cache of structures attached to dentry->d_fsdata */ -+ if ((result = reiser4_init_dentry_fsdata()) != 0) -+ goto failed_init_dentry_fsdata; -+ -+ /* initialize cache of structures attached to file->private_data */ -+ if ((result = reiser4_init_file_fsdata()) != 0) -+ goto failed_init_file_fsdata; -+ -+ /* -+ * initialize cache of d_cursors. See plugin/file_ops_readdir.c for -+ * more details -+ */ -+ if ((result = reiser4_init_d_cursor()) != 0) -+ goto failed_init_d_cursor; -+ -+ if ((result = register_filesystem(&reiser4_fs_type)) == 0) { -+ reiser4_debugfs_root = debugfs_create_dir("reiser4", NULL); -+ return 0; -+ } -+ -+ reiser4_done_d_cursor(); -+ failed_init_d_cursor: -+ reiser4_done_file_fsdata(); -+ failed_init_file_fsdata: -+ reiser4_done_dentry_fsdata(); -+ failed_init_dentry_fsdata: -+ reiser4_done_fqs(); -+ failed_init_fqs: -+ done_jnodes(); -+ failed_init_jnodes: -+ done_txnmgr_static(); -+ failed_init_txnmgr_static: -+ done_plugin_set(); -+ failed_init_plugin_set: -+ failed_init_plugins: -+ done_znodes(); -+ failed_init_znodes: -+ done_inodes(); -+ failed_inode_cache: -+ return result; -+} -+ -+/** -+ * done_reiser4 - reiser4 exit entry point -+ * -+ * Unregister reiser4 filesystem type, deletes caches. It is called on shutdown -+ * or at module unload. -+ */ -+static void __exit done_reiser4(void) -+{ -+ int result; -+ -+ debugfs_remove(reiser4_debugfs_root); -+ result = unregister_filesystem(&reiser4_fs_type); -+ BUG_ON(result != 0); -+ reiser4_done_d_cursor(); -+ reiser4_done_file_fsdata(); -+ reiser4_done_dentry_fsdata(); -+ reiser4_done_fqs(); -+ done_jnodes(); -+ done_txnmgr_static(); -+ done_plugin_set(); -+ done_znodes(); -+ destroy_reiser4_cache(&inode_cache); -+} -+ -+module_init(init_reiser4); -+module_exit(done_reiser4); -+ -+MODULE_DESCRIPTION("Reiser4 filesystem"); -+MODULE_AUTHOR("Hans Reiser "); -+ -+MODULE_LICENSE("GPL"); -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/tap.c linux-2.6.24/fs/reiser4/tap.c ---- linux-2.6.24.orig/fs/reiser4/tap.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/tap.c 2008-01-25 11:39:07.092247874 +0300 -@@ -0,0 +1,377 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* -+ Tree Access Pointer (tap). -+ -+ tap is data structure combining coord and lock handle (mostly). It is -+ useful when one has to scan tree nodes (for example, in readdir, or flush), -+ for tap functions allow to move tap in either direction transparently -+ crossing unit/item/node borders. -+ -+ Tap doesn't provide automatic synchronization of its fields as it is -+ supposed to be per-thread object. -+*/ -+ -+#include "forward.h" -+#include "debug.h" -+#include "coord.h" -+#include "tree.h" -+#include "context.h" -+#include "tap.h" -+#include "znode.h" -+#include "tree_walk.h" -+ -+#if REISER4_DEBUG -+static int tap_invariant(const tap_t * tap); -+static void tap_check(const tap_t * tap); -+#else -+#define tap_check(tap) noop -+#endif -+ -+/** load node tap is pointing to, if not loaded already */ -+int reiser4_tap_load(tap_t * tap) -+{ -+ tap_check(tap); -+ if (tap->loaded == 0) { -+ int result; -+ -+ result = zload_ra(tap->coord->node, &tap->ra_info); -+ if (result != 0) -+ return result; -+ coord_clear_iplug(tap->coord); -+ } -+ ++tap->loaded; -+ tap_check(tap); -+ return 0; -+} -+ -+/** release node tap is pointing to. Dual to tap_load() */ -+void reiser4_tap_relse(tap_t * tap) -+{ -+ tap_check(tap); -+ if (tap->loaded > 0) { -+ --tap->loaded; -+ if (tap->loaded == 0) { -+ zrelse(tap->coord->node); -+ } -+ } -+ tap_check(tap); -+} -+ -+/** -+ * init tap to consist of @coord and @lh. Locks on nodes will be acquired with -+ * @mode -+ */ -+void reiser4_tap_init(tap_t * tap, coord_t * coord, lock_handle * lh, -+ znode_lock_mode mode) -+{ -+ tap->coord = coord; -+ tap->lh = lh; -+ tap->mode = mode; -+ tap->loaded = 0; -+ INIT_LIST_HEAD(&tap->linkage); -+ reiser4_init_ra_info(&tap->ra_info); -+} -+ -+/** add @tap to the per-thread list of all taps */ -+void reiser4_tap_monitor(tap_t * tap) -+{ -+ assert("nikita-2623", tap != NULL); -+ tap_check(tap); -+ list_add(&tap->linkage, reiser4_taps_list()); -+ tap_check(tap); -+} -+ -+/* duplicate @src into @dst. Copy lock handle. @dst is not initially -+ * loaded. */ -+void reiser4_tap_copy(tap_t * dst, tap_t * src) -+{ -+ assert("nikita-3193", src != NULL); -+ assert("nikita-3194", dst != NULL); -+ -+ *dst->coord = *src->coord; -+ if (src->lh->node) -+ copy_lh(dst->lh, src->lh); -+ dst->mode = src->mode; -+ dst->loaded = 0; -+ INIT_LIST_HEAD(&dst->linkage); -+ dst->ra_info = src->ra_info; -+} -+ -+/** finish with @tap */ -+void reiser4_tap_done(tap_t * tap) -+{ -+ assert("nikita-2565", tap != NULL); -+ tap_check(tap); -+ if (tap->loaded > 0) -+ zrelse(tap->coord->node); -+ done_lh(tap->lh); -+ tap->loaded = 0; -+ list_del_init(&tap->linkage); -+ tap->coord->node = NULL; -+} -+ -+/** -+ * move @tap to the new node, locked with @target. Load @target, if @tap was -+ * already loaded. -+ */ -+int reiser4_tap_move(tap_t * tap, lock_handle * target) -+{ -+ int result = 0; -+ -+ assert("nikita-2567", tap != NULL); -+ assert("nikita-2568", target != NULL); -+ assert("nikita-2570", target->node != NULL); -+ assert("nikita-2569", tap->coord->node == tap->lh->node); -+ -+ tap_check(tap); -+ if (tap->loaded > 0) -+ result = zload_ra(target->node, &tap->ra_info); -+ -+ if (result == 0) { -+ if (tap->loaded > 0) -+ zrelse(tap->coord->node); -+ done_lh(tap->lh); -+ copy_lh(tap->lh, target); -+ tap->coord->node = target->node; -+ coord_clear_iplug(tap->coord); -+ } -+ tap_check(tap); -+ return result; -+} -+ -+/** -+ * move @tap to @target. Acquire lock on @target, if @tap was already -+ * loaded. -+ */ -+static int tap_to(tap_t * tap, znode * target) -+{ -+ int result; -+ -+ assert("nikita-2624", tap != NULL); -+ assert("nikita-2625", target != NULL); -+ -+ tap_check(tap); -+ result = 0; -+ if (tap->coord->node != target) { -+ lock_handle here; -+ -+ init_lh(&here); -+ result = longterm_lock_znode(&here, target, -+ tap->mode, ZNODE_LOCK_HIPRI); -+ if (result == 0) { -+ result = reiser4_tap_move(tap, &here); -+ done_lh(&here); -+ } -+ } -+ tap_check(tap); -+ return result; -+} -+ -+/** -+ * move @tap to given @target, loading and locking @target->node if -+ * necessary -+ */ -+int tap_to_coord(tap_t * tap, coord_t * target) -+{ -+ int result; -+ -+ tap_check(tap); -+ result = tap_to(tap, target->node); -+ if (result == 0) -+ coord_dup(tap->coord, target); -+ tap_check(tap); -+ return result; -+} -+ -+/** return list of all taps */ -+struct list_head *reiser4_taps_list(void) -+{ -+ return &get_current_context()->taps; -+} -+ -+/** helper function for go_{next,prev}_{item,unit,node}() */ -+int go_dir_el(tap_t * tap, sideof dir, int units_p) -+{ -+ coord_t dup; -+ coord_t *coord; -+ int result; -+ -+ int (*coord_dir) (coord_t *); -+ int (*get_dir_neighbor) (lock_handle *, znode *, int, int); -+ void (*coord_init) (coord_t *, const znode *); -+ ON_DEBUG(int (*coord_check) (const coord_t *)); -+ -+ assert("nikita-2556", tap != NULL); -+ assert("nikita-2557", tap->coord != NULL); -+ assert("nikita-2558", tap->lh != NULL); -+ assert("nikita-2559", tap->coord->node != NULL); -+ -+ tap_check(tap); -+ if (dir == LEFT_SIDE) { -+ coord_dir = units_p ? coord_prev_unit : coord_prev_item; -+ get_dir_neighbor = reiser4_get_left_neighbor; -+ coord_init = coord_init_last_unit; -+ } else { -+ coord_dir = units_p ? coord_next_unit : coord_next_item; -+ get_dir_neighbor = reiser4_get_right_neighbor; -+ coord_init = coord_init_first_unit; -+ } -+ ON_DEBUG(coord_check = -+ units_p ? coord_is_existing_unit : coord_is_existing_item); -+ assert("nikita-2560", coord_check(tap->coord)); -+ -+ coord = tap->coord; -+ coord_dup(&dup, coord); -+ if (coord_dir(&dup) != 0) { -+ do { -+ /* move to the left neighboring node */ -+ lock_handle dup; -+ -+ init_lh(&dup); -+ result = -+ get_dir_neighbor(&dup, coord->node, (int)tap->mode, -+ GN_CAN_USE_UPPER_LEVELS); -+ if (result == 0) { -+ result = reiser4_tap_move(tap, &dup); -+ if (result == 0) -+ coord_init(tap->coord, dup.node); -+ done_lh(&dup); -+ } -+ /* skip empty nodes */ -+ } while ((result == 0) && node_is_empty(coord->node)); -+ } else { -+ result = 0; -+ coord_dup(coord, &dup); -+ } -+ assert("nikita-2564", ergo(!result, coord_check(tap->coord))); -+ tap_check(tap); -+ return result; -+} -+ -+/** -+ * move @tap to the next unit, transparently crossing item and node -+ * boundaries -+ */ -+int go_next_unit(tap_t * tap) -+{ -+ return go_dir_el(tap, RIGHT_SIDE, 1); -+} -+ -+/** -+ * move @tap to the previous unit, transparently crossing item and node -+ * boundaries -+ */ -+int go_prev_unit(tap_t * tap) -+{ -+ return go_dir_el(tap, LEFT_SIDE, 1); -+} -+ -+/** -+ * @shift times apply @actor to the @tap. This is used to move @tap by -+ * @shift units (or items, or nodes) in either direction. -+ */ -+static int rewind_to(tap_t * tap, go_actor_t actor, int shift) -+{ -+ int result; -+ -+ assert("nikita-2555", shift >= 0); -+ assert("nikita-2562", tap->coord->node == tap->lh->node); -+ -+ tap_check(tap); -+ result = reiser4_tap_load(tap); -+ if (result != 0) -+ return result; -+ -+ for (; shift > 0; --shift) { -+ result = actor(tap); -+ assert("nikita-2563", tap->coord->node == tap->lh->node); -+ if (result != 0) -+ break; -+ } -+ reiser4_tap_relse(tap); -+ tap_check(tap); -+ return result; -+} -+ -+/** move @tap @shift units rightward */ -+int rewind_right(tap_t * tap, int shift) -+{ -+ return rewind_to(tap, go_next_unit, shift); -+} -+ -+/** move @tap @shift units leftward */ -+int rewind_left(tap_t * tap, int shift) -+{ -+ return rewind_to(tap, go_prev_unit, shift); -+} -+ -+#if REISER4_DEBUG -+/** debugging function: print @tap content in human readable form */ -+static void print_tap(const char *prefix, const tap_t * tap) -+{ -+ if (tap == NULL) { -+ printk("%s: null tap\n", prefix); -+ return; -+ } -+ printk("%s: loaded: %i, in-list: %i, node: %p, mode: %s\n", prefix, -+ tap->loaded, (&tap->linkage == tap->linkage.next && -+ &tap->linkage == tap->linkage.prev), -+ tap->lh->node, -+ lock_mode_name(tap->mode)); -+ print_coord("\tcoord", tap->coord, 0); -+} -+ -+/** check [tap-sane] invariant */ -+static int tap_invariant(const tap_t * tap) -+{ -+ /* [tap-sane] invariant */ -+ -+ if (tap == NULL) -+ return 1; -+ /* tap->mode is one of -+ * -+ * {ZNODE_NO_LOCK, ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}, and -+ */ -+ if (tap->mode != ZNODE_NO_LOCK && -+ tap->mode != ZNODE_READ_LOCK && tap->mode != ZNODE_WRITE_LOCK) -+ return 2; -+ /* tap->coord != NULL, and */ -+ if (tap->coord == NULL) -+ return 3; -+ /* tap->lh != NULL, and */ -+ if (tap->lh == NULL) -+ return 4; -+ /* tap->loaded > 0 => znode_is_loaded(tap->coord->node), and */ -+ if (!ergo(tap->loaded, znode_is_loaded(tap->coord->node))) -+ return 5; -+ /* tap->coord->node == tap->lh->node if tap->lh->node is not 0 */ -+ if (tap->lh->node != NULL && tap->coord->node != tap->lh->node) -+ return 6; -+ return 0; -+} -+ -+/** debugging function: check internal @tap consistency */ -+static void tap_check(const tap_t * tap) -+{ -+ int result; -+ -+ result = tap_invariant(tap); -+ if (result != 0) { -+ print_tap("broken", tap); -+ reiser4_panic("nikita-2831", "tap broken: %i\n", result); -+ } -+} -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/tap.h linux-2.6.24/fs/reiser4/tap.h ---- linux-2.6.24.orig/fs/reiser4/tap.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/tap.h 2008-01-25 11:39:07.092247874 +0300 -@@ -0,0 +1,70 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* Tree Access Pointers. See tap.c for more details. */ -+ -+#if !defined( __REISER4_TAP_H__ ) -+#define __REISER4_TAP_H__ -+ -+#include "forward.h" -+#include "readahead.h" -+ -+/** -+ tree_access_pointer aka tap. Data structure combining coord_t and lock -+ handle. -+ Invariants involving this data-type, see doc/lock-ordering for details: -+ -+ [tap-sane] -+ */ -+struct tree_access_pointer { -+ /* coord tap is at */ -+ coord_t *coord; -+ /* lock handle on ->coord->node */ -+ lock_handle *lh; -+ /* mode of lock acquired by this tap */ -+ znode_lock_mode mode; -+ /* incremented by reiser4_tap_load(). -+ Decremented by reiser4_tap_relse(). */ -+ int loaded; -+ /* list of taps */ -+ struct list_head linkage; -+ /* read-ahead hint */ -+ ra_info_t ra_info; -+}; -+ -+typedef int (*go_actor_t) (tap_t * tap); -+ -+extern int reiser4_tap_load(tap_t * tap); -+extern void reiser4_tap_relse(tap_t * tap); -+extern void reiser4_tap_init(tap_t * tap, coord_t * coord, lock_handle * lh, -+ znode_lock_mode mode); -+extern void reiser4_tap_monitor(tap_t * tap); -+extern void reiser4_tap_copy(tap_t * dst, tap_t * src); -+extern void reiser4_tap_done(tap_t * tap); -+extern int reiser4_tap_move(tap_t * tap, lock_handle * target); -+extern int tap_to_coord(tap_t * tap, coord_t * target); -+ -+extern int go_dir_el(tap_t * tap, sideof dir, int units_p); -+extern int go_next_unit(tap_t * tap); -+extern int go_prev_unit(tap_t * tap); -+extern int rewind_right(tap_t * tap, int shift); -+extern int rewind_left(tap_t * tap, int shift); -+ -+extern struct list_head *reiser4_taps_list(void); -+ -+#define for_all_taps(tap) \ -+ for (tap = list_entry(reiser4_taps_list()->next, tap_t, linkage); \ -+ reiser4_taps_list() != &tap->linkage; \ -+ tap = list_entry(tap->linkage.next, tap_t, linkage)) -+ -+/* __REISER4_TAP_H__ */ -+#endif -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/tree.c linux-2.6.24/fs/reiser4/tree.c ---- linux-2.6.24.orig/fs/reiser4/tree.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/tree.c 2008-01-25 11:39:07.096248905 +0300 -@@ -0,0 +1,1876 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* -+ * KEYS IN A TREE. -+ * -+ * The tree consists of nodes located on the disk. Node in the tree is either -+ * formatted or unformatted. Formatted node is one that has structure -+ * understood by the tree balancing and traversal code. Formatted nodes are -+ * further classified into leaf and internal nodes. Latter distinctions is -+ * (almost) of only historical importance: general structure of leaves and -+ * internal nodes is the same in Reiser4. Unformatted nodes contain raw data -+ * that are part of bodies of ordinary files and attributes. -+ * -+ * Each node in the tree spawns some interval in the key space. Key ranges for -+ * all nodes in the tree are disjoint. Actually, this only holds in some weak -+ * sense, because of the non-unique keys: intersection of key ranges for -+ * different nodes is either empty, or consists of exactly one key. -+ * -+ * Formatted node consists of a sequence of items. Each item spawns some -+ * interval in key space. Key ranges for all items in a tree are disjoint, -+ * modulo non-unique keys again. Items within nodes are ordered in the key -+ * order of the smallest key in a item. -+ * -+ * Particular type of item can be further split into units. Unit is piece of -+ * item that can be cut from item and moved into another item of the same -+ * time. Units are used by balancing code to repack data during balancing. -+ * -+ * Unit can be further split into smaller entities (for example, extent unit -+ * represents several pages, and it is natural for extent code to operate on -+ * particular pages and even bytes within one unit), but this is of no -+ * relevance to the generic balancing and lookup code. -+ * -+ * Although item is said to "spawn" range or interval of keys, it is not -+ * necessary that item contains piece of data addressable by each and every -+ * key in this range. For example, compound directory item, consisting of -+ * units corresponding to directory entries and keyed by hashes of file names, -+ * looks more as having "discrete spectrum": only some disjoint keys inside -+ * range occupied by this item really address data. -+ * -+ * No than less, each item always has well-defined least (minimal) key, that -+ * is recorded in item header, stored in the node this item is in. Also, item -+ * plugin can optionally define method ->max_key_inside() returning maximal -+ * key that can _possibly_ be located within this item. This method is used -+ * (mainly) to determine when given piece of data should be merged into -+ * existing item, in stead of creating new one. Because of this, even though -+ * ->max_key_inside() can be larger that any key actually located in the item, -+ * intervals -+ * -+ * [ reiser4_min_key( item ), ->max_key_inside( item ) ] -+ * -+ * are still disjoint for all items within the _same_ node. -+ * -+ * In memory node is represented by znode. It plays several roles: -+ * -+ * . something locks are taken on -+ * -+ * . something tracked by transaction manager (this is going to change) -+ * -+ * . something used to access node data -+ * -+ * . something used to maintain tree structure in memory: sibling and -+ * parental linkage. -+ * -+ * . something used to organize nodes into "slums" -+ * -+ * More on znodes see in znode.[ch] -+ * -+ * DELIMITING KEYS -+ * -+ * To simplify balancing, allow some flexibility in locking and speed up -+ * important coord cache optimization, we keep delimiting keys of nodes in -+ * memory. Depending on disk format (implemented by appropriate node plugin) -+ * node on disk can record both left and right delimiting key, only one of -+ * them, or none. Still, our balancing and tree traversal code keep both -+ * delimiting keys for a node that is in memory stored in the znode. When -+ * node is first brought into memory during tree traversal, its left -+ * delimiting key is taken from its parent, and its right delimiting key is -+ * either next key in its parent, or is right delimiting key of parent if -+ * node is the rightmost child of parent. -+ * -+ * Physical consistency of delimiting key is protected by special dk -+ * read-write lock. That is, delimiting keys can only be inspected or -+ * modified under this lock. But dk lock is only sufficient for fast -+ * "pessimistic" check, because to simplify code and to decrease lock -+ * contention, balancing (carry) only updates delimiting keys right before -+ * unlocking all locked nodes on the given tree level. For example, -+ * coord-by-key cache scans LRU list of recently accessed znodes. For each -+ * node it first does fast check under dk spin lock. If key looked for is -+ * not between delimiting keys for this node, next node is inspected and so -+ * on. If key is inside of the key range, long term lock is taken on node -+ * and key range is rechecked. -+ * -+ * COORDINATES -+ * -+ * To find something in the tree, you supply a key, and the key is resolved -+ * by coord_by_key() into a coord (coordinate) that is valid as long as the -+ * node the coord points to remains locked. As mentioned above trees -+ * consist of nodes that consist of items that consist of units. A unit is -+ * the smallest and indivisible piece of tree as far as balancing and tree -+ * search are concerned. Each node, item, and unit can be addressed by -+ * giving its level in the tree and the key occupied by this entity. A node -+ * knows what the key ranges are of the items within it, and how to find its -+ * items and invoke their item handlers, but it does not know how to access -+ * individual units within its items except through the item handlers. -+ * coord is a structure containing a pointer to the node, the ordinal number -+ * of the item within this node (a sort of item offset), and the ordinal -+ * number of the unit within this item. -+ * -+ * TREE LOOKUP -+ * -+ * There are two types of access to the tree: lookup and modification. -+ * -+ * Lookup is a search for the key in the tree. Search can look for either -+ * exactly the key given to it, or for the largest key that is not greater -+ * than the key given to it. This distinction is determined by "bias" -+ * parameter of search routine (coord_by_key()). coord_by_key() either -+ * returns error (key is not in the tree, or some kind of external error -+ * occurred), or successfully resolves key into coord. -+ * -+ * This resolution is done by traversing tree top-to-bottom from root level -+ * to the desired level. On levels above twig level (level one above the -+ * leaf level) nodes consist exclusively of internal items. Internal item is -+ * nothing more than pointer to the tree node on the child level. On twig -+ * level nodes consist of internal items intermixed with extent -+ * items. Internal items form normal search tree structure used by traversal -+ * to descent through the tree. -+ * -+ * TREE LOOKUP OPTIMIZATIONS -+ * -+ * Tree lookup described above is expensive even if all nodes traversed are -+ * already in the memory: for each node binary search within it has to be -+ * performed and binary searches are CPU consuming and tend to destroy CPU -+ * caches. -+ * -+ * Several optimizations are used to work around this: -+ * -+ * . cbk_cache (look-aside cache for tree traversals, see search.c for -+ * details) -+ * -+ * . seals (see seal.[ch]) -+ * -+ * . vroot (see search.c) -+ * -+ * General search-by-key is layered thusly: -+ * -+ * [check seal, if any] --ok--> done -+ * | -+ * failed -+ * | -+ * V -+ * [vroot defined] --no--> node = tree_root -+ * | | -+ * yes | -+ * | | -+ * V | -+ * node = vroot | -+ * | | -+ * | | -+ * | | -+ * V V -+ * [check cbk_cache for key] --ok--> done -+ * | -+ * failed -+ * | -+ * V -+ * [start tree traversal from node] -+ * -+ */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/item/static_stat.h" -+#include "plugin/item/item.h" -+#include "plugin/node/node.h" -+#include "plugin/plugin.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree_walk.h" -+#include "carry.h" -+#include "carry_ops.h" -+#include "tap.h" -+#include "tree.h" -+#include "vfs_ops.h" -+#include "page_cache.h" -+#include "super.h" -+#include "reiser4.h" -+#include "inode.h" -+ -+#include /* for struct super_block */ -+#include -+ -+/* Disk address (block number) never ever used for any real tree node. This is -+ used as block number of "uber" znode. -+ -+ Invalid block addresses are 0 by tradition. -+ -+*/ -+const reiser4_block_nr UBER_TREE_ADDR = 0ull; -+ -+#define CUT_TREE_MIN_ITERATIONS 64 -+ -+static int find_child_by_addr(znode * parent, znode * child, coord_t * result); -+ -+/* return node plugin of coord->node */ -+node_plugin *node_plugin_by_coord(const coord_t * coord) -+{ -+ assert("vs-1", coord != NULL); -+ assert("vs-2", coord->node != NULL); -+ -+ return coord->node->nplug; -+} -+ -+/* insert item into tree. Fields of @coord are updated so that they can be -+ * used by consequent insert operation. */ -+insert_result insert_by_key(reiser4_tree * tree /* tree to insert new item -+ * into */ , -+ const reiser4_key * key /* key of new item */ , -+ reiser4_item_data * data /* parameters for item -+ * creation */ , -+ coord_t * coord /* resulting insertion coord */ , -+ lock_handle * lh /* resulting lock -+ * handle */ , -+ tree_level stop_level /** level where to insert */ , -+ __u32 flags /* insertion flags */ ) -+{ -+ int result; -+ -+ assert("nikita-358", tree != NULL); -+ assert("nikita-360", coord != NULL); -+ -+ result = coord_by_key(tree, key, coord, lh, ZNODE_WRITE_LOCK, -+ FIND_EXACT, stop_level, stop_level, -+ flags | CBK_FOR_INSERT, NULL /*ra_info */ ); -+ switch (result) { -+ default: -+ break; -+ case CBK_COORD_FOUND: -+ result = IBK_ALREADY_EXISTS; -+ break; -+ case CBK_COORD_NOTFOUND: -+ assert("nikita-2017", coord->node != NULL); -+ result = insert_by_coord(coord, data, key, lh, 0 /*flags */ ); -+ break; -+ } -+ return result; -+} -+ -+/* insert item by calling carry. Helper function called if short-cut -+ insertion failed */ -+static insert_result insert_with_carry_by_coord(coord_t * coord, /* coord where to insert */ -+ lock_handle * lh, /* lock handle of insertion -+ * node */ -+ reiser4_item_data * data, /* parameters of new -+ * item */ -+ const reiser4_key * key, /* key of new item */ -+ carry_opcode cop, /* carry operation to perform */ -+ cop_insert_flag flags -+ /* carry flags */ ) -+{ -+ int result; -+ carry_pool *pool; -+ carry_level *lowest_level; -+ carry_insert_data *cdata; -+ carry_op *op; -+ -+ assert("umka-314", coord != NULL); -+ -+ /* allocate carry_pool and 3 carry_level-s */ -+ pool = -+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) + -+ sizeof(*cdata)); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ lowest_level = (carry_level *) (pool + 1); -+ init_carry_level(lowest_level, pool); -+ -+ op = reiser4_post_carry(lowest_level, cop, coord->node, 0); -+ if (IS_ERR(op) || (op == NULL)) { -+ done_carry_pool(pool); -+ return RETERR(op ? PTR_ERR(op) : -EIO); -+ } -+ cdata = (carry_insert_data *) (lowest_level + 3); -+ cdata->coord = coord; -+ cdata->data = data; -+ cdata->key = key; -+ op->u.insert.d = cdata; -+ if (flags == 0) -+ flags = znode_get_tree(coord->node)->carry.insert_flags; -+ op->u.insert.flags = flags; -+ op->u.insert.type = COPT_ITEM_DATA; -+ op->u.insert.child = NULL; -+ if (lh != NULL) { -+ assert("nikita-3245", lh->node == coord->node); -+ lowest_level->track_type = CARRY_TRACK_CHANGE; -+ lowest_level->tracked = lh; -+ } -+ -+ result = reiser4_carry(lowest_level, NULL); -+ done_carry_pool(pool); -+ -+ return result; -+} -+ -+/* form carry queue to perform paste of @data with @key at @coord, and launch -+ its execution by calling carry(). -+ -+ Instruct carry to update @lh it after balancing insertion coord moves into -+ different block. -+ -+*/ -+static int paste_with_carry(coord_t * coord, /* coord of paste */ -+ lock_handle * lh, /* lock handle of node -+ * where item is -+ * pasted */ -+ reiser4_item_data * data, /* parameters of new -+ * item */ -+ const reiser4_key * key, /* key of new item */ -+ unsigned flags /* paste flags */ ) -+{ -+ int result; -+ carry_pool *pool; -+ carry_level *lowest_level; -+ carry_insert_data *cdata; -+ carry_op *op; -+ -+ assert("umka-315", coord != NULL); -+ assert("umka-316", key != NULL); -+ -+ pool = -+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) + -+ sizeof(*cdata)); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ lowest_level = (carry_level *) (pool + 1); -+ init_carry_level(lowest_level, pool); -+ -+ op = reiser4_post_carry(lowest_level, COP_PASTE, coord->node, 0); -+ if (IS_ERR(op) || (op == NULL)) { -+ done_carry_pool(pool); -+ return RETERR(op ? PTR_ERR(op) : -EIO); -+ } -+ cdata = (carry_insert_data *) (lowest_level + 3); -+ cdata->coord = coord; -+ cdata->data = data; -+ cdata->key = key; -+ op->u.paste.d = cdata; -+ if (flags == 0) -+ flags = znode_get_tree(coord->node)->carry.paste_flags; -+ op->u.paste.flags = flags; -+ op->u.paste.type = COPT_ITEM_DATA; -+ if (lh != NULL) { -+ lowest_level->track_type = CARRY_TRACK_CHANGE; -+ lowest_level->tracked = lh; -+ } -+ -+ result = reiser4_carry(lowest_level, NULL); -+ done_carry_pool(pool); -+ -+ return result; -+} -+ -+/* insert item at the given coord. -+ -+ First try to skip carry by directly calling ->create_item() method of node -+ plugin. If this is impossible (there is not enough free space in the node, -+ or leftmost item in the node is created), call insert_with_carry_by_coord() -+ that will do full carry(). -+ -+*/ -+insert_result insert_by_coord(coord_t * coord /* coord where to -+ * insert. coord->node has -+ * to be write locked by -+ * caller */ , -+ reiser4_item_data * data /* data to be -+ * inserted */ , -+ const reiser4_key * key /* key of new item */ , -+ lock_handle * lh /* lock handle of write -+ * lock on node */ , -+ __u32 flags /* insertion flags */ ) -+{ -+ unsigned item_size; -+ int result; -+ znode *node; -+ -+ assert("vs-247", coord != NULL); -+ assert("vs-248", data != NULL); -+ assert("vs-249", data->length >= 0); -+ assert("nikita-1191", znode_is_write_locked(coord->node)); -+ -+ node = coord->node; -+ coord_clear_iplug(coord); -+ result = zload(node); -+ if (result != 0) -+ return result; -+ -+ item_size = space_needed(node, NULL, data, 1); -+ if (item_size > znode_free_space(node) && -+ (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT) -+ && (flags & COPI_DONT_ALLOCATE)) { -+ /* we are forced to use free space of coord->node and new item -+ does not fit into it. -+ -+ Currently we get here only when we allocate and copy units -+ of extent item from a node to its left neighbor during -+ "squalloc"-ing. If @node (this is left neighbor) does not -+ have enough free space - we do not want to attempt any -+ shifting and allocations because we are in squeezing and -+ everything to the left of @node is tightly packed. -+ */ -+ result = -E_NODE_FULL; -+ } else if ((item_size <= znode_free_space(node)) && -+ !coord_is_before_leftmost(coord) && -+ (node_plugin_by_node(node)->fast_insert != NULL) -+ && node_plugin_by_node(node)->fast_insert(coord)) { -+ /* shortcut insertion without carry() overhead. -+ -+ Only possible if: -+ -+ - there is enough free space -+ -+ - insertion is not into the leftmost position in a node -+ (otherwise it would require updating of delimiting key in a -+ parent) -+ -+ - node plugin agrees with this -+ -+ */ -+ result = -+ node_plugin_by_node(node)->create_item(coord, key, data, -+ NULL); -+ znode_make_dirty(node); -+ } else { -+ /* otherwise do full-fledged carry(). */ -+ result = -+ insert_with_carry_by_coord(coord, lh, data, key, COP_INSERT, -+ flags); -+ } -+ zrelse(node); -+ return result; -+} -+ -+/* @coord is set to leaf level and @data is to be inserted to twig level */ -+insert_result -+insert_extent_by_coord(coord_t * -+ coord -+ /* coord where to insert. coord->node * has to be write * locked by caller */ -+ , -+ reiser4_item_data * data /* data to be inserted */ , -+ const reiser4_key * key /* key of new item */ , -+ lock_handle * -+ lh /* lock handle of write lock on * node */ ) -+{ -+ assert("vs-405", coord != NULL); -+ assert("vs-406", data != NULL); -+ assert("vs-407", data->length > 0); -+ assert("vs-408", znode_is_write_locked(coord->node)); -+ assert("vs-409", znode_get_level(coord->node) == LEAF_LEVEL); -+ -+ return insert_with_carry_by_coord(coord, lh, data, key, COP_EXTENT, -+ 0 /*flags */ ); -+} -+ -+/* Insert into the item at the given coord. -+ -+ First try to skip carry by directly calling ->paste() method of item -+ plugin. If this is impossible (there is not enough free space in the node, -+ or we are pasting into leftmost position in the node), call -+ paste_with_carry() that will do full carry(). -+ -+*/ -+/* paste_into_item */ -+int insert_into_item(coord_t * coord /* coord of pasting */ , -+ lock_handle * lh /* lock handle on node involved */ , -+ const reiser4_key * key /* key of unit being pasted */ , -+ reiser4_item_data * data /* parameters for new unit */ , -+ unsigned flags /* insert/paste flags */ ) -+{ -+ int result; -+ int size_change; -+ node_plugin *nplug; -+ item_plugin *iplug; -+ -+ assert("umka-317", coord != NULL); -+ assert("umka-318", key != NULL); -+ -+ iplug = item_plugin_by_coord(coord); -+ nplug = node_plugin_by_coord(coord); -+ -+ assert("nikita-1480", iplug == data->iplug); -+ -+ size_change = space_needed(coord->node, coord, data, 0); -+ if (size_change > (int)znode_free_space(coord->node) && -+ (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT) -+ && (flags & COPI_DONT_ALLOCATE)) { -+ /* we are forced to use free space of coord->node and new data -+ does not fit into it. */ -+ return -E_NODE_FULL; -+ } -+ -+ /* shortcut paste without carry() overhead. -+ -+ Only possible if: -+ -+ - there is enough free space -+ -+ - paste is not into the leftmost unit in a node (otherwise -+ it would require updating of delimiting key in a parent) -+ -+ - node plugin agrees with this -+ -+ - item plugin agrees with us -+ */ -+ if (size_change <= (int)znode_free_space(coord->node) && -+ (coord->item_pos != 0 || -+ coord->unit_pos != 0 || coord->between == AFTER_UNIT) && -+ coord->unit_pos != 0 && nplug->fast_paste != NULL && -+ nplug->fast_paste(coord) && -+ iplug->b.fast_paste != NULL && iplug->b.fast_paste(coord)) { -+ if (size_change > 0) -+ nplug->change_item_size(coord, size_change); -+ /* NOTE-NIKITA: huh? where @key is used? */ -+ result = iplug->b.paste(coord, data, NULL); -+ if (size_change < 0) -+ nplug->change_item_size(coord, size_change); -+ znode_make_dirty(coord->node); -+ } else -+ /* otherwise do full-fledged carry(). */ -+ result = paste_with_carry(coord, lh, data, key, flags); -+ return result; -+} -+ -+/* this either appends or truncates item @coord */ -+int reiser4_resize_item(coord_t * coord /* coord of item being resized */ , -+ reiser4_item_data * data /* parameters of resize */ , -+ reiser4_key * key /* key of new unit */ , -+ lock_handle * lh /* lock handle of node -+ * being modified */ , -+ cop_insert_flag flags /* carry flags */ ) -+{ -+ int result; -+ znode *node; -+ -+ assert("nikita-362", coord != NULL); -+ assert("nikita-363", data != NULL); -+ assert("vs-245", data->length != 0); -+ -+ node = coord->node; -+ coord_clear_iplug(coord); -+ result = zload(node); -+ if (result != 0) -+ return result; -+ -+ if (data->length < 0) -+ result = node_plugin_by_coord(coord)->shrink_item(coord, -+ -data->length); -+ else -+ result = insert_into_item(coord, lh, key, data, flags); -+ -+ zrelse(node); -+ return result; -+} -+ -+/* insert flow @f */ -+int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f) -+{ -+ int result; -+ carry_pool *pool; -+ carry_level *lowest_level; -+ reiser4_item_data *data; -+ carry_op *op; -+ -+ pool = -+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) + -+ sizeof(*data)); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ lowest_level = (carry_level *) (pool + 1); -+ init_carry_level(lowest_level, pool); -+ -+ op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node, -+ 0 /* operate directly on coord -> node */ ); -+ if (IS_ERR(op) || (op == NULL)) { -+ done_carry_pool(pool); -+ return RETERR(op ? PTR_ERR(op) : -EIO); -+ } -+ -+ /* these are permanent during insert_flow */ -+ data = (reiser4_item_data *) (lowest_level + 3); -+ data->user = 1; -+ data->iplug = item_plugin_by_id(FORMATTING_ID); -+ data->arg = NULL; -+ /* data.length and data.data will be set before calling paste or -+ insert */ -+ data->length = 0; -+ data->data = NULL; -+ -+ op->u.insert_flow.flags = 0; -+ op->u.insert_flow.insert_point = coord; -+ op->u.insert_flow.flow = f; -+ op->u.insert_flow.data = data; -+ op->u.insert_flow.new_nodes = 0; -+ -+ lowest_level->track_type = CARRY_TRACK_CHANGE; -+ lowest_level->tracked = lh; -+ -+ result = reiser4_carry(lowest_level, NULL); -+ done_carry_pool(pool); -+ -+ return result; -+} -+ -+/* Given a coord in parent node, obtain a znode for the corresponding child */ -+znode *child_znode(const coord_t * parent_coord /* coord of pointer to -+ * child */ , -+ znode * parent /* parent of child */ , -+ int incore_p /* if !0 only return child if already in -+ * memory */ , -+ int setup_dkeys_p /* if !0 update delimiting keys of -+ * child */ ) -+{ -+ znode *child; -+ -+ assert("nikita-1374", parent_coord != NULL); -+ assert("nikita-1482", parent != NULL); -+#if REISER4_DEBUG -+ if (setup_dkeys_p) -+ assert_rw_not_locked(&(znode_get_tree(parent)->dk_lock)); -+#endif -+ assert("nikita-2947", znode_is_any_locked(parent)); -+ -+ if (znode_get_level(parent) <= LEAF_LEVEL) { -+ /* trying to get child of leaf node */ -+ warning("nikita-1217", "Child of maize?"); -+ return ERR_PTR(RETERR(-EIO)); -+ } -+ if (item_is_internal(parent_coord)) { -+ reiser4_block_nr addr; -+ item_plugin *iplug; -+ reiser4_tree *tree; -+ -+ iplug = item_plugin_by_coord(parent_coord); -+ assert("vs-512", iplug->s.internal.down_link); -+ iplug->s.internal.down_link(parent_coord, NULL, &addr); -+ -+ tree = znode_get_tree(parent); -+ if (incore_p) -+ child = zlook(tree, &addr); -+ else -+ child = -+ zget(tree, &addr, parent, -+ znode_get_level(parent) - 1, -+ reiser4_ctx_gfp_mask_get()); -+ if ((child != NULL) && !IS_ERR(child) && setup_dkeys_p) -+ set_child_delimiting_keys(parent, parent_coord, child); -+ } else { -+ warning("nikita-1483", "Internal item expected"); -+ child = ERR_PTR(RETERR(-EIO)); -+ } -+ return child; -+} -+ -+/* remove znode from transaction */ -+static void uncapture_znode(znode * node) -+{ -+ struct page *page; -+ -+ assert("zam-1001", ZF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ -+ if (!reiser4_blocknr_is_fake(znode_get_block(node))) { -+ int ret; -+ -+ /* An already allocated block goes right to the atom's delete set. */ -+ ret = -+ reiser4_dealloc_block(znode_get_block(node), 0, -+ BA_DEFER | BA_FORMATTED); -+ if (ret) -+ warning("zam-942", -+ "can\'t add a block (%llu) number to atom's delete set\n", -+ (unsigned long long)(*znode_get_block(node))); -+ -+ spin_lock_znode(node); -+ /* Here we return flush reserved block which was reserved at the -+ * moment when this allocated node was marked dirty and still -+ * not used by flush in node relocation procedure. */ -+ if (ZF_ISSET(node, JNODE_FLUSH_RESERVED)) { -+ txn_atom *atom; -+ -+ atom = jnode_get_atom(ZJNODE(node)); -+ assert("zam-939", atom != NULL); -+ spin_unlock_znode(node); -+ flush_reserved2grabbed(atom, (__u64) 1); -+ spin_unlock_atom(atom); -+ } else -+ spin_unlock_znode(node); -+ } else { -+ /* znode has assigned block which is counted as "fake -+ allocated". Return it back to "free blocks") */ -+ fake_allocated2free((__u64) 1, BA_FORMATTED); -+ } -+ -+ /* -+ * uncapture page from transaction. There is a possibility of a race -+ * with ->releasepage(): reiser4_releasepage() detaches page from this -+ * jnode and we have nothing to uncapture. To avoid this, get -+ * reference of node->pg under jnode spin lock. reiser4_uncapture_page() -+ * will deal with released page itself. -+ */ -+ spin_lock_znode(node); -+ page = znode_page(node); -+ if (likely(page != NULL)) { -+ /* -+ * reiser4_uncapture_page() can only be called when we are sure -+ * that znode is pinned in memory, which we are, because -+ * forget_znode() is only called from longterm_unlock_znode(). -+ */ -+ page_cache_get(page); -+ spin_unlock_znode(node); -+ lock_page(page); -+ reiser4_uncapture_page(page); -+ unlock_page(page); -+ page_cache_release(page); -+ } else { -+ txn_atom *atom; -+ -+ /* handle "flush queued" znodes */ -+ while (1) { -+ atom = jnode_get_atom(ZJNODE(node)); -+ assert("zam-943", atom != NULL); -+ -+ if (!ZF_ISSET(node, JNODE_FLUSH_QUEUED) -+ || !atom->nr_running_queues) -+ break; -+ -+ spin_unlock_znode(node); -+ reiser4_atom_wait_event(atom); -+ spin_lock_znode(node); -+ } -+ -+ reiser4_uncapture_block(ZJNODE(node)); -+ spin_unlock_atom(atom); -+ zput(node); -+ } -+} -+ -+/* This is called from longterm_unlock_znode() when last lock is released from -+ the node that has been removed from the tree. At this point node is removed -+ from sibling list and its lock is invalidated. */ -+void forget_znode(lock_handle * handle) -+{ -+ znode *node; -+ reiser4_tree *tree; -+ -+ assert("umka-319", handle != NULL); -+ -+ node = handle->node; -+ tree = znode_get_tree(node); -+ -+ assert("vs-164", znode_is_write_locked(node)); -+ assert("nikita-1280", ZF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ assert_rw_locked(&(node->lock.guard)); -+ -+ /* We assume that this node was detached from its parent before -+ * unlocking, it gives no way to reach this node from parent through a -+ * down link. The node should have no children and, thereby, can't be -+ * reached from them by their parent pointers. The only way to obtain a -+ * reference to the node is to use sibling pointers from its left and -+ * right neighbors. In the next several lines we remove the node from -+ * the sibling list. */ -+ -+ write_lock_tree(tree); -+ sibling_list_remove(node); -+ znode_remove(node, tree); -+ write_unlock_tree(tree); -+ -+ /* Here we set JNODE_DYING and cancel all pending lock requests. It -+ * forces all lock requestor threads to repeat iterations of getting -+ * lock on a child, neighbor or parent node. But, those threads can't -+ * come to this node again, because this node is no longer a child, -+ * neighbor or parent of any other node. This order of znode -+ * invalidation does not allow other threads to waste cpu time is a busy -+ * loop, trying to lock dying object. The exception is in the flush -+ * code when we take node directly from atom's capture list.*/ -+ reiser4_invalidate_lock(handle); -+ uncapture_znode(node); -+} -+ -+/* Check that internal item at @pointer really contains pointer to @child. */ -+int check_tree_pointer(const coord_t * pointer /* would-be pointer to -+ * @child */ , -+ const znode * child /* child znode */ ) -+{ -+ assert("nikita-1016", pointer != NULL); -+ assert("nikita-1017", child != NULL); -+ assert("nikita-1018", pointer->node != NULL); -+ -+ assert("nikita-1325", znode_is_any_locked(pointer->node)); -+ -+ assert("nikita-2985", -+ znode_get_level(pointer->node) == znode_get_level(child) + 1); -+ -+ coord_clear_iplug((coord_t *) pointer); -+ -+ if (coord_is_existing_unit(pointer)) { -+ item_plugin *iplug; -+ reiser4_block_nr addr; -+ -+ if (item_is_internal(pointer)) { -+ iplug = item_plugin_by_coord(pointer); -+ assert("vs-513", iplug->s.internal.down_link); -+ iplug->s.internal.down_link(pointer, NULL, &addr); -+ /* check that cached value is correct */ -+ if (disk_addr_eq(&addr, znode_get_block(child))) { -+ return NS_FOUND; -+ } -+ } -+ } -+ /* warning ("jmacd-1002", "tree pointer incorrect"); */ -+ return NS_NOT_FOUND; -+} -+ -+/* find coord of pointer to new @child in @parent. -+ -+ Find the &coord_t in the @parent where pointer to a given @child will -+ be in. -+ -+*/ -+int find_new_child_ptr(znode * parent /* parent znode, passed locked */ , -+ znode * -+ child UNUSED_ARG /* child znode, passed locked */ , -+ znode * left /* left brother of new node */ , -+ coord_t * result /* where result is stored in */ ) -+{ -+ int ret; -+ -+ assert("nikita-1486", parent != NULL); -+ assert("nikita-1487", child != NULL); -+ assert("nikita-1488", result != NULL); -+ -+ ret = find_child_ptr(parent, left, result); -+ if (ret != NS_FOUND) { -+ warning("nikita-1489", "Cannot find brother position: %i", ret); -+ return RETERR(-EIO); -+ } else { -+ result->between = AFTER_UNIT; -+ return RETERR(NS_NOT_FOUND); -+ } -+} -+ -+/* find coord of pointer to @child in @parent. -+ -+ Find the &coord_t in the @parent where pointer to a given @child is in. -+ -+*/ -+int find_child_ptr(znode * parent /* parent znode, passed locked */ , -+ znode * child /* child znode, passed locked */ , -+ coord_t * result /* where result is stored in */ ) -+{ -+ int lookup_res; -+ node_plugin *nplug; -+ /* left delimiting key of a child */ -+ reiser4_key ld; -+ reiser4_tree *tree; -+ -+ assert("nikita-934", parent != NULL); -+ assert("nikita-935", child != NULL); -+ assert("nikita-936", result != NULL); -+ assert("zam-356", znode_is_loaded(parent)); -+ -+ coord_init_zero(result); -+ result->node = parent; -+ -+ nplug = parent->nplug; -+ assert("nikita-939", nplug != NULL); -+ -+ tree = znode_get_tree(parent); -+ /* NOTE-NIKITA taking read-lock on tree here assumes that @result is -+ * not aliased to ->in_parent of some znode. Otherwise, -+ * parent_coord_to_coord() below would modify data protected by tree -+ * lock. */ -+ read_lock_tree(tree); -+ /* fast path. Try to use cached value. Lock tree to keep -+ node->pos_in_parent and pos->*_blocknr consistent. */ -+ if (child->in_parent.item_pos + 1 != 0) { -+ parent_coord_to_coord(&child->in_parent, result); -+ if (check_tree_pointer(result, child) == NS_FOUND) { -+ read_unlock_tree(tree); -+ return NS_FOUND; -+ } -+ -+ child->in_parent.item_pos = (unsigned short)~0; -+ } -+ read_unlock_tree(tree); -+ -+ /* is above failed, find some key from @child. We are looking for the -+ least key in a child. */ -+ read_lock_dk(tree); -+ ld = *znode_get_ld_key(child); -+ read_unlock_dk(tree); -+ /* -+ * now, lookup parent with key just found. Note, that left delimiting -+ * key doesn't identify node uniquely, because (in extremely rare -+ * case) two nodes can have equal left delimiting keys, if one of them -+ * is completely filled with directory entries that all happened to be -+ * hash collision. But, we check block number in check_tree_pointer() -+ * and, so, are safe. -+ */ -+ lookup_res = nplug->lookup(parent, &ld, FIND_EXACT, result); -+ /* update cached pos_in_node */ -+ if (lookup_res == NS_FOUND) { -+ write_lock_tree(tree); -+ coord_to_parent_coord(result, &child->in_parent); -+ write_unlock_tree(tree); -+ lookup_res = check_tree_pointer(result, child); -+ } -+ if (lookup_res == NS_NOT_FOUND) -+ lookup_res = find_child_by_addr(parent, child, result); -+ return lookup_res; -+} -+ -+/* find coord of pointer to @child in @parent by scanning -+ -+ Find the &coord_t in the @parent where pointer to a given @child -+ is in by scanning all internal items in @parent and comparing block -+ numbers in them with that of @child. -+ -+*/ -+static int find_child_by_addr(znode * parent /* parent znode, passed locked */ , -+ znode * child /* child znode, passed locked */ , -+ coord_t * result /* where result is stored in */ ) -+{ -+ int ret; -+ -+ assert("nikita-1320", parent != NULL); -+ assert("nikita-1321", child != NULL); -+ assert("nikita-1322", result != NULL); -+ -+ ret = NS_NOT_FOUND; -+ -+ for_all_units(result, parent) { -+ if (check_tree_pointer(result, child) == NS_FOUND) { -+ write_lock_tree(znode_get_tree(parent)); -+ coord_to_parent_coord(result, &child->in_parent); -+ write_unlock_tree(znode_get_tree(parent)); -+ ret = NS_FOUND; -+ break; -+ } -+ } -+ return ret; -+} -+ -+/* true, if @addr is "unallocated block number", which is just address, with -+ highest bit set. */ -+int is_disk_addr_unallocated(const reiser4_block_nr * addr /* address to -+ * check */ ) -+{ -+ assert("nikita-1766", addr != NULL); -+ cassert(sizeof(reiser4_block_nr) == 8); -+ return (*addr & REISER4_BLOCKNR_STATUS_BIT_MASK) == -+ REISER4_UNALLOCATED_STATUS_VALUE; -+} -+ -+/* returns true if removing bytes of given range of key [from_key, to_key] -+ causes removing of whole item @from */ -+static int -+item_removed_completely(coord_t * from, const reiser4_key * from_key, -+ const reiser4_key * to_key) -+{ -+ item_plugin *iplug; -+ reiser4_key key_in_item; -+ -+ assert("umka-325", from != NULL); -+ assert("", item_is_extent(from)); -+ -+ /* check first key just for case */ -+ item_key_by_coord(from, &key_in_item); -+ if (keygt(from_key, &key_in_item)) -+ return 0; -+ -+ /* check last key */ -+ iplug = item_plugin_by_coord(from); -+ assert("vs-611", iplug && iplug->s.file.append_key); -+ -+ iplug->s.file.append_key(from, &key_in_item); -+ set_key_offset(&key_in_item, get_key_offset(&key_in_item) - 1); -+ -+ if (keylt(to_key, &key_in_item)) -+ /* last byte is not removed */ -+ return 0; -+ return 1; -+} -+ -+/* helper function for prepare_twig_kill(): @left and @right are formatted -+ * neighbors of extent item being completely removed. Load and lock neighbors -+ * and store lock handles into @cdata for later use by kill_hook_extent() */ -+static int -+prepare_children(znode * left, znode * right, carry_kill_data * kdata) -+{ -+ int result; -+ int left_loaded; -+ int right_loaded; -+ -+ result = 0; -+ left_loaded = right_loaded = 0; -+ -+ if (left != NULL) { -+ result = zload(left); -+ if (result == 0) { -+ left_loaded = 1; -+ result = longterm_lock_znode(kdata->left, left, -+ ZNODE_READ_LOCK, -+ ZNODE_LOCK_LOPRI); -+ } -+ } -+ if (result == 0 && right != NULL) { -+ result = zload(right); -+ if (result == 0) { -+ right_loaded = 1; -+ result = longterm_lock_znode(kdata->right, right, -+ ZNODE_READ_LOCK, -+ ZNODE_LOCK_HIPRI | -+ ZNODE_LOCK_NONBLOCK); -+ } -+ } -+ if (result != 0) { -+ done_lh(kdata->left); -+ done_lh(kdata->right); -+ if (left_loaded != 0) -+ zrelse(left); -+ if (right_loaded != 0) -+ zrelse(right); -+ } -+ return result; -+} -+ -+static void done_children(carry_kill_data * kdata) -+{ -+ if (kdata->left != NULL && kdata->left->node != NULL) { -+ zrelse(kdata->left->node); -+ done_lh(kdata->left); -+ } -+ if (kdata->right != NULL && kdata->right->node != NULL) { -+ zrelse(kdata->right->node); -+ done_lh(kdata->right); -+ } -+} -+ -+/* part of cut_node. It is called when cut_node is called to remove or cut part -+ of extent item. When head of that item is removed - we have to update right -+ delimiting of left neighbor of extent. When item is removed completely - we -+ have to set sibling link between left and right neighbor of removed -+ extent. This may return -E_DEADLOCK because of trying to get left neighbor -+ locked. So, caller should repeat an attempt -+*/ -+/* Audited by: umka (2002.06.16) */ -+static int -+prepare_twig_kill(carry_kill_data * kdata, znode * locked_left_neighbor) -+{ -+ int result; -+ reiser4_key key; -+ lock_handle left_lh; -+ lock_handle right_lh; -+ coord_t left_coord; -+ coord_t *from; -+ znode *left_child; -+ znode *right_child; -+ reiser4_tree *tree; -+ int left_zloaded_here, right_zloaded_here; -+ -+ from = kdata->params.from; -+ assert("umka-326", from != NULL); -+ assert("umka-327", kdata->params.to != NULL); -+ -+ /* for one extent item only yet */ -+ assert("vs-591", item_is_extent(from)); -+ assert("vs-592", from->item_pos == kdata->params.to->item_pos); -+ -+ if ((kdata->params.from_key -+ && keygt(kdata->params.from_key, item_key_by_coord(from, &key))) -+ || from->unit_pos != 0) { -+ /* head of item @from is not removed, there is nothing to -+ worry about */ -+ return 0; -+ } -+ -+ result = 0; -+ left_zloaded_here = 0; -+ right_zloaded_here = 0; -+ -+ left_child = right_child = NULL; -+ -+ coord_dup(&left_coord, from); -+ init_lh(&left_lh); -+ init_lh(&right_lh); -+ if (coord_prev_unit(&left_coord)) { -+ /* @from is leftmost item in its node */ -+ if (!locked_left_neighbor) { -+ result = -+ reiser4_get_left_neighbor(&left_lh, from->node, -+ ZNODE_READ_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ switch (result) { -+ case 0: -+ break; -+ case -E_NO_NEIGHBOR: -+ /* there is no formatted node to the left of -+ from->node */ -+ warning("vs-605", -+ "extent item has smallest key in " -+ "the tree and it is about to be removed"); -+ return 0; -+ case -E_DEADLOCK: -+ /* need to restart */ -+ default: -+ return result; -+ } -+ -+ /* we have acquired left neighbor of from->node */ -+ result = zload(left_lh.node); -+ if (result) -+ goto done; -+ -+ locked_left_neighbor = left_lh.node; -+ } else { -+ /* squalloc_right_twig_cut should have supplied locked -+ * left neighbor */ -+ assert("vs-834", -+ znode_is_write_locked(locked_left_neighbor)); -+ result = zload(locked_left_neighbor); -+ if (result) -+ return result; -+ } -+ -+ left_zloaded_here = 1; -+ coord_init_last_unit(&left_coord, locked_left_neighbor); -+ } -+ -+ if (!item_is_internal(&left_coord)) { -+ /* what else but extent can be on twig level */ -+ assert("vs-606", item_is_extent(&left_coord)); -+ -+ /* there is no left formatted child */ -+ if (left_zloaded_here) -+ zrelse(locked_left_neighbor); -+ done_lh(&left_lh); -+ return 0; -+ } -+ -+ tree = znode_get_tree(left_coord.node); -+ left_child = child_znode(&left_coord, left_coord.node, 1, 0); -+ -+ if (IS_ERR(left_child)) { -+ result = PTR_ERR(left_child); -+ goto done; -+ } -+ -+ /* left child is acquired, calculate new right delimiting key for it -+ and get right child if it is necessary */ -+ if (item_removed_completely -+ (from, kdata->params.from_key, kdata->params.to_key)) { -+ /* try to get right child of removed item */ -+ coord_t right_coord; -+ -+ assert("vs-607", -+ kdata->params.to->unit_pos == -+ coord_last_unit_pos(kdata->params.to)); -+ coord_dup(&right_coord, kdata->params.to); -+ if (coord_next_unit(&right_coord)) { -+ /* @to is rightmost unit in the node */ -+ result = -+ reiser4_get_right_neighbor(&right_lh, from->node, -+ ZNODE_READ_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ switch (result) { -+ case 0: -+ result = zload(right_lh.node); -+ if (result) -+ goto done; -+ -+ right_zloaded_here = 1; -+ coord_init_first_unit(&right_coord, -+ right_lh.node); -+ item_key_by_coord(&right_coord, &key); -+ break; -+ -+ case -E_NO_NEIGHBOR: -+ /* there is no formatted node to the right of -+ from->node */ -+ read_lock_dk(tree); -+ key = *znode_get_rd_key(from->node); -+ read_unlock_dk(tree); -+ right_coord.node = NULL; -+ result = 0; -+ break; -+ default: -+ /* real error */ -+ goto done; -+ } -+ } else { -+ /* there is an item to the right of @from - take its key */ -+ item_key_by_coord(&right_coord, &key); -+ } -+ -+ /* try to get right child of @from */ -+ if (right_coord.node && /* there is right neighbor of @from */ -+ item_is_internal(&right_coord)) { /* it is internal item */ -+ right_child = child_znode(&right_coord, -+ right_coord.node, 1, 0); -+ -+ if (IS_ERR(right_child)) { -+ result = PTR_ERR(right_child); -+ goto done; -+ } -+ -+ } -+ /* whole extent is removed between znodes left_child and right_child. Prepare them for linking and -+ update of right delimiting key of left_child */ -+ result = prepare_children(left_child, right_child, kdata); -+ } else { -+ /* head of item @to is removed. left_child has to get right delimting key update. Prepare it for that */ -+ result = prepare_children(left_child, NULL, kdata); -+ } -+ -+ done: -+ if (right_child) -+ zput(right_child); -+ if (right_zloaded_here) -+ zrelse(right_lh.node); -+ done_lh(&right_lh); -+ -+ if (left_child) -+ zput(left_child); -+ if (left_zloaded_here) -+ zrelse(locked_left_neighbor); -+ done_lh(&left_lh); -+ return result; -+} -+ -+/* this is used to remove part of node content between coordinates @from and @to. Units to which @from and @to are set -+ are to be cut completely */ -+/* for try_to_merge_with_left, delete_copied, reiser4_delete_node */ -+int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key, /* first key to be removed */ -+ const reiser4_key * to_key, /* last key to be removed */ -+ reiser4_key * -+ smallest_removed /* smallest key actually removed */ ) -+{ -+ int result; -+ carry_pool *pool; -+ carry_level *lowest_level; -+ carry_cut_data *cut_data; -+ carry_op *op; -+ -+ assert("vs-1715", coord_compare(from, to) != COORD_CMP_ON_RIGHT); -+ -+ pool = -+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) + -+ sizeof(*cut_data)); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ lowest_level = (carry_level *) (pool + 1); -+ init_carry_level(lowest_level, pool); -+ -+ op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0); -+ assert("vs-1509", op != 0); -+ if (IS_ERR(op)) { -+ done_carry_pool(pool); -+ return PTR_ERR(op); -+ } -+ -+ cut_data = (carry_cut_data *) (lowest_level + 3); -+ cut_data->params.from = from; -+ cut_data->params.to = to; -+ cut_data->params.from_key = from_key; -+ cut_data->params.to_key = to_key; -+ cut_data->params.smallest_removed = smallest_removed; -+ -+ op->u.cut_or_kill.is_cut = 1; -+ op->u.cut_or_kill.u.cut = cut_data; -+ -+ result = reiser4_carry(lowest_level, NULL); -+ done_carry_pool(pool); -+ -+ return result; -+} -+ -+/* cut part of the node -+ -+ Cut part or whole content of node. -+ -+ cut data between @from and @to of @from->node and call carry() to make -+ corresponding changes in the tree. @from->node may become empty. If so - -+ pointer to it will be removed. Neighboring nodes are not changed. Smallest -+ removed key is stored in @smallest_removed -+ -+*/ -+int kill_node_content(coord_t * from, /* coord of the first unit/item that will be eliminated */ -+ coord_t * to, /* coord of the last unit/item that will be eliminated */ -+ const reiser4_key * from_key, /* first key to be removed */ -+ const reiser4_key * to_key, /* last key to be removed */ -+ reiser4_key * smallest_removed, /* smallest key actually removed */ -+ znode * locked_left_neighbor, /* this is set when kill_node_content is called with left neighbor -+ * locked (in squalloc_right_twig_cut, namely) */ -+ struct inode *inode, /* inode of file whose item (or its part) is to be killed. This is necessary to -+ invalidate pages together with item pointing to them */ -+ int truncate) -+{ /* this call is made for file truncate) */ -+ int result; -+ carry_pool *pool; -+ carry_level *lowest_level; -+ carry_kill_data *kdata; -+ lock_handle *left_child; -+ lock_handle *right_child; -+ carry_op *op; -+ -+ assert("umka-328", from != NULL); -+ assert("vs-316", !node_is_empty(from->node)); -+ assert("nikita-1812", coord_is_existing_unit(from) -+ && coord_is_existing_unit(to)); -+ -+ /* allocate carry_pool, 3 carry_level-s, carry_kill_data and structures for kill_hook_extent */ -+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) + -+ sizeof(carry_kill_data) + -+ 2 * sizeof(lock_handle) + -+ 5 * sizeof(reiser4_key) + 2 * sizeof(coord_t)); -+ if (IS_ERR(pool)) -+ return PTR_ERR(pool); -+ -+ lowest_level = (carry_level *) (pool + 1); -+ init_carry_level(lowest_level, pool); -+ -+ kdata = (carry_kill_data *) (lowest_level + 3); -+ left_child = (lock_handle *) (kdata + 1); -+ right_child = left_child + 1; -+ -+ init_lh(left_child); -+ init_lh(right_child); -+ -+ kdata->params.from = from; -+ kdata->params.to = to; -+ kdata->params.from_key = from_key; -+ kdata->params.to_key = to_key; -+ kdata->params.smallest_removed = smallest_removed; -+ kdata->params.truncate = truncate; -+ kdata->flags = 0; -+ kdata->inode = inode; -+ kdata->left = left_child; -+ kdata->right = right_child; -+ /* memory for 5 reiser4_key and 2 coord_t will be used in kill_hook_extent */ -+ kdata->buf = (char *)(right_child + 1); -+ -+ if (znode_get_level(from->node) == TWIG_LEVEL && item_is_extent(from)) { -+ /* left child of extent item may have to get updated right -+ delimiting key and to get linked with right child of extent -+ @from if it will be removed completely */ -+ result = prepare_twig_kill(kdata, locked_left_neighbor); -+ if (result) { -+ done_children(kdata); -+ done_carry_pool(pool); -+ return result; -+ } -+ } -+ -+ op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0); -+ if (IS_ERR(op) || (op == NULL)) { -+ done_children(kdata); -+ done_carry_pool(pool); -+ return RETERR(op ? PTR_ERR(op) : -EIO); -+ } -+ -+ op->u.cut_or_kill.is_cut = 0; -+ op->u.cut_or_kill.u.kill = kdata; -+ -+ result = reiser4_carry(lowest_level, NULL); -+ -+ done_children(kdata); -+ done_carry_pool(pool); -+ return result; -+} -+ -+void -+fake_kill_hook_tail(struct inode *inode, loff_t start, loff_t end, int truncate) -+{ -+ if (reiser4_inode_get_flag(inode, REISER4_HAS_MMAP)) { -+ pgoff_t start_pg, end_pg; -+ -+ start_pg = start >> PAGE_CACHE_SHIFT; -+ end_pg = (end - 1) >> PAGE_CACHE_SHIFT; -+ -+ if ((start & (PAGE_CACHE_SIZE - 1)) == 0) { -+ /* -+ * kill up to the page boundary. -+ */ -+ assert("vs-123456", start_pg == end_pg); -+ reiser4_invalidate_pages(inode->i_mapping, start_pg, 1, -+ truncate); -+ } else if (start_pg != end_pg) { -+ /* -+ * page boundary is within killed portion of node. -+ */ -+ assert("vs-654321", end_pg - start_pg == 1); -+ reiser4_invalidate_pages(inode->i_mapping, end_pg, -+ end_pg - start_pg, 1); -+ } -+ } -+ inode_sub_bytes(inode, end - start); -+} -+ -+/** -+ * Delete whole @node from the reiser4 tree without loading it. -+ * -+ * @left: locked left neighbor, -+ * @node: node to be deleted, -+ * @smallest_removed: leftmost key of deleted node, -+ * @object: inode pointer, if we truncate a file body. -+ * @truncate: true if called for file truncate. -+ * -+ * @return: 0 if success, error code otherwise. -+ * -+ * NOTE: if @object!=NULL we assume that @smallest_removed != NULL and it -+ * contains the right value of the smallest removed key from the previous -+ * cut_worker() iteration. This is needed for proper accounting of -+ * "i_blocks" and "i_bytes" fields of the @object. -+ */ -+int reiser4_delete_node(znode * node, reiser4_key * smallest_removed, -+ struct inode *object, int truncate) -+{ -+ lock_handle parent_lock; -+ coord_t cut_from; -+ coord_t cut_to; -+ reiser4_tree *tree; -+ int ret; -+ -+ assert("zam-937", node != NULL); -+ assert("zam-933", znode_is_write_locked(node)); -+ assert("zam-999", smallest_removed != NULL); -+ -+ init_lh(&parent_lock); -+ -+ ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK); -+ if (ret) -+ return ret; -+ -+ assert("zam-934", !znode_above_root(parent_lock.node)); -+ -+ ret = zload(parent_lock.node); -+ if (ret) -+ goto failed_nozrelse; -+ -+ ret = find_child_ptr(parent_lock.node, node, &cut_from); -+ if (ret) -+ goto failed; -+ -+ /* decrement child counter and set parent pointer to NULL before -+ deleting the list from parent node because of checks in -+ internal_kill_item_hook (we can delete the last item from the parent -+ node, the parent node is going to be deleted and its c_count should -+ be zero). */ -+ -+ tree = znode_get_tree(node); -+ write_lock_tree(tree); -+ init_parent_coord(&node->in_parent, NULL); -+ --parent_lock.node->c_count; -+ write_unlock_tree(tree); -+ -+ assert("zam-989", item_is_internal(&cut_from)); -+ -+ /* @node should be deleted after unlocking. */ -+ ZF_SET(node, JNODE_HEARD_BANSHEE); -+ -+ /* remove a pointer from the parent node to the node being deleted. */ -+ coord_dup(&cut_to, &cut_from); -+ /* FIXME: shouldn't this be kill_node_content */ -+ ret = cut_node_content(&cut_from, &cut_to, NULL, NULL, NULL); -+ if (ret) -+ /* FIXME(Zam): Should we re-connect the node to its parent if -+ * cut_node fails? */ -+ goto failed; -+ -+ { -+ reiser4_tree *tree = current_tree; -+ __u64 start_offset = 0, end_offset = 0; -+ -+ read_lock_tree(tree); -+ write_lock_dk(tree); -+ if (object) { -+ /* We use @smallest_removed and the left delimiting of -+ * the current node for @object->i_blocks, i_bytes -+ * calculation. We assume that the items after the -+ * *@smallest_removed key have been deleted from the -+ * file body. */ -+ start_offset = get_key_offset(znode_get_ld_key(node)); -+ end_offset = get_key_offset(smallest_removed); -+ } -+ -+ assert("zam-1021", znode_is_connected(node)); -+ if (node->left) -+ znode_set_rd_key(node->left, znode_get_rd_key(node)); -+ -+ *smallest_removed = *znode_get_ld_key(node); -+ -+ write_unlock_dk(tree); -+ read_unlock_tree(tree); -+ -+ if (object) { -+ /* we used to perform actions which are to be performed on items on their removal from tree in -+ special item method - kill_hook. Here for optimization reasons we avoid reading node -+ containing item we remove and can not call item's kill hook. Instead we call function which -+ does exactly the same things as tail kill hook in assumption that node we avoid reading -+ contains only one item and that item is a tail one. */ -+ fake_kill_hook_tail(object, start_offset, end_offset, -+ truncate); -+ } -+ } -+ failed: -+ zrelse(parent_lock.node); -+ failed_nozrelse: -+ done_lh(&parent_lock); -+ -+ return ret; -+} -+ -+static int can_delete(const reiser4_key *key, znode *node) -+{ -+ int result; -+ -+ read_lock_dk(current_tree); -+ result = keyle(key, znode_get_ld_key(node)); -+ read_unlock_dk(current_tree); -+ return result; -+} -+ -+/** -+ * This subroutine is not optimal but implementation seems to -+ * be easier). -+ * -+ * @tap: the point deletion process begins from, -+ * @from_key: the beginning of the deleted key range, -+ * @to_key: the end of the deleted key range, -+ * @smallest_removed: the smallest removed key, -+ * @truncate: true if called for file truncate. -+ * @progress: return true if a progress in file items deletions was made, -+ * @smallest_removed value is actual in that case. -+ * -+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long -+ * reiser4_cut_tree operation was interrupted for allowing atom commit. -+ */ -+int -+cut_tree_worker_common(tap_t * tap, const reiser4_key * from_key, -+ const reiser4_key * to_key, -+ reiser4_key * smallest_removed, struct inode *object, -+ int truncate, int *progress) -+{ -+ lock_handle next_node_lock; -+ coord_t left_coord; -+ int result; -+ -+ assert("zam-931", tap->coord->node != NULL); -+ assert("zam-932", znode_is_write_locked(tap->coord->node)); -+ -+ *progress = 0; -+ init_lh(&next_node_lock); -+ -+ while (1) { -+ znode *node; /* node from which items are cut */ -+ node_plugin *nplug; /* node plugin for @node */ -+ -+ node = tap->coord->node; -+ -+ /* Move next_node_lock to the next node on the left. */ -+ result = -+ reiser4_get_left_neighbor(&next_node_lock, node, -+ ZNODE_WRITE_LOCK, -+ GN_CAN_USE_UPPER_LEVELS); -+ if (result != 0 && result != -E_NO_NEIGHBOR) -+ break; -+ /* Check can we delete the node as a whole. */ -+ if (*progress && znode_get_level(node) == LEAF_LEVEL && -+ can_delete(from_key, node)) { -+ result = reiser4_delete_node(node, smallest_removed, -+ object, truncate); -+ } else { -+ result = reiser4_tap_load(tap); -+ if (result) -+ return result; -+ -+ /* Prepare the second (right) point for cut_node() */ -+ if (*progress) -+ coord_init_last_unit(tap->coord, node); -+ -+ else if (item_plugin_by_coord(tap->coord)->b.lookup == -+ NULL) -+ /* set rightmost unit for the items without lookup method */ -+ tap->coord->unit_pos = -+ coord_last_unit_pos(tap->coord); -+ -+ nplug = node->nplug; -+ -+ assert("vs-686", nplug); -+ assert("vs-687", nplug->lookup); -+ -+ /* left_coord is leftmost unit cut from @node */ -+ result = nplug->lookup(node, from_key, -+ FIND_MAX_NOT_MORE_THAN, -+ &left_coord); -+ -+ if (IS_CBKERR(result)) -+ break; -+ -+ /* adjust coordinates so that they are set to existing units */ -+ if (coord_set_to_right(&left_coord) -+ || coord_set_to_left(tap->coord)) { -+ result = 0; -+ break; -+ } -+ -+ if (coord_compare(&left_coord, tap->coord) == -+ COORD_CMP_ON_RIGHT) { -+ /* keys from @from_key to @to_key are not in the tree */ -+ result = 0; -+ break; -+ } -+ -+ if (left_coord.item_pos != tap->coord->item_pos) { -+ /* do not allow to cut more than one item. It is added to solve problem of truncating -+ partially converted files. If file is partially converted there may exist a twig node -+ containing both internal item or items pointing to leaf nodes with formatting items -+ and extent item. We do not want to kill internal items being at twig node here -+ because cut_tree_worker assumes killing them from level level */ -+ coord_dup(&left_coord, tap->coord); -+ assert("vs-1652", -+ coord_is_existing_unit(&left_coord)); -+ left_coord.unit_pos = 0; -+ } -+ -+ /* cut data from one node */ -+ // *smallest_removed = *reiser4_min_key(); -+ result = -+ kill_node_content(&left_coord, tap->coord, from_key, -+ to_key, smallest_removed, -+ next_node_lock.node, object, -+ truncate); -+ reiser4_tap_relse(tap); -+ } -+ if (result) -+ break; -+ -+ ++(*progress); -+ -+ /* Check whether all items with keys >= from_key were removed -+ * from the tree. */ -+ if (keyle(smallest_removed, from_key)) -+ /* result = 0; */ -+ break; -+ -+ if (next_node_lock.node == NULL) -+ break; -+ -+ result = reiser4_tap_move(tap, &next_node_lock); -+ done_lh(&next_node_lock); -+ if (result) -+ break; -+ -+ /* Break long reiser4_cut_tree operation (deletion of a large -+ file) if atom requires commit. */ -+ if (*progress > CUT_TREE_MIN_ITERATIONS -+ && current_atom_should_commit()) { -+ result = -E_REPEAT; -+ break; -+ } -+ } -+ done_lh(&next_node_lock); -+ // assert("vs-301", !keyeq(&smallest_removed, reiser4_min_key())); -+ return result; -+} -+ -+/* there is a fundamental problem with optimizing deletes: VFS does it -+ one file at a time. Another problem is that if an item can be -+ anything, then deleting items must be done one at a time. It just -+ seems clean to writes this to specify a from and a to key, and cut -+ everything between them though. */ -+ -+/* use this function with care if deleting more than what is part of a single file. */ -+/* do not use this when cutting a single item, it is suboptimal for that */ -+ -+/* You are encouraged to write plugin specific versions of this. It -+ cannot be optimal for all plugins because it works item at a time, -+ and some plugins could sometimes work node at a time. Regular files -+ however are not optimizable to work node at a time because of -+ extents needing to free the blocks they point to. -+ -+ Optimizations compared to v3 code: -+ -+ It does not balance (that task is left to memory pressure code). -+ -+ Nodes are deleted only if empty. -+ -+ Uses extents. -+ -+ Performs read-ahead of formatted nodes whose contents are part of -+ the deletion. -+*/ -+ -+/** -+ * Delete everything from the reiser4 tree between two keys: @from_key and -+ * @to_key. -+ * -+ * @from_key: the beginning of the deleted key range, -+ * @to_key: the end of the deleted key range, -+ * @smallest_removed: the smallest removed key, -+ * @object: owner of cutting items. -+ * @truncate: true if called for file truncate. -+ * @progress: return true if a progress in file items deletions was made, -+ * @smallest_removed value is actual in that case. -+ * -+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree -+ * operation was interrupted for allowing atom commit . -+ */ -+ -+int reiser4_cut_tree_object(reiser4_tree * tree, const reiser4_key * from_key, -+ const reiser4_key * to_key, -+ reiser4_key * smallest_removed_p, -+ struct inode *object, int truncate, int *progress) -+{ -+ lock_handle lock; -+ int result; -+ tap_t tap; -+ coord_t right_coord; -+ reiser4_key smallest_removed; -+ int (*cut_tree_worker) (tap_t *, const reiser4_key *, -+ const reiser4_key *, reiser4_key *, -+ struct inode *, int, int *); -+ STORE_COUNTERS; -+ -+ assert("umka-329", tree != NULL); -+ assert("umka-330", from_key != NULL); -+ assert("umka-331", to_key != NULL); -+ assert("zam-936", keyle(from_key, to_key)); -+ -+ if (smallest_removed_p == NULL) -+ smallest_removed_p = &smallest_removed; -+ -+ init_lh(&lock); -+ -+ do { -+ /* Find rightmost item to cut away from the tree. */ -+ result = reiser4_object_lookup(object, to_key, &right_coord, -+ &lock, ZNODE_WRITE_LOCK, -+ FIND_MAX_NOT_MORE_THAN, -+ TWIG_LEVEL, LEAF_LEVEL, -+ CBK_UNIQUE, NULL /*ra_info */); -+ if (result != CBK_COORD_FOUND) -+ break; -+ if (object == NULL -+ || inode_file_plugin(object)->cut_tree_worker == NULL) -+ cut_tree_worker = cut_tree_worker_common; -+ else -+ cut_tree_worker = -+ inode_file_plugin(object)->cut_tree_worker; -+ reiser4_tap_init(&tap, &right_coord, &lock, ZNODE_WRITE_LOCK); -+ result = -+ cut_tree_worker(&tap, from_key, to_key, smallest_removed_p, -+ object, truncate, progress); -+ reiser4_tap_done(&tap); -+ -+ reiser4_preempt_point(); -+ -+ } while (0); -+ -+ done_lh(&lock); -+ -+ if (result) { -+ switch (result) { -+ case -E_NO_NEIGHBOR: -+ result = 0; -+ break; -+ case -E_DEADLOCK: -+ result = -E_REPEAT; -+ case -E_REPEAT: -+ case -ENOMEM: -+ case -ENOENT: -+ break; -+ default: -+ warning("nikita-2861", "failure: %i", result); -+ } -+ } -+ -+ CHECK_COUNTERS; -+ return result; -+} -+ -+/* repeat reiser4_cut_tree_object until everything is deleted. -+ * unlike cut_file_items, it does not end current transaction if -E_REPEAT -+ * is returned by cut_tree_object. */ -+int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from, -+ const reiser4_key * to, struct inode *inode, int truncate) -+{ -+ int result; -+ int progress; -+ -+ do { -+ result = reiser4_cut_tree_object(tree, from, to, NULL, -+ inode, truncate, &progress); -+ } while (result == -E_REPEAT); -+ -+ return result; -+} -+ -+/* finishing reiser4 initialization */ -+int reiser4_init_tree(reiser4_tree * tree /* pointer to structure being -+ * initialized */ , -+ const reiser4_block_nr * root_block /* address of a root block -+ * on a disk */ , -+ tree_level height /* height of a tree */ , -+ node_plugin * nplug /* default node plugin */ ) -+{ -+ int result; -+ -+ assert("nikita-306", tree != NULL); -+ assert("nikita-307", root_block != NULL); -+ assert("nikita-308", height > 0); -+ assert("nikita-309", nplug != NULL); -+ assert("zam-587", tree->super != NULL); -+ -+ tree->root_block = *root_block; -+ tree->height = height; -+ tree->estimate_one_insert = calc_estimate_one_insert(height); -+ tree->nplug = nplug; -+ -+ tree->znode_epoch = 1ull; -+ -+ cbk_cache_init(&tree->cbk_cache); -+ -+ result = znodes_tree_init(tree); -+ if (result == 0) -+ result = jnodes_tree_init(tree); -+ if (result == 0) { -+ tree->uber = zget(tree, &UBER_TREE_ADDR, NULL, 0, -+ reiser4_ctx_gfp_mask_get()); -+ if (IS_ERR(tree->uber)) { -+ result = PTR_ERR(tree->uber); -+ tree->uber = NULL; -+ } -+ } -+ return result; -+} -+ -+/* release resources associated with @tree */ -+void reiser4_done_tree(reiser4_tree * tree /* tree to release */ ) -+{ -+ if (tree == NULL) -+ return; -+ -+ if (tree->uber != NULL) { -+ zput(tree->uber); -+ tree->uber = NULL; -+ } -+ znodes_tree_done(tree); -+ jnodes_tree_done(tree); -+ cbk_cache_done(&tree->cbk_cache); -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/tree.h linux-2.6.24/fs/reiser4/tree.h ---- linux-2.6.24.orig/fs/reiser4/tree.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/tree.h 2008-01-25 11:39:07.096248905 +0300 -@@ -0,0 +1,577 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Tree operations. See fs/reiser4/tree.c for comments */ -+ -+#if !defined( __REISER4_TREE_H__ ) -+#define __REISER4_TREE_H__ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "plugin/node/node.h" -+#include "plugin/plugin.h" -+#include "znode.h" -+#include "tap.h" -+ -+#include /* for __u?? */ -+#include /* for struct super_block */ -+#include -+#include /* for struct task_struct */ -+ -+/* fictive block number never actually used */ -+extern const reiser4_block_nr UBER_TREE_ADDR; -+ -+/* &cbk_cache_slot - entry in a coord cache. -+ -+ This is entry in a coord_by_key (cbk) cache, represented by -+ &cbk_cache. -+ -+*/ -+typedef struct cbk_cache_slot { -+ /* cached node */ -+ znode *node; -+ /* linkage to the next cbk cache slot in a LRU order */ -+ struct list_head lru; -+} cbk_cache_slot; -+ -+/* &cbk_cache - coord cache. This is part of reiser4_tree. -+ -+ cbk_cache is supposed to speed up tree lookups by caching results of recent -+ successful lookups (we don't cache negative results as dentry cache -+ does). Cache consists of relatively small number of entries kept in a LRU -+ order. Each entry (&cbk_cache_slot) contains a pointer to znode, from -+ which we can obtain a range of keys that covered by this znode. Before -+ embarking into real tree traversal we scan cbk_cache slot by slot and for -+ each slot check whether key we are looking for is between minimal and -+ maximal keys for node pointed to by this slot. If no match is found, real -+ tree traversal is performed and if result is successful, appropriate entry -+ is inserted into cache, possibly pulling least recently used entry out of -+ it. -+ -+ Tree spin lock is used to protect coord cache. If contention for this -+ lock proves to be too high, more finer grained locking can be added. -+ -+ Invariants involving parts of this data-type: -+ -+ [cbk-cache-invariant] -+*/ -+typedef struct cbk_cache { -+ /* serializator */ -+ rwlock_t guard; -+ int nr_slots; -+ /* head of LRU list of cache slots */ -+ struct list_head lru; -+ /* actual array of slots */ -+ cbk_cache_slot *slot; -+} cbk_cache; -+ -+/* level_lookup_result - possible outcome of looking up key at some level. -+ This is used by coord_by_key when traversing tree downward. */ -+typedef enum { -+ /* continue to the next level */ -+ LOOKUP_CONT, -+ /* done. Either required item was found, or we can prove it -+ doesn't exist, or some error occurred. */ -+ LOOKUP_DONE, -+ /* restart traversal from the root. Infamous "repetition". */ -+ LOOKUP_REST -+} level_lookup_result; -+ -+/* This is representation of internal reiser4 tree where all file-system -+ data and meta-data are stored. This structure is passed to all tree -+ manipulation functions. It's different from the super block because: -+ we don't want to limit ourselves to strictly one to one mapping -+ between super blocks and trees, and, because they are logically -+ different: there are things in a super block that have no relation to -+ the tree (bitmaps, journalling area, mount options, etc.) and there -+ are things in a tree that bear no relation to the super block, like -+ tree of znodes. -+ -+ At this time, there is only one tree -+ per filesystem, and this struct is part of the super block. We only -+ call the super block the super block for historical reasons (most -+ other filesystems call the per filesystem metadata the super block). -+*/ -+ -+struct reiser4_tree { -+ /* block_nr == 0 is fake znode. Write lock it, while changing -+ tree height. */ -+ /* disk address of root node of a tree */ -+ reiser4_block_nr root_block; -+ -+ /* level of the root node. If this is 1, tree consists of root -+ node only */ -+ tree_level height; -+ -+ /* -+ * this is cached here avoid calling plugins through function -+ * dereference all the time. -+ */ -+ __u64 estimate_one_insert; -+ -+ /* cache of recent tree lookup results */ -+ cbk_cache cbk_cache; -+ -+ /* hash table to look up znodes by block number. */ -+ z_hash_table zhash_table; -+ z_hash_table zfake_table; -+ /* hash table to look up jnodes by inode and offset. */ -+ j_hash_table jhash_table; -+ -+ /* lock protecting: -+ - parent pointers, -+ - sibling pointers, -+ - znode hash table -+ - coord cache -+ */ -+ /* NOTE: The "giant" tree lock can be replaced by more spin locks, -+ hoping they will be less contented. We can use one spin lock per one -+ znode hash bucket. With adding of some code complexity, sibling -+ pointers can be protected by both znode spin locks. However it looks -+ more SMP scalable we should test this locking change on n-ways (n > -+ 4) SMP machines. Current 4-ways machine test does not show that tree -+ lock is contented and it is a bottleneck (2003.07.25). */ -+ -+ rwlock_t tree_lock; -+ -+ /* lock protecting delimiting keys */ -+ rwlock_t dk_lock; -+ -+ /* spin lock protecting znode_epoch */ -+ spinlock_t epoch_lock; -+ /* version stamp used to mark znode updates. See seal.[ch] for more -+ * information. */ -+ __u64 znode_epoch; -+ -+ znode *uber; -+ node_plugin *nplug; -+ struct super_block *super; -+ struct { -+ /* carry flags used for insertion of new nodes */ -+ __u32 new_node_flags; -+ /* carry flags used for insertion of new extents */ -+ __u32 new_extent_flags; -+ /* carry flags used for paste operations */ -+ __u32 paste_flags; -+ /* carry flags used for insert operations */ -+ __u32 insert_flags; -+ } carry; -+}; -+ -+extern int reiser4_init_tree(reiser4_tree * tree, -+ const reiser4_block_nr * root_block, -+ tree_level height, node_plugin * default_plugin); -+extern void reiser4_done_tree(reiser4_tree * tree); -+ -+/* cbk flags: options for coord_by_key() */ -+typedef enum { -+ /* coord_by_key() is called for insertion. This is necessary because -+ of extents being located at the twig level. For explanation, see -+ comment just above is_next_item_internal(). -+ */ -+ CBK_FOR_INSERT = (1 << 0), -+ /* coord_by_key() is called with key that is known to be unique */ -+ CBK_UNIQUE = (1 << 1), -+ /* coord_by_key() can trust delimiting keys. This options is not user -+ accessible. coord_by_key() will set it automatically. It will be -+ only cleared by special-case in extents-on-the-twig-level handling -+ where it is necessary to insert item with a key smaller than -+ leftmost key in a node. This is necessary because of extents being -+ located at the twig level. For explanation, see comment just above -+ is_next_item_internal(). -+ */ -+ CBK_TRUST_DK = (1 << 2), -+ CBK_READA = (1 << 3), /* original: readahead leaves which contain items of certain file */ -+ CBK_READDIR_RA = (1 << 4), /* readdir: readahead whole directory and all its stat datas */ -+ CBK_DKSET = (1 << 5), -+ CBK_EXTENDED_COORD = (1 << 6), /* coord_t is actually */ -+ CBK_IN_CACHE = (1 << 7), /* node is already in cache */ -+ CBK_USE_CRABLOCK = (1 << 8) /* use crab_lock in stead of long term -+ * lock */ -+} cbk_flags; -+ -+/* insertion outcome. IBK = insert by key */ -+typedef enum { -+ IBK_INSERT_OK = 0, -+ IBK_ALREADY_EXISTS = -EEXIST, -+ IBK_IO_ERROR = -EIO, -+ IBK_NO_SPACE = -E_NODE_FULL, -+ IBK_OOM = -ENOMEM -+} insert_result; -+ -+#define IS_CBKERR(err) ((err) != CBK_COORD_FOUND && (err) != CBK_COORD_NOTFOUND) -+ -+typedef int (*tree_iterate_actor_t) (reiser4_tree * tree, coord_t * coord, -+ lock_handle * lh, void *arg); -+extern int reiser4_iterate_tree(reiser4_tree * tree, coord_t * coord, -+ lock_handle * lh, -+ tree_iterate_actor_t actor, void *arg, -+ znode_lock_mode mode, int through_units_p); -+extern int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode, -+ znode_lock_request pri, lock_handle * lh); -+ -+/* return node plugin of @node */ -+static inline node_plugin *node_plugin_by_node(const znode * -+ node /* node to query */ ) -+{ -+ assert("vs-213", node != NULL); -+ assert("vs-214", znode_is_loaded(node)); -+ -+ return node->nplug; -+} -+ -+/* number of items in @node */ -+static inline pos_in_node_t node_num_items(const znode * node) -+{ -+ assert("nikita-2754", znode_is_loaded(node)); -+ assert("nikita-2468", -+ node_plugin_by_node(node)->num_of_items(node) == node->nr_items); -+ -+ return node->nr_items; -+} -+ -+/* Return the number of items at the present node. Asserts coord->node != -+ NULL. */ -+static inline unsigned coord_num_items(const coord_t * coord) -+{ -+ assert("jmacd-9805", coord->node != NULL); -+ -+ return node_num_items(coord->node); -+} -+ -+/* true if @node is empty */ -+static inline int node_is_empty(const znode * node) -+{ -+ return node_num_items(node) == 0; -+} -+ -+typedef enum { -+ SHIFTED_SOMETHING = 0, -+ SHIFT_NO_SPACE = -E_NODE_FULL, -+ SHIFT_IO_ERROR = -EIO, -+ SHIFT_OOM = -ENOMEM, -+} shift_result; -+ -+extern node_plugin *node_plugin_by_coord(const coord_t * coord); -+extern int is_coord_in_node(const coord_t * coord); -+extern int key_in_node(const reiser4_key *, const coord_t *); -+extern void coord_item_move_to(coord_t * coord, int items); -+extern void coord_unit_move_to(coord_t * coord, int units); -+ -+/* there are two types of repetitive accesses (ra): intra-syscall -+ (local) and inter-syscall (global). Local ra is used when -+ during single syscall we add/delete several items and units in the -+ same place in a tree. Note that plan-A fragments local ra by -+ separating stat-data and file body in key-space. Global ra is -+ used when user does repetitive modifications in the same place in a -+ tree. -+ -+ Our ra implementation serves following purposes: -+ 1 it affects balancing decisions so that next operation in a row -+ can be performed faster; -+ 2 it affects lower-level read-ahead in page-cache; -+ 3 it allows to avoid unnecessary lookups by maintaining some state -+ across several operations (this is only for local ra); -+ 4 it leaves room for lazy-micro-balancing: when we start a sequence of -+ operations they are performed without actually doing any intra-node -+ shifts, until we finish sequence or scope of sequence leaves -+ current node, only then we really pack node (local ra only). -+*/ -+ -+/* another thing that can be useful is to keep per-tree and/or -+ per-process cache of recent lookups. This cache can be organised as a -+ list of block numbers of formatted nodes sorted by starting key in -+ this node. Balancings should invalidate appropriate parts of this -+ cache. -+*/ -+ -+lookup_result coord_by_key(reiser4_tree * tree, const reiser4_key * key, -+ coord_t * coord, lock_handle * handle, -+ znode_lock_mode lock, lookup_bias bias, -+ tree_level lock_level, tree_level stop_level, -+ __u32 flags, ra_info_t *); -+ -+lookup_result reiser4_object_lookup(struct inode *object, -+ const reiser4_key * key, -+ coord_t * coord, -+ lock_handle * lh, -+ znode_lock_mode lock_mode, -+ lookup_bias bias, -+ tree_level lock_level, -+ tree_level stop_level, -+ __u32 flags, ra_info_t * info); -+ -+insert_result insert_by_key(reiser4_tree * tree, const reiser4_key * key, -+ reiser4_item_data * data, coord_t * coord, -+ lock_handle * lh, -+ tree_level stop_level, __u32 flags); -+insert_result insert_by_coord(coord_t * coord, -+ reiser4_item_data * data, const reiser4_key * key, -+ lock_handle * lh, __u32); -+insert_result insert_extent_by_coord(coord_t * coord, -+ reiser4_item_data * data, -+ const reiser4_key * key, lock_handle * lh); -+int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key, -+ const reiser4_key * to_key, -+ reiser4_key * smallest_removed); -+int kill_node_content(coord_t * from, coord_t * to, -+ const reiser4_key * from_key, const reiser4_key * to_key, -+ reiser4_key * smallest_removed, -+ znode * locked_left_neighbor, struct inode *inode, -+ int truncate); -+ -+int reiser4_resize_item(coord_t * coord, reiser4_item_data * data, -+ reiser4_key * key, lock_handle * lh, cop_insert_flag); -+int insert_into_item(coord_t * coord, lock_handle * lh, const reiser4_key * key, -+ reiser4_item_data * data, unsigned); -+int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f); -+int find_new_child_ptr(znode * parent, znode * child, znode * left, -+ coord_t * result); -+ -+int shift_right_of_but_excluding_insert_coord(coord_t * insert_coord); -+int shift_left_of_and_including_insert_coord(coord_t * insert_coord); -+ -+void fake_kill_hook_tail(struct inode *, loff_t start, loff_t end, int); -+ -+extern int cut_tree_worker_common(tap_t *, const reiser4_key *, -+ const reiser4_key *, reiser4_key *, -+ struct inode *, int, int *); -+extern int reiser4_cut_tree_object(reiser4_tree *, const reiser4_key *, -+ const reiser4_key *, reiser4_key *, -+ struct inode *, int, int *); -+extern int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from, -+ const reiser4_key * to, struct inode *, int); -+ -+extern int reiser4_delete_node(znode *, reiser4_key *, struct inode *, int); -+extern int check_tree_pointer(const coord_t * pointer, const znode * child); -+extern int find_new_child_ptr(znode * parent, znode * child UNUSED_ARG, -+ znode * left, coord_t * result); -+extern int find_child_ptr(znode * parent, znode * child, coord_t * result); -+extern int set_child_delimiting_keys(znode * parent, const coord_t * in_parent, -+ znode * child); -+extern znode *child_znode(const coord_t * in_parent, znode * parent, -+ int incore_p, int setup_dkeys_p); -+ -+extern int cbk_cache_init(cbk_cache * cache); -+extern void cbk_cache_done(cbk_cache * cache); -+extern void cbk_cache_invalidate(const znode * node, reiser4_tree * tree); -+ -+extern char *sprint_address(const reiser4_block_nr * block); -+ -+#if REISER4_DEBUG -+extern void print_coord_content(const char *prefix, coord_t * p); -+extern void reiser4_print_address(const char *prefix, -+ const reiser4_block_nr * block); -+extern void print_tree_rec(const char *prefix, reiser4_tree * tree, -+ __u32 flags); -+extern void check_dkeys(znode *node); -+#else -+#define print_coord_content(p, c) noop -+#define reiser4_print_address(p, b) noop -+#endif -+ -+extern void forget_znode(lock_handle * handle); -+extern int deallocate_znode(znode * node); -+ -+extern int is_disk_addr_unallocated(const reiser4_block_nr * addr); -+ -+/* struct used internally to pack all numerous arguments of tree lookup. -+ Used to avoid passing a lot of arguments to helper functions. */ -+typedef struct cbk_handle { -+ /* tree we are in */ -+ reiser4_tree *tree; -+ /* key we are going after */ -+ const reiser4_key *key; -+ /* coord we will store result in */ -+ coord_t *coord; -+ /* type of lock to take on target node */ -+ znode_lock_mode lock_mode; -+ /* lookup bias. See comments at the declaration of lookup_bias */ -+ lookup_bias bias; -+ /* lock level: level starting from which tree traversal starts taking -+ * write locks. */ -+ tree_level lock_level; -+ /* level where search will stop. Either item will be found between -+ lock_level and stop_level, or CBK_COORD_NOTFOUND will be -+ returned. -+ */ -+ tree_level stop_level; -+ /* level we are currently at */ -+ tree_level level; -+ /* block number of @active node. Tree traversal operates on two -+ nodes: active and parent. */ -+ reiser4_block_nr block; -+ /* put here error message to be printed by caller */ -+ const char *error; -+ /* result passed back to caller */ -+ lookup_result result; -+ /* lock handles for active and parent */ -+ lock_handle *parent_lh; -+ lock_handle *active_lh; -+ reiser4_key ld_key; -+ reiser4_key rd_key; -+ /* flags, passed to the cbk routine. Bits of this bitmask are defined -+ in tree.h:cbk_flags enum. */ -+ __u32 flags; -+ ra_info_t *ra_info; -+ struct inode *object; -+} cbk_handle; -+ -+extern znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h); -+ -+/* eottl.c */ -+extern int handle_eottl(cbk_handle *h, int *outcome); -+ -+int lookup_multikey(cbk_handle * handle, int nr_keys); -+int lookup_couple(reiser4_tree * tree, -+ const reiser4_key * key1, const reiser4_key * key2, -+ coord_t * coord1, coord_t * coord2, -+ lock_handle * lh1, lock_handle * lh2, -+ znode_lock_mode lock_mode, lookup_bias bias, -+ tree_level lock_level, tree_level stop_level, __u32 flags, -+ int *result1, int *result2); -+ -+static inline void read_lock_tree(reiser4_tree *tree) -+{ -+ /* check that tree is not locked */ -+ assert("", (LOCK_CNT_NIL(rw_locked_tree) && -+ LOCK_CNT_NIL(read_locked_tree) && -+ LOCK_CNT_NIL(write_locked_tree))); -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) && -+ LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_NIL(spin_locked_stack))); -+ -+ read_lock(&(tree->tree_lock)); -+ -+ LOCK_CNT_INC(read_locked_tree); -+ LOCK_CNT_INC(rw_locked_tree); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline void read_unlock_tree(reiser4_tree *tree) -+{ -+ assert("nikita-1375", LOCK_CNT_GTZ(read_locked_tree)); -+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(read_locked_tree); -+ LOCK_CNT_DEC(rw_locked_tree); -+ LOCK_CNT_DEC(spin_locked); -+ -+ read_unlock(&(tree->tree_lock)); -+} -+ -+static inline void write_lock_tree(reiser4_tree *tree) -+{ -+ /* check that tree is not locked */ -+ assert("", (LOCK_CNT_NIL(rw_locked_tree) && -+ LOCK_CNT_NIL(read_locked_tree) && -+ LOCK_CNT_NIL(write_locked_tree))); -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) && -+ LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_NIL(spin_locked_stack))); -+ -+ write_lock(&(tree->tree_lock)); -+ -+ LOCK_CNT_INC(write_locked_tree); -+ LOCK_CNT_INC(rw_locked_tree); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline void write_unlock_tree(reiser4_tree *tree) -+{ -+ assert("nikita-1375", LOCK_CNT_GTZ(write_locked_tree)); -+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(write_locked_tree); -+ LOCK_CNT_DEC(rw_locked_tree); -+ LOCK_CNT_DEC(spin_locked); -+ -+ write_unlock(&(tree->tree_lock)); -+} -+ -+static inline void read_lock_dk(reiser4_tree *tree) -+{ -+ /* check that dk is not locked */ -+ assert("", (LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_NIL(read_locked_dk) && -+ LOCK_CNT_NIL(write_locked_dk))); -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", LOCK_CNT_NIL(spin_locked_stack)); -+ -+ read_lock(&((tree)->dk_lock)); -+ -+ LOCK_CNT_INC(read_locked_dk); -+ LOCK_CNT_INC(rw_locked_dk); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline void read_unlock_dk(reiser4_tree *tree) -+{ -+ assert("nikita-1375", LOCK_CNT_GTZ(read_locked_dk)); -+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(read_locked_dk); -+ LOCK_CNT_DEC(rw_locked_dk); -+ LOCK_CNT_DEC(spin_locked); -+ -+ read_unlock(&(tree->dk_lock)); -+} -+ -+static inline void write_lock_dk(reiser4_tree *tree) -+{ -+ /* check that dk is not locked */ -+ assert("", (LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_NIL(read_locked_dk) && -+ LOCK_CNT_NIL(write_locked_dk))); -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", LOCK_CNT_NIL(spin_locked_stack)); -+ -+ write_lock(&((tree)->dk_lock)); -+ -+ LOCK_CNT_INC(write_locked_dk); -+ LOCK_CNT_INC(rw_locked_dk); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline void write_unlock_dk(reiser4_tree *tree) -+{ -+ assert("nikita-1375", LOCK_CNT_GTZ(write_locked_dk)); -+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(write_locked_dk); -+ LOCK_CNT_DEC(rw_locked_dk); -+ LOCK_CNT_DEC(spin_locked); -+ -+ write_unlock(&(tree->dk_lock)); -+} -+ -+/* estimate api. Implementation is in estimate.c */ -+reiser4_block_nr estimate_one_insert_item(reiser4_tree *); -+reiser4_block_nr estimate_one_insert_into_item(reiser4_tree *); -+reiser4_block_nr estimate_insert_flow(tree_level); -+reiser4_block_nr estimate_one_item_removal(reiser4_tree *); -+reiser4_block_nr calc_estimate_one_insert(tree_level); -+reiser4_block_nr estimate_dirty_cluster(struct inode *); -+reiser4_block_nr estimate_insert_cluster(struct inode *); -+reiser4_block_nr estimate_update_cluster(struct inode *); -+ -+/* __REISER4_TREE_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/tree_mod.c linux-2.6.24/fs/reiser4/tree_mod.c ---- linux-2.6.24.orig/fs/reiser4/tree_mod.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/tree_mod.c 2008-01-25 11:39:07.100249935 +0300 -@@ -0,0 +1,386 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* -+ * Functions to add/delete new nodes to/from the tree. -+ * -+ * Functions from this file are used by carry (see carry*) to handle: -+ * -+ * . insertion of new formatted node into tree -+ * -+ * . addition of new tree root, increasing tree height -+ * -+ * . removing tree root, decreasing tree height -+ * -+ */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/plugin.h" -+#include "jnode.h" -+#include "znode.h" -+#include "tree_mod.h" -+#include "block_alloc.h" -+#include "tree_walk.h" -+#include "tree.h" -+#include "super.h" -+ -+#include -+ -+static int add_child_ptr(znode * parent, znode * child); -+/* warning only issued if error is not -E_REPEAT */ -+#define ewarning( error, ... ) \ -+ if( ( error ) != -E_REPEAT ) \ -+ warning( __VA_ARGS__ ) -+ -+/* allocate new node on the @level and immediately on the right of @brother. */ -+znode * reiser4_new_node(znode * brother /* existing left neighbor -+ * of new node */, -+ tree_level level /* tree level at which new node is to -+ * be allocated */) -+{ -+ znode *result; -+ int retcode; -+ reiser4_block_nr blocknr; -+ -+ assert("nikita-930", brother != NULL); -+ assert("umka-264", level < REAL_MAX_ZTREE_HEIGHT); -+ -+ retcode = assign_fake_blocknr_formatted(&blocknr); -+ if (retcode == 0) { -+ result = -+ zget(znode_get_tree(brother), &blocknr, NULL, level, -+ reiser4_ctx_gfp_mask_get()); -+ if (IS_ERR(result)) { -+ ewarning(PTR_ERR(result), "nikita-929", -+ "Cannot allocate znode for carry: %li", -+ PTR_ERR(result)); -+ return result; -+ } -+ /* cheap test, can be executed even when debugging is off */ -+ if (!znode_just_created(result)) { -+ warning("nikita-2213", -+ "Allocated already existing block: %llu", -+ (unsigned long long)blocknr); -+ zput(result); -+ return ERR_PTR(RETERR(-EIO)); -+ } -+ -+ assert("nikita-931", result != NULL); -+ result->nplug = znode_get_tree(brother)->nplug; -+ assert("nikita-933", result->nplug != NULL); -+ -+ retcode = zinit_new(result, reiser4_ctx_gfp_mask_get()); -+ if (retcode == 0) { -+ ZF_SET(result, JNODE_CREATED); -+ zrelse(result); -+ } else { -+ zput(result); -+ result = ERR_PTR(retcode); -+ } -+ } else { -+ /* failure to allocate new node during balancing. -+ This should never happen. Ever. Returning -E_REPEAT -+ is not viable solution, because "out of disk space" -+ is not transient error that will go away by itself. -+ */ -+ ewarning(retcode, "nikita-928", -+ "Cannot allocate block for carry: %i", retcode); -+ result = ERR_PTR(retcode); -+ } -+ assert("nikita-1071", result != NULL); -+ return result; -+} -+ -+/* allocate new root and add it to the tree -+ -+ This helper function is called by add_new_root(). -+ -+*/ -+znode *reiser4_add_tree_root(znode * old_root /* existing tree root */ , -+ znode * fake /* "fake" znode */ ) -+{ -+ reiser4_tree *tree = znode_get_tree(old_root); -+ znode *new_root = NULL; /* to shut gcc up */ -+ int result; -+ -+ assert("nikita-1069", old_root != NULL); -+ assert("umka-262", fake != NULL); -+ assert("umka-263", tree != NULL); -+ -+ /* "fake" znode---one always hanging just above current root. This -+ node is locked when new root is created or existing root is -+ deleted. Downward tree traversal takes lock on it before taking -+ lock on a root node. This avoids race conditions with root -+ manipulations. -+ -+ */ -+ assert("nikita-1348", znode_above_root(fake)); -+ assert("nikita-1211", znode_is_root(old_root)); -+ -+ result = 0; -+ if (tree->height >= REAL_MAX_ZTREE_HEIGHT) { -+ warning("nikita-1344", "Tree is too tall: %i", tree->height); -+ /* ext2 returns -ENOSPC when it runs out of free inodes with a -+ following comment (fs/ext2/ialloc.c:441): Is it really -+ ENOSPC? -+ -+ -EXFULL? -EINVAL? -+ */ -+ result = RETERR(-ENOSPC); -+ } else { -+ /* Allocate block for new root. It's not that -+ important where it will be allocated, as root is -+ almost always in memory. Moreover, allocate on -+ flush can be going here. -+ */ -+ assert("nikita-1448", znode_is_root(old_root)); -+ new_root = reiser4_new_node(fake, tree->height + 1); -+ if (!IS_ERR(new_root) && (result = zload(new_root)) == 0) { -+ lock_handle rlh; -+ -+ init_lh(&rlh); -+ result = -+ longterm_lock_znode(&rlh, new_root, -+ ZNODE_WRITE_LOCK, -+ ZNODE_LOCK_LOPRI); -+ if (result == 0) { -+ parent_coord_t *in_parent; -+ -+ znode_make_dirty(fake); -+ -+ /* new root is a child of "fake" node */ -+ write_lock_tree(tree); -+ -+ ++tree->height; -+ -+ /* recalculate max balance overhead */ -+ tree->estimate_one_insert = -+ estimate_one_insert_item(tree); -+ -+ tree->root_block = *znode_get_block(new_root); -+ in_parent = &new_root->in_parent; -+ init_parent_coord(in_parent, fake); -+ /* manually insert new root into sibling -+ * list. With this all nodes involved into -+ * balancing are connected after balancing is -+ * done---useful invariant to check. */ -+ sibling_list_insert_nolock(new_root, NULL); -+ write_unlock_tree(tree); -+ -+ /* insert into new root pointer to the -+ @old_root. */ -+ assert("nikita-1110", -+ WITH_DATA(new_root, -+ node_is_empty(new_root))); -+ write_lock_dk(tree); -+ znode_set_ld_key(new_root, reiser4_min_key()); -+ znode_set_rd_key(new_root, reiser4_max_key()); -+ write_unlock_dk(tree); -+ if (REISER4_DEBUG) { -+ ZF_CLR(old_root, JNODE_LEFT_CONNECTED); -+ ZF_CLR(old_root, JNODE_RIGHT_CONNECTED); -+ ZF_SET(old_root, JNODE_ORPHAN); -+ } -+ result = add_child_ptr(new_root, old_root); -+ done_lh(&rlh); -+ } -+ zrelse(new_root); -+ } -+ } -+ if (result != 0) -+ new_root = ERR_PTR(result); -+ return new_root; -+} -+ -+/* build &reiser4_item_data for inserting child pointer -+ -+ Build &reiser4_item_data that can be later used to insert pointer to @child -+ in its parent. -+ -+*/ -+void build_child_ptr_data(znode * child /* node pointer to which will be -+ * inserted */ , -+ reiser4_item_data * data /* where to store result */ ) -+{ -+ assert("nikita-1116", child != NULL); -+ assert("nikita-1117", data != NULL); -+ -+ /* -+ * NOTE: use address of child's blocknr as address of data to be -+ * inserted. As result of this data gets into on-disk structure in cpu -+ * byte order. internal's create_hook converts it to little endian byte -+ * order. -+ */ -+ data->data = (char *)znode_get_block(child); -+ /* data -> data is kernel space */ -+ data->user = 0; -+ data->length = sizeof(reiser4_block_nr); -+ /* FIXME-VS: hardcoded internal item? */ -+ -+ /* AUDIT: Is it possible that "item_plugin_by_id" may find nothing? */ -+ data->iplug = item_plugin_by_id(NODE_POINTER_ID); -+} -+ -+/* add pointer to @child into empty @parent. -+ -+ This is used when pointer to old root is inserted into new root which is -+ empty. -+*/ -+static int add_child_ptr(znode * parent, znode * child) -+{ -+ coord_t coord; -+ reiser4_item_data data; -+ int result; -+ reiser4_key key; -+ -+ assert("nikita-1111", parent != NULL); -+ assert("nikita-1112", child != NULL); -+ assert("nikita-1115", -+ znode_get_level(parent) == znode_get_level(child) + 1); -+ -+ result = zload(parent); -+ if (result != 0) -+ return result; -+ assert("nikita-1113", node_is_empty(parent)); -+ coord_init_first_unit(&coord, parent); -+ -+ build_child_ptr_data(child, &data); -+ data.arg = NULL; -+ -+ read_lock_dk(znode_get_tree(parent)); -+ key = *znode_get_ld_key(child); -+ read_unlock_dk(znode_get_tree(parent)); -+ -+ result = node_plugin_by_node(parent)->create_item(&coord, &key, &data, -+ NULL); -+ znode_make_dirty(parent); -+ zrelse(parent); -+ return result; -+} -+ -+/* actually remove tree root */ -+static int reiser4_kill_root(reiser4_tree * tree /* tree from which root is -+ * being removed */, -+ znode * old_root /* root node that is being -+ * removed */ , -+ znode * new_root /* new root---sole child of -+ * @old_root */, -+ const reiser4_block_nr * new_root_blk /* disk address of -+ * @new_root */) -+{ -+ znode *uber; -+ int result; -+ lock_handle handle_for_uber; -+ -+ assert("umka-265", tree != NULL); -+ assert("nikita-1198", new_root != NULL); -+ assert("nikita-1199", -+ znode_get_level(new_root) + 1 == znode_get_level(old_root)); -+ -+ assert("nikita-1201", znode_is_write_locked(old_root)); -+ -+ assert("nikita-1203", -+ disk_addr_eq(new_root_blk, znode_get_block(new_root))); -+ -+ init_lh(&handle_for_uber); -+ /* obtain and lock "fake" znode protecting changes in tree height. */ -+ result = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI, -+ &handle_for_uber); -+ if (result == 0) { -+ uber = handle_for_uber.node; -+ -+ znode_make_dirty(uber); -+ -+ /* don't take long term lock a @new_root. Take spinlock. */ -+ -+ write_lock_tree(tree); -+ -+ tree->root_block = *new_root_blk; -+ --tree->height; -+ -+ /* recalculate max balance overhead */ -+ tree->estimate_one_insert = estimate_one_insert_item(tree); -+ -+ assert("nikita-1202", -+ tree->height == znode_get_level(new_root)); -+ -+ /* new root is child on "fake" node */ -+ init_parent_coord(&new_root->in_parent, uber); -+ ++uber->c_count; -+ -+ /* sibling_list_insert_nolock(new_root, NULL); */ -+ write_unlock_tree(tree); -+ -+ /* reinitialise old root. */ -+ result = node_plugin_by_node(old_root)->init(old_root); -+ znode_make_dirty(old_root); -+ if (result == 0) { -+ assert("nikita-1279", node_is_empty(old_root)); -+ ZF_SET(old_root, JNODE_HEARD_BANSHEE); -+ old_root->c_count = 0; -+ } -+ } -+ done_lh(&handle_for_uber); -+ -+ return result; -+} -+ -+/* remove tree root -+ -+ This function removes tree root, decreasing tree height by one. Tree root -+ and its only child (that is going to become new tree root) are write locked -+ at the entry. -+ -+ To remove tree root we need to take lock on special "fake" znode that -+ protects changes of tree height. See comments in reiser4_add_tree_root() for -+ more on this. -+ -+ Also parent pointers have to be updated in -+ old and new root. To simplify code, function is split into two parts: outer -+ reiser4_kill_tree_root() collects all necessary arguments and calls -+ reiser4_kill_root() to do the actual job. -+ -+*/ -+int reiser4_kill_tree_root(znode * old_root /* tree root that we are -+ removing*/) -+{ -+ int result; -+ coord_t down_link; -+ znode *new_root; -+ reiser4_tree *tree; -+ -+ assert("umka-266", current_tree != NULL); -+ assert("nikita-1194", old_root != NULL); -+ assert("nikita-1196", znode_is_root(old_root)); -+ assert("nikita-1200", node_num_items(old_root) == 1); -+ assert("nikita-1401", znode_is_write_locked(old_root)); -+ -+ coord_init_first_unit(&down_link, old_root); -+ -+ tree = znode_get_tree(old_root); -+ new_root = child_znode(&down_link, old_root, 0, 1); -+ if (!IS_ERR(new_root)) { -+ result = -+ reiser4_kill_root(tree, old_root, new_root, -+ znode_get_block(new_root)); -+ zput(new_root); -+ } else -+ result = PTR_ERR(new_root); -+ -+ return result; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/tree_mod.h linux-2.6.24/fs/reiser4/tree_mod.h ---- linux-2.6.24.orig/fs/reiser4/tree_mod.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/tree_mod.h 2008-01-25 11:39:07.100249935 +0300 -@@ -0,0 +1,29 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Functions to add/delete new nodes to/from the tree. See tree_mod.c for -+ * comments. */ -+ -+#if !defined( __REISER4_TREE_MOD_H__ ) -+#define __REISER4_TREE_MOD_H__ -+ -+#include "forward.h" -+ -+znode *reiser4_new_node(znode * brother, tree_level level); -+znode *reiser4_add_tree_root(znode * old_root, znode * fake); -+int reiser4_kill_tree_root(znode * old_root); -+void build_child_ptr_data(znode * child, reiser4_item_data * data); -+ -+/* __REISER4_TREE_MOD_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/tree_walk.c linux-2.6.24/fs/reiser4/tree_walk.c ---- linux-2.6.24.orig/fs/reiser4/tree_walk.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/tree_walk.c 2008-01-25 11:39:07.100249935 +0300 -@@ -0,0 +1,927 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Routines and macros to: -+ -+ get_left_neighbor() -+ -+ get_right_neighbor() -+ -+ get_parent() -+ -+ get_first_child() -+ -+ get_last_child() -+ -+ various routines to walk the whole tree and do things to it like -+ repack it, or move it to tertiary storage. Please make them as -+ generic as is reasonable. -+ -+*/ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "coord.h" -+#include "plugin/item/item.h" -+#include "jnode.h" -+#include "znode.h" -+#include "tree_walk.h" -+#include "tree.h" -+#include "super.h" -+ -+/* These macros are used internally in tree_walk.c in attempt to make -+ lock_neighbor() code usable to build lock_parent(), lock_right_neighbor, -+ lock_left_neighbor */ -+#define GET_NODE_BY_PTR_OFFSET(node, off) (*(znode**)(((unsigned long)(node)) + (off))) -+#define FIELD_OFFSET(name) offsetof(znode, name) -+#define PARENT_PTR_OFFSET FIELD_OFFSET(in_parent.node) -+#define LEFT_PTR_OFFSET FIELD_OFFSET(left) -+#define RIGHT_PTR_OFFSET FIELD_OFFSET(right) -+ -+/* This is the generic procedure to get and lock `generic' neighbor (left or -+ right neighbor or parent). It implements common algorithm for all cases of -+ getting lock on neighbor node, only znode structure field is different in -+ each case. This is parameterized by ptr_offset argument, which is byte -+ offset for the pointer to the desired neighbor within the current node's -+ znode structure. This function should be called with the tree lock held */ -+static int lock_neighbor( -+ /* resulting lock handle */ -+ lock_handle * result, -+ /* znode to lock */ -+ znode * node, -+ /* pointer to neighbor (or parent) znode field offset, in bytes from -+ the base address of znode structure */ -+ int ptr_offset, -+ /* lock mode for longterm_lock_znode call */ -+ znode_lock_mode mode, -+ /* lock request for longterm_lock_znode call */ -+ znode_lock_request req, -+ /* GN_* flags */ -+ int flags, int rlocked) -+{ -+ reiser4_tree *tree = znode_get_tree(node); -+ znode *neighbor; -+ int ret; -+ -+ assert("umka-236", node != NULL); -+ assert("umka-237", tree != NULL); -+ assert_rw_locked(&(tree->tree_lock)); -+ -+ if (flags & GN_TRY_LOCK) -+ req |= ZNODE_LOCK_NONBLOCK; -+ if (flags & GN_SAME_ATOM) -+ req |= ZNODE_LOCK_DONT_FUSE; -+ -+ /* get neighbor's address by using of sibling link, quit while loop -+ (and return) if link is not available. */ -+ while (1) { -+ neighbor = GET_NODE_BY_PTR_OFFSET(node, ptr_offset); -+ -+ /* return -E_NO_NEIGHBOR if parent or side pointer is NULL or if -+ * node pointed by it is not connected. -+ * -+ * However, GN_ALLOW_NOT_CONNECTED option masks "connected" -+ * check and allows passing reference to not connected znode to -+ * subsequent longterm_lock_znode() call. This kills possible -+ * busy loop if we are trying to get longterm lock on locked but -+ * not yet connected parent node. */ -+ if (neighbor == NULL || !((flags & GN_ALLOW_NOT_CONNECTED) -+ || znode_is_connected(neighbor))) { -+ return RETERR(-E_NO_NEIGHBOR); -+ } -+ -+ /* protect it from deletion. */ -+ zref(neighbor); -+ -+ rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree); -+ -+ ret = longterm_lock_znode(result, neighbor, mode, req); -+ -+ /* The lock handle obtains its own reference, release the one from above. */ -+ zput(neighbor); -+ -+ rlocked ? read_lock_tree(tree) : write_lock_tree(tree); -+ -+ /* restart if node we got reference to is being -+ invalidated. we should not get reference to this node -+ again. */ -+ if (ret == -EINVAL) -+ continue; -+ if (ret) -+ return ret; -+ -+ /* check if neighbor link still points to just locked znode; -+ the link could have been changed while the process slept. */ -+ if (neighbor == GET_NODE_BY_PTR_OFFSET(node, ptr_offset)) -+ return 0; -+ -+ /* znode was locked by mistake; unlock it and restart locking -+ process from beginning. */ -+ rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree); -+ longterm_unlock_znode(result); -+ rlocked ? read_lock_tree(tree) : write_lock_tree(tree); -+ } -+} -+ -+/* get parent node with longterm lock, accepts GN* flags. */ -+int reiser4_get_parent_flags(lock_handle * lh /* resulting lock handle */ , -+ znode * node /* child node */ , -+ znode_lock_mode mode -+ /* type of lock: read or write */ , -+ int flags /* GN_* flags */ ) -+{ -+ int result; -+ -+ read_lock_tree(znode_get_tree(node)); -+ result = lock_neighbor(lh, node, PARENT_PTR_OFFSET, mode, -+ ZNODE_LOCK_HIPRI, flags, 1); -+ read_unlock_tree(znode_get_tree(node)); -+ return result; -+} -+ -+/* wrapper function to lock right or left neighbor depending on GN_GO_LEFT -+ bit in @flags parameter */ -+/* Audited by: umka (2002.06.14) */ -+static inline int -+lock_side_neighbor(lock_handle * result, -+ znode * node, znode_lock_mode mode, int flags, int rlocked) -+{ -+ int ret; -+ int ptr_offset; -+ znode_lock_request req; -+ -+ if (flags & GN_GO_LEFT) { -+ ptr_offset = LEFT_PTR_OFFSET; -+ req = ZNODE_LOCK_LOPRI; -+ } else { -+ ptr_offset = RIGHT_PTR_OFFSET; -+ req = ZNODE_LOCK_HIPRI; -+ } -+ -+ ret = -+ lock_neighbor(result, node, ptr_offset, mode, req, flags, rlocked); -+ -+ if (ret == -E_NO_NEIGHBOR) /* if we walk left or right -E_NO_NEIGHBOR does not -+ * guarantee that neighbor is absent in the -+ * tree; in this case we return -ENOENT -- -+ * means neighbor at least not found in -+ * cache */ -+ return RETERR(-ENOENT); -+ -+ return ret; -+} -+ -+#if REISER4_DEBUG -+ -+int check_sibling_list(znode * node) -+{ -+ znode *scan; -+ znode *next; -+ -+ assert("nikita-3283", LOCK_CNT_GTZ(write_locked_tree)); -+ -+ if (node == NULL) -+ return 1; -+ -+ if (ZF_ISSET(node, JNODE_RIP)) -+ return 1; -+ -+ assert("nikita-3270", node != NULL); -+ assert_rw_write_locked(&(znode_get_tree(node)->tree_lock)); -+ -+ for (scan = node; znode_is_left_connected(scan); scan = next) { -+ next = scan->left; -+ if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) { -+ assert("nikita-3271", znode_is_right_connected(next)); -+ assert("nikita-3272", next->right == scan); -+ } else -+ break; -+ } -+ for (scan = node; znode_is_right_connected(scan); scan = next) { -+ next = scan->right; -+ if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) { -+ assert("nikita-3273", znode_is_left_connected(next)); -+ assert("nikita-3274", next->left == scan); -+ } else -+ break; -+ } -+ return 1; -+} -+ -+#endif -+ -+/* Znode sibling pointers maintenence. */ -+ -+/* Znode sibling pointers are established between any neighbored nodes which are -+ in cache. There are two znode state bits (JNODE_LEFT_CONNECTED, -+ JNODE_RIGHT_CONNECTED), if left or right sibling pointer contains actual -+ value (even NULL), corresponded JNODE_*_CONNECTED bit is set. -+ -+ Reiser4 tree operations which may allocate new znodes (CBK, tree balancing) -+ take care about searching (hash table lookup may be required) of znode -+ neighbors, establishing sibling pointers between them and setting -+ JNODE_*_CONNECTED state bits. */ -+ -+/* adjusting of sibling pointers and `connected' states for two -+ neighbors; works if one neighbor is NULL (was not found). */ -+ -+/* FIXME-VS: this is unstatic-ed to use in tree.c in prepare_twig_cut */ -+void link_left_and_right(znode * left, znode * right) -+{ -+ assert("nikita-3275", check_sibling_list(left)); -+ assert("nikita-3275", check_sibling_list(right)); -+ -+ if (left != NULL) { -+ if (left->right == NULL) { -+ left->right = right; -+ ZF_SET(left, JNODE_RIGHT_CONNECTED); -+ -+ ON_DEBUG(left->right_version = -+ atomic_inc_return(&delim_key_version); -+ ); -+ -+ } else if (ZF_ISSET(left->right, JNODE_HEARD_BANSHEE) -+ && left->right != right) { -+ -+ ON_DEBUG(left->right->left_version = -+ atomic_inc_return(&delim_key_version); -+ left->right_version = -+ atomic_inc_return(&delim_key_version);); -+ -+ left->right->left = NULL; -+ left->right = right; -+ ZF_SET(left, JNODE_RIGHT_CONNECTED); -+ } else -+ /* -+ * there is a race condition in renew_sibling_link() -+ * and assertions below check that it is only one -+ * there. Thread T1 calls renew_sibling_link() without -+ * GN_NO_ALLOC flag. zlook() doesn't find neighbor -+ * node, but before T1 gets to the -+ * link_left_and_right(), another thread T2 creates -+ * neighbor node and connects it. check for -+ * left->right == NULL above protects T1 from -+ * overwriting correct left->right pointer installed -+ * by T2. -+ */ -+ assert("nikita-3302", -+ right == NULL || left->right == right); -+ } -+ if (right != NULL) { -+ if (right->left == NULL) { -+ right->left = left; -+ ZF_SET(right, JNODE_LEFT_CONNECTED); -+ -+ ON_DEBUG(right->left_version = -+ atomic_inc_return(&delim_key_version); -+ ); -+ -+ } else if (ZF_ISSET(right->left, JNODE_HEARD_BANSHEE) -+ && right->left != left) { -+ -+ ON_DEBUG(right->left->right_version = -+ atomic_inc_return(&delim_key_version); -+ right->left_version = -+ atomic_inc_return(&delim_key_version);); -+ -+ right->left->right = NULL; -+ right->left = left; -+ ZF_SET(right, JNODE_LEFT_CONNECTED); -+ -+ } else -+ assert("nikita-3303", -+ left == NULL || right->left == left); -+ } -+ assert("nikita-3275", check_sibling_list(left)); -+ assert("nikita-3275", check_sibling_list(right)); -+} -+ -+/* Audited by: umka (2002.06.14) */ -+static void link_znodes(znode * first, znode * second, int to_left) -+{ -+ if (to_left) -+ link_left_and_right(second, first); -+ else -+ link_left_and_right(first, second); -+} -+ -+/* getting of next (to left or to right, depend on gn_to_left bit in flags) -+ coord's unit position in horizontal direction, even across node -+ boundary. Should be called under tree lock, it protects nonexistence of -+ sibling link on parent level, if lock_side_neighbor() fails with -+ -ENOENT. */ -+static int far_next_coord(coord_t * coord, lock_handle * handle, int flags) -+{ -+ int ret; -+ znode *node; -+ reiser4_tree *tree; -+ -+ assert("umka-243", coord != NULL); -+ assert("umka-244", handle != NULL); -+ assert("zam-1069", handle->node == NULL); -+ -+ ret = -+ (flags & GN_GO_LEFT) ? coord_prev_unit(coord) : -+ coord_next_unit(coord); -+ if (!ret) -+ return 0; -+ -+ ret = -+ lock_side_neighbor(handle, coord->node, ZNODE_READ_LOCK, flags, 0); -+ if (ret) -+ return ret; -+ -+ node = handle->node; -+ tree = znode_get_tree(node); -+ write_unlock_tree(tree); -+ -+ coord_init_zero(coord); -+ -+ /* We avoid synchronous read here if it is specified by flag. */ -+ if ((flags & GN_ASYNC) && znode_page(handle->node) == NULL) { -+ ret = jstartio(ZJNODE(handle->node)); -+ if (!ret) -+ ret = -E_REPEAT; -+ goto error_locked; -+ } -+ -+ /* corresponded zrelse() should be called by the clients of -+ far_next_coord(), in place when this node gets unlocked. */ -+ ret = zload(handle->node); -+ if (ret) -+ goto error_locked; -+ -+ if (flags & GN_GO_LEFT) -+ coord_init_last_unit(coord, node); -+ else -+ coord_init_first_unit(coord, node); -+ -+ if (0) { -+ error_locked: -+ longterm_unlock_znode(handle); -+ } -+ write_lock_tree(tree); -+ return ret; -+} -+ -+/* Very significant function which performs a step in horizontal direction -+ when sibling pointer is not available. Actually, it is only function which -+ does it. -+ Note: this function does not restore locking status at exit, -+ caller should does care about proper unlocking and zrelsing */ -+static int -+renew_sibling_link(coord_t * coord, lock_handle * handle, znode * child, -+ tree_level level, int flags, int *nr_locked) -+{ -+ int ret; -+ int to_left = flags & GN_GO_LEFT; -+ reiser4_block_nr da; -+ /* parent of the neighbor node; we set it to parent until not sharing -+ of one parent between child and neighbor node is detected */ -+ znode *side_parent = coord->node; -+ reiser4_tree *tree = znode_get_tree(child); -+ znode *neighbor = NULL; -+ -+ assert("umka-245", coord != NULL); -+ assert("umka-246", handle != NULL); -+ assert("umka-247", child != NULL); -+ assert("umka-303", tree != NULL); -+ -+ init_lh(handle); -+ write_lock_tree(tree); -+ ret = far_next_coord(coord, handle, flags); -+ -+ if (ret) { -+ if (ret != -ENOENT) { -+ write_unlock_tree(tree); -+ return ret; -+ } -+ } else { -+ item_plugin *iplug; -+ -+ if (handle->node != NULL) { -+ (*nr_locked)++; -+ side_parent = handle->node; -+ } -+ -+ /* does coord object points to internal item? We do not -+ support sibling pointers between znode for formatted and -+ unformatted nodes and return -E_NO_NEIGHBOR in that case. */ -+ iplug = item_plugin_by_coord(coord); -+ if (!item_is_internal(coord)) { -+ link_znodes(child, NULL, to_left); -+ write_unlock_tree(tree); -+ /* we know there can't be formatted neighbor */ -+ return RETERR(-E_NO_NEIGHBOR); -+ } -+ write_unlock_tree(tree); -+ -+ iplug->s.internal.down_link(coord, NULL, &da); -+ -+ if (flags & GN_NO_ALLOC) { -+ neighbor = zlook(tree, &da); -+ } else { -+ neighbor = -+ zget(tree, &da, side_parent, level, -+ reiser4_ctx_gfp_mask_get()); -+ } -+ -+ if (IS_ERR(neighbor)) { -+ ret = PTR_ERR(neighbor); -+ return ret; -+ } -+ -+ if (neighbor) -+ /* update delimiting keys */ -+ set_child_delimiting_keys(coord->node, coord, neighbor); -+ -+ write_lock_tree(tree); -+ } -+ -+ if (likely(neighbor == NULL || -+ (znode_get_level(child) == znode_get_level(neighbor) -+ && child != neighbor))) -+ link_znodes(child, neighbor, to_left); -+ else { -+ warning("nikita-3532", -+ "Sibling nodes on the different levels: %i != %i\n", -+ znode_get_level(child), znode_get_level(neighbor)); -+ ret = RETERR(-EIO); -+ } -+ -+ write_unlock_tree(tree); -+ -+ /* if GN_NO_ALLOC isn't set we keep reference to neighbor znode */ -+ if (neighbor != NULL && (flags & GN_NO_ALLOC)) -+ /* atomic_dec(&ZJNODE(neighbor)->x_count); */ -+ zput(neighbor); -+ -+ return ret; -+} -+ -+/* This function is for establishing of one side relation. */ -+/* Audited by: umka (2002.06.14) */ -+static int connect_one_side(coord_t * coord, znode * node, int flags) -+{ -+ coord_t local; -+ lock_handle handle; -+ int nr_locked; -+ int ret; -+ -+ assert("umka-248", coord != NULL); -+ assert("umka-249", node != NULL); -+ -+ coord_dup_nocheck(&local, coord); -+ -+ init_lh(&handle); -+ -+ ret = -+ renew_sibling_link(&local, &handle, node, znode_get_level(node), -+ flags | GN_NO_ALLOC, &nr_locked); -+ -+ if (handle.node != NULL) { -+ /* complementary operations for zload() and lock() in far_next_coord() */ -+ zrelse(handle.node); -+ longterm_unlock_znode(&handle); -+ } -+ -+ /* we catch error codes which are not interesting for us because we -+ run renew_sibling_link() only for znode connection. */ -+ if (ret == -ENOENT || ret == -E_NO_NEIGHBOR) -+ return 0; -+ -+ return ret; -+} -+ -+/* if @child is not in `connected' state, performs hash searches for left and -+ right neighbor nodes and establishes horizontal sibling links */ -+/* Audited by: umka (2002.06.14), umka (2002.06.15) */ -+int connect_znode(coord_t * parent_coord, znode * child) -+{ -+ reiser4_tree *tree = znode_get_tree(child); -+ int ret = 0; -+ -+ assert("zam-330", parent_coord != NULL); -+ assert("zam-331", child != NULL); -+ assert("zam-332", parent_coord->node != NULL); -+ assert("umka-305", tree != NULL); -+ -+ /* it is trivial to `connect' root znode because it can't have -+ neighbors */ -+ if (znode_above_root(parent_coord->node)) { -+ child->left = NULL; -+ child->right = NULL; -+ ZF_SET(child, JNODE_LEFT_CONNECTED); -+ ZF_SET(child, JNODE_RIGHT_CONNECTED); -+ -+ ON_DEBUG(child->left_version = -+ atomic_inc_return(&delim_key_version); -+ child->right_version = -+ atomic_inc_return(&delim_key_version);); -+ -+ return 0; -+ } -+ -+ /* load parent node */ -+ coord_clear_iplug(parent_coord); -+ ret = zload(parent_coord->node); -+ -+ if (ret != 0) -+ return ret; -+ -+ /* protect `connected' state check by tree_lock */ -+ read_lock_tree(tree); -+ -+ if (!znode_is_right_connected(child)) { -+ read_unlock_tree(tree); -+ /* connect right (default is right) */ -+ ret = connect_one_side(parent_coord, child, GN_NO_ALLOC); -+ if (ret) -+ goto zrelse_and_ret; -+ -+ read_lock_tree(tree); -+ } -+ -+ ret = znode_is_left_connected(child); -+ -+ read_unlock_tree(tree); -+ -+ if (!ret) { -+ ret = -+ connect_one_side(parent_coord, child, -+ GN_NO_ALLOC | GN_GO_LEFT); -+ } else -+ ret = 0; -+ -+ zrelse_and_ret: -+ zrelse(parent_coord->node); -+ -+ return ret; -+} -+ -+/* this function is like renew_sibling_link() but allocates neighbor node if -+ it doesn't exist and `connects' it. It may require making two steps in -+ horizontal direction, first one for neighbor node finding/allocation, -+ second one is for finding neighbor of neighbor to connect freshly allocated -+ znode. */ -+/* Audited by: umka (2002.06.14), umka (2002.06.15) */ -+static int -+renew_neighbor(coord_t * coord, znode * node, tree_level level, int flags) -+{ -+ coord_t local; -+ lock_handle empty[2]; -+ reiser4_tree *tree = znode_get_tree(node); -+ znode *neighbor = NULL; -+ int nr_locked = 0; -+ int ret; -+ -+ assert("umka-250", coord != NULL); -+ assert("umka-251", node != NULL); -+ assert("umka-307", tree != NULL); -+ assert("umka-308", level <= tree->height); -+ -+ /* umka (2002.06.14) -+ Here probably should be a check for given "level" validness. -+ Something like assert("xxx-yyy", level < REAL_MAX_ZTREE_HEIGHT); -+ */ -+ -+ coord_dup(&local, coord); -+ -+ ret = -+ renew_sibling_link(&local, &empty[0], node, level, -+ flags & ~GN_NO_ALLOC, &nr_locked); -+ if (ret) -+ goto out; -+ -+ /* tree lock is not needed here because we keep parent node(s) locked -+ and reference to neighbor znode incremented */ -+ neighbor = (flags & GN_GO_LEFT) ? node->left : node->right; -+ -+ read_lock_tree(tree); -+ ret = znode_is_connected(neighbor); -+ read_unlock_tree(tree); -+ if (ret) { -+ ret = 0; -+ goto out; -+ } -+ -+ ret = -+ renew_sibling_link(&local, &empty[nr_locked], neighbor, level, -+ flags | GN_NO_ALLOC, &nr_locked); -+ /* second renew_sibling_link() call is used for znode connection only, -+ so we can live with these errors */ -+ if (-ENOENT == ret || -E_NO_NEIGHBOR == ret) -+ ret = 0; -+ -+ out: -+ -+ for (--nr_locked; nr_locked >= 0; --nr_locked) { -+ zrelse(empty[nr_locked].node); -+ longterm_unlock_znode(&empty[nr_locked]); -+ } -+ -+ if (neighbor != NULL) -+ /* decrement znode reference counter without actually -+ releasing it. */ -+ atomic_dec(&ZJNODE(neighbor)->x_count); -+ -+ return ret; -+} -+ -+/* -+ reiser4_get_neighbor() -- lock node's neighbor. -+ -+ reiser4_get_neighbor() locks node's neighbor (left or right one, depends on -+ given parameter) using sibling link to it. If sibling link is not available -+ (i.e. neighbor znode is not in cache) and flags allow read blocks, we go one -+ level up for information about neighbor's disk address. We lock node's -+ parent, if it is common parent for both 'node' and its neighbor, neighbor's -+ disk address is in next (to left or to right) down link from link that points -+ to original node. If not, we need to lock parent's neighbor, read its content -+ and take first(last) downlink with neighbor's disk address. That locking -+ could be done by using sibling link and lock_neighbor() function, if sibling -+ link exists. In another case we have to go level up again until we find -+ common parent or valid sibling link. Then go down -+ allocating/connecting/locking/reading nodes until neighbor of first one is -+ locked. -+ -+ @neighbor: result lock handle, -+ @node: a node which we lock neighbor of, -+ @lock_mode: lock mode {LM_READ, LM_WRITE}, -+ @flags: logical OR of {GN_*} (see description above) subset. -+ -+ @return: 0 if success, negative value if lock was impossible due to an error -+ or lack of neighbor node. -+*/ -+ -+/* Audited by: umka (2002.06.14), umka (2002.06.15) */ -+int -+reiser4_get_neighbor(lock_handle * neighbor, znode * node, -+ znode_lock_mode lock_mode, int flags) -+{ -+ reiser4_tree *tree = znode_get_tree(node); -+ lock_handle path[REAL_MAX_ZTREE_HEIGHT]; -+ -+ coord_t coord; -+ -+ tree_level base_level; -+ tree_level h = 0; -+ int ret; -+ -+ assert("umka-252", tree != NULL); -+ assert("umka-253", neighbor != NULL); -+ assert("umka-254", node != NULL); -+ -+ base_level = znode_get_level(node); -+ -+ assert("umka-310", base_level <= tree->height); -+ -+ coord_init_zero(&coord); -+ -+ again: -+ /* first, we try to use simple lock_neighbor() which requires sibling -+ link existence */ -+ read_lock_tree(tree); -+ ret = lock_side_neighbor(neighbor, node, lock_mode, flags, 1); -+ read_unlock_tree(tree); -+ if (!ret) { -+ /* load znode content if it was specified */ -+ if (flags & GN_LOAD_NEIGHBOR) { -+ ret = zload(node); -+ if (ret) -+ longterm_unlock_znode(neighbor); -+ } -+ return ret; -+ } -+ -+ /* only -ENOENT means we may look upward and try to connect -+ @node with its neighbor (if @flags allow us to do it) */ -+ if (ret != -ENOENT || !(flags & GN_CAN_USE_UPPER_LEVELS)) -+ return ret; -+ -+ /* before establishing of sibling link we lock parent node; it is -+ required by renew_neighbor() to work. */ -+ init_lh(&path[0]); -+ ret = reiser4_get_parent(&path[0], node, ZNODE_READ_LOCK); -+ if (ret) -+ return ret; -+ if (znode_above_root(path[0].node)) { -+ longterm_unlock_znode(&path[0]); -+ return RETERR(-E_NO_NEIGHBOR); -+ } -+ -+ while (1) { -+ znode *child = (h == 0) ? node : path[h - 1].node; -+ znode *parent = path[h].node; -+ -+ ret = zload(parent); -+ if (ret) -+ break; -+ -+ ret = find_child_ptr(parent, child, &coord); -+ -+ if (ret) { -+ zrelse(parent); -+ break; -+ } -+ -+ /* try to establish missing sibling link */ -+ ret = renew_neighbor(&coord, child, h + base_level, flags); -+ -+ zrelse(parent); -+ -+ switch (ret) { -+ case 0: -+ /* unlocking of parent znode prevents simple -+ deadlock situation */ -+ done_lh(&path[h]); -+ -+ /* depend on tree level we stay on we repeat first -+ locking attempt ... */ -+ if (h == 0) -+ goto again; -+ -+ /* ... or repeat establishing of sibling link at -+ one level below. */ -+ --h; -+ break; -+ -+ case -ENOENT: -+ /* sibling link is not available -- we go -+ upward. */ -+ init_lh(&path[h + 1]); -+ ret = -+ reiser4_get_parent(&path[h + 1], parent, -+ ZNODE_READ_LOCK); -+ if (ret) -+ goto fail; -+ ++h; -+ if (znode_above_root(path[h].node)) { -+ ret = RETERR(-E_NO_NEIGHBOR); -+ goto fail; -+ } -+ break; -+ -+ case -E_DEADLOCK: -+ /* there was lock request from hi-pri locker. if -+ it is possible we unlock last parent node and -+ re-lock it again. */ -+ for (; reiser4_check_deadlock(); h--) { -+ done_lh(&path[h]); -+ if (h == 0) -+ goto fail; -+ } -+ -+ break; -+ -+ default: /* other errors. */ -+ goto fail; -+ } -+ } -+ fail: -+ ON_DEBUG(check_lock_node_data(node)); -+ ON_DEBUG(check_lock_data()); -+ -+ /* unlock path */ -+ do { -+ /* FIXME-Zam: when we get here from case -E_DEADLOCK's goto -+ fail; path[0] is already done_lh-ed, therefore -+ longterm_unlock_znode(&path[h]); is not applicable */ -+ done_lh(&path[h]); -+ --h; -+ } while (h + 1 != 0); -+ -+ return ret; -+} -+ -+/* remove node from sibling list */ -+/* Audited by: umka (2002.06.14) */ -+void sibling_list_remove(znode * node) -+{ -+ reiser4_tree *tree; -+ -+ tree = znode_get_tree(node); -+ assert("umka-255", node != NULL); -+ assert_rw_write_locked(&(tree->tree_lock)); -+ assert("nikita-3275", check_sibling_list(node)); -+ -+ write_lock_dk(tree); -+ if (znode_is_right_connected(node) && node->right != NULL && -+ znode_is_left_connected(node) && node->left != NULL) { -+ assert("zam-32245", -+ keyeq(znode_get_rd_key(node), -+ znode_get_ld_key(node->right))); -+ znode_set_rd_key(node->left, znode_get_ld_key(node->right)); -+ } -+ write_unlock_dk(tree); -+ -+ if (znode_is_right_connected(node) && node->right != NULL) { -+ assert("zam-322", znode_is_left_connected(node->right)); -+ node->right->left = node->left; -+ ON_DEBUG(node->right->left_version = -+ atomic_inc_return(&delim_key_version); -+ ); -+ } -+ if (znode_is_left_connected(node) && node->left != NULL) { -+ assert("zam-323", znode_is_right_connected(node->left)); -+ node->left->right = node->right; -+ ON_DEBUG(node->left->right_version = -+ atomic_inc_return(&delim_key_version); -+ ); -+ } -+ -+ ZF_CLR(node, JNODE_LEFT_CONNECTED); -+ ZF_CLR(node, JNODE_RIGHT_CONNECTED); -+ ON_DEBUG(node->left = node->right = NULL; -+ node->left_version = atomic_inc_return(&delim_key_version); -+ node->right_version = atomic_inc_return(&delim_key_version);); -+ assert("nikita-3276", check_sibling_list(node)); -+} -+ -+/* disconnect node from sibling list */ -+void sibling_list_drop(znode * node) -+{ -+ znode *right; -+ znode *left; -+ -+ assert("nikita-2464", node != NULL); -+ assert("nikita-3277", check_sibling_list(node)); -+ -+ right = node->right; -+ if (right != NULL) { -+ assert("nikita-2465", znode_is_left_connected(right)); -+ right->left = NULL; -+ ON_DEBUG(right->left_version = -+ atomic_inc_return(&delim_key_version); -+ ); -+ } -+ left = node->left; -+ if (left != NULL) { -+ assert("zam-323", znode_is_right_connected(left)); -+ left->right = NULL; -+ ON_DEBUG(left->right_version = -+ atomic_inc_return(&delim_key_version); -+ ); -+ } -+ ZF_CLR(node, JNODE_LEFT_CONNECTED); -+ ZF_CLR(node, JNODE_RIGHT_CONNECTED); -+ ON_DEBUG(node->left = node->right = NULL; -+ node->left_version = atomic_inc_return(&delim_key_version); -+ node->right_version = atomic_inc_return(&delim_key_version);); -+} -+ -+/* Insert new node into sibling list. Regular balancing inserts new node -+ after (at right side) existing and locked node (@before), except one case -+ of adding new tree root node. @before should be NULL in that case. */ -+void sibling_list_insert_nolock(znode * new, znode * before) -+{ -+ assert("zam-334", new != NULL); -+ assert("nikita-3298", !znode_is_left_connected(new)); -+ assert("nikita-3299", !znode_is_right_connected(new)); -+ assert("nikita-3300", new->left == NULL); -+ assert("nikita-3301", new->right == NULL); -+ assert("nikita-3278", check_sibling_list(new)); -+ assert("nikita-3279", check_sibling_list(before)); -+ -+ if (before != NULL) { -+ assert("zam-333", znode_is_connected(before)); -+ new->right = before->right; -+ new->left = before; -+ ON_DEBUG(new->right_version = -+ atomic_inc_return(&delim_key_version); -+ new->left_version = -+ atomic_inc_return(&delim_key_version);); -+ if (before->right != NULL) { -+ before->right->left = new; -+ ON_DEBUG(before->right->left_version = -+ atomic_inc_return(&delim_key_version); -+ ); -+ } -+ before->right = new; -+ ON_DEBUG(before->right_version = -+ atomic_inc_return(&delim_key_version); -+ ); -+ } else { -+ new->right = NULL; -+ new->left = NULL; -+ ON_DEBUG(new->right_version = -+ atomic_inc_return(&delim_key_version); -+ new->left_version = -+ atomic_inc_return(&delim_key_version);); -+ } -+ ZF_SET(new, JNODE_LEFT_CONNECTED); -+ ZF_SET(new, JNODE_RIGHT_CONNECTED); -+ assert("nikita-3280", check_sibling_list(new)); -+ assert("nikita-3281", check_sibling_list(before)); -+} -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/tree_walk.h linux-2.6.24/fs/reiser4/tree_walk.h ---- linux-2.6.24.orig/fs/reiser4/tree_walk.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/tree_walk.h 2008-01-25 11:39:07.100249935 +0300 -@@ -0,0 +1,125 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+/* definitions of reiser4 tree walk functions */ -+ -+#ifndef __FS_REISER4_TREE_WALK_H__ -+#define __FS_REISER4_TREE_WALK_H__ -+ -+#include "debug.h" -+#include "forward.h" -+ -+/* establishes horizontal links between cached znodes */ -+int connect_znode(coord_t * coord, znode * node); -+ -+/* tree traversal functions (reiser4_get_parent(), reiser4_get_neighbor()) -+ have the following common arguments: -+ -+ return codes: -+ -+ @return : 0 - OK, -+ -+ZAM-FIXME-HANS: wrong return code name. Change them all. -+ -ENOENT - neighbor is not in cache, what is detected by sibling -+ link absence. -+ -+ -E_NO_NEIGHBOR - we are sure that neighbor (or parent) node cannot be -+ found (because we are left-/right- most node of the -+ tree, for example). Also, this return code is for -+ reiser4_get_parent() when we see no parent link -- it -+ means that our node is root node. -+ -+ -E_DEADLOCK - deadlock detected (request from high-priority process -+ received), other error codes are conformed to -+ /usr/include/asm/errno.h . -+*/ -+ -+int -+reiser4_get_parent_flags(lock_handle * result, znode * node, -+ znode_lock_mode mode, int flags); -+ -+/* bits definition for reiser4_get_neighbor function `flags' arg. */ -+typedef enum { -+ /* If sibling pointer is NULL, this flag allows get_neighbor() to try to -+ * find not allocated not connected neigbor by going though upper -+ * levels */ -+ GN_CAN_USE_UPPER_LEVELS = 0x1, -+ /* locking left neighbor instead of right one */ -+ GN_GO_LEFT = 0x2, -+ /* automatically load neighbor node content */ -+ GN_LOAD_NEIGHBOR = 0x4, -+ /* return -E_REPEAT if can't lock */ -+ GN_TRY_LOCK = 0x8, -+ /* used internally in tree_walk.c, causes renew_sibling to not -+ allocate neighbor znode, but only search for it in znode cache */ -+ GN_NO_ALLOC = 0x10, -+ /* do not go across atom boundaries */ -+ GN_SAME_ATOM = 0x20, -+ /* allow to lock not connected nodes */ -+ GN_ALLOW_NOT_CONNECTED = 0x40, -+ /* Avoid synchronous jload, instead, call jstartio() and return -E_REPEAT. */ -+ GN_ASYNC = 0x80 -+} znode_get_neigbor_flags; -+ -+/* A commonly used wrapper for reiser4_get_parent_flags(). */ -+static inline int reiser4_get_parent(lock_handle * result, znode * node, -+ znode_lock_mode mode) -+{ -+ return reiser4_get_parent_flags(result, node, mode, -+ GN_ALLOW_NOT_CONNECTED); -+} -+ -+int reiser4_get_neighbor(lock_handle * neighbor, znode * node, -+ znode_lock_mode lock_mode, int flags); -+ -+/* there are wrappers for most common usages of reiser4_get_neighbor() */ -+static inline int -+reiser4_get_left_neighbor(lock_handle * result, znode * node, int lock_mode, -+ int flags) -+{ -+ return reiser4_get_neighbor(result, node, lock_mode, -+ flags | GN_GO_LEFT); -+} -+ -+static inline int -+reiser4_get_right_neighbor(lock_handle * result, znode * node, int lock_mode, -+ int flags) -+{ -+ ON_DEBUG(check_lock_node_data(node)); -+ ON_DEBUG(check_lock_data()); -+ return reiser4_get_neighbor(result, node, lock_mode, -+ flags & (~GN_GO_LEFT)); -+} -+ -+extern void sibling_list_remove(znode * node); -+extern void sibling_list_drop(znode * node); -+extern void sibling_list_insert_nolock(znode * new, znode * before); -+extern void link_left_and_right(znode * left, znode * right); -+ -+/* Functions called by tree_walk() when tree_walk() ... */ -+struct tree_walk_actor { -+ /* ... meets a formatted node, */ -+ int (*process_znode) (tap_t *, void *); -+ /* ... meets an extent, */ -+ int (*process_extent) (tap_t *, void *); -+ /* ... begins tree traversal or repeats it after -E_REPEAT was returned by -+ * node or extent processing functions. */ -+ int (*before) (void *); -+}; -+ -+#if REISER4_DEBUG -+int check_sibling_list(znode * node); -+#else -+#define check_sibling_list(n) (1) -+#endif -+ -+#endif /* __FS_REISER4_TREE_WALK_H__ */ -+ -+/* -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/txnmgr.c linux-2.6.24/fs/reiser4/txnmgr.c ---- linux-2.6.24.orig/fs/reiser4/txnmgr.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/txnmgr.c 2008-01-25 11:39:07.108251996 +0300 -@@ -0,0 +1,3164 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Joshua MacDonald wrote the first draft of this code. */ -+ -+/* ZAM-LONGTERM-FIXME-HANS: The locking in this file is badly designed, and a -+filesystem scales only as well as its worst locking design. You need to -+substantially restructure this code. Josh was not as experienced a programmer -+as you. Particularly review how the locking style differs from what you did -+for znodes usingt hi-lo priority locking, and present to me an opinion on -+whether the differences are well founded. */ -+ -+/* I cannot help but to disagree with the sentiment above. Locking of -+ * transaction manager is _not_ badly designed, and, at the very least, is not -+ * the scaling bottleneck. Scaling bottleneck is _exactly_ hi-lo priority -+ * locking on znodes, especially on the root node of the tree. --nikita, -+ * 2003.10.13 */ -+ -+/* The txnmgr is a set of interfaces that keep track of atoms and transcrash handles. The -+ txnmgr processes capture_block requests and manages the relationship between jnodes and -+ atoms through the various stages of a transcrash, and it also oversees the fusion and -+ capture-on-copy processes. The main difficulty with this task is maintaining a -+ deadlock-free lock ordering between atoms and jnodes/handles. The reason for the -+ difficulty is that jnodes, handles, and atoms contain pointer circles, and the cycle -+ must be broken. The main requirement is that atom-fusion be deadlock free, so once you -+ hold the atom_lock you may then wait to acquire any jnode or handle lock. This implies -+ that any time you check the atom-pointer of a jnode or handle and then try to lock that -+ atom, you must use trylock() and possibly reverse the order. -+ -+ This code implements the design documented at: -+ -+ http://namesys.com/txn-doc.html -+ -+ZAM-FIXME-HANS: update v4.html to contain all of the information present in the above (but updated), and then remove the -+above document and reference the new. Be sure to provide some credit to Josh. I already have some writings on this -+topic in v4.html, but they are lacking in details present in the above. Cure that. Remember to write for the bright 12 -+year old --- define all technical terms used. -+ -+*/ -+ -+/* Thoughts on the external transaction interface: -+ -+ In the current code, a TRANSCRASH handle is created implicitly by reiser4_init_context() (which -+ creates state that lasts for the duration of a system call and is called at the start -+ of ReiserFS methods implementing VFS operations), and closed by reiser4_exit_context(), -+ occupying the scope of a single system call. We wish to give certain applications an -+ interface to begin and close (commit) transactions. Since our implementation of -+ transactions does not yet support isolation, allowing an application to open a -+ transaction implies trusting it to later close the transaction. Part of the -+ transaction interface will be aimed at enabling that trust, but the interface for -+ actually using transactions is fairly narrow. -+ -+ BEGIN_TRANSCRASH: Returns a transcrash identifier. It should be possible to translate -+ this identifier into a string that a shell-script could use, allowing you to start a -+ transaction by issuing a command. Once open, the transcrash should be set in the task -+ structure, and there should be options (I suppose) to allow it to be carried across -+ fork/exec. A transcrash has several options: -+ -+ - READ_FUSING or WRITE_FUSING: The default policy is for txn-capture to capture only -+ on writes (WRITE_FUSING) and allow "dirty reads". If the application wishes to -+ capture on reads as well, it should set READ_FUSING. -+ -+ - TIMEOUT: Since a non-isolated transcrash cannot be undone, every transcrash must -+ eventually close (or else the machine must crash). If the application dies an -+ unexpected death with an open transcrash, for example, or if it hangs for a long -+ duration, one solution (to avoid crashing the machine) is to simply close it anyway. -+ This is a dangerous option, but it is one way to solve the problem until isolated -+ transcrashes are available for untrusted applications. -+ -+ It seems to be what databases do, though it is unclear how one avoids a DoS attack -+ creating a vulnerability based on resource starvation. Guaranteeing that some -+ minimum amount of computational resources are made available would seem more correct -+ than guaranteeing some amount of time. When we again have someone to code the work, -+ this issue should be considered carefully. -Hans -+ -+ RESERVE_BLOCKS: A running transcrash should indicate to the transaction manager how -+ many dirty blocks it expects. The reserve_blocks interface should be called at a point -+ where it is safe for the application to fail, because the system may not be able to -+ grant the allocation and the application must be able to back-out. For this reason, -+ the number of reserve-blocks can also be passed as an argument to BEGIN_TRANSCRASH, but -+ the application may also wish to extend the allocation after beginning its transcrash. -+ -+ CLOSE_TRANSCRASH: The application closes the transcrash when it is finished making -+ modifications that require transaction protection. When isolated transactions are -+ supported the CLOSE operation is replaced by either COMMIT or ABORT. For example, if a -+ RESERVE_BLOCKS call fails for the application, it should "abort" by calling -+ CLOSE_TRANSCRASH, even though it really commits any changes that were made (which is -+ why, for safety, the application should call RESERVE_BLOCKS before making any changes). -+ -+ For actually implementing these out-of-system-call-scopped transcrashes, the -+ reiser4_context has a "txn_handle *trans" pointer that may be set to an open -+ transcrash. Currently there are no dynamically-allocated transcrashes, but there is a -+ "struct kmem_cache *_txnh_slab" created for that purpose in this file. -+*/ -+ -+/* Extending the other system call interfaces for future transaction features: -+ -+ Specialized applications may benefit from passing flags to the ordinary system call -+ interface such as read(), write(), or stat(). For example, the application specifies -+ WRITE_FUSING by default but wishes to add that a certain read() command should be -+ treated as READ_FUSING. But which read? Is it the directory-entry read, the stat-data -+ read, or the file-data read? These issues are straight-forward, but there are a lot of -+ them and adding the necessary flags-passing code will be tedious. -+ -+ When supporting isolated transactions, there is a corresponding READ_MODIFY_WRITE (RMW) -+ flag, which specifies that although it is a read operation being requested, a -+ write-lock should be taken. The reason is that read-locks are shared while write-locks -+ are exclusive, so taking a read-lock when a later-write is known in advance will often -+ leads to deadlock. If a reader knows it will write later, it should issue read -+ requests with the RMW flag set. -+*/ -+ -+/* -+ The znode/atom deadlock avoidance. -+ -+ FIXME(Zam): writing of this comment is in progress. -+ -+ The atom's special stage ASTAGE_CAPTURE_WAIT introduces a kind of atom's -+ long-term locking, which makes reiser4 locking scheme more complex. It had -+ deadlocks until we implement deadlock avoidance algorithms. That deadlocks -+ looked as the following: one stopped thread waits for a long-term lock on -+ znode, the thread who owns that lock waits when fusion with another atom will -+ be allowed. -+ -+ The source of the deadlocks is an optimization of not capturing index nodes -+ for read. Let's prove it. Suppose we have dumb node capturing scheme which -+ unconditionally captures each block before locking it. -+ -+ That scheme has no deadlocks. Let's begin with the thread which stage is -+ ASTAGE_CAPTURE_WAIT and it waits for a znode lock. The thread can't wait for -+ a capture because it's stage allows fusion with any atom except which are -+ being committed currently. A process of atom commit can't deadlock because -+ atom commit procedure does not acquire locks and does not fuse with other -+ atoms. Reiser4 does capturing right before going to sleep inside the -+ longtertm_lock_znode() function, it means the znode which we want to lock is -+ already captured and its atom is in ASTAGE_CAPTURE_WAIT stage. If we -+ continue the analysis we understand that no one process in the sequence may -+ waits atom fusion. Thereby there are no deadlocks of described kind. -+ -+ The capturing optimization makes the deadlocks possible. A thread can wait a -+ lock which owner did not captured that node. The lock owner's current atom -+ is not fused with the first atom and it does not get a ASTAGE_CAPTURE_WAIT -+ state. A deadlock is possible when that atom meets another one which is in -+ ASTAGE_CAPTURE_WAIT already. -+ -+ The deadlock avoidance scheme includes two algorithms: -+ -+ First algorithm is used when a thread captures a node which is locked but not -+ captured by another thread. Those nodes are marked MISSED_IN_CAPTURE at the -+ moment we skip their capturing. If such a node (marked MISSED_IN_CAPTURE) is -+ being captured by a thread with current atom is in ASTAGE_CAPTURE_WAIT, the -+ routine which forces all lock owners to join with current atom is executed. -+ -+ Second algorithm does not allow to skip capturing of already captured nodes. -+ -+ Both algorithms together prevent waiting a longterm lock without atom fusion -+ with atoms of all lock owners, which is a key thing for getting atom/znode -+ locking deadlocks. -+*/ -+ -+/* -+ * Transactions and mmap(2). -+ * -+ * 1. Transactions are not supported for accesses through mmap(2), because -+ * this would effectively amount to user-level transactions whose duration -+ * is beyond control of the kernel. -+ * -+ * 2. That said, we still want to preserve some decency with regard to -+ * mmap(2). During normal write(2) call, following sequence of events -+ * happens: -+ * -+ * 1. page is created; -+ * -+ * 2. jnode is created, dirtied and captured into current atom. -+ * -+ * 3. extent is inserted and modified. -+ * -+ * Steps (2) and (3) take place under long term lock on the twig node. -+ * -+ * When file is accessed through mmap(2) page is always created during -+ * page fault. -+ * After this (in reiser4_readpage()->reiser4_readpage_extent()): -+ * -+ * 1. if access is made to non-hole page new jnode is created, (if -+ * necessary) -+ * -+ * 2. if access is made to the hole page, jnode is not created (XXX -+ * not clear why). -+ * -+ * Also, even if page is created by write page fault it is not marked -+ * dirty immediately by handle_mm_fault(). Probably this is to avoid races -+ * with page write-out. -+ * -+ * Dirty bit installed by hardware is only transferred to the struct page -+ * later, when page is unmapped (in zap_pte_range(), or -+ * try_to_unmap_one()). -+ * -+ * So, with mmap(2) we have to handle following irksome situations: -+ * -+ * 1. there exists modified page (clean or dirty) without jnode -+ * -+ * 2. there exists modified page (clean or dirty) with clean jnode -+ * -+ * 3. clean page which is a part of atom can be transparently modified -+ * at any moment through mapping without becoming dirty. -+ * -+ * (1) and (2) can lead to the out-of-memory situation: ->writepage() -+ * doesn't know what to do with such pages and ->sync_sb()/->writepages() -+ * don't see them, because these methods operate on atoms. -+ * -+ * (3) can lead to the loss of data: suppose we have dirty page with dirty -+ * captured jnode captured by some atom. As part of early flush (for -+ * example) page was written out. Dirty bit was cleared on both page and -+ * jnode. After this page is modified through mapping, but kernel doesn't -+ * notice and just discards page and jnode as part of commit. (XXX -+ * actually it doesn't, because to reclaim page ->releasepage() has to be -+ * called and before this dirty bit will be transferred to the struct -+ * page). -+ * -+ */ -+ -+#include "debug.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree.h" -+#include "wander.h" -+#include "ktxnmgrd.h" -+#include "super.h" -+#include "page_cache.h" -+#include "reiser4.h" -+#include "vfs_ops.h" -+#include "inode.h" -+#include "flush.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include /* for totalram_pages */ -+ -+static void atom_free(txn_atom * atom); -+ -+static int commit_txnh(txn_handle * txnh); -+ -+static void wakeup_atom_waitfor_list(txn_atom * atom); -+static void wakeup_atom_waiting_list(txn_atom * atom); -+ -+static void capture_assign_txnh_nolock(txn_atom * atom, txn_handle * txnh); -+ -+static void capture_assign_block_nolock(txn_atom * atom, jnode * node); -+ -+static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node); -+ -+static int capture_init_fusion(jnode * node, txn_handle * txnh, -+ txn_capture mode); -+ -+static int capture_fuse_wait(txn_handle *, txn_atom *, txn_atom *, txn_capture); -+ -+static void capture_fuse_into(txn_atom * small, txn_atom * large); -+ -+void reiser4_invalidate_list(struct list_head *); -+ -+/* GENERIC STRUCTURES */ -+ -+typedef struct _txn_wait_links txn_wait_links; -+ -+struct _txn_wait_links { -+ lock_stack *_lock_stack; -+ struct list_head _fwaitfor_link; -+ struct list_head _fwaiting_link; -+ int (*waitfor_cb) (txn_atom * atom, struct _txn_wait_links * wlinks); -+ int (*waiting_cb) (txn_atom * atom, struct _txn_wait_links * wlinks); -+}; -+ -+/* FIXME: In theory, we should be using the slab cache init & destructor -+ methods instead of, e.g., jnode_init, etc. */ -+static struct kmem_cache *_atom_slab = NULL; -+/* this is for user-visible, cross system-call transactions. */ -+static struct kmem_cache *_txnh_slab = NULL; -+ -+/** -+ * init_txnmgr_static - create transaction manager slab caches -+ * -+ * Initializes caches of txn-atoms and txn_handle. It is part of reiser4 module -+ * initialization. -+ */ -+int init_txnmgr_static(void) -+{ -+ assert("jmacd-600", _atom_slab == NULL); -+ assert("jmacd-601", _txnh_slab == NULL); -+ -+ ON_DEBUG(atomic_set(&flush_cnt, 0)); -+ -+ _atom_slab = kmem_cache_create("txn_atom", sizeof(txn_atom), 0, -+ SLAB_HWCACHE_ALIGN | -+ SLAB_RECLAIM_ACCOUNT, NULL); -+ if (_atom_slab == NULL) -+ return RETERR(-ENOMEM); -+ -+ _txnh_slab = kmem_cache_create("txn_handle", sizeof(txn_handle), 0, -+ SLAB_HWCACHE_ALIGN, NULL); -+ if (_txnh_slab == NULL) { -+ kmem_cache_destroy(_atom_slab); -+ _atom_slab = NULL; -+ return RETERR(-ENOMEM); -+ } -+ -+ return 0; -+} -+ -+/** -+ * done_txnmgr_static - delete txn_atom and txn_handle caches -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+void done_txnmgr_static(void) -+{ -+ destroy_reiser4_cache(&_atom_slab); -+ destroy_reiser4_cache(&_txnh_slab); -+} -+ -+/** -+ * init_txnmgr - initialize a new transaction manager -+ * @mgr: pointer to transaction manager embedded in reiser4 super block -+ * -+ * This is called on mount. Makes necessary initializations. -+ */ -+void reiser4_init_txnmgr(txn_mgr *mgr) -+{ -+ assert("umka-169", mgr != NULL); -+ -+ mgr->atom_count = 0; -+ mgr->id_count = 1; -+ INIT_LIST_HEAD(&mgr->atoms_list); -+ spin_lock_init(&mgr->tmgr_lock); -+ mutex_init(&mgr->commit_mutex); -+} -+ -+/** -+ * reiser4_done_txnmgr - stop transaction manager -+ * @mgr: pointer to transaction manager embedded in reiser4 super block -+ * -+ * This is called on umount. Does sanity checks. -+ */ -+void reiser4_done_txnmgr(txn_mgr *mgr) -+{ -+ assert("umka-170", mgr != NULL); -+ assert("umka-1701", list_empty_careful(&mgr->atoms_list)); -+ assert("umka-1702", mgr->atom_count == 0); -+} -+ -+/* Initialize a transaction handle. */ -+/* Audited by: umka (2002.06.13) */ -+static void txnh_init(txn_handle * txnh, txn_mode mode) -+{ -+ assert("umka-171", txnh != NULL); -+ -+ txnh->mode = mode; -+ txnh->atom = NULL; -+ reiser4_ctx_gfp_mask_set(); -+ txnh->flags = 0; -+ spin_lock_init(&txnh->hlock); -+ INIT_LIST_HEAD(&txnh->txnh_link); -+} -+ -+#if REISER4_DEBUG -+/* Check if a transaction handle is clean. */ -+static int txnh_isclean(txn_handle * txnh) -+{ -+ assert("umka-172", txnh != NULL); -+ return txnh->atom == NULL && -+ LOCK_CNT_NIL(spin_locked_txnh); -+} -+#endif -+ -+/* Initialize an atom. */ -+static void atom_init(txn_atom * atom) -+{ -+ int level; -+ -+ assert("umka-173", atom != NULL); -+ -+ memset(atom, 0, sizeof(txn_atom)); -+ -+ atom->stage = ASTAGE_FREE; -+ atom->start_time = jiffies; -+ -+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) -+ INIT_LIST_HEAD(ATOM_DIRTY_LIST(atom, level)); -+ -+ INIT_LIST_HEAD(ATOM_CLEAN_LIST(atom)); -+ INIT_LIST_HEAD(ATOM_OVRWR_LIST(atom)); -+ INIT_LIST_HEAD(ATOM_WB_LIST(atom)); -+ INIT_LIST_HEAD(&atom->inodes); -+ spin_lock_init(&(atom->alock)); -+ /* list of transaction handles */ -+ INIT_LIST_HEAD(&atom->txnh_list); -+ /* link to transaction manager's list of atoms */ -+ INIT_LIST_HEAD(&atom->atom_link); -+ INIT_LIST_HEAD(&atom->fwaitfor_list); -+ INIT_LIST_HEAD(&atom->fwaiting_list); -+ blocknr_set_init(&atom->delete_set); -+ blocknr_set_init(&atom->wandered_map); -+ -+ init_atom_fq_parts(atom); -+} -+ -+#if REISER4_DEBUG -+/* Check if an atom is clean. */ -+static int atom_isclean(txn_atom * atom) -+{ -+ int level; -+ -+ assert("umka-174", atom != NULL); -+ -+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) { -+ if (!list_empty_careful(ATOM_DIRTY_LIST(atom, level))) { -+ return 0; -+ } -+ } -+ -+ return atom->stage == ASTAGE_FREE && -+ atom->txnh_count == 0 && -+ atom->capture_count == 0 && -+ atomic_read(&atom->refcount) == 0 && -+ (&atom->atom_link == atom->atom_link.next && -+ &atom->atom_link == atom->atom_link.prev) && -+ list_empty_careful(&atom->txnh_list) && -+ list_empty_careful(ATOM_CLEAN_LIST(atom)) && -+ list_empty_careful(ATOM_OVRWR_LIST(atom)) && -+ list_empty_careful(ATOM_WB_LIST(atom)) && -+ list_empty_careful(&atom->fwaitfor_list) && -+ list_empty_careful(&atom->fwaiting_list) && -+ atom_fq_parts_are_clean(atom); -+} -+#endif -+ -+/* Begin a transaction in this context. Currently this uses the reiser4_context's -+ trans_in_ctx, which means that transaction handles are stack-allocated. Eventually -+ this will be extended to allow transaction handles to span several contexts. */ -+/* Audited by: umka (2002.06.13) */ -+void reiser4_txn_begin(reiser4_context * context) -+{ -+ assert("jmacd-544", context->trans == NULL); -+ -+ context->trans = &context->trans_in_ctx; -+ -+ /* FIXME_LATER_JMACD Currently there's no way to begin a TXN_READ_FUSING -+ transcrash. Default should be TXN_WRITE_FUSING. Also, the _trans variable is -+ stack allocated right now, but we would like to allow for dynamically allocated -+ transcrashes that span multiple system calls. -+ */ -+ txnh_init(context->trans, TXN_WRITE_FUSING); -+} -+ -+/* Finish a transaction handle context. */ -+int reiser4_txn_end(reiser4_context * context) -+{ -+ long ret = 0; -+ txn_handle *txnh; -+ -+ assert("umka-283", context != NULL); -+ assert("nikita-3012", reiser4_schedulable()); -+ assert("vs-24", context == get_current_context()); -+ assert("nikita-2967", lock_stack_isclean(get_current_lock_stack())); -+ -+ txnh = context->trans; -+ if (txnh != NULL) { -+ if (txnh->atom != NULL) -+ ret = commit_txnh(txnh); -+ assert("jmacd-633", txnh_isclean(txnh)); -+ context->trans = NULL; -+ } -+ return ret; -+} -+ -+void reiser4_txn_restart(reiser4_context * context) -+{ -+ reiser4_txn_end(context); -+ reiser4_preempt_point(); -+ reiser4_txn_begin(context); -+} -+ -+void reiser4_txn_restart_current(void) -+{ -+ reiser4_txn_restart(get_current_context()); -+} -+ -+/* TXN_ATOM */ -+ -+/* Get the atom belonging to a txnh, which is not locked. Return txnh locked. Locks atom, if atom -+ is not NULL. This performs the necessary spin_trylock to break the lock-ordering cycle. May -+ return NULL. */ -+static txn_atom *txnh_get_atom(txn_handle * txnh) -+{ -+ txn_atom *atom; -+ -+ assert("umka-180", txnh != NULL); -+ assert_spin_not_locked(&(txnh->hlock)); -+ -+ while (1) { -+ spin_lock_txnh(txnh); -+ atom = txnh->atom; -+ -+ if (atom == NULL) -+ break; -+ -+ if (spin_trylock_atom(atom)) -+ break; -+ -+ atomic_inc(&atom->refcount); -+ -+ spin_unlock_txnh(txnh); -+ spin_lock_atom(atom); -+ spin_lock_txnh(txnh); -+ -+ if (txnh->atom == atom) { -+ atomic_dec(&atom->refcount); -+ break; -+ } -+ -+ spin_unlock_txnh(txnh); -+ atom_dec_and_unlock(atom); -+ } -+ -+ return atom; -+} -+ -+/* Get the current atom and spinlock it if current atom present. May return NULL */ -+txn_atom *get_current_atom_locked_nocheck(void) -+{ -+ reiser4_context *cx; -+ txn_atom *atom; -+ txn_handle *txnh; -+ -+ cx = get_current_context(); -+ assert("zam-437", cx != NULL); -+ -+ txnh = cx->trans; -+ assert("zam-435", txnh != NULL); -+ -+ atom = txnh_get_atom(txnh); -+ -+ spin_unlock_txnh(txnh); -+ return atom; -+} -+ -+/* Get the atom belonging to a jnode, which is initially locked. Return with -+ both jnode and atom locked. This performs the necessary spin_trylock to -+ break the lock-ordering cycle. Assumes the jnode is already locked, and -+ returns NULL if atom is not set. */ -+txn_atom *jnode_get_atom(jnode * node) -+{ -+ txn_atom *atom; -+ -+ assert("umka-181", node != NULL); -+ -+ while (1) { -+ assert_spin_locked(&(node->guard)); -+ -+ atom = node->atom; -+ /* node is not in any atom */ -+ if (atom == NULL) -+ break; -+ -+ /* If atom is not locked, grab the lock and return */ -+ if (spin_trylock_atom(atom)) -+ break; -+ -+ /* At least one jnode belongs to this atom it guarantees that -+ * atom->refcount > 0, we can safely increment refcount. */ -+ atomic_inc(&atom->refcount); -+ spin_unlock_jnode(node); -+ -+ /* re-acquire spin locks in the right order */ -+ spin_lock_atom(atom); -+ spin_lock_jnode(node); -+ -+ /* check if node still points to the same atom. */ -+ if (node->atom == atom) { -+ atomic_dec(&atom->refcount); -+ break; -+ } -+ -+ /* releasing of atom lock and reference requires not holding -+ * locks on jnodes. */ -+ spin_unlock_jnode(node); -+ -+ /* We do not sure that this atom has extra references except our -+ * one, so we should call proper function which may free atom if -+ * last reference is released. */ -+ atom_dec_and_unlock(atom); -+ -+ /* lock jnode again for getting valid node->atom pointer -+ * value. */ -+ spin_lock_jnode(node); -+ } -+ -+ return atom; -+} -+ -+/* Returns true if @node is dirty and part of the same atom as one of its neighbors. Used -+ by flush code to indicate whether the next node (in some direction) is suitable for -+ flushing. */ -+int -+same_slum_check(jnode * node, jnode * check, int alloc_check, int alloc_value) -+{ -+ int compat; -+ txn_atom *atom; -+ -+ assert("umka-182", node != NULL); -+ assert("umka-183", check != NULL); -+ -+ /* Not sure what this function is supposed to do if supplied with @check that is -+ neither formatted nor unformatted (bitmap or so). */ -+ assert("nikita-2373", jnode_is_znode(check) -+ || jnode_is_unformatted(check)); -+ -+ /* Need a lock on CHECK to get its atom and to check various state bits. -+ Don't need a lock on NODE once we get the atom lock. */ -+ /* It is not enough to lock two nodes and check (node->atom == -+ check->atom) because atom could be locked and being fused at that -+ moment, jnodes of the atom of that state (being fused) can point to -+ different objects, but the atom is the same. */ -+ spin_lock_jnode(check); -+ -+ atom = jnode_get_atom(check); -+ -+ if (atom == NULL) { -+ compat = 0; -+ } else { -+ compat = (node->atom == atom && JF_ISSET(check, JNODE_DIRTY)); -+ -+ if (compat && jnode_is_znode(check)) { -+ compat &= znode_is_connected(JZNODE(check)); -+ } -+ -+ if (compat && alloc_check) { -+ compat &= (alloc_value == jnode_is_flushprepped(check)); -+ } -+ -+ spin_unlock_atom(atom); -+ } -+ -+ spin_unlock_jnode(check); -+ -+ return compat; -+} -+ -+/* Decrement the atom's reference count and if it falls to zero, free it. */ -+void atom_dec_and_unlock(txn_atom * atom) -+{ -+ txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr; -+ -+ assert("umka-186", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ assert("zam-1039", atomic_read(&atom->refcount) > 0); -+ -+ if (atomic_dec_and_test(&atom->refcount)) { -+ /* take txnmgr lock and atom lock in proper order. */ -+ if (!spin_trylock_txnmgr(mgr)) { -+ /* This atom should exist after we re-acquire its -+ * spinlock, so we increment its reference counter. */ -+ atomic_inc(&atom->refcount); -+ spin_unlock_atom(atom); -+ spin_lock_txnmgr(mgr); -+ spin_lock_atom(atom); -+ -+ if (!atomic_dec_and_test(&atom->refcount)) { -+ spin_unlock_atom(atom); -+ spin_unlock_txnmgr(mgr); -+ return; -+ } -+ } -+ assert_spin_locked(&(mgr->tmgr_lock)); -+ atom_free(atom); -+ spin_unlock_txnmgr(mgr); -+ } else -+ spin_unlock_atom(atom); -+} -+ -+/* Create new atom and connect it to given transaction handle. This adds the -+ atom to the transaction manager's list and sets its reference count to 1, an -+ artificial reference which is kept until it commits. We play strange games -+ to avoid allocation under jnode & txnh spinlocks.*/ -+ -+static int atom_begin_and_assign_to_txnh(txn_atom ** atom_alloc, txn_handle * txnh) -+{ -+ txn_atom *atom; -+ txn_mgr *mgr; -+ -+ if (REISER4_DEBUG && rofs_tree(current_tree)) { -+ warning("nikita-3366", "Creating atom on rofs"); -+ dump_stack(); -+ } -+ -+ if (*atom_alloc == NULL) { -+ (*atom_alloc) = kmem_cache_alloc(_atom_slab, -+ reiser4_ctx_gfp_mask_get()); -+ -+ if (*atom_alloc == NULL) -+ return RETERR(-ENOMEM); -+ } -+ -+ /* and, also, txnmgr spin lock should be taken before jnode and txnh -+ locks. */ -+ mgr = &get_super_private(reiser4_get_current_sb())->tmgr; -+ spin_lock_txnmgr(mgr); -+ spin_lock_txnh(txnh); -+ -+ /* Check whether new atom still needed */ -+ if (txnh->atom != NULL) { -+ /* NOTE-NIKITA probably it is rather better to free -+ * atom_alloc here than thread it up to reiser4_try_capture() */ -+ -+ spin_unlock_txnh(txnh); -+ spin_unlock_txnmgr(mgr); -+ -+ return -E_REPEAT; -+ } -+ -+ atom = *atom_alloc; -+ *atom_alloc = NULL; -+ -+ atom_init(atom); -+ -+ assert("jmacd-17", atom_isclean(atom)); -+ -+ /* -+ * lock ordering is broken here. It is ok, as long as @atom is new -+ * and inaccessible for others. We can't use spin_lock_atom or -+ * spin_lock(&atom->alock) because they care about locking -+ * dependencies. spin_trylock_lock doesn't. -+ */ -+ check_me("", spin_trylock_atom(atom)); -+ -+ /* add atom to the end of transaction manager's list of atoms */ -+ list_add_tail(&atom->atom_link, &mgr->atoms_list); -+ atom->atom_id = mgr->id_count++; -+ mgr->atom_count += 1; -+ -+ /* Release txnmgr lock */ -+ spin_unlock_txnmgr(mgr); -+ -+ /* One reference until it commits. */ -+ atomic_inc(&atom->refcount); -+ atom->stage = ASTAGE_CAPTURE_FUSE; -+ atom->super = reiser4_get_current_sb(); -+ capture_assign_txnh_nolock(atom, txnh); -+ -+ spin_unlock_atom(atom); -+ spin_unlock_txnh(txnh); -+ -+ return -E_REPEAT; -+} -+ -+/* Return true if an atom is currently "open". */ -+static int atom_isopen(const txn_atom * atom) -+{ -+ assert("umka-185", atom != NULL); -+ -+ return atom->stage > 0 && atom->stage < ASTAGE_PRE_COMMIT; -+} -+ -+/* Return the number of pointers to this atom that must be updated during fusion. This -+ approximates the amount of work to be done. Fusion chooses the atom with fewer -+ pointers to fuse into the atom with more pointers. */ -+static int atom_pointer_count(const txn_atom * atom) -+{ -+ assert("umka-187", atom != NULL); -+ -+ /* This is a measure of the amount of work needed to fuse this atom -+ * into another. */ -+ return atom->txnh_count + atom->capture_count; -+} -+ -+/* Called holding the atom lock, this removes the atom from the transaction manager list -+ and frees it. */ -+static void atom_free(txn_atom * atom) -+{ -+ txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr; -+ -+ assert("umka-188", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ -+ /* Remove from the txn_mgr's atom list */ -+ assert_spin_locked(&(mgr->tmgr_lock)); -+ mgr->atom_count -= 1; -+ list_del_init(&atom->atom_link); -+ -+ /* Clean the atom */ -+ assert("jmacd-16", -+ (atom->stage == ASTAGE_INVALID || atom->stage == ASTAGE_DONE)); -+ atom->stage = ASTAGE_FREE; -+ -+ blocknr_set_destroy(&atom->delete_set); -+ blocknr_set_destroy(&atom->wandered_map); -+ -+ assert("jmacd-16", atom_isclean(atom)); -+ -+ spin_unlock_atom(atom); -+ -+ kmem_cache_free(_atom_slab, atom); -+} -+ -+static int atom_is_dotard(const txn_atom * atom) -+{ -+ return time_after(jiffies, atom->start_time + -+ get_current_super_private()->tmgr.atom_max_age); -+} -+ -+static int atom_can_be_committed(txn_atom * atom) -+{ -+ assert_spin_locked(&(atom->alock)); -+ assert("zam-885", atom->txnh_count > atom->nr_waiters); -+ return atom->txnh_count == atom->nr_waiters + 1; -+} -+ -+/* Return true if an atom should commit now. This is determined by aging, atom -+ size or atom flags. */ -+static int atom_should_commit(const txn_atom * atom) -+{ -+ assert("umka-189", atom != NULL); -+ return -+ (atom->flags & ATOM_FORCE_COMMIT) || -+ ((unsigned)atom_pointer_count(atom) > -+ get_current_super_private()->tmgr.atom_max_size) -+ || atom_is_dotard(atom); -+} -+ -+/* return 1 if current atom exists and requires commit. */ -+int current_atom_should_commit(void) -+{ -+ txn_atom *atom; -+ int result = 0; -+ -+ atom = get_current_atom_locked_nocheck(); -+ if (atom) { -+ result = atom_should_commit(atom); -+ spin_unlock_atom(atom); -+ } -+ return result; -+} -+ -+static int atom_should_commit_asap(const txn_atom * atom) -+{ -+ unsigned int captured; -+ unsigned int pinnedpages; -+ -+ assert("nikita-3309", atom != NULL); -+ -+ captured = (unsigned)atom->capture_count; -+ pinnedpages = (captured >> PAGE_CACHE_SHIFT) * sizeof(znode); -+ -+ return (pinnedpages > (totalram_pages >> 3)) || (atom->flushed > 100); -+} -+ -+static jnode *find_first_dirty_in_list(struct list_head *head, int flags) -+{ -+ jnode *first_dirty; -+ -+ list_for_each_entry(first_dirty, head, capture_link) { -+ if (!(flags & JNODE_FLUSH_COMMIT)) { -+ /* -+ * skip jnodes which "heard banshee" or having active -+ * I/O -+ */ -+ if (JF_ISSET(first_dirty, JNODE_HEARD_BANSHEE) || -+ JF_ISSET(first_dirty, JNODE_WRITEBACK)) -+ continue; -+ } -+ return first_dirty; -+ } -+ return NULL; -+} -+ -+/* Get first dirty node from the atom's dirty_nodes[n] lists; return NULL if atom has no dirty -+ nodes on atom's lists */ -+jnode *find_first_dirty_jnode(txn_atom * atom, int flags) -+{ -+ jnode *first_dirty; -+ tree_level level; -+ -+ assert_spin_locked(&(atom->alock)); -+ -+ /* The flush starts from LEAF_LEVEL (=1). */ -+ for (level = 1; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) { -+ if (list_empty_careful(ATOM_DIRTY_LIST(atom, level))) -+ continue; -+ -+ first_dirty = -+ find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, level), -+ flags); -+ if (first_dirty) -+ return first_dirty; -+ } -+ -+ /* znode-above-root is on the list #0. */ -+ return find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, 0), flags); -+} -+ -+static void dispatch_wb_list(txn_atom * atom, flush_queue_t * fq) -+{ -+ jnode *cur; -+ -+ assert("zam-905", atom_is_protected(atom)); -+ -+ cur = list_entry(ATOM_WB_LIST(atom)->next, jnode, capture_link); -+ while (ATOM_WB_LIST(atom) != &cur->capture_link) { -+ jnode *next = list_entry(cur->capture_link.next, jnode, capture_link); -+ -+ spin_lock_jnode(cur); -+ if (!JF_ISSET(cur, JNODE_WRITEBACK)) { -+ if (JF_ISSET(cur, JNODE_DIRTY)) { -+ queue_jnode(fq, cur); -+ } else { -+ /* move jnode to atom's clean list */ -+ list_move_tail(&cur->capture_link, -+ ATOM_CLEAN_LIST(atom)); -+ } -+ } -+ spin_unlock_jnode(cur); -+ -+ cur = next; -+ } -+} -+ -+/* Scan current atom->writeback_nodes list, re-submit dirty and !writeback -+ * jnodes to disk. */ -+static int submit_wb_list(void) -+{ -+ int ret; -+ flush_queue_t *fq; -+ -+ fq = get_fq_for_current_atom(); -+ if (IS_ERR(fq)) -+ return PTR_ERR(fq); -+ -+ dispatch_wb_list(fq->atom, fq); -+ spin_unlock_atom(fq->atom); -+ -+ ret = reiser4_write_fq(fq, NULL, 1); -+ reiser4_fq_put(fq); -+ -+ return ret; -+} -+ -+/* Wait completion of all writes, re-submit atom writeback list if needed. */ -+static int current_atom_complete_writes(void) -+{ -+ int ret; -+ -+ /* Each jnode from that list was modified and dirtied when it had i/o -+ * request running already. After i/o completion we have to resubmit -+ * them to disk again.*/ -+ ret = submit_wb_list(); -+ if (ret < 0) -+ return ret; -+ -+ /* Wait all i/o completion */ -+ ret = current_atom_finish_all_fq(); -+ if (ret) -+ return ret; -+ -+ /* Scan wb list again; all i/o should be completed, we re-submit dirty -+ * nodes to disk */ -+ ret = submit_wb_list(); -+ if (ret < 0) -+ return ret; -+ -+ /* Wait all nodes we just submitted */ -+ return current_atom_finish_all_fq(); -+} -+ -+#if REISER4_DEBUG -+ -+static void reiser4_info_atom(const char *prefix, const txn_atom * atom) -+{ -+ if (atom == NULL) { -+ printk("%s: no atom\n", prefix); -+ return; -+ } -+ -+ printk("%s: refcount: %i id: %i flags: %x txnh_count: %i" -+ " capture_count: %i stage: %x start: %lu, flushed: %i\n", prefix, -+ atomic_read(&atom->refcount), atom->atom_id, atom->flags, -+ atom->txnh_count, atom->capture_count, atom->stage, -+ atom->start_time, atom->flushed); -+} -+ -+#else /* REISER4_DEBUG */ -+ -+static inline void reiser4_info_atom(const char *prefix, const txn_atom * atom) {} -+ -+#endif /* REISER4_DEBUG */ -+ -+#define TOOMANYFLUSHES (1 << 13) -+ -+/* Called with the atom locked and no open "active" transaction handlers except -+ ours, this function calls flush_current_atom() until all dirty nodes are -+ processed. Then it initiates commit processing. -+ -+ Called by the single remaining open "active" txnh, which is closing. Other -+ open txnhs belong to processes which wait atom commit in commit_txnh() -+ routine. They are counted as "waiters" in atom->nr_waiters. Therefore as -+ long as we hold the atom lock none of the jnodes can be captured and/or -+ locked. -+ -+ Return value is an error code if commit fails. -+*/ -+static int commit_current_atom(long *nr_submitted, txn_atom ** atom) -+{ -+ reiser4_super_info_data *sbinfo = get_current_super_private(); -+ long ret = 0; -+ /* how many times jnode_flush() was called as a part of attempt to -+ * commit this atom. */ -+ int flushiters; -+ -+ assert("zam-888", atom != NULL && *atom != NULL); -+ assert_spin_locked(&((*atom)->alock)); -+ assert("zam-887", get_current_context()->trans->atom == *atom); -+ assert("jmacd-151", atom_isopen(*atom)); -+ -+ assert("nikita-3184", -+ get_current_super_private()->delete_mutex_owner != current); -+ -+ for (flushiters = 0;; ++flushiters) { -+ ret = -+ flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS | -+ JNODE_FLUSH_COMMIT, -+ LONG_MAX /* nr_to_write */ , -+ nr_submitted, atom, NULL); -+ if (ret != -E_REPEAT) -+ break; -+ -+ /* if atom's dirty list contains one znode which is -+ HEARD_BANSHEE and is locked we have to allow lock owner to -+ continue and uncapture that znode */ -+ reiser4_preempt_point(); -+ -+ *atom = get_current_atom_locked(); -+ if (flushiters > TOOMANYFLUSHES && IS_POW(flushiters)) { -+ warning("nikita-3176", -+ "Flushing like mad: %i", flushiters); -+ reiser4_info_atom("atom", *atom); -+ DEBUGON(flushiters > (1 << 20)); -+ } -+ } -+ -+ if (ret) -+ return ret; -+ -+ assert_spin_locked(&((*atom)->alock)); -+ -+ if (!atom_can_be_committed(*atom)) { -+ spin_unlock_atom(*atom); -+ return RETERR(-E_REPEAT); -+ } -+ -+ if ((*atom)->capture_count == 0) -+ goto done; -+ -+ /* Up to this point we have been flushing and after flush is called we -+ return -E_REPEAT. Now we can commit. We cannot return -E_REPEAT -+ at this point, commit should be successful. */ -+ reiser4_atom_set_stage(*atom, ASTAGE_PRE_COMMIT); -+ ON_DEBUG(((*atom)->committer = current)); -+ spin_unlock_atom(*atom); -+ -+ ret = current_atom_complete_writes(); -+ if (ret) -+ return ret; -+ -+ assert("zam-906", list_empty(ATOM_WB_LIST(*atom))); -+ -+ /* isolate critical code path which should be executed by only one -+ * thread using tmgr mutex */ -+ mutex_lock(&sbinfo->tmgr.commit_mutex); -+ -+ ret = reiser4_write_logs(nr_submitted); -+ if (ret < 0) -+ reiser4_panic("zam-597", "write log failed (%ld)\n", ret); -+ -+ /* The atom->ovrwr_nodes list is processed under commit mutex held -+ because of bitmap nodes which are captured by special way in -+ reiser4_pre_commit_hook_bitmap(), that way does not include -+ capture_fuse_wait() as a capturing of other nodes does -- the commit -+ mutex is used for transaction isolation instead. */ -+ reiser4_invalidate_list(ATOM_OVRWR_LIST(*atom)); -+ mutex_unlock(&sbinfo->tmgr.commit_mutex); -+ -+ reiser4_invalidate_list(ATOM_CLEAN_LIST(*atom)); -+ reiser4_invalidate_list(ATOM_WB_LIST(*atom)); -+ assert("zam-927", list_empty(&(*atom)->inodes)); -+ -+ spin_lock_atom(*atom); -+ done: -+ reiser4_atom_set_stage(*atom, ASTAGE_DONE); -+ ON_DEBUG((*atom)->committer = NULL); -+ -+ /* Atom's state changes, so wake up everybody waiting for this -+ event. */ -+ wakeup_atom_waiting_list(*atom); -+ -+ /* Decrement the "until commit" reference, at least one txnh (the caller) is -+ still open. */ -+ atomic_dec(&(*atom)->refcount); -+ -+ assert("jmacd-1070", atomic_read(&(*atom)->refcount) > 0); -+ assert("jmacd-1062", (*atom)->capture_count == 0); -+ BUG_ON((*atom)->capture_count != 0); -+ assert_spin_locked(&((*atom)->alock)); -+ -+ return ret; -+} -+ -+/* TXN_TXNH */ -+ -+/** -+ * force_commit_atom - commit current atom and wait commit completion -+ * @txnh: -+ * -+ * Commits current atom and wait commit completion; current atom and @txnh have -+ * to be spinlocked before call, this function unlocks them on exit. -+ */ -+int force_commit_atom(txn_handle *txnh) -+{ -+ txn_atom *atom; -+ -+ assert("zam-837", txnh != NULL); -+ assert_spin_locked(&(txnh->hlock)); -+ assert("nikita-2966", lock_stack_isclean(get_current_lock_stack())); -+ -+ atom = txnh->atom; -+ -+ assert("zam-834", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ -+ /* -+ * Set flags for atom and txnh: forcing atom commit and waiting for -+ * commit completion -+ */ -+ txnh->flags |= TXNH_WAIT_COMMIT; -+ atom->flags |= ATOM_FORCE_COMMIT; -+ -+ spin_unlock_txnh(txnh); -+ spin_unlock_atom(atom); -+ -+ /* commit is here */ -+ reiser4_txn_restart_current(); -+ return 0; -+} -+ -+/* Called to force commit of any outstanding atoms. @commit_all_atoms controls -+ * should we commit all atoms including new ones which are created after this -+ * functions is called. */ -+int txnmgr_force_commit_all(struct super_block *super, int commit_all_atoms) -+{ -+ int ret; -+ txn_atom *atom; -+ txn_mgr *mgr; -+ txn_handle *txnh; -+ unsigned long start_time = jiffies; -+ reiser4_context *ctx = get_current_context(); -+ -+ assert("nikita-2965", lock_stack_isclean(get_current_lock_stack())); -+ assert("nikita-3058", reiser4_commit_check_locks()); -+ -+ reiser4_txn_restart_current(); -+ -+ mgr = &get_super_private(super)->tmgr; -+ -+ txnh = ctx->trans; -+ -+ again: -+ -+ spin_lock_txnmgr(mgr); -+ -+ list_for_each_entry(atom, &mgr->atoms_list, atom_link) { -+ spin_lock_atom(atom); -+ -+ /* Commit any atom which can be committed. If @commit_new_atoms -+ * is not set we commit only atoms which were created before -+ * this call is started. */ -+ if (commit_all_atoms -+ || time_before_eq(atom->start_time, start_time)) { -+ if (atom->stage <= ASTAGE_POST_COMMIT) { -+ spin_unlock_txnmgr(mgr); -+ -+ if (atom->stage < ASTAGE_PRE_COMMIT) { -+ spin_lock_txnh(txnh); -+ /* Add force-context txnh */ -+ capture_assign_txnh_nolock(atom, txnh); -+ ret = force_commit_atom(txnh); -+ if (ret) -+ return ret; -+ } else -+ /* wait atom commit */ -+ reiser4_atom_wait_event(atom); -+ -+ goto again; -+ } -+ } -+ -+ spin_unlock_atom(atom); -+ } -+ -+#if REISER4_DEBUG -+ if (commit_all_atoms) { -+ reiser4_super_info_data *sbinfo = get_super_private(super); -+ spin_lock_reiser4_super(sbinfo); -+ assert("zam-813", -+ sbinfo->blocks_fake_allocated_unformatted == 0); -+ assert("zam-812", sbinfo->blocks_fake_allocated == 0); -+ spin_unlock_reiser4_super(sbinfo); -+ } -+#endif -+ -+ spin_unlock_txnmgr(mgr); -+ -+ return 0; -+} -+ -+/* check whether commit_some_atoms() can commit @atom. Locking is up to the -+ * caller */ -+static int atom_is_committable(txn_atom * atom) -+{ -+ return -+ atom->stage < ASTAGE_PRE_COMMIT && -+ atom->txnh_count == atom->nr_waiters && atom_should_commit(atom); -+} -+ -+/* called periodically from ktxnmgrd to commit old atoms. Releases ktxnmgrd spin -+ * lock at exit */ -+int commit_some_atoms(txn_mgr * mgr) -+{ -+ int ret = 0; -+ txn_atom *atom; -+ txn_handle *txnh; -+ reiser4_context *ctx; -+ struct list_head *pos, *tmp; -+ -+ ctx = get_current_context(); -+ assert("nikita-2444", ctx != NULL); -+ -+ txnh = ctx->trans; -+ spin_lock_txnmgr(mgr); -+ -+ /* -+ * this is to avoid gcc complain that atom might be used -+ * uninitialized -+ */ -+ atom = NULL; -+ -+ /* look for atom to commit */ -+ list_for_each_safe(pos, tmp, &mgr->atoms_list) { -+ atom = list_entry(pos, txn_atom, atom_link); -+ /* -+ * first test without taking atom spin lock, whether it is -+ * eligible for committing at all -+ */ -+ if (atom_is_committable(atom)) { -+ /* now, take spin lock and re-check */ -+ spin_lock_atom(atom); -+ if (atom_is_committable(atom)) -+ break; -+ spin_unlock_atom(atom); -+ } -+ } -+ -+ ret = (&mgr->atoms_list == pos); -+ spin_unlock_txnmgr(mgr); -+ -+ if (ret) { -+ /* nothing found */ -+ spin_unlock(&mgr->daemon->guard); -+ return 0; -+ } -+ -+ spin_lock_txnh(txnh); -+ -+ BUG_ON(atom == NULL); -+ /* Set the atom to force committing */ -+ atom->flags |= ATOM_FORCE_COMMIT; -+ -+ /* Add force-context txnh */ -+ capture_assign_txnh_nolock(atom, txnh); -+ -+ spin_unlock_txnh(txnh); -+ spin_unlock_atom(atom); -+ -+ /* we are about to release daemon spin lock, notify daemon it -+ has to rescan atoms */ -+ mgr->daemon->rescan = 1; -+ spin_unlock(&mgr->daemon->guard); -+ reiser4_txn_restart_current(); -+ return 0; -+} -+ -+static int txn_try_to_fuse_small_atom(txn_mgr * tmgr, txn_atom * atom) -+{ -+ int atom_stage; -+ txn_atom *atom_2; -+ int repeat; -+ -+ assert("zam-1051", atom->stage < ASTAGE_PRE_COMMIT); -+ -+ atom_stage = atom->stage; -+ repeat = 0; -+ -+ if (!spin_trylock_txnmgr(tmgr)) { -+ atomic_inc(&atom->refcount); -+ spin_unlock_atom(atom); -+ spin_lock_txnmgr(tmgr); -+ spin_lock_atom(atom); -+ repeat = 1; -+ if (atom->stage != atom_stage) { -+ spin_unlock_txnmgr(tmgr); -+ atom_dec_and_unlock(atom); -+ return -E_REPEAT; -+ } -+ atomic_dec(&atom->refcount); -+ } -+ -+ list_for_each_entry(atom_2, &tmgr->atoms_list, atom_link) { -+ if (atom == atom_2) -+ continue; -+ /* -+ * if trylock does not succeed we just do not fuse with that -+ * atom. -+ */ -+ if (spin_trylock_atom(atom_2)) { -+ if (atom_2->stage < ASTAGE_PRE_COMMIT) { -+ spin_unlock_txnmgr(tmgr); -+ capture_fuse_into(atom_2, atom); -+ /* all locks are lost we can only repeat here */ -+ return -E_REPEAT; -+ } -+ spin_unlock_atom(atom_2); -+ } -+ } -+ atom->flags |= ATOM_CANCEL_FUSION; -+ spin_unlock_txnmgr(tmgr); -+ if (repeat) { -+ spin_unlock_atom(atom); -+ return -E_REPEAT; -+ } -+ return 0; -+} -+ -+/* Calls jnode_flush for current atom if it exists; if not, just take another -+ atom and call jnode_flush() for him. If current transaction handle has -+ already assigned atom (current atom) we have to close current transaction -+ prior to switch to another atom or do something with current atom. This -+ code tries to flush current atom. -+ -+ flush_some_atom() is called as part of memory clearing process. It is -+ invoked from balance_dirty_pages(), pdflushd, and entd. -+ -+ If we can flush no nodes, atom is committed, because this frees memory. -+ -+ If atom is too large or too old it is committed also. -+*/ -+int -+flush_some_atom(jnode * start, long *nr_submitted, const struct writeback_control *wbc, -+ int flags) -+{ -+ reiser4_context *ctx = get_current_context(); -+ txn_mgr *tmgr = &get_super_private(ctx->super)->tmgr; -+ txn_handle *txnh = ctx->trans; -+ txn_atom *atom; -+ int ret; -+ -+ BUG_ON(wbc->nr_to_write == 0); -+ BUG_ON(*nr_submitted != 0); -+ assert("zam-1042", txnh != NULL); -+ repeat: -+ if (txnh->atom == NULL) { -+ /* current atom is not available, take first from txnmgr */ -+ spin_lock_txnmgr(tmgr); -+ -+ /* traverse the list of all atoms */ -+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) { -+ /* lock atom before checking its state */ -+ spin_lock_atom(atom); -+ -+ /* -+ * we need an atom which is not being committed and -+ * which has no flushers (jnode_flush() add one flusher -+ * at the beginning and subtract one at the end). -+ */ -+ if (atom->stage < ASTAGE_PRE_COMMIT && -+ atom->nr_flushers == 0) { -+ spin_lock_txnh(txnh); -+ capture_assign_txnh_nolock(atom, txnh); -+ spin_unlock_txnh(txnh); -+ -+ goto found; -+ } -+ -+ spin_unlock_atom(atom); -+ } -+ -+ /* -+ * Write throttling is case of no one atom can be -+ * flushed/committed. -+ */ -+ if (!current_is_pdflush() && !wbc->nonblocking) { -+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) { -+ spin_lock_atom(atom); -+ /* Repeat the check from the above. */ -+ if (atom->stage < ASTAGE_PRE_COMMIT -+ && atom->nr_flushers == 0) { -+ spin_lock_txnh(txnh); -+ capture_assign_txnh_nolock(atom, txnh); -+ spin_unlock_txnh(txnh); -+ -+ goto found; -+ } -+ if (atom->stage <= ASTAGE_POST_COMMIT) { -+ spin_unlock_txnmgr(tmgr); -+ /* -+ * we just wait until atom's flusher -+ * makes a progress in flushing or -+ * committing the atom -+ */ -+ reiser4_atom_wait_event(atom); -+ goto repeat; -+ } -+ spin_unlock_atom(atom); -+ } -+ } -+ spin_unlock_txnmgr(tmgr); -+ return 0; -+ found: -+ spin_unlock_txnmgr(tmgr); -+ } else -+ atom = get_current_atom_locked(); -+ -+ BUG_ON(atom->super != ctx->super); -+ assert("vs-35", atom->super == ctx->super); -+ if (start) { -+ spin_lock_jnode(start); -+ ret = (atom == start->atom) ? 1 : 0; -+ spin_unlock_jnode(start); -+ if (ret == 0) -+ start = NULL; -+ } -+ ret = flush_current_atom(flags, wbc->nr_to_write, nr_submitted, &atom, start); -+ if (ret == 0) { -+ /* flush_current_atom returns 0 only if it submitted for write -+ nothing */ -+ BUG_ON(*nr_submitted != 0); -+ if (*nr_submitted == 0 || atom_should_commit_asap(atom)) { -+ if (atom->capture_count < tmgr->atom_min_size && -+ !(atom->flags & ATOM_CANCEL_FUSION)) { -+ ret = txn_try_to_fuse_small_atom(tmgr, atom); -+ if (ret == -E_REPEAT) { -+ reiser4_preempt_point(); -+ goto repeat; -+ } -+ } -+ /* if early flushing could not make more nodes clean, -+ * or atom is too old/large, -+ * we force current atom to commit */ -+ /* wait for commit completion but only if this -+ * wouldn't stall pdflushd and ent thread. */ -+ if (!wbc->nonblocking && !ctx->entd) -+ txnh->flags |= TXNH_WAIT_COMMIT; -+ atom->flags |= ATOM_FORCE_COMMIT; -+ } -+ spin_unlock_atom(atom); -+ } else if (ret == -E_REPEAT) { -+ if (*nr_submitted == 0) { -+ /* let others who hampers flushing (hold longterm locks, -+ for instance) to free the way for flush */ -+ reiser4_preempt_point(); -+ goto repeat; -+ } -+ ret = 0; -+ } -+/* -+ if (*nr_submitted > wbc->nr_to_write) -+ warning("", "asked for %ld, written %ld\n", wbc->nr_to_write, *nr_submitted); -+*/ -+ reiser4_txn_restart(ctx); -+ -+ return ret; -+} -+ -+/* Remove processed nodes from atom's clean list (thereby remove them from transaction). */ -+void reiser4_invalidate_list(struct list_head *head) -+{ -+ while (!list_empty(head)) { -+ jnode *node; -+ -+ node = list_entry(head->next, jnode, capture_link); -+ spin_lock_jnode(node); -+ reiser4_uncapture_block(node); -+ jput(node); -+ } -+} -+ -+static void init_wlinks(txn_wait_links * wlinks) -+{ -+ wlinks->_lock_stack = get_current_lock_stack(); -+ INIT_LIST_HEAD(&wlinks->_fwaitfor_link); -+ INIT_LIST_HEAD(&wlinks->_fwaiting_link); -+ wlinks->waitfor_cb = NULL; -+ wlinks->waiting_cb = NULL; -+} -+ -+/* Add atom to the atom's waitfor list and wait for somebody to wake us up; */ -+void reiser4_atom_wait_event(txn_atom * atom) -+{ -+ txn_wait_links _wlinks; -+ -+ assert_spin_locked(&(atom->alock)); -+ assert("nikita-3156", -+ lock_stack_isclean(get_current_lock_stack()) || -+ atom->nr_running_queues > 0); -+ -+ init_wlinks(&_wlinks); -+ list_add_tail(&_wlinks._fwaitfor_link, &atom->fwaitfor_list); -+ atomic_inc(&atom->refcount); -+ spin_unlock_atom(atom); -+ -+ reiser4_prepare_to_sleep(_wlinks._lock_stack); -+ reiser4_go_to_sleep(_wlinks._lock_stack); -+ -+ spin_lock_atom(atom); -+ list_del(&_wlinks._fwaitfor_link); -+ atom_dec_and_unlock(atom); -+} -+ -+void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage) -+{ -+ assert("nikita-3535", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ assert("nikita-3536", stage <= ASTAGE_INVALID); -+ /* Excelsior! */ -+ assert("nikita-3537", stage >= atom->stage); -+ if (atom->stage != stage) { -+ atom->stage = stage; -+ reiser4_atom_send_event(atom); -+ } -+} -+ -+/* wake all threads which wait for an event */ -+void reiser4_atom_send_event(txn_atom * atom) -+{ -+ assert_spin_locked(&(atom->alock)); -+ wakeup_atom_waitfor_list(atom); -+} -+ -+/* Informs txn manager code that owner of this txn_handle should wait atom commit completion (for -+ example, because it does fsync(2)) */ -+static int should_wait_commit(txn_handle * h) -+{ -+ return h->flags & TXNH_WAIT_COMMIT; -+} -+ -+typedef struct commit_data { -+ txn_atom *atom; -+ txn_handle *txnh; -+ long nr_written; -+ /* as an optimization we start committing atom by first trying to -+ * flush it few times without switching into ASTAGE_CAPTURE_WAIT. This -+ * allows to reduce stalls due to other threads waiting for atom in -+ * ASTAGE_CAPTURE_WAIT stage. ->preflush is counter of these -+ * preliminary flushes. */ -+ int preflush; -+ /* have we waited on atom. */ -+ int wait; -+ int failed; -+ int wake_ktxnmgrd_up; -+} commit_data; -+ -+/* -+ * Called from commit_txnh() repeatedly, until either error happens, or atom -+ * commits successfully. -+ */ -+static int try_commit_txnh(commit_data * cd) -+{ -+ int result; -+ -+ assert("nikita-2968", lock_stack_isclean(get_current_lock_stack())); -+ -+ /* Get the atom and txnh locked. */ -+ cd->atom = txnh_get_atom(cd->txnh); -+ assert("jmacd-309", cd->atom != NULL); -+ spin_unlock_txnh(cd->txnh); -+ -+ if (cd->wait) { -+ cd->atom->nr_waiters--; -+ cd->wait = 0; -+ } -+ -+ if (cd->atom->stage == ASTAGE_DONE) -+ return 0; -+ -+ if (cd->failed) -+ return 0; -+ -+ if (atom_should_commit(cd->atom)) { -+ /* if atom is _very_ large schedule it for commit as soon as -+ * possible. */ -+ if (atom_should_commit_asap(cd->atom)) { -+ /* -+ * When atom is in PRE_COMMIT or later stage following -+ * invariant (encoded in atom_can_be_committed()) -+ * holds: there is exactly one non-waiter transaction -+ * handle opened on this atom. When thread wants to -+ * wait until atom commits (for example sync()) it -+ * waits on atom event after increasing -+ * atom->nr_waiters (see blow in this function). It -+ * cannot be guaranteed that atom is already committed -+ * after receiving event, so loop has to be -+ * re-started. But if atom switched into PRE_COMMIT -+ * stage and became too large, we cannot change its -+ * state back to CAPTURE_WAIT (atom stage can only -+ * increase monotonically), hence this check. -+ */ -+ if (cd->atom->stage < ASTAGE_CAPTURE_WAIT) -+ reiser4_atom_set_stage(cd->atom, -+ ASTAGE_CAPTURE_WAIT); -+ cd->atom->flags |= ATOM_FORCE_COMMIT; -+ } -+ if (cd->txnh->flags & TXNH_DONT_COMMIT) { -+ /* -+ * this thread (transaction handle that is) doesn't -+ * want to commit atom. Notify waiters that handle is -+ * closed. This can happen, for example, when we are -+ * under VFS directory lock and don't want to commit -+ * atom right now to avoid stalling other threads -+ * working in the same directory. -+ */ -+ -+ /* Wake the ktxnmgrd up if the ktxnmgrd is needed to -+ * commit this atom: no atom waiters and only one -+ * (our) open transaction handle. */ -+ cd->wake_ktxnmgrd_up = -+ cd->atom->txnh_count == 1 && -+ cd->atom->nr_waiters == 0; -+ reiser4_atom_send_event(cd->atom); -+ result = 0; -+ } else if (!atom_can_be_committed(cd->atom)) { -+ if (should_wait_commit(cd->txnh)) { -+ /* sync(): wait for commit */ -+ cd->atom->nr_waiters++; -+ cd->wait = 1; -+ reiser4_atom_wait_event(cd->atom); -+ result = RETERR(-E_REPEAT); -+ } else { -+ result = 0; -+ } -+ } else if (cd->preflush > 0 && !is_current_ktxnmgrd()) { -+ /* -+ * optimization: flush atom without switching it into -+ * ASTAGE_CAPTURE_WAIT. -+ * -+ * But don't do this for ktxnmgrd, because ktxnmgrd -+ * should never block on atom fusion. -+ */ -+ result = flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS, -+ LONG_MAX, &cd->nr_written, -+ &cd->atom, NULL); -+ if (result == 0) { -+ spin_unlock_atom(cd->atom); -+ cd->preflush = 0; -+ result = RETERR(-E_REPEAT); -+ } else /* Atoms wasn't flushed -+ * completely. Rinse. Repeat. */ -+ --cd->preflush; -+ } else { -+ /* We change atom state to ASTAGE_CAPTURE_WAIT to -+ prevent atom fusion and count ourself as an active -+ flusher */ -+ reiser4_atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT); -+ cd->atom->flags |= ATOM_FORCE_COMMIT; -+ -+ result = -+ commit_current_atom(&cd->nr_written, &cd->atom); -+ if (result != 0 && result != -E_REPEAT) -+ cd->failed = 1; -+ } -+ } else -+ result = 0; -+ -+#if REISER4_DEBUG -+ if (result == 0) -+ assert_spin_locked(&(cd->atom->alock)); -+#endif -+ -+ /* perfectly valid assertion, except that when atom/txnh is not locked -+ * fusion can take place, and cd->atom points nowhere. */ -+ /* -+ assert("jmacd-1028", ergo(result != 0, spin_atom_is_not_locked(cd->atom))); -+ */ -+ return result; -+} -+ -+/* Called to commit a transaction handle. This decrements the atom's number of open -+ handles and if it is the last handle to commit and the atom should commit, initiates -+ atom commit. if commit does not fail, return number of written blocks */ -+static int commit_txnh(txn_handle * txnh) -+{ -+ commit_data cd; -+ assert("umka-192", txnh != NULL); -+ -+ memset(&cd, 0, sizeof cd); -+ cd.txnh = txnh; -+ cd.preflush = 10; -+ -+ /* calls try_commit_txnh() until either atom commits, or error -+ * happens */ -+ while (try_commit_txnh(&cd) != 0) -+ reiser4_preempt_point(); -+ -+ spin_lock_txnh(txnh); -+ -+ cd.atom->txnh_count -= 1; -+ txnh->atom = NULL; -+ /* remove transaction handle from atom's list of transaction handles */ -+ list_del_init(&txnh->txnh_link); -+ -+ spin_unlock_txnh(txnh); -+ atom_dec_and_unlock(cd.atom); -+ /* if we don't want to do a commit (TXNH_DONT_COMMIT is set, probably -+ * because it takes time) by current thread, we do that work -+ * asynchronously by ktxnmgrd daemon. */ -+ if (cd.wake_ktxnmgrd_up) -+ ktxnmgrd_kick(&get_current_super_private()->tmgr); -+ -+ return 0; -+} -+ -+/* TRY_CAPTURE */ -+ -+/* This routine attempts a single block-capture request. It may return -E_REPEAT if some -+ condition indicates that the request should be retried, and it may block if the -+ txn_capture mode does not include the TXN_CAPTURE_NONBLOCKING request flag. -+ -+ This routine encodes the basic logic of block capturing described by: -+ -+ http://namesys.com/v4/v4.html -+ -+ Our goal here is to ensure that any two blocks that contain dependent modifications -+ should commit at the same time. This function enforces this discipline by initiating -+ fusion whenever a transaction handle belonging to one atom requests to read or write a -+ block belonging to another atom (TXN_CAPTURE_WRITE or TXN_CAPTURE_READ_ATOMIC). -+ -+ In addition, this routine handles the initial assignment of atoms to blocks and -+ transaction handles. These are possible outcomes of this function: -+ -+ 1. The block and handle are already part of the same atom: return immediate success -+ -+ 2. The block is assigned but the handle is not: call capture_assign_txnh to assign -+ the handle to the block's atom. -+ -+ 3. The handle is assigned but the block is not: call capture_assign_block to assign -+ the block to the handle's atom. -+ -+ 4. Both handle and block are assigned, but to different atoms: call capture_init_fusion -+ to fuse atoms. -+ -+ 5. Neither block nor handle are assigned: create a new atom and assign them both. -+ -+ 6. A read request for a non-captured block: return immediate success. -+ -+ This function acquires and releases the handle's spinlock. This function is called -+ under the jnode lock and if the return value is 0, it returns with the jnode lock still -+ held. If the return is -E_REPEAT or some other error condition, the jnode lock is -+ released. The external interface (reiser4_try_capture) manages re-aquiring the jnode -+ lock in the failure case. -+*/ -+static int try_capture_block( -+ txn_handle * txnh, jnode * node, txn_capture mode, -+ txn_atom ** atom_alloc) -+{ -+ txn_atom *block_atom; -+ txn_atom *txnh_atom; -+ -+ /* Should not call capture for READ_NONCOM requests, handled in reiser4_try_capture. */ -+ assert("jmacd-567", CAPTURE_TYPE(mode) != TXN_CAPTURE_READ_NONCOM); -+ -+ /* FIXME-ZAM-HANS: FIXME_LATER_JMACD Should assert that atom->tree == -+ * node->tree somewhere. */ -+ assert("umka-194", txnh != NULL); -+ assert("umka-195", node != NULL); -+ -+ /* The jnode is already locked! Being called from reiser4_try_capture(). */ -+ assert_spin_locked(&(node->guard)); -+ block_atom = node->atom; -+ -+ /* Get txnh spinlock, this allows us to compare txn_atom pointers but it doesn't -+ let us touch the atoms themselves. */ -+ spin_lock_txnh(txnh); -+ txnh_atom = txnh->atom; -+ /* Process of capturing continues into one of four branches depends on -+ which atoms from (block atom (node->atom), current atom (txnh->atom)) -+ exist. */ -+ if (txnh_atom == NULL) { -+ if (block_atom == NULL) { -+ spin_unlock_txnh(txnh); -+ spin_unlock_jnode(node); -+ /* assign empty atom to the txnh and repeat */ -+ return atom_begin_and_assign_to_txnh(atom_alloc, txnh); -+ } else { -+ atomic_inc(&block_atom->refcount); -+ /* node spin-lock isn't needed anymore */ -+ spin_unlock_jnode(node); -+ if (!spin_trylock_atom(block_atom)) { -+ spin_unlock_txnh(txnh); -+ spin_lock_atom(block_atom); -+ spin_lock_txnh(txnh); -+ } -+ /* re-check state after getting txnh and the node -+ * atom spin-locked */ -+ if (node->atom != block_atom || txnh->atom != NULL) { -+ spin_unlock_txnh(txnh); -+ atom_dec_and_unlock(block_atom); -+ return RETERR(-E_REPEAT); -+ } -+ atomic_dec(&block_atom->refcount); -+ if (block_atom->stage > ASTAGE_CAPTURE_WAIT || -+ (block_atom->stage == ASTAGE_CAPTURE_WAIT && -+ block_atom->txnh_count != 0)) -+ return capture_fuse_wait(txnh, block_atom, NULL, mode); -+ capture_assign_txnh_nolock(block_atom, txnh); -+ spin_unlock_txnh(txnh); -+ spin_unlock_atom(block_atom); -+ return RETERR(-E_REPEAT); -+ } -+ } else { -+ /* It is time to perform deadlock prevention check over the -+ node we want to capture. It is possible this node was locked -+ for read without capturing it. The optimization which allows -+ to do it helps us in keeping atoms independent as long as -+ possible but it may cause lock/fuse deadlock problems. -+ -+ A number of similar deadlock situations with locked but not -+ captured nodes were found. In each situation there are two -+ or more threads: one of them does flushing while another one -+ does routine balancing or tree lookup. The flushing thread -+ (F) sleeps in long term locking request for node (N), another -+ thread (A) sleeps in trying to capture some node already -+ belonging the atom F, F has a state which prevents -+ immediately fusion . -+ -+ Deadlocks of this kind cannot happen if node N was properly -+ captured by thread A. The F thread fuse atoms before locking -+ therefore current atom of thread F and current atom of thread -+ A became the same atom and thread A may proceed. This does -+ not work if node N was not captured because the fusion of -+ atom does not happens. -+ -+ The following scheme solves the deadlock: If -+ longterm_lock_znode locks and does not capture a znode, that -+ znode is marked as MISSED_IN_CAPTURE. A node marked this way -+ is processed by the code below which restores the missed -+ capture and fuses current atoms of all the node lock owners -+ by calling the fuse_not_fused_lock_owners() function. */ -+ if (JF_ISSET(node, JNODE_MISSED_IN_CAPTURE)) { -+ JF_CLR(node, JNODE_MISSED_IN_CAPTURE); -+ if (jnode_is_znode(node) && znode_is_locked(JZNODE(node))) { -+ spin_unlock_txnh(txnh); -+ spin_unlock_jnode(node); -+ fuse_not_fused_lock_owners(txnh, JZNODE(node)); -+ return RETERR(-E_REPEAT); -+ } -+ } -+ if (block_atom == NULL) { -+ atomic_inc(&txnh_atom->refcount); -+ spin_unlock_txnh(txnh); -+ if (!spin_trylock_atom(txnh_atom)) { -+ spin_unlock_jnode(node); -+ spin_lock_atom(txnh_atom); -+ spin_lock_jnode(node); -+ } -+ if (txnh->atom != txnh_atom || node->atom != NULL -+ || JF_ISSET(node, JNODE_IS_DYING)) { -+ spin_unlock_jnode(node); -+ atom_dec_and_unlock(txnh_atom); -+ return RETERR(-E_REPEAT); -+ } -+ atomic_dec(&txnh_atom->refcount); -+ capture_assign_block_nolock(txnh_atom, node); -+ spin_unlock_atom(txnh_atom); -+ } else { -+ if (txnh_atom != block_atom) { -+ if (mode & TXN_CAPTURE_DONT_FUSE) { -+ spin_unlock_txnh(txnh); -+ spin_unlock_jnode(node); -+ /* we are in a "no-fusion" mode and @node is -+ * already part of transaction. */ -+ return RETERR(-E_NO_NEIGHBOR); -+ } -+ return capture_init_fusion(node, txnh, mode); -+ } -+ spin_unlock_txnh(txnh); -+ } -+ } -+ return 0; -+} -+ -+static txn_capture -+build_capture_mode(jnode * node, znode_lock_mode lock_mode, txn_capture flags) -+{ -+ txn_capture cap_mode; -+ -+ assert_spin_locked(&(node->guard)); -+ -+ /* FIXME_JMACD No way to set TXN_CAPTURE_READ_MODIFY yet. */ -+ -+ if (lock_mode == ZNODE_WRITE_LOCK) { -+ cap_mode = TXN_CAPTURE_WRITE; -+ } else if (node->atom != NULL) { -+ cap_mode = TXN_CAPTURE_WRITE; -+ } else if (0 && /* txnh->mode == TXN_READ_FUSING && */ -+ jnode_get_level(node) == LEAF_LEVEL) { -+ /* NOTE-NIKITA TXN_READ_FUSING is not currently used */ -+ /* We only need a READ_FUSING capture at the leaf level. This -+ is because the internal levels of the tree (twigs included) -+ are redundant from the point of the user that asked for a -+ read-fusing transcrash. The user only wants to read-fuse -+ atoms due to reading uncommitted data that another user has -+ written. It is the file system that reads/writes the -+ internal tree levels, the user only reads/writes leaves. */ -+ cap_mode = TXN_CAPTURE_READ_ATOMIC; -+ } else { -+ /* In this case (read lock at a non-leaf) there's no reason to -+ * capture. */ -+ /* cap_mode = TXN_CAPTURE_READ_NONCOM; */ -+ return 0; -+ } -+ -+ cap_mode |= (flags & (TXN_CAPTURE_NONBLOCKING | TXN_CAPTURE_DONT_FUSE)); -+ assert("nikita-3186", cap_mode != 0); -+ return cap_mode; -+} -+ -+/* This is an external interface to try_capture_block(), it calls -+ try_capture_block() repeatedly as long as -E_REPEAT is returned. -+ -+ @node: node to capture, -+ @lock_mode: read or write lock is used in capture mode calculation, -+ @flags: see txn_capture flags enumeration, -+ @can_coc : can copy-on-capture -+ -+ @return: 0 - node was successfully captured, -E_REPEAT - capture request -+ cannot be processed immediately as it was requested in flags, -+ < 0 - other errors. -+*/ -+int reiser4_try_capture(jnode *node, znode_lock_mode lock_mode, -+ txn_capture flags) -+{ -+ txn_atom *atom_alloc = NULL; -+ txn_capture cap_mode; -+ txn_handle *txnh = get_current_context()->trans; -+ int ret; -+ -+ assert_spin_locked(&(node->guard)); -+ -+ repeat: -+ if (JF_ISSET(node, JNODE_IS_DYING)) -+ return RETERR(-EINVAL); -+ if (node->atom != NULL && txnh->atom == node->atom) -+ return 0; -+ cap_mode = build_capture_mode(node, lock_mode, flags); -+ if (cap_mode == 0 || -+ (!(cap_mode & TXN_CAPTURE_WTYPES) && node->atom == NULL)) { -+ /* Mark this node as "MISSED". It helps in further deadlock -+ * analysis */ -+ if (jnode_is_znode(node)) -+ JF_SET(node, JNODE_MISSED_IN_CAPTURE); -+ return 0; -+ } -+ /* Repeat try_capture as long as -E_REPEAT is returned. */ -+ ret = try_capture_block(txnh, node, cap_mode, &atom_alloc); -+ /* Regardless of non_blocking: -+ -+ If ret == 0 then jnode is still locked. -+ If ret != 0 then jnode is unlocked. -+ */ -+#if REISER4_DEBUG -+ if (ret == 0) -+ assert_spin_locked(&(node->guard)); -+ else -+ assert_spin_not_locked(&(node->guard)); -+#endif -+ assert_spin_not_locked(&(txnh->guard)); -+ -+ if (ret == -E_REPEAT) { -+ /* E_REPEAT implies all locks were released, therefore we need -+ to take the jnode's lock again. */ -+ spin_lock_jnode(node); -+ -+ /* Although this may appear to be a busy loop, it is not. -+ There are several conditions that cause E_REPEAT to be -+ returned by the call to try_capture_block, all cases -+ indicating some kind of state change that means you should -+ retry the request and will get a different result. In some -+ cases this could be avoided with some extra code, but -+ generally it is done because the necessary locks were -+ released as a result of the operation and repeating is the -+ simplest thing to do (less bug potential). The cases are: -+ atom fusion returns E_REPEAT after it completes (jnode and -+ txnh were unlocked); race conditions in assign_block, -+ assign_txnh, and init_fusion return E_REPEAT (trylock -+ failure); after going to sleep in capture_fuse_wait -+ (request was blocked but may now succeed). I'm not quite -+ sure how capture_copy works yet, but it may also return -+ E_REPEAT. When the request is legitimately blocked, the -+ requestor goes to sleep in fuse_wait, so this is not a busy -+ loop. */ -+ /* NOTE-NIKITA: still don't understand: -+ -+ try_capture_block->capture_assign_txnh->spin_trylock_atom->E_REPEAT -+ -+ looks like busy loop? -+ */ -+ goto repeat; -+ } -+ -+ /* free extra atom object that was possibly allocated by -+ try_capture_block(). -+ -+ Do this before acquiring jnode spin lock to -+ minimize time spent under lock. --nikita */ -+ if (atom_alloc != NULL) { -+ kmem_cache_free(_atom_slab, atom_alloc); -+ } -+ -+ if (ret != 0) { -+ if (ret == -E_BLOCK) { -+ assert("nikita-3360", -+ cap_mode & TXN_CAPTURE_NONBLOCKING); -+ ret = -E_REPEAT; -+ } -+ -+ /* Failure means jnode is not locked. FIXME_LATER_JMACD May -+ want to fix the above code to avoid releasing the lock and -+ re-acquiring it, but there are cases were failure occurs -+ when the lock is not held, and those cases would need to be -+ modified to re-take the lock. */ -+ spin_lock_jnode(node); -+ } -+ -+ /* Jnode is still locked. */ -+ assert_spin_locked(&(node->guard)); -+ return ret; -+} -+ -+static void release_two_atoms(txn_atom *one, txn_atom *two) -+{ -+ spin_unlock_atom(one); -+ atom_dec_and_unlock(two); -+ spin_lock_atom(one); -+ atom_dec_and_unlock(one); -+} -+ -+/* This function sets up a call to try_capture_block and repeats as long as -E_REPEAT is -+ returned by that routine. The txn_capture request mode is computed here depending on -+ the transaction handle's type and the lock request. This is called from the depths of -+ the lock manager with the jnode lock held and it always returns with the jnode lock -+ held. -+*/ -+ -+/* fuse all 'active' atoms of lock owners of given node. */ -+static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node) -+{ -+ lock_handle *lh; -+ int repeat; -+ txn_atom *atomh, *atomf; -+ reiser4_context *me = get_current_context(); -+ reiser4_context *ctx = NULL; -+ -+ assert_spin_not_locked(&(ZJNODE(node)->guard)); -+ assert_spin_not_locked(&(txnh->hlock)); -+ -+ repeat: -+ repeat = 0; -+ atomh = txnh_get_atom(txnh); -+ spin_unlock_txnh(txnh); -+ assert("zam-692", atomh != NULL); -+ -+ spin_lock_zlock(&node->lock); -+ /* inspect list of lock owners */ -+ list_for_each_entry(lh, &node->lock.owners, owners_link) { -+ ctx = get_context_by_lock_stack(lh->owner); -+ if (ctx == me) -+ continue; -+ /* below we use two assumptions to avoid addition spin-locks -+ for checking the condition : -+ -+ 1) if the lock stack has lock, the transaction should be -+ opened, i.e. ctx->trans != NULL; -+ -+ 2) reading of well-aligned ctx->trans->atom is atomic, if it -+ equals to the address of spin-locked atomh, we take that -+ the atoms are the same, nothing has to be captured. */ -+ if (atomh != ctx->trans->atom) { -+ reiser4_wake_up(lh->owner); -+ repeat = 1; -+ break; -+ } -+ } -+ if (repeat) { -+ if (!spin_trylock_txnh(ctx->trans)) { -+ spin_unlock_zlock(&node->lock); -+ spin_unlock_atom(atomh); -+ goto repeat; -+ } -+ atomf = ctx->trans->atom; -+ if (atomf == NULL) { -+ capture_assign_txnh_nolock(atomh, ctx->trans); -+ /* release zlock lock _after_ assigning the atom to the -+ * transaction handle, otherwise the lock owner thread -+ * may unlock all znodes, exit kernel context and here -+ * we would access an invalid transaction handle. */ -+ spin_unlock_zlock(&node->lock); -+ spin_unlock_atom(atomh); -+ spin_unlock_txnh(ctx->trans); -+ goto repeat; -+ } -+ assert("zam-1059", atomf != atomh); -+ spin_unlock_zlock(&node->lock); -+ atomic_inc(&atomh->refcount); -+ atomic_inc(&atomf->refcount); -+ spin_unlock_txnh(ctx->trans); -+ if (atomf > atomh) { -+ spin_lock_atom_nested(atomf); -+ } else { -+ spin_unlock_atom(atomh); -+ spin_lock_atom(atomf); -+ spin_lock_atom_nested(atomh); -+ } -+ if (atomh == atomf || !atom_isopen(atomh) || !atom_isopen(atomf)) { -+ release_two_atoms(atomf, atomh); -+ goto repeat; -+ } -+ atomic_dec(&atomh->refcount); -+ atomic_dec(&atomf->refcount); -+ capture_fuse_into(atomf, atomh); -+ goto repeat; -+ } -+ spin_unlock_zlock(&node->lock); -+ spin_unlock_atom(atomh); -+} -+ -+/* This is the interface to capture unformatted nodes via their struct page -+ reference. Currently it is only used in reiser4_invalidatepage */ -+int try_capture_page_to_invalidate(struct page *pg) -+{ -+ int ret; -+ jnode *node; -+ -+ assert("umka-292", pg != NULL); -+ assert("nikita-2597", PageLocked(pg)); -+ -+ if (IS_ERR(node = jnode_of_page(pg))) { -+ return PTR_ERR(node); -+ } -+ -+ spin_lock_jnode(node); -+ unlock_page(pg); -+ -+ ret = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0); -+ spin_unlock_jnode(node); -+ jput(node); -+ lock_page(pg); -+ return ret; -+} -+ -+/* This informs the transaction manager when a node is deleted. Add the block to the -+ atom's delete set and uncapture the block. -+ -+VS-FIXME-HANS: this E_REPEAT paradigm clutters the code and creates a need for -+explanations. find all the functions that use it, and unless there is some very -+good reason to use it (I have not noticed one so far and I doubt it exists, but maybe somewhere somehow....), -+move the loop to inside the function. -+ -+VS-FIXME-HANS: can this code be at all streamlined? In particular, can you lock and unlock the jnode fewer times? -+ */ -+void reiser4_uncapture_page(struct page *pg) -+{ -+ jnode *node; -+ txn_atom *atom; -+ -+ assert("umka-199", pg != NULL); -+ assert("nikita-3155", PageLocked(pg)); -+ -+ clear_page_dirty_for_io(pg); -+ -+ reiser4_wait_page_writeback(pg); -+ -+ node = jprivate(pg); -+ BUG_ON(node == NULL); -+ -+ spin_lock_jnode(node); -+ -+ atom = jnode_get_atom(node); -+ if (atom == NULL) { -+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY)); -+ spin_unlock_jnode(node); -+ return; -+ } -+ -+ /* We can remove jnode from transaction even if it is on flush queue -+ * prepped list, we only need to be sure that flush queue is not being -+ * written by reiser4_write_fq(). reiser4_write_fq() does not use atom -+ * spin lock for protection of the prepped nodes list, instead -+ * write_fq() increments atom's nr_running_queues counters for the time -+ * when prepped list is not protected by spin lock. Here we check this -+ * counter if we want to remove jnode from flush queue and, if the -+ * counter is not zero, wait all reiser4_write_fq() for this atom to -+ * complete. This is not significant overhead. */ -+ while (JF_ISSET(node, JNODE_FLUSH_QUEUED) && atom->nr_running_queues) { -+ spin_unlock_jnode(node); -+ /* -+ * at this moment we want to wait for "atom event", viz. wait -+ * until @node can be removed from flush queue. But -+ * reiser4_atom_wait_event() cannot be called with page locked, -+ * because it deadlocks with jnode_extent_write(). Unlock page, -+ * after making sure (through page_cache_get()) that it cannot -+ * be released from memory. -+ */ -+ page_cache_get(pg); -+ unlock_page(pg); -+ reiser4_atom_wait_event(atom); -+ lock_page(pg); -+ /* -+ * page may has been detached by ->writepage()->releasepage(). -+ */ -+ reiser4_wait_page_writeback(pg); -+ spin_lock_jnode(node); -+ page_cache_release(pg); -+ atom = jnode_get_atom(node); -+/* VS-FIXME-HANS: improve the commenting in this function */ -+ if (atom == NULL) { -+ spin_unlock_jnode(node); -+ return; -+ } -+ } -+ reiser4_uncapture_block(node); -+ spin_unlock_atom(atom); -+ jput(node); -+} -+ -+/* this is used in extent's kill hook to uncapture and unhash jnodes attached to -+ * inode's tree of jnodes */ -+void reiser4_uncapture_jnode(jnode * node) -+{ -+ txn_atom *atom; -+ -+ assert_spin_locked(&(node->guard)); -+ assert("", node->pg == 0); -+ -+ atom = jnode_get_atom(node); -+ if (atom == NULL) { -+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY)); -+ spin_unlock_jnode(node); -+ return; -+ } -+ -+ reiser4_uncapture_block(node); -+ spin_unlock_atom(atom); -+ jput(node); -+} -+ -+/* No-locking version of assign_txnh. Sets the transaction handle's atom pointer, -+ increases atom refcount and txnh_count, adds to txnh_list. */ -+static void capture_assign_txnh_nolock(txn_atom *atom, txn_handle *txnh) -+{ -+ assert("umka-200", atom != NULL); -+ assert("umka-201", txnh != NULL); -+ -+ assert_spin_locked(&(txnh->hlock)); -+ assert_spin_locked(&(atom->alock)); -+ assert("jmacd-824", txnh->atom == NULL); -+ assert("nikita-3540", atom_isopen(atom)); -+ BUG_ON(txnh->atom != NULL); -+ -+ atomic_inc(&atom->refcount); -+ txnh->atom = atom; -+ reiser4_ctx_gfp_mask_set(); -+ list_add_tail(&txnh->txnh_link, &atom->txnh_list); -+ atom->txnh_count += 1; -+} -+ -+/* No-locking version of assign_block. Sets the block's atom pointer, references the -+ block, adds it to the clean or dirty capture_jnode list, increments capture_count. */ -+static void capture_assign_block_nolock(txn_atom *atom, jnode *node) -+{ -+ assert("umka-202", atom != NULL); -+ assert("umka-203", node != NULL); -+ assert_spin_locked(&(node->guard)); -+ assert_spin_locked(&(atom->alock)); -+ assert("jmacd-323", node->atom == NULL); -+ BUG_ON(!list_empty_careful(&node->capture_link)); -+ assert("nikita-3470", !JF_ISSET(node, JNODE_DIRTY)); -+ -+ /* Pointer from jnode to atom is not counted in atom->refcount. */ -+ node->atom = atom; -+ -+ list_add_tail(&node->capture_link, ATOM_CLEAN_LIST(atom)); -+ atom->capture_count += 1; -+ /* reference to jnode is acquired by atom. */ -+ jref(node); -+ -+ ON_DEBUG(count_jnode(atom, node, NOT_CAPTURED, CLEAN_LIST, 1)); -+ -+ LOCK_CNT_INC(t_refs); -+} -+ -+/* common code for dirtying both unformatted jnodes and formatted znodes. */ -+static void do_jnode_make_dirty(jnode * node, txn_atom * atom) -+{ -+ assert_spin_locked(&(node->guard)); -+ assert_spin_locked(&(atom->alock)); -+ assert("jmacd-3981", !JF_ISSET(node, JNODE_DIRTY)); -+ -+ JF_SET(node, JNODE_DIRTY); -+ -+ get_current_context()->nr_marked_dirty++; -+ -+ /* We grab2flush_reserve one additional block only if node was -+ not CREATED and jnode_flush did not sort it into neither -+ relocate set nor overwrite one. If node is in overwrite or -+ relocate set we assume that atom's flush reserved counter was -+ already adjusted. */ -+ if (!JF_ISSET(node, JNODE_CREATED) && !JF_ISSET(node, JNODE_RELOC) -+ && !JF_ISSET(node, JNODE_OVRWR) && jnode_is_leaf(node) -+ && !jnode_is_cluster_page(node)) { -+ assert("vs-1093", !reiser4_blocknr_is_fake(&node->blocknr)); -+ assert("vs-1506", *jnode_get_block(node) != 0); -+ grabbed2flush_reserved_nolock(atom, (__u64) 1); -+ JF_SET(node, JNODE_FLUSH_RESERVED); -+ } -+ -+ if (!JF_ISSET(node, JNODE_FLUSH_QUEUED)) { -+ /* If the atom is not set yet, it will be added to the appropriate list in -+ capture_assign_block_nolock. */ -+ /* Sometimes a node is set dirty before being captured -- the case for new -+ jnodes. In that case the jnode will be added to the appropriate list -+ in capture_assign_block_nolock. Another reason not to re-link jnode is -+ that jnode is on a flush queue (see flush.c for details) */ -+ -+ int level = jnode_get_level(node); -+ -+ assert("nikita-3152", !JF_ISSET(node, JNODE_OVRWR)); -+ assert("zam-654", atom->stage < ASTAGE_PRE_COMMIT); -+ assert("nikita-2607", 0 <= level); -+ assert("nikita-2606", level <= REAL_MAX_ZTREE_HEIGHT); -+ -+ /* move node to atom's dirty list */ -+ list_move_tail(&node->capture_link, ATOM_DIRTY_LIST(atom, level)); -+ ON_DEBUG(count_jnode -+ (atom, node, NODE_LIST(node), DIRTY_LIST, 1)); -+ } -+} -+ -+/* Set the dirty status for this (spin locked) jnode. */ -+void jnode_make_dirty_locked(jnode * node) -+{ -+ assert("umka-204", node != NULL); -+ assert_spin_locked(&(node->guard)); -+ -+ if (REISER4_DEBUG && rofs_jnode(node)) { -+ warning("nikita-3365", "Dirtying jnode on rofs"); -+ dump_stack(); -+ } -+ -+ /* Fast check for already dirty node */ -+ if (!JF_ISSET(node, JNODE_DIRTY)) { -+ txn_atom *atom; -+ -+ atom = jnode_get_atom(node); -+ assert("vs-1094", atom); -+ /* Check jnode dirty status again because node spin lock might -+ * be released inside jnode_get_atom(). */ -+ if (likely(!JF_ISSET(node, JNODE_DIRTY))) -+ do_jnode_make_dirty(node, atom); -+ spin_unlock_atom(atom); -+ } -+} -+ -+/* Set the dirty status for this znode. */ -+void znode_make_dirty(znode * z) -+{ -+ jnode *node; -+ struct page *page; -+ -+ assert("umka-204", z != NULL); -+ assert("nikita-3290", znode_above_root(z) || znode_is_loaded(z)); -+ assert("nikita-3560", znode_is_write_locked(z)); -+ -+ node = ZJNODE(z); -+ /* znode is longterm locked, we can check dirty bit without spinlock */ -+ if (JF_ISSET(node, JNODE_DIRTY)) { -+ /* znode is dirty already. All we have to do is to change znode version */ -+ z->version = znode_build_version(jnode_get_tree(node)); -+ return; -+ } -+ -+ spin_lock_jnode(node); -+ jnode_make_dirty_locked(node); -+ page = jnode_page(node); -+ if (page != NULL) { -+ /* this is useful assertion (allows one to check that no -+ * modifications are lost due to update of in-flight page), -+ * but it requires locking on page to check PG_writeback -+ * bit. */ -+ /* assert("nikita-3292", -+ !PageWriteback(page) || ZF_ISSET(z, JNODE_WRITEBACK)); */ -+ page_cache_get(page); -+ -+ /* jnode lock is not needed for the rest of -+ * znode_set_dirty(). */ -+ spin_unlock_jnode(node); -+ /* reiser4 file write code calls set_page_dirty for -+ * unformatted nodes, for formatted nodes we do it here. */ -+ reiser4_set_page_dirty_internal(page); -+ page_cache_release(page); -+ /* bump version counter in znode */ -+ z->version = znode_build_version(jnode_get_tree(node)); -+ } else { -+ assert("zam-596", znode_above_root(JZNODE(node))); -+ spin_unlock_jnode(node); -+ } -+ -+ assert("nikita-1900", znode_is_write_locked(z)); -+ assert("jmacd-9777", node->atom != NULL); -+} -+ -+int reiser4_sync_atom(txn_atom * atom) -+{ -+ int result; -+ txn_handle *txnh; -+ -+ txnh = get_current_context()->trans; -+ -+ result = 0; -+ if (atom != NULL) { -+ if (atom->stage < ASTAGE_PRE_COMMIT) { -+ spin_lock_txnh(txnh); -+ capture_assign_txnh_nolock(atom, txnh); -+ result = force_commit_atom(txnh); -+ } else if (atom->stage < ASTAGE_POST_COMMIT) { -+ /* wait atom commit */ -+ reiser4_atom_wait_event(atom); -+ /* try once more */ -+ result = RETERR(-E_REPEAT); -+ } else -+ spin_unlock_atom(atom); -+ } -+ return result; -+} -+ -+#if REISER4_DEBUG -+ -+/* move jnode form one list to another -+ call this after atom->capture_count is updated */ -+void -+count_jnode(txn_atom * atom, jnode * node, atom_list old_list, -+ atom_list new_list, int check_lists) -+{ -+ struct list_head *pos; -+ -+ assert("zam-1018", atom_is_protected(atom)); -+ assert_spin_locked(&(node->guard)); -+ assert("", NODE_LIST(node) == old_list); -+ -+ switch (NODE_LIST(node)) { -+ case NOT_CAPTURED: -+ break; -+ case DIRTY_LIST: -+ assert("", atom->dirty > 0); -+ atom->dirty--; -+ break; -+ case CLEAN_LIST: -+ assert("", atom->clean > 0); -+ atom->clean--; -+ break; -+ case FQ_LIST: -+ assert("", atom->fq > 0); -+ atom->fq--; -+ break; -+ case WB_LIST: -+ assert("", atom->wb > 0); -+ atom->wb--; -+ break; -+ case OVRWR_LIST: -+ assert("", atom->ovrwr > 0); -+ atom->ovrwr--; -+ break; -+ default: -+ impossible("", ""); -+ } -+ -+ switch (new_list) { -+ case NOT_CAPTURED: -+ break; -+ case DIRTY_LIST: -+ atom->dirty++; -+ break; -+ case CLEAN_LIST: -+ atom->clean++; -+ break; -+ case FQ_LIST: -+ atom->fq++; -+ break; -+ case WB_LIST: -+ atom->wb++; -+ break; -+ case OVRWR_LIST: -+ atom->ovrwr++; -+ break; -+ default: -+ impossible("", ""); -+ } -+ ASSIGN_NODE_LIST(node, new_list); -+ if (0 && check_lists) { -+ int count; -+ tree_level level; -+ -+ count = 0; -+ -+ /* flush queue list */ -+ /* reiser4_check_fq(atom); */ -+ -+ /* dirty list */ -+ count = 0; -+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) { -+ list_for_each(pos, ATOM_DIRTY_LIST(atom, level)) -+ count++; -+ } -+ if (count != atom->dirty) -+ warning("", "dirty counter %d, real %d\n", atom->dirty, -+ count); -+ -+ /* clean list */ -+ count = 0; -+ list_for_each(pos, ATOM_CLEAN_LIST(atom)) -+ count++; -+ if (count != atom->clean) -+ warning("", "clean counter %d, real %d\n", atom->clean, -+ count); -+ -+ /* wb list */ -+ count = 0; -+ list_for_each(pos, ATOM_WB_LIST(atom)) -+ count++; -+ if (count != atom->wb) -+ warning("", "wb counter %d, real %d\n", atom->wb, -+ count); -+ -+ /* overwrite list */ -+ count = 0; -+ list_for_each(pos, ATOM_OVRWR_LIST(atom)) -+ count++; -+ -+ if (count != atom->ovrwr) -+ warning("", "ovrwr counter %d, real %d\n", atom->ovrwr, -+ count); -+ } -+ assert("vs-1624", atom->num_queued == atom->fq); -+ if (atom->capture_count != -+ atom->dirty + atom->clean + atom->ovrwr + atom->wb + atom->fq) { -+ printk -+ ("count %d, dirty %d clean %d ovrwr %d wb %d fq %d\n", -+ atom->capture_count, atom->dirty, atom->clean, atom->ovrwr, -+ atom->wb, atom->fq); -+ assert("vs-1622", -+ atom->capture_count == -+ atom->dirty + atom->clean + atom->ovrwr + atom->wb + -+ atom->fq); -+ } -+} -+ -+#endif -+ -+/* Make node OVRWR and put it on atom->overwrite_nodes list, atom lock and jnode -+ * lock should be taken before calling this function. */ -+void jnode_make_wander_nolock(jnode * node) -+{ -+ txn_atom *atom; -+ -+ assert("nikita-2431", node != NULL); -+ assert("nikita-2432", !JF_ISSET(node, JNODE_RELOC)); -+ assert("nikita-3153", JF_ISSET(node, JNODE_DIRTY)); -+ assert("zam-897", !JF_ISSET(node, JNODE_FLUSH_QUEUED)); -+ assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node))); -+ -+ atom = node->atom; -+ -+ assert("zam-895", atom != NULL); -+ assert("zam-894", atom_is_protected(atom)); -+ -+ JF_SET(node, JNODE_OVRWR); -+ /* move node to atom's overwrite list */ -+ list_move_tail(&node->capture_link, ATOM_OVRWR_LIST(atom)); -+ ON_DEBUG(count_jnode(atom, node, DIRTY_LIST, OVRWR_LIST, 1)); -+} -+ -+/* Same as jnode_make_wander_nolock, but all necessary locks are taken inside -+ * this function. */ -+void jnode_make_wander(jnode * node) -+{ -+ txn_atom *atom; -+ -+ spin_lock_jnode(node); -+ atom = jnode_get_atom(node); -+ assert("zam-913", atom != NULL); -+ assert("zam-914", !JF_ISSET(node, JNODE_RELOC)); -+ -+ jnode_make_wander_nolock(node); -+ spin_unlock_atom(atom); -+ spin_unlock_jnode(node); -+} -+ -+/* this just sets RELOC bit */ -+static void jnode_make_reloc_nolock(flush_queue_t * fq, jnode * node) -+{ -+ assert_spin_locked(&(node->guard)); -+ assert("zam-916", JF_ISSET(node, JNODE_DIRTY)); -+ assert("zam-917", !JF_ISSET(node, JNODE_RELOC)); -+ assert("zam-918", !JF_ISSET(node, JNODE_OVRWR)); -+ assert("zam-920", !JF_ISSET(node, JNODE_FLUSH_QUEUED)); -+ assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node))); -+ jnode_set_reloc(node); -+} -+ -+/* Make znode RELOC and put it on flush queue */ -+void znode_make_reloc(znode * z, flush_queue_t * fq) -+{ -+ jnode *node; -+ txn_atom *atom; -+ -+ node = ZJNODE(z); -+ spin_lock_jnode(node); -+ -+ atom = jnode_get_atom(node); -+ assert("zam-919", atom != NULL); -+ -+ jnode_make_reloc_nolock(fq, node); -+ queue_jnode(fq, node); -+ -+ spin_unlock_atom(atom); -+ spin_unlock_jnode(node); -+ -+} -+ -+/* Make unformatted node RELOC and put it on flush queue */ -+void unformatted_make_reloc(jnode *node, flush_queue_t *fq) -+{ -+ assert("vs-1479", jnode_is_unformatted(node)); -+ -+ jnode_make_reloc_nolock(fq, node); -+ queue_jnode(fq, node); -+} -+ -+int reiser4_capture_super_block(struct super_block *s) -+{ -+ int result; -+ znode *uber; -+ lock_handle lh; -+ -+ init_lh(&lh); -+ result = get_uber_znode(reiser4_get_tree(s), -+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI, &lh); -+ if (result) -+ return result; -+ -+ uber = lh.node; -+ /* Grabbing one block for superblock */ -+ result = reiser4_grab_space_force((__u64) 1, BA_RESERVED); -+ if (result != 0) -+ return result; -+ -+ znode_make_dirty(uber); -+ -+ done_lh(&lh); -+ return 0; -+} -+ -+/* Wakeup every handle on the atom's WAITFOR list */ -+static void wakeup_atom_waitfor_list(txn_atom * atom) -+{ -+ txn_wait_links *wlinks; -+ -+ assert("umka-210", atom != NULL); -+ -+ /* atom is locked */ -+ list_for_each_entry(wlinks, &atom->fwaitfor_list, _fwaitfor_link) { -+ if (wlinks->waitfor_cb == NULL || -+ wlinks->waitfor_cb(atom, wlinks)) -+ /* Wake up. */ -+ reiser4_wake_up(wlinks->_lock_stack); -+ } -+} -+ -+/* Wakeup every handle on the atom's WAITING list */ -+static void wakeup_atom_waiting_list(txn_atom * atom) -+{ -+ txn_wait_links *wlinks; -+ -+ assert("umka-211", atom != NULL); -+ -+ /* atom is locked */ -+ list_for_each_entry(wlinks, &atom->fwaiting_list, _fwaiting_link) { -+ if (wlinks->waiting_cb == NULL || -+ wlinks->waiting_cb(atom, wlinks)) -+ /* Wake up. */ -+ reiser4_wake_up(wlinks->_lock_stack); -+ } -+} -+ -+/* helper function used by capture_fuse_wait() to avoid "spurious wake-ups" */ -+static int wait_for_fusion(txn_atom * atom, txn_wait_links * wlinks) -+{ -+ assert("nikita-3330", atom != NULL); -+ assert_spin_locked(&(atom->alock)); -+ -+ /* atom->txnh_count == 1 is for waking waiters up if we are releasing -+ * last transaction handle. */ -+ return atom->stage != ASTAGE_CAPTURE_WAIT || atom->txnh_count == 1; -+} -+ -+/* The general purpose of this function is to wait on the first of two possible events. -+ The situation is that a handle (and its atom atomh) is blocked trying to capture a -+ block (i.e., node) but the node's atom (atomf) is in the CAPTURE_WAIT state. The -+ handle's atom (atomh) is not in the CAPTURE_WAIT state. However, atomh could fuse with -+ another atom or, due to age, enter the CAPTURE_WAIT state itself, at which point it -+ needs to unblock the handle to avoid deadlock. When the txnh is unblocked it will -+ proceed and fuse the two atoms in the CAPTURE_WAIT state. -+ -+ In other words, if either atomh or atomf change state, the handle will be awakened, -+ thus there are two lists per atom: WAITING and WAITFOR. -+ -+ This is also called by capture_assign_txnh with (atomh == NULL) to wait for atomf to -+ close but it is not assigned to an atom of its own. -+ -+ Lock ordering in this method: all four locks are held: JNODE_LOCK, TXNH_LOCK, -+ BOTH_ATOM_LOCKS. Result: all four locks are released. -+*/ -+static int capture_fuse_wait(txn_handle * txnh, txn_atom * atomf, -+ txn_atom * atomh, txn_capture mode) -+{ -+ int ret; -+ txn_wait_links wlinks; -+ -+ assert("umka-213", txnh != NULL); -+ assert("umka-214", atomf != NULL); -+ -+ if ((mode & TXN_CAPTURE_NONBLOCKING) != 0) { -+ spin_unlock_txnh(txnh); -+ spin_unlock_atom(atomf); -+ -+ if (atomh) { -+ spin_unlock_atom(atomh); -+ } -+ -+ return RETERR(-E_BLOCK); -+ } -+ -+ /* Initialize the waiting list links. */ -+ init_wlinks(&wlinks); -+ -+ /* Add txnh to atomf's waitfor list, unlock atomf. */ -+ list_add_tail(&wlinks._fwaitfor_link, &atomf->fwaitfor_list); -+ wlinks.waitfor_cb = wait_for_fusion; -+ atomic_inc(&atomf->refcount); -+ spin_unlock_atom(atomf); -+ -+ if (atomh) { -+ /* Add txnh to atomh's waiting list, unlock atomh. */ -+ list_add_tail(&wlinks._fwaiting_link, &atomh->fwaiting_list); -+ atomic_inc(&atomh->refcount); -+ spin_unlock_atom(atomh); -+ } -+ -+ /* Go to sleep. */ -+ spin_unlock_txnh(txnh); -+ -+ ret = reiser4_prepare_to_sleep(wlinks._lock_stack); -+ if (ret == 0) { -+ reiser4_go_to_sleep(wlinks._lock_stack); -+ ret = RETERR(-E_REPEAT); -+ } -+ -+ /* Remove from the waitfor list. */ -+ spin_lock_atom(atomf); -+ -+ list_del(&wlinks._fwaitfor_link); -+ atom_dec_and_unlock(atomf); -+ -+ if (atomh) { -+ /* Remove from the waiting list. */ -+ spin_lock_atom(atomh); -+ list_del(&wlinks._fwaiting_link); -+ atom_dec_and_unlock(atomh); -+ } -+ return ret; -+} -+ -+static void lock_two_atoms(txn_atom * one, txn_atom * two) -+{ -+ assert("zam-1067", one != two); -+ -+ /* lock the atom with lesser address first */ -+ if (one < two) { -+ spin_lock_atom(one); -+ spin_lock_atom_nested(two); -+ } else { -+ spin_lock_atom(two); -+ spin_lock_atom_nested(one); -+ } -+} -+ -+/* Perform the necessary work to prepare for fusing two atoms, which involves -+ * acquiring two atom locks in the proper order. If one of the node's atom is -+ * blocking fusion (i.e., it is in the CAPTURE_WAIT stage) and the handle's -+ * atom is not then the handle's request is put to sleep. If the node's atom -+ * is committing, then the node can be copy-on-captured. Otherwise, pick the -+ * atom with fewer pointers to be fused into the atom with more pointer and -+ * call capture_fuse_into. -+ */ -+static int capture_init_fusion(jnode *node, txn_handle *txnh, txn_capture mode) -+{ -+ txn_atom * txnh_atom = txnh->atom; -+ txn_atom * block_atom = node->atom; -+ -+ atomic_inc(&txnh_atom->refcount); -+ atomic_inc(&block_atom->refcount); -+ -+ spin_unlock_txnh(txnh); -+ spin_unlock_jnode(node); -+ -+ lock_two_atoms(txnh_atom, block_atom); -+ -+ if (txnh->atom != txnh_atom || node->atom != block_atom ) { -+ release_two_atoms(txnh_atom, block_atom); -+ return RETERR(-E_REPEAT); -+ } -+ -+ atomic_dec(&txnh_atom->refcount); -+ atomic_dec(&block_atom->refcount); -+ -+ assert ("zam-1066", atom_isopen(txnh_atom)); -+ -+ if (txnh_atom->stage >= block_atom->stage || -+ (block_atom->stage == ASTAGE_CAPTURE_WAIT && block_atom->txnh_count == 0)) { -+ capture_fuse_into(txnh_atom, block_atom); -+ return RETERR(-E_REPEAT); -+ } -+ spin_lock_txnh(txnh); -+ return capture_fuse_wait(txnh, block_atom, txnh_atom, mode); -+} -+ -+/* This function splices together two jnode lists (small and large) and sets all jnodes in -+ the small list to point to the large atom. Returns the length of the list. */ -+static int -+capture_fuse_jnode_lists(txn_atom *large, struct list_head *large_head, -+ struct list_head *small_head) -+{ -+ int count = 0; -+ jnode *node; -+ -+ assert("umka-218", large != NULL); -+ assert("umka-219", large_head != NULL); -+ assert("umka-220", small_head != NULL); -+ /* small atom should be locked also. */ -+ assert_spin_locked(&(large->alock)); -+ -+ /* For every jnode on small's capture list... */ -+ list_for_each_entry(node, small_head, capture_link) { -+ count += 1; -+ -+ /* With the jnode lock held, update atom pointer. */ -+ spin_lock_jnode(node); -+ node->atom = large; -+ spin_unlock_jnode(node); -+ } -+ -+ /* Splice the lists. */ -+ list_splice_init(small_head, large_head->prev); -+ -+ return count; -+} -+ -+/* This function splices together two txnh lists (small and large) and sets all txn handles in -+ the small list to point to the large atom. Returns the length of the list. */ -+static int -+capture_fuse_txnh_lists(txn_atom *large, struct list_head *large_head, -+ struct list_head *small_head) -+{ -+ int count = 0; -+ txn_handle *txnh; -+ -+ assert("umka-221", large != NULL); -+ assert("umka-222", large_head != NULL); -+ assert("umka-223", small_head != NULL); -+ -+ /* Adjust every txnh to the new atom. */ -+ list_for_each_entry(txnh, small_head, txnh_link) { -+ count += 1; -+ -+ /* With the txnh lock held, update atom pointer. */ -+ spin_lock_txnh(txnh); -+ txnh->atom = large; -+ spin_unlock_txnh(txnh); -+ } -+ -+ /* Splice the txn_handle list. */ -+ list_splice_init(small_head, large_head->prev); -+ -+ return count; -+} -+ -+/* This function fuses two atoms. The captured nodes and handles belonging to SMALL are -+ added to LARGE and their ->atom pointers are all updated. The associated counts are -+ updated as well, and any waiting handles belonging to either are awakened. Finally the -+ smaller atom's refcount is decremented. -+*/ -+static void capture_fuse_into(txn_atom * small, txn_atom * large) -+{ -+ int level; -+ unsigned zcount = 0; -+ unsigned tcount = 0; -+ -+ assert("umka-224", small != NULL); -+ assert("umka-225", small != NULL); -+ -+ assert_spin_locked(&(large->alock)); -+ assert_spin_locked(&(small->alock)); -+ -+ assert("jmacd-201", atom_isopen(small)); -+ assert("jmacd-202", atom_isopen(large)); -+ -+ /* Splice and update the per-level dirty jnode lists */ -+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) { -+ zcount += -+ capture_fuse_jnode_lists(large, -+ ATOM_DIRTY_LIST(large, level), -+ ATOM_DIRTY_LIST(small, level)); -+ } -+ -+ /* Splice and update the [clean,dirty] jnode and txnh lists */ -+ zcount += -+ capture_fuse_jnode_lists(large, ATOM_CLEAN_LIST(large), -+ ATOM_CLEAN_LIST(small)); -+ zcount += -+ capture_fuse_jnode_lists(large, ATOM_OVRWR_LIST(large), -+ ATOM_OVRWR_LIST(small)); -+ zcount += -+ capture_fuse_jnode_lists(large, ATOM_WB_LIST(large), -+ ATOM_WB_LIST(small)); -+ zcount += -+ capture_fuse_jnode_lists(large, &large->inodes, &small->inodes); -+ tcount += -+ capture_fuse_txnh_lists(large, &large->txnh_list, -+ &small->txnh_list); -+ -+ /* Check our accounting. */ -+ assert("jmacd-1063", -+ zcount + small->num_queued == small->capture_count); -+ assert("jmacd-1065", tcount == small->txnh_count); -+ -+ /* sum numbers of waiters threads */ -+ large->nr_waiters += small->nr_waiters; -+ small->nr_waiters = 0; -+ -+ /* splice flush queues */ -+ reiser4_fuse_fq(large, small); -+ -+ /* update counter of jnode on every atom' list */ -+ ON_DEBUG(large->dirty += small->dirty; -+ small->dirty = 0; -+ large->clean += small->clean; -+ small->clean = 0; -+ large->ovrwr += small->ovrwr; -+ small->ovrwr = 0; -+ large->wb += small->wb; -+ small->wb = 0; -+ large->fq += small->fq; -+ small->fq = 0;); -+ -+ /* count flushers in result atom */ -+ large->nr_flushers += small->nr_flushers; -+ small->nr_flushers = 0; -+ -+ /* update counts of flushed nodes */ -+ large->flushed += small->flushed; -+ small->flushed = 0; -+ -+ /* Transfer list counts to large. */ -+ large->txnh_count += small->txnh_count; -+ large->capture_count += small->capture_count; -+ -+ /* Add all txnh references to large. */ -+ atomic_add(small->txnh_count, &large->refcount); -+ atomic_sub(small->txnh_count, &small->refcount); -+ -+ /* Reset small counts */ -+ small->txnh_count = 0; -+ small->capture_count = 0; -+ -+ /* Assign the oldest start_time, merge flags. */ -+ large->start_time = min(large->start_time, small->start_time); -+ large->flags |= small->flags; -+ -+ /* Merge blocknr sets. */ -+ blocknr_set_merge(&small->delete_set, &large->delete_set); -+ blocknr_set_merge(&small->wandered_map, &large->wandered_map); -+ -+ /* Merge allocated/deleted file counts */ -+ large->nr_objects_deleted += small->nr_objects_deleted; -+ large->nr_objects_created += small->nr_objects_created; -+ -+ small->nr_objects_deleted = 0; -+ small->nr_objects_created = 0; -+ -+ /* Merge allocated blocks counts */ -+ large->nr_blocks_allocated += small->nr_blocks_allocated; -+ -+ large->nr_running_queues += small->nr_running_queues; -+ small->nr_running_queues = 0; -+ -+ /* Merge blocks reserved for overwrite set. */ -+ large->flush_reserved += small->flush_reserved; -+ small->flush_reserved = 0; -+ -+ if (large->stage < small->stage) { -+ /* Large only needs to notify if it has changed state. */ -+ reiser4_atom_set_stage(large, small->stage); -+ wakeup_atom_waiting_list(large); -+ } -+ -+ reiser4_atom_set_stage(small, ASTAGE_INVALID); -+ -+ /* Notify any waiters--small needs to unload its wait lists. Waiters -+ actually remove themselves from the list before returning from the -+ fuse_wait function. */ -+ wakeup_atom_waiting_list(small); -+ -+ /* Unlock atoms */ -+ spin_unlock_atom(large); -+ atom_dec_and_unlock(small); -+} -+ -+/* TXNMGR STUFF */ -+ -+/* Release a block from the atom, reversing the effects of being captured, -+ do not release atom's reference to jnode due to holding spin-locks. -+ Currently this is only called when the atom commits. -+ -+ NOTE: this function does not release a (journal) reference to jnode -+ due to locking optimizations, you should call jput() somewhere after -+ calling reiser4_uncapture_block(). */ -+void reiser4_uncapture_block(jnode * node) -+{ -+ txn_atom *atom; -+ -+ assert("umka-226", node != NULL); -+ atom = node->atom; -+ assert("umka-228", atom != NULL); -+ -+ assert("jmacd-1021", node->atom == atom); -+ assert_spin_locked(&(node->guard)); -+ assert("jmacd-1023", atom_is_protected(atom)); -+ -+ JF_CLR(node, JNODE_DIRTY); -+ JF_CLR(node, JNODE_RELOC); -+ JF_CLR(node, JNODE_OVRWR); -+ JF_CLR(node, JNODE_CREATED); -+ JF_CLR(node, JNODE_WRITEBACK); -+ JF_CLR(node, JNODE_REPACK); -+ -+ list_del_init(&node->capture_link); -+ if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) { -+ assert("zam-925", atom_isopen(atom)); -+ assert("vs-1623", NODE_LIST(node) == FQ_LIST); -+ ON_DEBUG(atom->num_queued--); -+ JF_CLR(node, JNODE_FLUSH_QUEUED); -+ } -+ atom->capture_count -= 1; -+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), NOT_CAPTURED, 1)); -+ node->atom = NULL; -+ -+ spin_unlock_jnode(node); -+ LOCK_CNT_DEC(t_refs); -+} -+ -+/* Unconditional insert of jnode into atom's overwrite list. Currently used in -+ bitmap-based allocator code for adding modified bitmap blocks the -+ transaction. @atom and @node are spin locked */ -+void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node) -+{ -+ assert("zam-538", atom_is_protected(atom)); -+ assert_spin_locked(&(node->guard)); -+ assert("zam-899", JF_ISSET(node, JNODE_OVRWR)); -+ assert("zam-543", node->atom == NULL); -+ assert("vs-1433", !jnode_is_unformatted(node) && !jnode_is_znode(node)); -+ -+ list_add(&node->capture_link, ATOM_OVRWR_LIST(atom)); -+ jref(node); -+ node->atom = atom; -+ atom->capture_count++; -+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), OVRWR_LIST, 1)); -+} -+ -+static int count_deleted_blocks_actor(txn_atom * atom, -+ const reiser4_block_nr * a, -+ const reiser4_block_nr * b, void *data) -+{ -+ reiser4_block_nr *counter = data; -+ -+ assert("zam-995", data != NULL); -+ assert("zam-996", a != NULL); -+ if (b == NULL) -+ *counter += 1; -+ else -+ *counter += *b; -+ return 0; -+} -+ -+reiser4_block_nr txnmgr_count_deleted_blocks(void) -+{ -+ reiser4_block_nr result; -+ txn_mgr *tmgr = &get_super_private(reiser4_get_current_sb())->tmgr; -+ txn_atom *atom; -+ -+ result = 0; -+ -+ spin_lock_txnmgr(tmgr); -+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) { -+ spin_lock_atom(atom); -+ if (atom_isopen(atom)) -+ blocknr_set_iterator( -+ atom, &atom->delete_set, -+ count_deleted_blocks_actor, &result, 0); -+ spin_unlock_atom(atom); -+ } -+ spin_unlock_txnmgr(tmgr); -+ -+ return result; -+} -+ -+/* -+ * Local variables: -+ * c-indentation-style: "K&R" -+ * mode-name: "LC" -+ * c-basic-offset: 8 -+ * tab-width: 8 -+ * fill-column: 79 -+ * End: -+ */ -diff -urN linux-2.6.24.orig/fs/reiser4/txnmgr.h linux-2.6.24/fs/reiser4/txnmgr.h ---- linux-2.6.24.orig/fs/reiser4/txnmgr.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/txnmgr.h 2008-01-25 11:39:07.112253026 +0300 -@@ -0,0 +1,701 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* data-types and function declarations for transaction manager. See txnmgr.c -+ * for details. */ -+ -+#ifndef __REISER4_TXNMGR_H__ -+#define __REISER4_TXNMGR_H__ -+ -+#include "forward.h" -+#include "dformat.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* TYPE DECLARATIONS */ -+ -+/* This enumeration describes the possible types of a capture request (reiser4_try_capture). -+ A capture request dynamically assigns a block to the calling thread's transaction -+ handle. */ -+typedef enum { -+ /* A READ_ATOMIC request indicates that a block will be read and that the caller's -+ atom should fuse in order to ensure that the block commits atomically with the -+ caller. */ -+ TXN_CAPTURE_READ_ATOMIC = (1 << 0), -+ -+ /* A READ_NONCOM request indicates that a block will be read and that the caller is -+ willing to read a non-committed block without causing atoms to fuse. */ -+ TXN_CAPTURE_READ_NONCOM = (1 << 1), -+ -+ /* A READ_MODIFY request indicates that a block will be read but that the caller -+ wishes for the block to be captured as it will be written. This capture request -+ mode is not currently used, but eventually it will be useful for preventing -+ deadlock in read-modify-write cycles. */ -+ TXN_CAPTURE_READ_MODIFY = (1 << 2), -+ -+ /* A WRITE capture request indicates that a block will be modified and that atoms -+ should fuse to make the commit atomic. */ -+ TXN_CAPTURE_WRITE = (1 << 3), -+ -+ /* CAPTURE_TYPES is a mask of the four above capture types, used to separate the -+ exclusive type designation from extra bits that may be supplied -- see -+ below. */ -+ TXN_CAPTURE_TYPES = (TXN_CAPTURE_READ_ATOMIC | -+ TXN_CAPTURE_READ_NONCOM | TXN_CAPTURE_READ_MODIFY | -+ TXN_CAPTURE_WRITE), -+ -+ /* A subset of CAPTURE_TYPES, CAPTURE_WTYPES is a mask of request types that -+ indicate modification will occur. */ -+ TXN_CAPTURE_WTYPES = (TXN_CAPTURE_READ_MODIFY | TXN_CAPTURE_WRITE), -+ -+ /* An option to reiser4_try_capture, NONBLOCKING indicates that the caller would -+ prefer not to sleep waiting for an aging atom to commit. */ -+ TXN_CAPTURE_NONBLOCKING = (1 << 4), -+ -+ /* An option to reiser4_try_capture to prevent atom fusion, just simple -+ capturing is allowed */ -+ TXN_CAPTURE_DONT_FUSE = (1 << 5) -+ -+ /* This macro selects only the exclusive capture request types, stripping out any -+ options that were supplied (i.e., NONBLOCKING). */ -+#define CAPTURE_TYPE(x) ((x) & TXN_CAPTURE_TYPES) -+} txn_capture; -+ -+/* There are two kinds of transaction handle: WRITE_FUSING and READ_FUSING, the only -+ difference is in the handling of read requests. A WRITE_FUSING transaction handle -+ defaults read capture requests to TXN_CAPTURE_READ_NONCOM whereas a READ_FUSIONG -+ transaction handle defaults to TXN_CAPTURE_READ_ATOMIC. */ -+typedef enum { -+ TXN_WRITE_FUSING = (1 << 0), -+ TXN_READ_FUSING = (1 << 1) | TXN_WRITE_FUSING, /* READ implies WRITE */ -+} txn_mode; -+ -+/* Every atom has a stage, which is one of these exclusive values: */ -+typedef enum { -+ /* Initially an atom is free. */ -+ ASTAGE_FREE = 0, -+ -+ /* An atom begins by entering the CAPTURE_FUSE stage, where it proceeds to capture -+ blocks and fuse with other atoms. */ -+ ASTAGE_CAPTURE_FUSE = 1, -+ -+ /* We need to have a ASTAGE_CAPTURE_SLOW in which an atom fuses with one node for every X nodes it flushes to disk where X > 1. */ -+ -+ /* When an atom reaches a certain age it must do all it can to commit. An atom in -+ the CAPTURE_WAIT stage refuses new transaction handles and prevents fusion from -+ atoms in the CAPTURE_FUSE stage. */ -+ ASTAGE_CAPTURE_WAIT = 2, -+ -+ /* Waiting for I/O before commit. Copy-on-capture (see -+ http://namesys.com/v4/v4.html). */ -+ ASTAGE_PRE_COMMIT = 3, -+ -+ /* Post-commit overwrite I/O. Steal-on-capture. */ -+ ASTAGE_POST_COMMIT = 4, -+ -+ /* Atom which waits for the removal of the last reference to (it? ) to -+ * be deleted from memory */ -+ ASTAGE_DONE = 5, -+ -+ /* invalid atom. */ -+ ASTAGE_INVALID = 6, -+ -+} txn_stage; -+ -+/* Certain flags may be set in the txn_atom->flags field. */ -+typedef enum { -+ /* Indicates that the atom should commit as soon as possible. */ -+ ATOM_FORCE_COMMIT = (1 << 0), -+ /* to avoid endless loop, mark the atom (which was considered as too -+ * small) after failed attempt to fuse it. */ -+ ATOM_CANCEL_FUSION = (1 << 1) -+} txn_flags; -+ -+/* Flags for controlling commit_txnh */ -+typedef enum { -+ /* Wait commit atom completion in commit_txnh */ -+ TXNH_WAIT_COMMIT = 0x2, -+ /* Don't commit atom when this handle is closed */ -+ TXNH_DONT_COMMIT = 0x4 -+} txn_handle_flags_t; -+ -+/* TYPE DEFINITIONS */ -+ -+/* A note on lock ordering: the handle & jnode spinlock protects reading of their ->atom -+ fields, so typically an operation on the atom through either of these objects must (1) -+ lock the object, (2) read the atom pointer, (3) lock the atom. -+ -+ During atom fusion, the process holds locks on both atoms at once. Then, it iterates -+ through the list of handles and pages held by the smaller of the two atoms. For each -+ handle and page referencing the smaller atom, the fusing process must: (1) lock the -+ object, and (2) update the atom pointer. -+ -+ You can see that there is a conflict of lock ordering here, so the more-complex -+ procedure should have priority, i.e., the fusing process has priority so that it is -+ guaranteed to make progress and to avoid restarts. -+ -+ This decision, however, means additional complexity for aquiring the atom lock in the -+ first place. -+ -+ The general original procedure followed in the code was: -+ -+ TXN_OBJECT *obj = ...; -+ TXN_ATOM *atom; -+ -+ spin_lock (& obj->_lock); -+ -+ atom = obj->_atom; -+ -+ if (! spin_trylock_atom (atom)) -+ { -+ spin_unlock (& obj->_lock); -+ RESTART OPERATION, THERE WAS A RACE; -+ } -+ -+ ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED -+ -+ It has however been found that this wastes CPU a lot in a manner that is -+ hard to profile. So, proper refcounting was added to atoms, and new -+ standard locking sequence is like following: -+ -+ TXN_OBJECT *obj = ...; -+ TXN_ATOM *atom; -+ -+ spin_lock (& obj->_lock); -+ -+ atom = obj->_atom; -+ -+ if (! spin_trylock_atom (atom)) -+ { -+ atomic_inc (& atom->refcount); -+ spin_unlock (& obj->_lock); -+ spin_lock (&atom->_lock); -+ atomic_dec (& atom->refcount); -+ // HERE atom is locked -+ spin_unlock (&atom->_lock); -+ RESTART OPERATION, THERE WAS A RACE; -+ } -+ -+ ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED -+ -+ (core of this is implemented in trylock_throttle() function) -+ -+ See the jnode_get_atom() function for a common case. -+ -+ As an additional (and important) optimization allowing to avoid restarts, -+ it is possible to re-check required pre-conditions at the HERE point in -+ code above and proceed without restarting if they are still satisfied. -+*/ -+ -+/* An atomic transaction: this is the underlying system representation -+ of a transaction, not the one seen by clients. -+ -+ Invariants involving this data-type: -+ -+ [sb-fake-allocated] -+*/ -+struct txn_atom { -+ /* The spinlock protecting the atom, held during fusion and various other state -+ changes. */ -+ spinlock_t alock; -+ -+ /* The atom's reference counter, increasing (in case of a duplication -+ of an existing reference or when we are sure that some other -+ reference exists) may be done without taking spinlock, decrementing -+ of the ref. counter requires a spinlock to be held. -+ -+ Each transaction handle counts in ->refcount. All jnodes count as -+ one reference acquired in atom_begin_andlock(), released in -+ commit_current_atom(). -+ */ -+ atomic_t refcount; -+ -+ /* The atom_id identifies the atom in persistent records such as the log. */ -+ __u32 atom_id; -+ -+ /* Flags holding any of the txn_flags enumerated values (e.g., -+ ATOM_FORCE_COMMIT). */ -+ __u32 flags; -+ -+ /* Number of open handles. */ -+ __u32 txnh_count; -+ -+ /* The number of znodes captured by this atom. Equal to the sum of lengths of the -+ dirty_nodes[level] and clean_nodes lists. */ -+ __u32 capture_count; -+ -+#if REISER4_DEBUG -+ int clean; -+ int dirty; -+ int ovrwr; -+ int wb; -+ int fq; -+#endif -+ -+ __u32 flushed; -+ -+ /* Current transaction stage. */ -+ txn_stage stage; -+ -+ /* Start time. */ -+ unsigned long start_time; -+ -+ /* The atom's delete set. It collects block numbers of the nodes -+ which were deleted during the transaction. */ -+ struct list_head delete_set; -+ -+ /* The atom's wandered_block mapping. */ -+ struct list_head wandered_map; -+ -+ /* The transaction's list of dirty captured nodes--per level. Index -+ by (level). dirty_nodes[0] is for znode-above-root */ -+ struct list_head dirty_nodes[REAL_MAX_ZTREE_HEIGHT + 1]; -+ -+ /* The transaction's list of clean captured nodes. */ -+ struct list_head clean_nodes; -+ -+ /* The atom's overwrite set */ -+ struct list_head ovrwr_nodes; -+ -+ /* nodes which are being written to disk */ -+ struct list_head writeback_nodes; -+ -+ /* list of inodes */ -+ struct list_head inodes; -+ -+ /* List of handles associated with this atom. */ -+ struct list_head txnh_list; -+ -+ /* Transaction list link: list of atoms in the transaction manager. */ -+ struct list_head atom_link; -+ -+ /* List of handles waiting FOR this atom: see 'capture_fuse_wait' comment. */ -+ struct list_head fwaitfor_list; -+ -+ /* List of this atom's handles that are waiting: see 'capture_fuse_wait' comment. */ -+ struct list_head fwaiting_list; -+ -+ /* Numbers of objects which were deleted/created in this transaction -+ thereby numbers of objects IDs which were released/deallocated. */ -+ int nr_objects_deleted; -+ int nr_objects_created; -+ /* number of blocks allocated during the transaction */ -+ __u64 nr_blocks_allocated; -+ /* All atom's flush queue objects are on this list */ -+ struct list_head flush_queues; -+#if REISER4_DEBUG -+ /* number of flush queues for this atom. */ -+ int nr_flush_queues; -+ /* Number of jnodes which were removed from atom's lists and put -+ on flush_queue */ -+ int num_queued; -+#endif -+ /* number of threads who wait for this atom to complete commit */ -+ int nr_waiters; -+ /* number of threads which do jnode_flush() over this atom */ -+ int nr_flushers; -+ /* number of flush queues which are IN_USE and jnodes from fq->prepped -+ are submitted to disk by the reiser4_write_fq() routine. */ -+ int nr_running_queues; -+ /* A counter of grabbed unformatted nodes, see a description of the -+ * reiser4 space reservation scheme at block_alloc.c */ -+ reiser4_block_nr flush_reserved; -+#if REISER4_DEBUG -+ void *committer; -+#endif -+ struct super_block *super; -+}; -+ -+#define ATOM_DIRTY_LIST(atom, level) (&(atom)->dirty_nodes[level]) -+#define ATOM_CLEAN_LIST(atom) (&(atom)->clean_nodes) -+#define ATOM_OVRWR_LIST(atom) (&(atom)->ovrwr_nodes) -+#define ATOM_WB_LIST(atom) (&(atom)->writeback_nodes) -+#define ATOM_FQ_LIST(fq) (&(fq)->prepped) -+ -+#define NODE_LIST(node) (node)->list -+#define ASSIGN_NODE_LIST(node, list) ON_DEBUG(NODE_LIST(node) = list) -+ON_DEBUG(void -+ count_jnode(txn_atom *, jnode *, atom_list old_list, -+ atom_list new_list, int check_lists)); -+ -+/* A transaction handle: the client obtains and commits this handle which is assigned by -+ the system to a txn_atom. */ -+struct txn_handle { -+ /* Spinlock protecting ->atom pointer */ -+ spinlock_t hlock; -+ -+ /* Flags for controlling commit_txnh() behavior */ -+ /* from txn_handle_flags_t */ -+ txn_handle_flags_t flags; -+ -+ /* Whether it is READ_FUSING or WRITE_FUSING. */ -+ txn_mode mode; -+ -+ /* If assigned, the atom it is part of. */ -+ txn_atom *atom; -+ -+ /* Transaction list link. Head is in txn_atom. */ -+ struct list_head txnh_link; -+}; -+ -+/* The transaction manager: one is contained in the reiser4_super_info_data */ -+struct txn_mgr { -+ /* A spinlock protecting the atom list, id_count, flush_control */ -+ spinlock_t tmgr_lock; -+ -+ /* List of atoms. */ -+ struct list_head atoms_list; -+ -+ /* Number of atoms. */ -+ int atom_count; -+ -+ /* A counter used to assign atom->atom_id values. */ -+ __u32 id_count; -+ -+ /* a mutex object for commit serialization */ -+ struct mutex commit_mutex; -+ -+ /* a list of all txnmrgs served by particular daemon. */ -+ struct list_head linkage; -+ -+ /* description of daemon for this txnmgr */ -+ ktxnmgrd_context *daemon; -+ -+ /* parameters. Adjustable through mount options. */ -+ unsigned int atom_max_size; -+ unsigned int atom_max_age; -+ unsigned int atom_min_size; -+ /* max number of concurrent flushers for one atom, 0 - unlimited. */ -+ unsigned int atom_max_flushers; -+ struct dentry *debugfs_atom_count; -+ struct dentry *debugfs_id_count; -+}; -+ -+/* FUNCTION DECLARATIONS */ -+ -+/* These are the externally (within Reiser4) visible transaction functions, therefore they -+ are prefixed with "txn_". For comments, see txnmgr.c. */ -+ -+extern int init_txnmgr_static(void); -+extern void done_txnmgr_static(void); -+ -+extern void reiser4_init_txnmgr(txn_mgr *); -+extern void reiser4_done_txnmgr(txn_mgr *); -+ -+extern int reiser4_txn_reserve(int reserved); -+ -+extern void reiser4_txn_begin(reiser4_context * context); -+extern int reiser4_txn_end(reiser4_context * context); -+ -+extern void reiser4_txn_restart(reiser4_context * context); -+extern void reiser4_txn_restart_current(void); -+ -+extern int txnmgr_force_commit_all(struct super_block *, int); -+extern int current_atom_should_commit(void); -+ -+extern jnode *find_first_dirty_jnode(txn_atom *, int); -+ -+extern int commit_some_atoms(txn_mgr *); -+extern int force_commit_atom(txn_handle *); -+extern int flush_current_atom(int, long, long *, txn_atom **, jnode *); -+ -+extern int flush_some_atom(jnode *, long *, const struct writeback_control *, int); -+ -+extern void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage); -+ -+extern int same_slum_check(jnode * base, jnode * check, int alloc_check, -+ int alloc_value); -+extern void atom_dec_and_unlock(txn_atom * atom); -+ -+extern int reiser4_try_capture(jnode * node, znode_lock_mode mode, txn_capture flags); -+extern int try_capture_page_to_invalidate(struct page *pg); -+ -+extern void reiser4_uncapture_page(struct page *pg); -+extern void reiser4_uncapture_block(jnode *); -+extern void reiser4_uncapture_jnode(jnode *); -+ -+extern int reiser4_capture_inode(struct inode *); -+extern int reiser4_uncapture_inode(struct inode *); -+ -+extern txn_atom *get_current_atom_locked_nocheck(void); -+ -+#if REISER4_DEBUG -+ -+/** -+ * atom_is_protected - make sure that nobody but us can do anything with atom -+ * @atom: atom to be checked -+ * -+ * This is used to assert that atom either entered commit stages or is spin -+ * locked. -+ */ -+static inline int atom_is_protected(txn_atom *atom) -+{ -+ if (atom->stage >= ASTAGE_PRE_COMMIT) -+ return 1; -+ assert_spin_locked(&(atom->alock)); -+ return 1; -+} -+ -+#endif -+ -+/* Get the current atom and spinlock it if current atom present. May not return NULL */ -+static inline txn_atom *get_current_atom_locked(void) -+{ -+ txn_atom *atom; -+ -+ atom = get_current_atom_locked_nocheck(); -+ assert("zam-761", atom != NULL); -+ -+ return atom; -+} -+ -+extern txn_atom *jnode_get_atom(jnode *); -+ -+extern void reiser4_atom_wait_event(txn_atom *); -+extern void reiser4_atom_send_event(txn_atom *); -+ -+extern void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node); -+extern int reiser4_capture_super_block(struct super_block *s); -+int capture_bulk(jnode **, int count); -+ -+/* See the comment on the function blocknrset.c:blocknr_set_add for the -+ calling convention of these three routines. */ -+extern void blocknr_set_init(struct list_head * bset); -+extern void blocknr_set_destroy(struct list_head * bset); -+extern void blocknr_set_merge(struct list_head * from, struct list_head * into); -+extern int blocknr_set_add_extent(txn_atom * atom, -+ struct list_head * bset, -+ blocknr_set_entry ** new_bsep, -+ const reiser4_block_nr * start, -+ const reiser4_block_nr * len); -+extern int blocknr_set_add_pair(txn_atom * atom, struct list_head * bset, -+ blocknr_set_entry ** new_bsep, -+ const reiser4_block_nr * a, -+ const reiser4_block_nr * b); -+ -+typedef int (*blocknr_set_actor_f) (txn_atom *, const reiser4_block_nr *, -+ const reiser4_block_nr *, void *); -+ -+extern int blocknr_set_iterator(txn_atom * atom, struct list_head * bset, -+ blocknr_set_actor_f actor, void *data, -+ int delete); -+ -+/* flush code takes care about how to fuse flush queues */ -+extern void flush_init_atom(txn_atom * atom); -+extern void flush_fuse_queues(txn_atom * large, txn_atom * small); -+ -+static inline void spin_lock_atom(txn_atom *atom) -+{ -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) && -+ LOCK_CNT_NIL(spin_locked_atom) && -+ LOCK_CNT_NIL(spin_locked_jnode) && -+ LOCK_CNT_NIL(spin_locked_zlock) && -+ LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_NIL(rw_locked_tree))); -+ -+ spin_lock(&(atom->alock)); -+ -+ LOCK_CNT_INC(spin_locked_atom); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline void spin_lock_atom_nested(txn_atom *atom) -+{ -+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) && -+ LOCK_CNT_NIL(spin_locked_jnode) && -+ LOCK_CNT_NIL(spin_locked_zlock) && -+ LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_NIL(rw_locked_tree))); -+ -+ spin_lock_nested(&(atom->alock), SINGLE_DEPTH_NESTING); -+ -+ LOCK_CNT_INC(spin_locked_atom); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline int spin_trylock_atom(txn_atom *atom) -+{ -+ if (spin_trylock(&(atom->alock))) { -+ LOCK_CNT_INC(spin_locked_atom); -+ LOCK_CNT_INC(spin_locked); -+ return 1; -+ } -+ return 0; -+} -+ -+static inline void spin_unlock_atom(txn_atom *atom) -+{ -+ assert_spin_locked(&(atom->alock)); -+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_atom)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(spin_locked_atom); -+ LOCK_CNT_DEC(spin_locked); -+ -+ spin_unlock(&(atom->alock)); -+} -+ -+static inline void spin_lock_txnh(txn_handle *txnh) -+{ -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", (LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_NIL(spin_locked_zlock) && -+ LOCK_CNT_NIL(rw_locked_tree))); -+ -+ spin_lock(&(txnh->hlock)); -+ -+ LOCK_CNT_INC(spin_locked_txnh); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline int spin_trylock_txnh(txn_handle *txnh) -+{ -+ if (spin_trylock(&(txnh->hlock))) { -+ LOCK_CNT_INC(spin_locked_txnh); -+ LOCK_CNT_INC(spin_locked); -+ return 1; -+ } -+ return 0; -+} -+ -+static inline void spin_unlock_txnh(txn_handle *txnh) -+{ -+ assert_spin_locked(&(txnh->hlock)); -+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnh)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(spin_locked_txnh); -+ LOCK_CNT_DEC(spin_locked); -+ -+ spin_unlock(&(txnh->hlock)); -+} -+ -+#define spin_ordering_pred_txnmgr(tmgr) \ -+ ( LOCK_CNT_NIL(spin_locked_atom) && \ -+ LOCK_CNT_NIL(spin_locked_txnh) && \ -+ LOCK_CNT_NIL(spin_locked_jnode) && \ -+ LOCK_CNT_NIL(rw_locked_zlock) && \ -+ LOCK_CNT_NIL(rw_locked_dk) && \ -+ LOCK_CNT_NIL(rw_locked_tree) ) -+ -+static inline void spin_lock_txnmgr(txn_mgr *mgr) -+{ -+ /* check that spinlocks of lower priorities are not held */ -+ assert("", (LOCK_CNT_NIL(spin_locked_atom) && -+ LOCK_CNT_NIL(spin_locked_txnh) && -+ LOCK_CNT_NIL(spin_locked_jnode) && -+ LOCK_CNT_NIL(spin_locked_zlock) && -+ LOCK_CNT_NIL(rw_locked_dk) && -+ LOCK_CNT_NIL(rw_locked_tree))); -+ -+ spin_lock(&(mgr->tmgr_lock)); -+ -+ LOCK_CNT_INC(spin_locked_txnmgr); -+ LOCK_CNT_INC(spin_locked); -+} -+ -+static inline int spin_trylock_txnmgr(txn_mgr *mgr) -+{ -+ if (spin_trylock(&(mgr->tmgr_lock))) { -+ LOCK_CNT_INC(spin_locked_txnmgr); -+ LOCK_CNT_INC(spin_locked); -+ return 1; -+ } -+ return 0; -+} -+ -+static inline void spin_unlock_txnmgr(txn_mgr *mgr) -+{ -+ assert_spin_locked(&(mgr->tmgr_lock)); -+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnmgr)); -+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked)); -+ -+ LOCK_CNT_DEC(spin_locked_txnmgr); -+ LOCK_CNT_DEC(spin_locked); -+ -+ spin_unlock(&(mgr->tmgr_lock)); -+} -+ -+typedef enum { -+ FQ_IN_USE = 0x1 -+} flush_queue_state_t; -+ -+typedef struct flush_queue flush_queue_t; -+ -+/* This is an accumulator for jnodes prepared for writing to disk. A flush queue -+ is filled by the jnode_flush() routine, and written to disk under memory -+ pressure or at atom commit time. */ -+/* LOCKING: fq state and fq->atom are protected by guard spinlock, fq->nr_queued -+ field and fq->prepped list can be modified if atom is spin-locked and fq -+ object is "in-use" state. For read-only traversal of the fq->prepped list -+ and reading of the fq->nr_queued field it is enough to keep fq "in-use" or -+ only have atom spin-locked. */ -+struct flush_queue { -+ /* linkage element is the first in this structure to make debugging -+ easier. See field in atom struct for description of list. */ -+ struct list_head alink; -+ /* A spinlock to protect changes of fq state and fq->atom pointer */ -+ spinlock_t guard; -+ /* flush_queue state: [in_use | ready] */ -+ flush_queue_state_t state; -+ /* A list which contains queued nodes, queued nodes are removed from any -+ * atom's list and put on this ->prepped one. */ -+ struct list_head prepped; -+ /* number of submitted i/o requests */ -+ atomic_t nr_submitted; -+ /* number of i/o errors */ -+ atomic_t nr_errors; -+ /* An atom this flush queue is attached to */ -+ txn_atom *atom; -+ /* A wait queue head to wait on i/o completion */ -+ wait_queue_head_t wait; -+#if REISER4_DEBUG -+ /* A thread which took this fq in exclusive use, NULL if fq is free, -+ * used for debugging. */ -+ struct task_struct *owner; -+#endif -+}; -+ -+extern int reiser4_fq_by_atom(txn_atom *, flush_queue_t **); -+extern void reiser4_fq_put_nolock(flush_queue_t *); -+extern void reiser4_fq_put(flush_queue_t *); -+extern void reiser4_fuse_fq(txn_atom * to, txn_atom * from); -+extern void queue_jnode(flush_queue_t *, jnode *); -+ -+extern int reiser4_write_fq(flush_queue_t *, long *, int); -+extern int current_atom_finish_all_fq(void); -+extern void init_atom_fq_parts(txn_atom *); -+ -+extern reiser4_block_nr txnmgr_count_deleted_blocks(void); -+ -+extern void znode_make_dirty(znode * node); -+extern void jnode_make_dirty_locked(jnode * node); -+ -+extern int reiser4_sync_atom(txn_atom * atom); -+ -+#if REISER4_DEBUG -+extern int atom_fq_parts_are_clean(txn_atom *); -+#endif -+ -+extern void add_fq_to_bio(flush_queue_t *, struct bio *); -+extern flush_queue_t *get_fq_for_current_atom(void); -+ -+void reiser4_invalidate_list(struct list_head * head); -+ -+# endif /* __REISER4_TXNMGR_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/type_safe_hash.h linux-2.6.24/fs/reiser4/type_safe_hash.h ---- linux-2.6.24.orig/fs/reiser4/type_safe_hash.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/type_safe_hash.h 2008-01-25 11:39:07.112253026 +0300 -@@ -0,0 +1,320 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* A hash table class that uses hash chains (singly-linked) and is -+ parametrized to provide type safety. */ -+ -+#ifndef __REISER4_TYPE_SAFE_HASH_H__ -+#define __REISER4_TYPE_SAFE_HASH_H__ -+ -+#include "debug.h" -+ -+#include -+/* Step 1: Use TYPE_SAFE_HASH_DECLARE() to define the TABLE and LINK objects -+ based on the object type. You need to declare the item type before -+ this definition, define it after this definition. */ -+#define TYPE_SAFE_HASH_DECLARE(PREFIX,ITEM_TYPE) \ -+ \ -+typedef struct PREFIX##_hash_table_ PREFIX##_hash_table; \ -+typedef struct PREFIX##_hash_link_ PREFIX##_hash_link; \ -+ \ -+struct PREFIX##_hash_table_ \ -+{ \ -+ ITEM_TYPE **_table; \ -+ __u32 _buckets; \ -+}; \ -+ \ -+struct PREFIX##_hash_link_ \ -+{ \ -+ ITEM_TYPE *_next; \ -+} -+ -+/* Step 2: Define the object type of the hash: give it field of type -+ PREFIX_hash_link. */ -+ -+/* Step 3: Use TYPE_SAFE_HASH_DEFINE to define the hash table interface using -+ the type and field name used in step 3. The arguments are: -+ -+ ITEM_TYPE The item type being hashed -+ KEY_TYPE The type of key being hashed -+ KEY_NAME The name of the key field within the item -+ LINK_NAME The name of the link field within the item, which you must make type PREFIX_hash_link) -+ HASH_FUNC The name of the hash function (or macro, takes const pointer to key) -+ EQ_FUNC The name of the equality function (or macro, takes const pointer to two keys) -+ -+ It implements these functions: -+ -+ prefix_hash_init Initialize the table given its size. -+ prefix_hash_insert Insert an item -+ prefix_hash_insert_index Insert an item w/ precomputed hash_index -+ prefix_hash_find Find an item by key -+ prefix_hash_find_index Find an item w/ precomputed hash_index -+ prefix_hash_remove Remove an item, returns 1 if found, 0 if not found -+ prefix_hash_remove_index Remove an item w/ precomputed hash_index -+ -+ If you'd like something to be done differently, feel free to ask me -+ for modifications. Additional features that could be added but -+ have not been: -+ -+ prefix_hash_remove_key Find and remove an item by key -+ prefix_hash_remove_key_index Find and remove an item by key w/ precomputed hash_index -+ -+ The hash_function currently receives only the key as an argument, -+ meaning it must somehow know the number of buckets. If this is a -+ problem let me know. -+ -+ This hash table uses a single-linked hash chain. This means -+ insertion is fast but deletion requires searching the chain. -+ -+ There is also the doubly-linked hash chain approach, under which -+ deletion requires no search but the code is longer and it takes two -+ pointers per item. -+ -+ The circularly-linked approach has the shortest code but requires -+ two pointers per bucket, doubling the size of the bucket array (in -+ addition to two pointers per item). -+*/ -+#define TYPE_SAFE_HASH_DEFINE(PREFIX,ITEM_TYPE,KEY_TYPE,KEY_NAME,LINK_NAME,HASH_FUNC,EQ_FUNC) \ -+ \ -+static __inline__ void \ -+PREFIX##_check_hash (PREFIX##_hash_table *table UNUSED_ARG, \ -+ __u32 hash UNUSED_ARG) \ -+{ \ -+ assert("nikita-2780", hash < table->_buckets); \ -+} \ -+ \ -+static __inline__ int \ -+PREFIX##_hash_init (PREFIX##_hash_table *hash, \ -+ __u32 buckets) \ -+{ \ -+ hash->_table = (ITEM_TYPE**) KMALLOC (sizeof (ITEM_TYPE*) * buckets); \ -+ hash->_buckets = buckets; \ -+ if (hash->_table == NULL) \ -+ { \ -+ return RETERR(-ENOMEM); \ -+ } \ -+ memset (hash->_table, 0, sizeof (ITEM_TYPE*) * buckets); \ -+ ON_DEBUG(printk(#PREFIX "_hash_table: %i buckets\n", buckets)); \ -+ return 0; \ -+} \ -+ \ -+static __inline__ void \ -+PREFIX##_hash_done (PREFIX##_hash_table *hash) \ -+{ \ -+ if (REISER4_DEBUG && hash->_table != NULL) { \ -+ __u32 i; \ -+ for (i = 0 ; i < hash->_buckets ; ++ i) \ -+ assert("nikita-2905", hash->_table[i] == NULL); \ -+ } \ -+ if (hash->_table != NULL) \ -+ KFREE (hash->_table, sizeof (ITEM_TYPE*) * hash->_buckets); \ -+ hash->_table = NULL; \ -+} \ -+ \ -+static __inline__ void \ -+PREFIX##_hash_prefetch_next (ITEM_TYPE *item) \ -+{ \ -+ prefetch(item->LINK_NAME._next); \ -+} \ -+ \ -+static __inline__ void \ -+PREFIX##_hash_prefetch_bucket (PREFIX##_hash_table *hash, \ -+ __u32 index) \ -+{ \ -+ prefetch(hash->_table[index]); \ -+} \ -+ \ -+static __inline__ ITEM_TYPE* \ -+PREFIX##_hash_find_index (PREFIX##_hash_table *hash, \ -+ __u32 hash_index, \ -+ KEY_TYPE const *find_key) \ -+{ \ -+ ITEM_TYPE *item; \ -+ \ -+ PREFIX##_check_hash(hash, hash_index); \ -+ \ -+ for (item = hash->_table[hash_index]; \ -+ item != NULL; \ -+ item = item->LINK_NAME._next) \ -+ { \ -+ prefetch(item->LINK_NAME._next); \ -+ prefetch(item->LINK_NAME._next + offsetof(ITEM_TYPE, KEY_NAME)); \ -+ if (EQ_FUNC (& item->KEY_NAME, find_key)) \ -+ { \ -+ return item; \ -+ } \ -+ } \ -+ \ -+ return NULL; \ -+} \ -+ \ -+static __inline__ ITEM_TYPE* \ -+PREFIX##_hash_find_index_lru (PREFIX##_hash_table *hash, \ -+ __u32 hash_index, \ -+ KEY_TYPE const *find_key) \ -+{ \ -+ ITEM_TYPE ** item = &hash->_table[hash_index]; \ -+ \ -+ PREFIX##_check_hash(hash, hash_index); \ -+ \ -+ while (*item != NULL) { \ -+ prefetch(&(*item)->LINK_NAME._next); \ -+ if (EQ_FUNC (&(*item)->KEY_NAME, find_key)) { \ -+ ITEM_TYPE *found; \ -+ \ -+ found = *item; \ -+ *item = found->LINK_NAME._next; \ -+ found->LINK_NAME._next = hash->_table[hash_index]; \ -+ hash->_table[hash_index] = found; \ -+ return found; \ -+ } \ -+ item = &(*item)->LINK_NAME._next; \ -+ } \ -+ return NULL; \ -+} \ -+ \ -+static __inline__ int \ -+PREFIX##_hash_remove_index (PREFIX##_hash_table *hash, \ -+ __u32 hash_index, \ -+ ITEM_TYPE *del_item) \ -+{ \ -+ ITEM_TYPE ** hash_item_p = &hash->_table[hash_index]; \ -+ \ -+ PREFIX##_check_hash(hash, hash_index); \ -+ \ -+ while (*hash_item_p != NULL) { \ -+ prefetch(&(*hash_item_p)->LINK_NAME._next); \ -+ if (*hash_item_p == del_item) { \ -+ *hash_item_p = (*hash_item_p)->LINK_NAME._next; \ -+ return 1; \ -+ } \ -+ hash_item_p = &(*hash_item_p)->LINK_NAME._next; \ -+ } \ -+ return 0; \ -+} \ -+ \ -+static __inline__ void \ -+PREFIX##_hash_insert_index (PREFIX##_hash_table *hash, \ -+ __u32 hash_index, \ -+ ITEM_TYPE *ins_item) \ -+{ \ -+ PREFIX##_check_hash(hash, hash_index); \ -+ \ -+ ins_item->LINK_NAME._next = hash->_table[hash_index]; \ -+ hash->_table[hash_index] = ins_item; \ -+} \ -+ \ -+static __inline__ void \ -+PREFIX##_hash_insert_index_rcu (PREFIX##_hash_table *hash, \ -+ __u32 hash_index, \ -+ ITEM_TYPE *ins_item) \ -+{ \ -+ PREFIX##_check_hash(hash, hash_index); \ -+ \ -+ ins_item->LINK_NAME._next = hash->_table[hash_index]; \ -+ smp_wmb(); \ -+ hash->_table[hash_index] = ins_item; \ -+} \ -+ \ -+static __inline__ ITEM_TYPE* \ -+PREFIX##_hash_find (PREFIX##_hash_table *hash, \ -+ KEY_TYPE const *find_key) \ -+{ \ -+ return PREFIX##_hash_find_index (hash, HASH_FUNC(hash, find_key), find_key); \ -+} \ -+ \ -+static __inline__ ITEM_TYPE* \ -+PREFIX##_hash_find_lru (PREFIX##_hash_table *hash, \ -+ KEY_TYPE const *find_key) \ -+{ \ -+ return PREFIX##_hash_find_index_lru (hash, HASH_FUNC(hash, find_key), find_key); \ -+} \ -+ \ -+static __inline__ int \ -+PREFIX##_hash_remove (PREFIX##_hash_table *hash, \ -+ ITEM_TYPE *del_item) \ -+{ \ -+ return PREFIX##_hash_remove_index (hash, \ -+ HASH_FUNC(hash, &del_item->KEY_NAME), del_item); \ -+} \ -+ \ -+static __inline__ int \ -+PREFIX##_hash_remove_rcu (PREFIX##_hash_table *hash, \ -+ ITEM_TYPE *del_item) \ -+{ \ -+ return PREFIX##_hash_remove (hash, del_item); \ -+} \ -+ \ -+static __inline__ void \ -+PREFIX##_hash_insert (PREFIX##_hash_table *hash, \ -+ ITEM_TYPE *ins_item) \ -+{ \ -+ return PREFIX##_hash_insert_index (hash, \ -+ HASH_FUNC(hash, &ins_item->KEY_NAME), ins_item); \ -+} \ -+ \ -+static __inline__ void \ -+PREFIX##_hash_insert_rcu (PREFIX##_hash_table *hash, \ -+ ITEM_TYPE *ins_item) \ -+{ \ -+ return PREFIX##_hash_insert_index_rcu (hash, HASH_FUNC(hash, &ins_item->KEY_NAME), \ -+ ins_item); \ -+} \ -+ \ -+static __inline__ ITEM_TYPE * \ -+PREFIX##_hash_first (PREFIX##_hash_table *hash, __u32 ind) \ -+{ \ -+ ITEM_TYPE *first; \ -+ \ -+ for (first = NULL; ind < hash->_buckets; ++ ind) { \ -+ first = hash->_table[ind]; \ -+ if (first != NULL) \ -+ break; \ -+ } \ -+ return first; \ -+} \ -+ \ -+static __inline__ ITEM_TYPE * \ -+PREFIX##_hash_next (PREFIX##_hash_table *hash, \ -+ ITEM_TYPE *item) \ -+{ \ -+ ITEM_TYPE *next; \ -+ \ -+ if (item == NULL) \ -+ return NULL; \ -+ next = item->LINK_NAME._next; \ -+ if (next == NULL) \ -+ next = PREFIX##_hash_first (hash, HASH_FUNC(hash, &item->KEY_NAME) + 1); \ -+ return next; \ -+} \ -+ \ -+typedef struct {} PREFIX##_hash_dummy -+ -+#define for_all_ht_buckets(table, head) \ -+for ((head) = &(table) -> _table[ 0 ] ; \ -+ (head) != &(table) -> _table[ (table) -> _buckets ] ; ++ (head)) -+ -+#define for_all_in_bucket(bucket, item, next, field) \ -+for ((item) = *(bucket), (next) = (item) ? (item) -> field._next : NULL ; \ -+ (item) != NULL ; \ -+ (item) = (next), (next) = (item) ? (item) -> field._next : NULL ) -+ -+#define for_all_in_htable(table, prefix, item, next) \ -+for ((item) = prefix ## _hash_first ((table), 0), \ -+ (next) = prefix ## _hash_next ((table), (item)) ; \ -+ (item) != NULL ; \ -+ (item) = (next), \ -+ (next) = prefix ## _hash_next ((table), (item))) -+ -+/* __REISER4_TYPE_SAFE_HASH_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/vfs_ops.c linux-2.6.24/fs/reiser4/vfs_ops.c ---- linux-2.6.24.orig/fs/reiser4/vfs_ops.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/vfs_ops.c 2008-01-25 11:39:07.112253026 +0300 -@@ -0,0 +1,259 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Interface to VFS. Reiser4 {super|export|dentry}_operations are defined -+ here. */ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "coord.h" -+#include "plugin/item/item.h" -+#include "plugin/file/file.h" -+#include "plugin/security/perm.h" -+#include "plugin/disk_format/disk_format.h" -+#include "plugin/plugin.h" -+#include "plugin/plugin_set.h" -+#include "plugin/object.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree.h" -+#include "vfs_ops.h" -+#include "inode.h" -+#include "page_cache.h" -+#include "ktxnmgrd.h" -+#include "super.h" -+#include "reiser4.h" -+#include "entd.h" -+#include "status_flags.h" -+#include "flush.h" -+#include "dscale.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* update inode stat-data by calling plugin */ -+int reiser4_update_sd(struct inode *object) -+{ -+ file_plugin *fplug; -+ -+ assert("nikita-2338", object != NULL); -+ /* check for read-only file system. */ -+ if (IS_RDONLY(object)) -+ return 0; -+ -+ fplug = inode_file_plugin(object); -+ assert("nikita-2339", fplug != NULL); -+ return fplug->write_sd_by_inode(object); -+} -+ -+/* helper function: increase inode nlink count and call plugin method to save -+ updated stat-data. -+ -+ Used by link/create and during creation of dot and dotdot in mkdir -+*/ -+int reiser4_add_nlink(struct inode *object /* object to which link is added */ , -+ struct inode *parent /* parent where new entry will be */ -+ , -+ int write_sd_p /* true if stat-data has to be -+ * updated */ ) -+{ -+ file_plugin *fplug; -+ int result; -+ -+ assert("nikita-1351", object != NULL); -+ -+ fplug = inode_file_plugin(object); -+ assert("nikita-1445", fplug != NULL); -+ -+ /* ask plugin whether it can add yet another link to this -+ object */ -+ if (!fplug->can_add_link(object)) -+ return RETERR(-EMLINK); -+ -+ assert("nikita-2211", fplug->add_link != NULL); -+ /* call plugin to do actual addition of link */ -+ result = fplug->add_link(object, parent); -+ -+ /* optionally update stat data */ -+ if (result == 0 && write_sd_p) -+ result = fplug->write_sd_by_inode(object); -+ return result; -+} -+ -+/* helper function: decrease inode nlink count and call plugin method to save -+ updated stat-data. -+ -+ Used by unlink/create -+*/ -+int reiser4_del_nlink(struct inode *object /* object from which link is -+ * removed */ , -+ struct inode *parent /* parent where entry was */ , -+ int write_sd_p /* true is stat-data has to be -+ * updated */ ) -+{ -+ file_plugin *fplug; -+ int result; -+ -+ assert("nikita-1349", object != NULL); -+ -+ fplug = inode_file_plugin(object); -+ assert("nikita-1350", fplug != NULL); -+ assert("nikita-1446", object->i_nlink > 0); -+ assert("nikita-2210", fplug->rem_link != NULL); -+ -+ /* call plugin to do actual deletion of link */ -+ result = fplug->rem_link(object, parent); -+ -+ /* optionally update stat data */ -+ if (result == 0 && write_sd_p) -+ result = fplug->write_sd_by_inode(object); -+ return result; -+} -+ -+/* Release reiser4 dentry. This is d_op->d_release() method. */ -+static void reiser4_d_release(struct dentry *dentry /* dentry released */ ) -+{ -+ reiser4_free_dentry_fsdata(dentry); -+} -+ -+/* -+ * Called by reiser4_sync_inodes(), during speculative write-back (through -+ * pdflush, or balance_dirty_pages()). -+ */ -+void reiser4_writeout(struct super_block *sb, struct writeback_control *wbc) -+{ -+ long written = 0; -+ int repeats = 0; -+ int result; -+ struct address_space *mapping; -+ -+ /* -+ * Performs early flushing, trying to free some memory. If there is -+ * nothing to flush, commits some atoms. -+ */ -+ -+ /* Commit all atoms if reiser4_writepages() is called from sys_sync() or -+ sys_fsync(). */ -+ if (wbc->sync_mode != WB_SYNC_NONE) { -+ txnmgr_force_commit_all(sb, 0); -+ return; -+ } -+ -+ BUG_ON(reiser4_get_super_fake(sb) == NULL); -+ mapping = reiser4_get_super_fake(sb)->i_mapping; -+ do { -+ long nr_submitted = 0; -+ jnode *node = NULL; -+ -+ /* do not put more requests to overload write queue */ -+ if (wbc->nonblocking && -+ bdi_write_congested(mapping->backing_dev_info)) { -+ blk_run_address_space(mapping); -+ wbc->encountered_congestion = 1; -+ break; -+ } -+ repeats++; -+ BUG_ON(wbc->nr_to_write <= 0); -+ -+ if (get_current_context()->entd) { -+ entd_context *ent = get_entd_context(sb); -+ -+ if (ent->cur_request->node) -+ /* -+ * this is ent thread and it managed to capture -+ * requested page itself - start flush from -+ * that page -+ */ -+ node = jref(ent->cur_request->node); -+ } -+ -+ result = flush_some_atom(node, &nr_submitted, wbc, -+ JNODE_FLUSH_WRITE_BLOCKS); -+ if (result != 0) -+ warning("nikita-31001", "Flush failed: %i", result); -+ if (node) -+ jput(node); -+ if (!nr_submitted) -+ break; -+ -+ wbc->nr_to_write -= nr_submitted; -+ written += nr_submitted; -+ } while (wbc->nr_to_write > 0); -+} -+ -+void reiser4_throttle_write(struct inode *inode) -+{ -+ reiser4_txn_restart_current(); -+ balance_dirty_pages_ratelimited(inode->i_mapping); -+} -+ -+const char *REISER4_SUPER_MAGIC_STRING = "ReIsEr4"; -+const int REISER4_MAGIC_OFFSET = 16 * 4096; /* offset to magic string from the -+ * beginning of device */ -+ -+/* -+ * Reiser4 initialization/shutdown. -+ * -+ * Code below performs global reiser4 initialization that is done either as -+ * part of kernel initialization (when reiser4 is statically built-in), or -+ * during reiser4 module load (when compiled as module). -+ */ -+ -+void reiser4_handle_error(void) -+{ -+ struct super_block *sb = reiser4_get_current_sb(); -+ -+ if (!sb) -+ return; -+ reiser4_status_write(REISER4_STATUS_DAMAGED, 0, -+ "Filesystem error occured"); -+ switch (get_super_private(sb)->onerror) { -+ case 0: -+ reiser4_panic("foobar-42", "Filesystem error occured\n"); -+ case 1: -+ default: -+ if (sb->s_flags & MS_RDONLY) -+ return; -+ sb->s_flags |= MS_RDONLY; -+ break; -+ } -+} -+ -+struct dentry_operations reiser4_dentry_operations = { -+ .d_revalidate = NULL, -+ .d_hash = NULL, -+ .d_compare = NULL, -+ .d_delete = NULL, -+ .d_release = reiser4_d_release, -+ .d_iput = NULL, -+}; -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/vfs_ops.h linux-2.6.24/fs/reiser4/vfs_ops.h ---- linux-2.6.24.orig/fs/reiser4/vfs_ops.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/vfs_ops.h 2008-01-25 11:39:07.112253026 +0300 -@@ -0,0 +1,53 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* vfs_ops.c's exported symbols */ -+ -+#if !defined( __FS_REISER4_VFS_OPS_H__ ) -+#define __FS_REISER4_VFS_OPS_H__ -+ -+#include "forward.h" -+#include "coord.h" -+#include "seal.h" -+#include "plugin/file/file.h" -+#include "super.h" -+#include "readahead.h" -+ -+#include /* for loff_t */ -+#include /* for struct address_space */ -+#include /* for struct dentry */ -+#include -+#include -+ -+/* address space operations */ -+int reiser4_writepage(struct page *, struct writeback_control *); -+int reiser4_set_page_dirty(struct page *); -+void reiser4_invalidatepage(struct page *, unsigned long offset); -+int reiser4_releasepage(struct page *, gfp_t); -+ -+extern int reiser4_update_sd(struct inode *); -+extern int reiser4_add_nlink(struct inode *, struct inode *, int); -+extern int reiser4_del_nlink(struct inode *, struct inode *, int); -+ -+extern int reiser4_start_up_io(struct page *page); -+extern void reiser4_throttle_write(struct inode *); -+extern int jnode_is_releasable(jnode *); -+ -+#define CAPTURE_APAGE_BURST (1024l) -+void reiser4_writeout(struct super_block *, struct writeback_control *); -+ -+extern void reiser4_handle_error(void); -+ -+/* __FS_REISER4_VFS_OPS_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/wander.c linux-2.6.24/fs/reiser4/wander.c ---- linux-2.6.24.orig/fs/reiser4/wander.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/wander.c 2008-01-25 11:39:07.116254057 +0300 -@@ -0,0 +1,1797 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Reiser4 Wandering Log */ -+ -+/* You should read http://www.namesys.com/txn-doc.html -+ -+ That describes how filesystem operations are performed as atomic -+ transactions, and how we try to arrange it so that we can write most of the -+ data only once while performing the operation atomically. -+ -+ For the purposes of this code, it is enough for it to understand that it -+ has been told a given block should be written either once, or twice (if -+ twice then once to the wandered location and once to the real location). -+ -+ This code guarantees that those blocks that are defined to be part of an -+ atom either all take effect or none of them take effect. -+ -+ Relocate set nodes are submitted to write by the jnode_flush() routine, and -+ the overwrite set is submitted by reiser4_write_log(). This is because with -+ the overwrite set we seek to optimize writes, and with the relocate set we -+ seek to cause disk order to correlate with the parent first pre-order. -+ -+ reiser4_write_log() allocates and writes wandered blocks and maintains -+ additional on-disk structures of the atom as wander records (each wander -+ record occupies one block) for storing of the "wandered map" (a table which -+ contains a relation between wandered and real block numbers) and other -+ information which might be needed at transaction recovery time. -+ -+ The wander records are unidirectionally linked into a circle: each wander -+ record contains a block number of the next wander record, the last wander -+ record points to the first one. -+ -+ One wander record (named "tx head" in this file) has a format which is -+ different from the other wander records. The "tx head" has a reference to the -+ "tx head" block of the previously committed atom. Also, "tx head" contains -+ fs information (the free blocks counter, and the oid allocator state) which -+ is logged in a special way . -+ -+ There are two journal control blocks, named journal header and journal -+ footer which have fixed on-disk locations. The journal header has a -+ reference to the "tx head" block of the last committed atom. The journal -+ footer points to the "tx head" of the last flushed atom. The atom is -+ "played" when all blocks from its overwrite set are written to disk the -+ second time (i.e. written to their real locations). -+ -+ NOTE: People who know reiserfs internals and its journal structure might be -+ confused with these terms journal footer and journal header. There is a table -+ with terms of similar semantics in reiserfs (reiser3) and reiser4: -+ -+ REISER3 TERM | REISER4 TERM | DESCRIPTION -+ --------------------+-----------------------+---------------------------- -+ commit record | journal header | atomic write of this record -+ | | ends transaction commit -+ --------------------+-----------------------+---------------------------- -+ journal header | journal footer | atomic write of this record -+ | | ends post-commit writes. -+ | | After successful -+ | | writing of this journal -+ | | blocks (in reiser3) or -+ | | wandered blocks/records are -+ | | free for re-use. -+ --------------------+-----------------------+---------------------------- -+ -+ The atom commit process is the following: -+ -+ 1. The overwrite set is taken from atom's clean list, and its size is -+ counted. -+ -+ 2. The number of necessary wander records (including tx head) is calculated, -+ and the wander record blocks are allocated. -+ -+ 3. Allocate wandered blocks and populate wander records by wandered map. -+ -+ 4. submit write requests for wander records and wandered blocks. -+ -+ 5. wait until submitted write requests complete. -+ -+ 6. update journal header: change the pointer to the block number of just -+ written tx head, submit an i/o for modified journal header block and wait -+ for i/o completion. -+ -+ NOTE: The special logging for bitmap blocks and some reiser4 super block -+ fields makes processes of atom commit, flush and recovering a bit more -+ complex (see comments in the source code for details). -+ -+ The atom playing process is the following: -+ -+ 1. Write atom's overwrite set in-place. -+ -+ 2. Wait on i/o. -+ -+ 3. Update journal footer: change the pointer to block number of tx head -+ block of the atom we currently flushing, submit an i/o, wait on i/o -+ completion. -+ -+ 4. Free disk space which was used for wandered blocks and wander records. -+ -+ After the freeing of wandered blocks and wander records we have that journal -+ footer points to the on-disk structure which might be overwritten soon. -+ Neither the log writer nor the journal recovery procedure use that pointer -+ for accessing the data. When the journal recovery procedure finds the oldest -+ transaction it compares the journal footer pointer value with the "prev_tx" -+ pointer value in tx head, if values are equal the oldest not flushed -+ transaction is found. -+ -+ NOTE on disk space leakage: the information about of what blocks and how many -+ blocks are allocated for wandered blocks, wandered records is not written to -+ the disk because of special logging for bitmaps and some super blocks -+ counters. After a system crash we the reiser4 does not remember those -+ objects allocation, thus we have no such a kind of disk space leakage. -+*/ -+ -+/* Special logging of reiser4 super block fields. */ -+ -+/* There are some reiser4 super block fields (free block count and OID allocator -+ state (number of files and next free OID) which are logged separately from -+ super block to avoid unnecessary atom fusion. -+ -+ So, the reiser4 super block can be not captured by a transaction with -+ allocates/deallocates disk blocks or create/delete file objects. Moreover, -+ the reiser4 on-disk super block is not touched when such a transaction is -+ committed and flushed. Those "counters logged specially" are logged in "tx -+ head" blocks and in the journal footer block. -+ -+ A step-by-step description of special logging: -+ -+ 0. The per-atom information about deleted or created files and allocated or -+ freed blocks is collected during the transaction. The atom's -+ ->nr_objects_created and ->nr_objects_deleted are for object -+ deletion/creation tracking, the numbers of allocated and freed blocks are -+ calculated using atom's delete set and atom's capture list -- all new and -+ relocated nodes should be on atom's clean list and should have JNODE_RELOC -+ bit set. -+ -+ 1. The "logged specially" reiser4 super block fields have their "committed" -+ versions in the reiser4 in-memory super block. They get modified only at -+ atom commit time. The atom's commit thread has an exclusive access to those -+ "committed" fields because the log writer implementation supports only one -+ atom commit a time (there is a per-fs "commit" mutex). At -+ that time "committed" counters are modified using per-atom information -+ collected during the transaction. These counters are stored on disk as a -+ part of tx head block when atom is committed. -+ -+ 2. When the atom is flushed the value of the free block counter and the OID -+ allocator state get written to the journal footer block. A special journal -+ procedure (journal_recover_sb_data()) takes those values from the journal -+ footer and updates the reiser4 in-memory super block. -+ -+ NOTE: That means free block count and OID allocator state are logged -+ separately from the reiser4 super block regardless of the fact that the -+ reiser4 super block has fields to store both the free block counter and the -+ OID allocator. -+ -+ Writing the whole super block at commit time requires knowing true values of -+ all its fields without changes made by not yet committed transactions. It is -+ possible by having their "committed" version of the super block like the -+ reiser4 bitmap blocks have "committed" and "working" versions. However, -+ another scheme was implemented which stores special logged values in the -+ unused free space inside transaction head block. In my opinion it has an -+ advantage of not writing whole super block when only part of it was -+ modified. */ -+ -+#include "debug.h" -+#include "dformat.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "page_cache.h" -+#include "wander.h" -+#include "reiser4.h" -+#include "super.h" -+#include "vfs_ops.h" -+#include "writeout.h" -+#include "inode.h" -+#include "entd.h" -+ -+#include -+#include /* for struct super_block */ -+#include /* for struct page */ -+#include -+#include /* for struct bio */ -+#include -+ -+static int write_jnodes_to_disk_extent( -+ jnode *, int, const reiser4_block_nr *, flush_queue_t *, int); -+ -+/* The commit_handle is a container for objects needed at atom commit time */ -+struct commit_handle { -+ /* A pointer to atom's list of OVRWR nodes */ -+ struct list_head *overwrite_set; -+ /* atom's overwrite set size */ -+ int overwrite_set_size; -+ /* jnodes for wander record blocks */ -+ struct list_head tx_list; -+ /* number of wander records */ -+ __u32 tx_size; -+ /* 'committed' sb counters are saved here until atom is completely -+ flushed */ -+ __u64 free_blocks; -+ __u64 nr_files; -+ __u64 next_oid; -+ /* A pointer to the atom which is being committed */ -+ txn_atom *atom; -+ /* A pointer to current super block */ -+ struct super_block *super; -+ /* The counter of modified bitmaps */ -+ reiser4_block_nr nr_bitmap; -+}; -+ -+static void init_commit_handle(struct commit_handle *ch, txn_atom *atom) -+{ -+ memset(ch, 0, sizeof(struct commit_handle)); -+ INIT_LIST_HEAD(&ch->tx_list); -+ -+ ch->atom = atom; -+ ch->super = reiser4_get_current_sb(); -+} -+ -+static void done_commit_handle(struct commit_handle *ch) -+{ -+ assert("zam-690", list_empty(&ch->tx_list)); -+} -+ -+static inline int reiser4_use_write_barrier(struct super_block * s) -+{ -+ return !reiser4_is_set(s, REISER4_NO_WRITE_BARRIER); -+} -+ -+static void disable_write_barrier(struct super_block * s) -+{ -+ notice("zam-1055", "%s does not support write barriers," -+ " using synchronous write instead.", s->s_id); -+ set_bit((int)REISER4_NO_WRITE_BARRIER, &get_super_private(s)->fs_flags); -+} -+ -+/* fill journal header block data */ -+static void format_journal_header(struct commit_handle *ch) -+{ -+ struct reiser4_super_info_data *sbinfo; -+ struct journal_header *header; -+ jnode *txhead; -+ -+ sbinfo = get_super_private(ch->super); -+ assert("zam-479", sbinfo != NULL); -+ assert("zam-480", sbinfo->journal_header != NULL); -+ -+ txhead = list_entry(ch->tx_list.next, jnode, capture_link); -+ -+ jload(sbinfo->journal_header); -+ -+ header = (struct journal_header *)jdata(sbinfo->journal_header); -+ assert("zam-484", header != NULL); -+ -+ put_unaligned(cpu_to_le64(*jnode_get_block(txhead)), -+ &header->last_committed_tx); -+ -+ jrelse(sbinfo->journal_header); -+} -+ -+/* fill journal footer block data */ -+static void format_journal_footer(struct commit_handle *ch) -+{ -+ struct reiser4_super_info_data *sbinfo; -+ struct journal_footer *footer; -+ jnode *tx_head; -+ -+ sbinfo = get_super_private(ch->super); -+ -+ tx_head = list_entry(ch->tx_list.next, jnode, capture_link); -+ -+ assert("zam-493", sbinfo != NULL); -+ assert("zam-494", sbinfo->journal_header != NULL); -+ -+ check_me("zam-691", jload(sbinfo->journal_footer) == 0); -+ -+ footer = (struct journal_footer *)jdata(sbinfo->journal_footer); -+ assert("zam-495", footer != NULL); -+ -+ put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)), -+ &footer->last_flushed_tx); -+ put_unaligned(cpu_to_le64(ch->free_blocks), &footer->free_blocks); -+ -+ put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files); -+ put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid); -+ -+ jrelse(sbinfo->journal_footer); -+} -+ -+/* wander record capacity depends on current block size */ -+static int wander_record_capacity(const struct super_block *super) -+{ -+ return (super->s_blocksize - -+ sizeof(struct wander_record_header)) / -+ sizeof(struct wander_entry); -+} -+ -+/* Fill first wander record (tx head) in accordance with supplied given data */ -+static void format_tx_head(struct commit_handle *ch) -+{ -+ jnode *tx_head; -+ jnode *next; -+ struct tx_header *header; -+ -+ tx_head = list_entry(ch->tx_list.next, jnode, capture_link); -+ assert("zam-692", &ch->tx_list != &tx_head->capture_link); -+ -+ next = list_entry(tx_head->capture_link.next, jnode, capture_link); -+ if (&ch->tx_list == &next->capture_link) -+ next = tx_head; -+ -+ header = (struct tx_header *)jdata(tx_head); -+ -+ assert("zam-460", header != NULL); -+ assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header)); -+ -+ memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize); -+ memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE); -+ -+ put_unaligned(cpu_to_le32(ch->tx_size), &header->total); -+ put_unaligned(cpu_to_le64(get_super_private(ch->super)->last_committed_tx), -+ &header->prev_tx); -+ put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block); -+ put_unaligned(cpu_to_le64(ch->free_blocks), &header->free_blocks); -+ put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files); -+ put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid); -+} -+ -+/* prepare ordinary wander record block (fill all service fields) */ -+static void -+format_wander_record(struct commit_handle *ch, jnode *node, __u32 serial) -+{ -+ struct wander_record_header *LRH; -+ jnode *next; -+ -+ assert("zam-464", node != NULL); -+ -+ LRH = (struct wander_record_header *)jdata(node); -+ next = list_entry(node->capture_link.next, jnode, capture_link); -+ -+ if (&ch->tx_list == &next->capture_link) -+ next = list_entry(ch->tx_list.next, jnode, capture_link); -+ -+ assert("zam-465", LRH != NULL); -+ assert("zam-463", -+ ch->super->s_blocksize > sizeof(struct wander_record_header)); -+ -+ memset(jdata(node), 0, (size_t) ch->super->s_blocksize); -+ memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE); -+ -+ put_unaligned(cpu_to_le32(ch->tx_size), &LRH->total); -+ put_unaligned(cpu_to_le32(serial), &LRH->serial); -+ put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block); -+} -+ -+/* add one wandered map entry to formatted wander record */ -+static void -+store_entry(jnode * node, int index, const reiser4_block_nr * a, -+ const reiser4_block_nr * b) -+{ -+ char *data; -+ struct wander_entry *pairs; -+ -+ data = jdata(node); -+ assert("zam-451", data != NULL); -+ -+ pairs = -+ (struct wander_entry *)(data + sizeof(struct wander_record_header)); -+ -+ put_unaligned(cpu_to_le64(*a), &pairs[index].original); -+ put_unaligned(cpu_to_le64(*b), &pairs[index].wandered); -+} -+ -+/* currently, wander records contains contain only wandered map, which depend on -+ overwrite set size */ -+static void get_tx_size(struct commit_handle *ch) -+{ -+ assert("zam-440", ch->overwrite_set_size != 0); -+ assert("zam-695", ch->tx_size == 0); -+ -+ /* count all ordinary wander records -+ ( - 1) / + 1 and add one -+ for tx head block */ -+ ch->tx_size = -+ (ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) + -+ 2; -+} -+ -+/* A special structure for using in store_wmap_actor() for saving its state -+ between calls */ -+struct store_wmap_params { -+ jnode *cur; /* jnode of current wander record to fill */ -+ int idx; /* free element index in wander record */ -+ int capacity; /* capacity */ -+ -+#if REISER4_DEBUG -+ struct list_head *tx_list; -+#endif -+}; -+ -+/* an actor for use in blocknr_set_iterator routine which populates the list -+ of pre-formatted wander records by wandered map info */ -+static int -+store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a, -+ const reiser4_block_nr * b, void *data) -+{ -+ struct store_wmap_params *params = data; -+ -+ if (params->idx >= params->capacity) { -+ /* a new wander record should be taken from the tx_list */ -+ params->cur = list_entry(params->cur->capture_link.next, jnode, capture_link); -+ assert("zam-454", -+ params->tx_list != ¶ms->cur->capture_link); -+ -+ params->idx = 0; -+ } -+ -+ store_entry(params->cur, params->idx, a, b); -+ params->idx++; -+ -+ return 0; -+} -+ -+/* This function is called after Relocate set gets written to disk, Overwrite -+ set is written to wandered locations and all wander records are written -+ also. Updated journal header blocks contains a pointer (block number) to -+ first wander record of the just written transaction */ -+static int update_journal_header(struct commit_handle *ch, int use_barrier) -+{ -+ struct reiser4_super_info_data *sbinfo = get_super_private(ch->super); -+ jnode *jh = sbinfo->journal_header; -+ jnode *head = list_entry(ch->tx_list.next, jnode, capture_link); -+ int ret; -+ -+ format_journal_header(ch); -+ -+ ret = write_jnodes_to_disk_extent(jh, 1, jnode_get_block(jh), NULL, -+ use_barrier ? WRITEOUT_BARRIER : 0); -+ if (ret) -+ return ret; -+ -+ // blk_run_address_space(sbinfo->fake->i_mapping); -+ /*blk_run_queues(); */ -+ -+ ret = jwait_io(jh, WRITE); -+ -+ if (ret) -+ return ret; -+ -+ sbinfo->last_committed_tx = *jnode_get_block(head); -+ -+ return 0; -+} -+ -+/* This function is called after write-back is finished. We update journal -+ footer block and free blocks which were occupied by wandered blocks and -+ transaction wander records */ -+static int update_journal_footer(struct commit_handle *ch, int use_barrier) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(ch->super); -+ -+ jnode *jf = sbinfo->journal_footer; -+ -+ int ret; -+ -+ format_journal_footer(ch); -+ -+ ret = write_jnodes_to_disk_extent(jf, 1, jnode_get_block(jf), NULL, -+ use_barrier ? WRITEOUT_BARRIER : 0); -+ if (ret) -+ return ret; -+ -+ // blk_run_address_space(sbinfo->fake->i_mapping); -+ /*blk_run_queue(); */ -+ -+ ret = jwait_io(jf, WRITE); -+ if (ret) -+ return ret; -+ -+ return 0; -+} -+ -+/* free block numbers of wander records of already written in place transaction */ -+static void dealloc_tx_list(struct commit_handle *ch) -+{ -+ while (!list_empty(&ch->tx_list)) { -+ jnode *cur = list_entry(ch->tx_list.next, jnode, capture_link); -+ list_del(&cur->capture_link); -+ ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link)); -+ reiser4_dealloc_block(jnode_get_block(cur), BLOCK_NOT_COUNTED, -+ BA_FORMATTED); -+ -+ unpin_jnode_data(cur); -+ reiser4_drop_io_head(cur); -+ } -+} -+ -+/* An actor for use in block_nr_iterator() routine which frees wandered blocks -+ from atom's overwrite set. */ -+static int -+dealloc_wmap_actor(txn_atom * atom UNUSED_ARG, -+ const reiser4_block_nr * a UNUSED_ARG, -+ const reiser4_block_nr * b, void *data UNUSED_ARG) -+{ -+ -+ assert("zam-499", b != NULL); -+ assert("zam-500", *b != 0); -+ assert("zam-501", !reiser4_blocknr_is_fake(b)); -+ -+ reiser4_dealloc_block(b, BLOCK_NOT_COUNTED, BA_FORMATTED); -+ return 0; -+} -+ -+/* free wandered block locations of already written in place transaction */ -+static void dealloc_wmap(struct commit_handle *ch) -+{ -+ assert("zam-696", ch->atom != NULL); -+ -+ blocknr_set_iterator(ch->atom, &ch->atom->wandered_map, -+ dealloc_wmap_actor, NULL, 1); -+} -+ -+/* helper function for alloc wandered blocks, which refill set of block -+ numbers needed for wandered blocks */ -+static int -+get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len) -+{ -+ reiser4_blocknr_hint hint; -+ int ret; -+ -+ reiser4_block_nr wide_len = count; -+ -+ /* FIXME-ZAM: A special policy needed for allocation of wandered blocks -+ ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed -+ reserved allocation area so as to get the best qualities of fixed -+ journals? */ -+ reiser4_blocknr_hint_init(&hint); -+ hint.block_stage = BLOCK_GRABBED; -+ -+ ret = reiser4_alloc_blocks(&hint, start, &wide_len, -+ BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START); -+ *len = (int)wide_len; -+ -+ return ret; -+} -+ -+/* -+ * roll back changes made before issuing BIO in the case of IO error. -+ */ -+static void undo_bio(struct bio *bio) -+{ -+ int i; -+ -+ for (i = 0; i < bio->bi_vcnt; ++i) { -+ struct page *pg; -+ jnode *node; -+ -+ pg = bio->bi_io_vec[i].bv_page; -+ end_page_writeback(pg); -+ node = jprivate(pg); -+ spin_lock_jnode(node); -+ JF_CLR(node, JNODE_WRITEBACK); -+ JF_SET(node, JNODE_DIRTY); -+ spin_unlock_jnode(node); -+ } -+ bio_put(bio); -+} -+ -+/* put overwrite set back to atom's clean list */ -+static void put_overwrite_set(struct commit_handle *ch) -+{ -+ jnode *cur; -+ -+ list_for_each_entry(cur, ch->overwrite_set, capture_link) -+ jrelse_tail(cur); -+} -+ -+/* Count overwrite set size, grab disk space for wandered blocks allocation. -+ Since we have a separate list for atom's overwrite set we just scan the list, -+ count bitmap and other not leaf nodes which wandered blocks allocation we -+ have to grab space for. */ -+static int get_overwrite_set(struct commit_handle *ch) -+{ -+ int ret; -+ jnode *cur; -+ __u64 nr_not_leaves = 0; -+#if REISER4_DEBUG -+ __u64 nr_formatted_leaves = 0; -+ __u64 nr_unformatted_leaves = 0; -+#endif -+ -+ assert("zam-697", ch->overwrite_set_size == 0); -+ -+ ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom); -+ cur = list_entry(ch->overwrite_set->next, jnode, capture_link); -+ -+ while (ch->overwrite_set != &cur->capture_link) { -+ jnode *next = list_entry(cur->capture_link.next, jnode, capture_link); -+ -+ /* Count bitmap locks for getting correct statistics what number -+ * of blocks were cleared by the transaction commit. */ -+ if (jnode_get_type(cur) == JNODE_BITMAP) -+ ch->nr_bitmap++; -+ -+ assert("zam-939", JF_ISSET(cur, JNODE_OVRWR) -+ || jnode_get_type(cur) == JNODE_BITMAP); -+ -+ if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) { -+ /* we replace fake znode by another (real) -+ znode which is suggested by disk_layout -+ plugin */ -+ -+ /* FIXME: it looks like fake znode should be -+ replaced by jnode supplied by -+ disk_layout. */ -+ -+ struct super_block *s = reiser4_get_current_sb(); -+ reiser4_super_info_data *sbinfo = -+ get_current_super_private(); -+ -+ if (sbinfo->df_plug->log_super) { -+ jnode *sj = sbinfo->df_plug->log_super(s); -+ -+ assert("zam-593", sj != NULL); -+ -+ if (IS_ERR(sj)) -+ return PTR_ERR(sj); -+ -+ spin_lock_jnode(sj); -+ JF_SET(sj, JNODE_OVRWR); -+ insert_into_atom_ovrwr_list(ch->atom, sj); -+ spin_unlock_jnode(sj); -+ -+ /* jload it as the rest of overwrite set */ -+ jload_gfp(sj, reiser4_ctx_gfp_mask_get(), 0); -+ -+ ch->overwrite_set_size++; -+ } -+ spin_lock_jnode(cur); -+ reiser4_uncapture_block(cur); -+ jput(cur); -+ -+ } else { -+ int ret; -+ ch->overwrite_set_size++; -+ ret = jload_gfp(cur, reiser4_ctx_gfp_mask_get(), 0); -+ if (ret) -+ reiser4_panic("zam-783", -+ "cannot load e-flushed jnode back (ret = %d)\n", -+ ret); -+ } -+ -+ /* Count not leaves here because we have to grab disk space -+ * for wandered blocks. They were not counted as "flush -+ * reserved". Counting should be done _after_ nodes are pinned -+ * into memory by jload(). */ -+ if (!jnode_is_leaf(cur)) -+ nr_not_leaves++; -+ else { -+#if REISER4_DEBUG -+ /* at this point @cur either has JNODE_FLUSH_RESERVED -+ * or is eflushed. Locking is not strong enough to -+ * write an assertion checking for this. */ -+ if (jnode_is_znode(cur)) -+ nr_formatted_leaves++; -+ else -+ nr_unformatted_leaves++; -+#endif -+ JF_CLR(cur, JNODE_FLUSH_RESERVED); -+ } -+ -+ cur = next; -+ } -+ -+ /* Grab space for writing (wandered blocks) of not leaves found in -+ * overwrite set. */ -+ ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED); -+ if (ret) -+ return ret; -+ -+ /* Disk space for allocation of wandered blocks of leaf nodes already -+ * reserved as "flush reserved", move it to grabbed space counter. */ -+ spin_lock_atom(ch->atom); -+ assert("zam-940", -+ nr_formatted_leaves + nr_unformatted_leaves <= -+ ch->atom->flush_reserved); -+ flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved); -+ spin_unlock_atom(ch->atom); -+ -+ return ch->overwrite_set_size; -+} -+ -+/** -+ * write_jnodes_to_disk_extent - submit write request -+ * @head: -+ * @first: first jnode of the list -+ * @nr: number of jnodes on the list -+ * @block_p: -+ * @fq: -+ * @flags: used to decide whether page is to get PG_reclaim flag -+ * -+ * Submits a write request for @nr jnodes beginning from the @first, other -+ * jnodes are after the @first on the double-linked "capture" list. All jnodes -+ * will be written to the disk region of @nr blocks starting with @block_p block -+ * number. If @fq is not NULL it means that waiting for i/o completion will be -+ * done more efficiently by using flush_queue_t objects. -+ * This function is the one which writes list of jnodes in batch mode. It does -+ * all low-level things as bio construction and page states manipulation. -+ * -+ * ZAM-FIXME-HANS: brief me on why this function exists, and why bios are -+ * aggregated in this function instead of being left to the layers below -+ * -+ * FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that? -+ * Why that layer needed? Why BIOs cannot be constructed here? -+ */ -+static int write_jnodes_to_disk_extent( -+ jnode *first, int nr, const reiser4_block_nr *block_p, -+ flush_queue_t *fq, int flags) -+{ -+ struct super_block *super = reiser4_get_current_sb(); -+ int write_op = ( flags & WRITEOUT_BARRIER ) ? WRITE_BARRIER : WRITE; -+ int max_blocks; -+ jnode *cur = first; -+ reiser4_block_nr block; -+ -+ assert("zam-571", first != NULL); -+ assert("zam-572", block_p != NULL); -+ assert("zam-570", nr > 0); -+ -+ block = *block_p; -+ max_blocks = min(bio_get_nr_vecs(super->s_bdev), BIO_MAX_PAGES); -+ -+ while (nr > 0) { -+ struct bio *bio; -+ int nr_blocks = min(nr, max_blocks); -+ int i; -+ int nr_used; -+ -+ bio = bio_alloc(GFP_NOIO, nr_blocks); -+ if (!bio) -+ return RETERR(-ENOMEM); -+ -+ bio->bi_bdev = super->s_bdev; -+ bio->bi_sector = block * (super->s_blocksize >> 9); -+ for (nr_used = 0, i = 0; i < nr_blocks; i++) { -+ struct page *pg; -+ -+ pg = jnode_page(cur); -+ assert("zam-573", pg != NULL); -+ -+ page_cache_get(pg); -+ -+ lock_and_wait_page_writeback(pg); -+ -+ if (!bio_add_page(bio, pg, super->s_blocksize, 0)) { -+ /* -+ * underlying device is satiated. Stop adding -+ * pages to the bio. -+ */ -+ unlock_page(pg); -+ page_cache_release(pg); -+ break; -+ } -+ -+ spin_lock_jnode(cur); -+ assert("nikita-3166", -+ pg->mapping == jnode_get_mapping(cur)); -+ assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK)); -+#if REISER4_DEBUG -+ spin_lock(&cur->load); -+ assert("nikita-3165", !jnode_is_releasable(cur)); -+ spin_unlock(&cur->load); -+#endif -+ JF_SET(cur, JNODE_WRITEBACK); -+ JF_CLR(cur, JNODE_DIRTY); -+ ON_DEBUG(cur->written++); -+ spin_unlock_jnode(cur); -+ -+ ClearPageError(pg); -+ set_page_writeback(pg); -+ -+ if (get_current_context()->entd) { -+ /* this is ent thread */ -+ entd_context *ent = get_entd_context(super); -+ struct wbq *rq, *next; -+ -+ spin_lock(&ent->guard); -+ -+ if (pg == ent->cur_request->page) { -+ /* -+ * entd is called for this page. This -+ * request is not in th etodo list -+ */ -+ ent->cur_request->written = 1; -+ } else { -+ /* -+ * if we have written a page for which writepage -+ * is called for - move request to another list. -+ */ -+ list_for_each_entry_safe(rq, next, &ent->todo_list, link) { -+ assert("", rq->magic == WBQ_MAGIC); -+ if (pg == rq->page) { -+ /* -+ * remove request from -+ * entd's queue, but do -+ * not wake up a thread -+ * which put this -+ * request -+ */ -+ list_del_init(&rq->link); -+ ent->nr_todo_reqs --; -+ list_add_tail(&rq->link, &ent->done_list); -+ ent->nr_done_reqs ++; -+ rq->written = 1; -+ break; -+ } -+ } -+ } -+ spin_unlock(&ent->guard); -+ } -+ -+ clear_page_dirty_for_io(pg); -+ -+ unlock_page(pg); -+ -+ cur = list_entry(cur->capture_link.next, jnode, capture_link); -+ nr_used++; -+ } -+ if (nr_used > 0) { -+ assert("nikita-3453", -+ bio->bi_size == super->s_blocksize * nr_used); -+ assert("nikita-3454", bio->bi_vcnt == nr_used); -+ -+ /* Check if we are allowed to write at all */ -+ if (super->s_flags & MS_RDONLY) -+ undo_bio(bio); -+ else { -+ int not_supported; -+ -+ add_fq_to_bio(fq, bio); -+ bio_get(bio); -+ reiser4_submit_bio(write_op, bio); -+ not_supported = bio_flagged(bio, BIO_EOPNOTSUPP); -+ bio_put(bio); -+ if (not_supported) -+ return -EOPNOTSUPP; -+ } -+ -+ block += nr_used - 1; -+ update_blocknr_hint_default(super, &block); -+ block += 1; -+ } else { -+ bio_put(bio); -+ } -+ nr -= nr_used; -+ } -+ -+ return 0; -+} -+ -+/* This is a procedure which recovers a contiguous sequences of disk block -+ numbers in the given list of j-nodes and submits write requests on this -+ per-sequence basis */ -+int -+write_jnode_list(struct list_head *head, flush_queue_t *fq, -+ long *nr_submitted, int flags) -+{ -+ int ret; -+ jnode *beg = list_entry(head->next, jnode, capture_link); -+ -+ while (head != &beg->capture_link) { -+ int nr = 1; -+ jnode *cur = list_entry(beg->capture_link.next, jnode, capture_link); -+ -+ while (head != &cur->capture_link) { -+ if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr) -+ break; -+ ++nr; -+ cur = list_entry(cur->capture_link.next, jnode, capture_link); -+ } -+ -+ ret = write_jnodes_to_disk_extent( -+ beg, nr, jnode_get_block(beg), fq, flags); -+ if (ret) -+ return ret; -+ -+ if (nr_submitted) -+ *nr_submitted += nr; -+ -+ beg = cur; -+ } -+ -+ return 0; -+} -+ -+/* add given wandered mapping to atom's wandered map */ -+static int -+add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p) -+{ -+ int ret; -+ blocknr_set_entry *new_bsep = NULL; -+ reiser4_block_nr block; -+ -+ txn_atom *atom; -+ -+ assert("zam-568", block_p != NULL); -+ block = *block_p; -+ assert("zam-569", len > 0); -+ -+ while ((len--) > 0) { -+ do { -+ atom = get_current_atom_locked(); -+ assert("zam-536", -+ !reiser4_blocknr_is_fake(jnode_get_block(cur))); -+ ret = -+ blocknr_set_add_pair(atom, &atom->wandered_map, -+ &new_bsep, -+ jnode_get_block(cur), &block); -+ } while (ret == -E_REPEAT); -+ -+ if (ret) { -+ /* deallocate blocks which were not added to wandered -+ map */ -+ reiser4_block_nr wide_len = len; -+ -+ reiser4_dealloc_blocks(&block, &wide_len, -+ BLOCK_NOT_COUNTED, -+ BA_FORMATTED -+ /* formatted, without defer */ ); -+ -+ return ret; -+ } -+ -+ spin_unlock_atom(atom); -+ -+ cur = list_entry(cur->capture_link.next, jnode, capture_link); -+ ++block; -+ } -+ -+ return 0; -+} -+ -+/* Allocate wandered blocks for current atom's OVERWRITE SET and immediately -+ submit IO for allocated blocks. We assume that current atom is in a stage -+ when any atom fusion is impossible and atom is unlocked and it is safe. */ -+static int alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t *fq) -+{ -+ reiser4_block_nr block; -+ -+ int rest; -+ int len; -+ int ret; -+ -+ jnode *cur; -+ -+ assert("zam-534", ch->overwrite_set_size > 0); -+ -+ rest = ch->overwrite_set_size; -+ -+ cur = list_entry(ch->overwrite_set->next, jnode, capture_link); -+ while (ch->overwrite_set != &cur->capture_link) { -+ assert("zam-567", JF_ISSET(cur, JNODE_OVRWR)); -+ -+ ret = get_more_wandered_blocks(rest, &block, &len); -+ if (ret) -+ return ret; -+ -+ rest -= len; -+ -+ ret = add_region_to_wmap(cur, len, &block); -+ if (ret) -+ return ret; -+ -+ ret = write_jnodes_to_disk_extent(cur, len, &block, fq, 0); -+ if (ret) -+ return ret; -+ -+ while ((len--) > 0) { -+ assert("zam-604", -+ ch->overwrite_set != &cur->capture_link); -+ cur = list_entry(cur->capture_link.next, jnode, capture_link); -+ } -+ } -+ -+ return 0; -+} -+ -+/* allocate given number of nodes over the journal area and link them into a -+ list, return pointer to the first jnode in the list */ -+static int alloc_tx(struct commit_handle *ch, flush_queue_t * fq) -+{ -+ reiser4_blocknr_hint hint; -+ reiser4_block_nr allocated = 0; -+ reiser4_block_nr first, len; -+ jnode *cur; -+ jnode *txhead; -+ int ret; -+ reiser4_context *ctx; -+ reiser4_super_info_data *sbinfo; -+ -+ assert("zam-698", ch->tx_size > 0); -+ assert("zam-699", list_empty_careful(&ch->tx_list)); -+ -+ ctx = get_current_context(); -+ sbinfo = get_super_private(ctx->super); -+ -+ while (allocated < (unsigned)ch->tx_size) { -+ len = (ch->tx_size - allocated); -+ -+ reiser4_blocknr_hint_init(&hint); -+ -+ hint.block_stage = BLOCK_GRABBED; -+ -+ /* FIXME: there should be some block allocation policy for -+ nodes which contain wander records */ -+ -+ /* We assume that disk space for wandered record blocks can be -+ * taken from reserved area. */ -+ ret = reiser4_alloc_blocks(&hint, &first, &len, -+ BA_FORMATTED | BA_RESERVED | -+ BA_USE_DEFAULT_SEARCH_START); -+ reiser4_blocknr_hint_done(&hint); -+ -+ if (ret) -+ return ret; -+ -+ allocated += len; -+ -+ /* create jnodes for all wander records */ -+ while (len--) { -+ cur = reiser4_alloc_io_head(&first); -+ -+ if (cur == NULL) { -+ ret = RETERR(-ENOMEM); -+ goto free_not_assigned; -+ } -+ -+ ret = jinit_new(cur, reiser4_ctx_gfp_mask_get()); -+ -+ if (ret != 0) { -+ jfree(cur); -+ goto free_not_assigned; -+ } -+ -+ pin_jnode_data(cur); -+ -+ list_add_tail(&cur->capture_link, &ch->tx_list); -+ -+ first++; -+ } -+ } -+ -+ { /* format a on-disk linked list of wander records */ -+ int serial = 1; -+ -+ txhead = list_entry(ch->tx_list.next, jnode, capture_link); -+ format_tx_head(ch); -+ -+ cur = list_entry(txhead->capture_link.next, jnode, capture_link); -+ while (&ch->tx_list != &cur->capture_link) { -+ format_wander_record(ch, cur, serial++); -+ cur = list_entry(cur->capture_link.next, jnode, capture_link); -+ } -+ } -+ -+ { /* Fill wander records with Wandered Set */ -+ struct store_wmap_params params; -+ txn_atom *atom; -+ -+ params.cur = list_entry(txhead->capture_link.next, jnode, capture_link); -+ -+ params.idx = 0; -+ params.capacity = -+ wander_record_capacity(reiser4_get_current_sb()); -+ -+ atom = get_current_atom_locked(); -+ blocknr_set_iterator(atom, &atom->wandered_map, -+ &store_wmap_actor, ¶ms, 0); -+ spin_unlock_atom(atom); -+ } -+ -+ { /* relse all jnodes from tx_list */ -+ cur = list_entry(ch->tx_list.next, jnode, capture_link); -+ while (&ch->tx_list != &cur->capture_link) { -+ jrelse(cur); -+ cur = list_entry(cur->capture_link.next, jnode, capture_link); -+ } -+ } -+ -+ ret = write_jnode_list(&ch->tx_list, fq, NULL, 0); -+ -+ return ret; -+ -+ free_not_assigned: -+ /* We deallocate blocks not yet assigned to jnodes on tx_list. The -+ caller takes care about invalidating of tx list */ -+ reiser4_dealloc_blocks(&first, &len, BLOCK_NOT_COUNTED, BA_FORMATTED); -+ -+ return ret; -+} -+ -+static int commit_tx(struct commit_handle *ch) -+{ -+ flush_queue_t *fq; -+ int barrier; -+ int ret; -+ -+ /* Grab more space for wandered records. */ -+ ret = reiser4_grab_space_force((__u64) (ch->tx_size), BA_RESERVED); -+ if (ret) -+ return ret; -+ -+ fq = get_fq_for_current_atom(); -+ if (IS_ERR(fq)) -+ return PTR_ERR(fq); -+ -+ spin_unlock_atom(fq->atom); -+ do { -+ ret = alloc_wandered_blocks(ch, fq); -+ if (ret) -+ break; -+ ret = alloc_tx(ch, fq); -+ if (ret) -+ break; -+ } while (0); -+ -+ reiser4_fq_put(fq); -+ if (ret) -+ return ret; -+ repeat_wo_barrier: -+ barrier = reiser4_use_write_barrier(ch->super); -+ if (!barrier) { -+ ret = current_atom_finish_all_fq(); -+ if (ret) -+ return ret; -+ } -+ ret = update_journal_header(ch, barrier); -+ if (barrier) { -+ if (ret) { -+ if (ret == -EOPNOTSUPP) { -+ disable_write_barrier(ch->super); -+ goto repeat_wo_barrier; -+ } -+ return ret; -+ } -+ ret = current_atom_finish_all_fq(); -+ } -+ return ret; -+} -+ -+static int write_tx_back(struct commit_handle * ch) -+{ -+ flush_queue_t *fq; -+ int ret; -+ int barrier; -+ -+ reiser4_post_commit_hook(); -+ fq = get_fq_for_current_atom(); -+ if (IS_ERR(fq)) -+ return PTR_ERR(fq); -+ spin_unlock_atom(fq->atom); -+ ret = write_jnode_list( -+ ch->overwrite_set, fq, NULL, WRITEOUT_FOR_PAGE_RECLAIM); -+ reiser4_fq_put(fq); -+ if (ret) -+ return ret; -+ repeat_wo_barrier: -+ barrier = reiser4_use_write_barrier(ch->super); -+ if (!barrier) { -+ ret = current_atom_finish_all_fq(); -+ if (ret) -+ return ret; -+ } -+ ret = update_journal_footer(ch, barrier); -+ if (barrier) { -+ if (ret) { -+ if (ret == -EOPNOTSUPP) { -+ disable_write_barrier(ch->super); -+ goto repeat_wo_barrier; -+ } -+ return ret; -+ } -+ ret = current_atom_finish_all_fq(); -+ } -+ if (ret) -+ return ret; -+ reiser4_post_write_back_hook(); -+ return 0; -+} -+ -+/* We assume that at this moment all captured blocks are marked as RELOC or -+ WANDER (belong to Relocate o Overwrite set), all nodes from Relocate set -+ are submitted to write. -+*/ -+ -+int reiser4_write_logs(long *nr_submitted) -+{ -+ txn_atom *atom; -+ struct super_block *super = reiser4_get_current_sb(); -+ reiser4_super_info_data *sbinfo = get_super_private(super); -+ struct commit_handle ch; -+ int ret; -+ -+ writeout_mode_enable(); -+ -+ /* block allocator may add j-nodes to the clean_list */ -+ ret = reiser4_pre_commit_hook(); -+ if (ret) -+ return ret; -+ -+ /* No locks are required if we take atom which stage >= -+ * ASTAGE_PRE_COMMIT */ -+ atom = get_current_context()->trans->atom; -+ assert("zam-965", atom != NULL); -+ -+ /* relocate set is on the atom->clean_nodes list after -+ * current_atom_complete_writes() finishes. It can be safely -+ * uncaptured after commit_mutex is locked, because any atom that -+ * captures these nodes is guaranteed to commit after current one. -+ * -+ * This can only be done after reiser4_pre_commit_hook(), because it is where -+ * early flushed jnodes with CREATED bit are transferred to the -+ * overwrite list. */ -+ reiser4_invalidate_list(ATOM_CLEAN_LIST(atom)); -+ spin_lock_atom(atom); -+ /* There might be waiters for the relocate nodes which we have -+ * released, wake them up. */ -+ reiser4_atom_send_event(atom); -+ spin_unlock_atom(atom); -+ -+ if (REISER4_DEBUG) { -+ int level; -+ -+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++level) -+ assert("nikita-3352", -+ list_empty_careful(ATOM_DIRTY_LIST(atom, level))); -+ } -+ -+ sbinfo->nr_files_committed += (unsigned)atom->nr_objects_created; -+ sbinfo->nr_files_committed -= (unsigned)atom->nr_objects_deleted; -+ -+ init_commit_handle(&ch, atom); -+ -+ ch.free_blocks = sbinfo->blocks_free_committed; -+ ch.nr_files = sbinfo->nr_files_committed; -+ /* ZAM-FIXME-HANS: email me what the contention level is for the super -+ * lock. */ -+ ch.next_oid = oid_next(super); -+ -+ /* count overwrite set and place it in a separate list */ -+ ret = get_overwrite_set(&ch); -+ -+ if (ret <= 0) { -+ /* It is possible that overwrite set is empty here, it means -+ all captured nodes are clean */ -+ goto up_and_ret; -+ } -+ -+ /* Inform the caller about what number of dirty pages will be -+ * submitted to disk. */ -+ *nr_submitted += ch.overwrite_set_size - ch.nr_bitmap; -+ -+ /* count all records needed for storing of the wandered set */ -+ get_tx_size(&ch); -+ -+ ret = commit_tx(&ch); -+ if (ret) -+ goto up_and_ret; -+ -+ spin_lock_atom(atom); -+ reiser4_atom_set_stage(atom, ASTAGE_POST_COMMIT); -+ spin_unlock_atom(atom); -+ -+ ret = write_tx_back(&ch); -+ reiser4_post_write_back_hook(); -+ -+ up_and_ret: -+ if (ret) { -+ /* there could be fq attached to current atom; the only way to -+ remove them is: */ -+ current_atom_finish_all_fq(); -+ } -+ -+ /* free blocks of flushed transaction */ -+ dealloc_tx_list(&ch); -+ dealloc_wmap(&ch); -+ -+ put_overwrite_set(&ch); -+ -+ done_commit_handle(&ch); -+ -+ writeout_mode_disable(); -+ -+ return ret; -+} -+ -+/* consistency checks for journal data/control blocks: header, footer, log -+ records, transactions head blocks. All functions return zero on success. */ -+ -+static int check_journal_header(const jnode * node UNUSED_ARG) -+{ -+ /* FIXME: journal header has no magic field yet. */ -+ return 0; -+} -+ -+/* wait for write completion for all jnodes from given list */ -+static int wait_on_jnode_list(struct list_head *head) -+{ -+ jnode *scan; -+ int ret = 0; -+ -+ list_for_each_entry(scan, head, capture_link) { -+ struct page *pg = jnode_page(scan); -+ -+ if (pg) { -+ if (PageWriteback(pg)) -+ wait_on_page_writeback(pg); -+ -+ if (PageError(pg)) -+ ret++; -+ } -+ } -+ -+ return ret; -+} -+ -+static int check_journal_footer(const jnode * node UNUSED_ARG) -+{ -+ /* FIXME: journal footer has no magic field yet. */ -+ return 0; -+} -+ -+static int check_tx_head(const jnode * node) -+{ -+ struct tx_header *header = (struct tx_header *)jdata(node); -+ -+ if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) { -+ warning("zam-627", "tx head at block %s corrupted\n", -+ sprint_address(jnode_get_block(node))); -+ return RETERR(-EIO); -+ } -+ -+ return 0; -+} -+ -+static int check_wander_record(const jnode * node) -+{ -+ struct wander_record_header *RH = -+ (struct wander_record_header *)jdata(node); -+ -+ if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) != -+ 0) { -+ warning("zam-628", "wander record at block %s corrupted\n", -+ sprint_address(jnode_get_block(node))); -+ return RETERR(-EIO); -+ } -+ -+ return 0; -+} -+ -+/* fill commit_handler structure by everything what is needed for update_journal_footer */ -+static int restore_commit_handle(struct commit_handle *ch, jnode *tx_head) -+{ -+ struct tx_header *TXH; -+ int ret; -+ -+ ret = jload(tx_head); -+ if (ret) -+ return ret; -+ -+ TXH = (struct tx_header *)jdata(tx_head); -+ -+ ch->free_blocks = le64_to_cpu(get_unaligned(&TXH->free_blocks)); -+ ch->nr_files = le64_to_cpu(get_unaligned(&TXH->nr_files)); -+ ch->next_oid = le64_to_cpu(get_unaligned(&TXH->next_oid)); -+ -+ jrelse(tx_head); -+ -+ list_add(&tx_head->capture_link, &ch->tx_list); -+ -+ return 0; -+} -+ -+/* replay one transaction: restore and write overwrite set in place */ -+static int replay_transaction(const struct super_block *s, -+ jnode * tx_head, -+ const reiser4_block_nr * log_rec_block_p, -+ const reiser4_block_nr * end_block, -+ unsigned int nr_wander_records) -+{ -+ reiser4_block_nr log_rec_block = *log_rec_block_p; -+ struct commit_handle ch; -+ LIST_HEAD(overwrite_set); -+ jnode *log; -+ int ret; -+ -+ init_commit_handle(&ch, NULL); -+ ch.overwrite_set = &overwrite_set; -+ -+ restore_commit_handle(&ch, tx_head); -+ -+ while (log_rec_block != *end_block) { -+ struct wander_record_header *header; -+ struct wander_entry *entry; -+ -+ int i; -+ -+ if (nr_wander_records == 0) { -+ warning("zam-631", -+ "number of wander records in the linked list" -+ " greater than number stored in tx head.\n"); -+ ret = RETERR(-EIO); -+ goto free_ow_set; -+ } -+ -+ log = reiser4_alloc_io_head(&log_rec_block); -+ if (log == NULL) -+ return RETERR(-ENOMEM); -+ -+ ret = jload(log); -+ if (ret < 0) { -+ reiser4_drop_io_head(log); -+ return ret; -+ } -+ -+ ret = check_wander_record(log); -+ if (ret) { -+ jrelse(log); -+ reiser4_drop_io_head(log); -+ return ret; -+ } -+ -+ header = (struct wander_record_header *)jdata(log); -+ log_rec_block = le64_to_cpu(get_unaligned(&header->next_block)); -+ -+ entry = (struct wander_entry *)(header + 1); -+ -+ /* restore overwrite set from wander record content */ -+ for (i = 0; i < wander_record_capacity(s); i++) { -+ reiser4_block_nr block; -+ jnode *node; -+ -+ block = le64_to_cpu(get_unaligned(&entry->wandered)); -+ if (block == 0) -+ break; -+ -+ node = reiser4_alloc_io_head(&block); -+ if (node == NULL) { -+ ret = RETERR(-ENOMEM); -+ /* -+ * FIXME-VS:??? -+ */ -+ jrelse(log); -+ reiser4_drop_io_head(log); -+ goto free_ow_set; -+ } -+ -+ ret = jload(node); -+ -+ if (ret < 0) { -+ reiser4_drop_io_head(node); -+ /* -+ * FIXME-VS:??? -+ */ -+ jrelse(log); -+ reiser4_drop_io_head(log); -+ goto free_ow_set; -+ } -+ -+ block = le64_to_cpu(get_unaligned(&entry->original)); -+ -+ assert("zam-603", block != 0); -+ -+ jnode_set_block(node, &block); -+ -+ list_add_tail(&node->capture_link, ch.overwrite_set); -+ -+ ++entry; -+ } -+ -+ jrelse(log); -+ reiser4_drop_io_head(log); -+ -+ --nr_wander_records; -+ } -+ -+ if (nr_wander_records != 0) { -+ warning("zam-632", "number of wander records in the linked list" -+ " less than number stored in tx head.\n"); -+ ret = RETERR(-EIO); -+ goto free_ow_set; -+ } -+ -+ { /* write wandered set in place */ -+ write_jnode_list(ch.overwrite_set, NULL, NULL, 0); -+ ret = wait_on_jnode_list(ch.overwrite_set); -+ -+ if (ret) { -+ ret = RETERR(-EIO); -+ goto free_ow_set; -+ } -+ } -+ -+ ret = update_journal_footer(&ch, 0); -+ -+ free_ow_set: -+ -+ while (!list_empty(ch.overwrite_set)) { -+ jnode *cur = list_entry(ch.overwrite_set->next, jnode, capture_link); -+ list_del_init(&cur->capture_link); -+ jrelse(cur); -+ reiser4_drop_io_head(cur); -+ } -+ -+ list_del_init(&tx_head->capture_link); -+ -+ done_commit_handle(&ch); -+ -+ return ret; -+} -+ -+/* find oldest committed and not played transaction and play it. The transaction -+ * was committed and journal header block was updated but the blocks from the -+ * process of writing the atom's overwrite set in-place and updating of journal -+ * footer block were not completed. This function completes the process by -+ * recovering the atom's overwrite set from their wandered locations and writes -+ * them in-place and updating the journal footer. */ -+static int replay_oldest_transaction(struct super_block *s) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(s); -+ jnode *jf = sbinfo->journal_footer; -+ unsigned int total; -+ struct journal_footer *F; -+ struct tx_header *T; -+ -+ reiser4_block_nr prev_tx; -+ reiser4_block_nr last_flushed_tx; -+ reiser4_block_nr log_rec_block = 0; -+ -+ jnode *tx_head; -+ -+ int ret; -+ -+ if ((ret = jload(jf)) < 0) -+ return ret; -+ -+ F = (struct journal_footer *)jdata(jf); -+ -+ last_flushed_tx = le64_to_cpu(get_unaligned(&F->last_flushed_tx)); -+ -+ jrelse(jf); -+ -+ if (sbinfo->last_committed_tx == last_flushed_tx) { -+ /* all transactions are replayed */ -+ return 0; -+ } -+ -+ prev_tx = sbinfo->last_committed_tx; -+ -+ /* searching for oldest not flushed transaction */ -+ while (1) { -+ tx_head = reiser4_alloc_io_head(&prev_tx); -+ if (!tx_head) -+ return RETERR(-ENOMEM); -+ -+ ret = jload(tx_head); -+ if (ret < 0) { -+ reiser4_drop_io_head(tx_head); -+ return ret; -+ } -+ -+ ret = check_tx_head(tx_head); -+ if (ret) { -+ jrelse(tx_head); -+ reiser4_drop_io_head(tx_head); -+ return ret; -+ } -+ -+ T = (struct tx_header *)jdata(tx_head); -+ -+ prev_tx = le64_to_cpu(get_unaligned(&T->prev_tx)); -+ -+ if (prev_tx == last_flushed_tx) -+ break; -+ -+ jrelse(tx_head); -+ reiser4_drop_io_head(tx_head); -+ } -+ -+ total = le32_to_cpu(get_unaligned(&T->total)); -+ log_rec_block = le64_to_cpu(get_unaligned(&T->next_block)); -+ -+ pin_jnode_data(tx_head); -+ jrelse(tx_head); -+ -+ ret = -+ replay_transaction(s, tx_head, &log_rec_block, -+ jnode_get_block(tx_head), total - 1); -+ -+ unpin_jnode_data(tx_head); -+ reiser4_drop_io_head(tx_head); -+ -+ if (ret) -+ return ret; -+ return -E_REPEAT; -+} -+ -+/* The reiser4 journal current implementation was optimized to not to capture -+ super block if certain super blocks fields are modified. Currently, the set -+ is (, ). These fields are logged by -+ special way which includes storing them in each transaction head block at -+ atom commit time and writing that information to journal footer block at -+ atom flush time. For getting info from journal footer block to the -+ in-memory super block there is a special function -+ reiser4_journal_recover_sb_data() which should be called after disk format -+ plugin re-reads super block after journal replaying. -+*/ -+ -+/* get the information from journal footer in-memory super block */ -+int reiser4_journal_recover_sb_data(struct super_block *s) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(s); -+ struct journal_footer *jf; -+ int ret; -+ -+ assert("zam-673", sbinfo->journal_footer != NULL); -+ -+ ret = jload(sbinfo->journal_footer); -+ if (ret != 0) -+ return ret; -+ -+ ret = check_journal_footer(sbinfo->journal_footer); -+ if (ret != 0) -+ goto out; -+ -+ jf = (struct journal_footer *)jdata(sbinfo->journal_footer); -+ -+ /* was there at least one flushed transaction? */ -+ if (jf->last_flushed_tx) { -+ -+ /* restore free block counter logged in this transaction */ -+ reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&jf->free_blocks))); -+ -+ /* restore oid allocator state */ -+ oid_init_allocator(s, -+ le64_to_cpu(get_unaligned(&jf->nr_files)), -+ le64_to_cpu(get_unaligned(&jf->next_oid))); -+ } -+ out: -+ jrelse(sbinfo->journal_footer); -+ return ret; -+} -+ -+/* reiser4 replay journal procedure */ -+int reiser4_journal_replay(struct super_block *s) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(s); -+ jnode *jh, *jf; -+ struct journal_header *header; -+ int nr_tx_replayed = 0; -+ int ret; -+ -+ assert("zam-582", sbinfo != NULL); -+ -+ jh = sbinfo->journal_header; -+ jf = sbinfo->journal_footer; -+ -+ if (!jh || !jf) { -+ /* it is possible that disk layout does not support journal -+ structures, we just warn about this */ -+ warning("zam-583", -+ "journal control blocks were not loaded by disk layout plugin. " -+ "journal replaying is not possible.\n"); -+ return 0; -+ } -+ -+ /* Take free block count from journal footer block. The free block -+ counter value corresponds the last flushed transaction state */ -+ ret = jload(jf); -+ if (ret < 0) -+ return ret; -+ -+ ret = check_journal_footer(jf); -+ if (ret) { -+ jrelse(jf); -+ return ret; -+ } -+ -+ jrelse(jf); -+ -+ /* store last committed transaction info in reiser4 in-memory super -+ block */ -+ ret = jload(jh); -+ if (ret < 0) -+ return ret; -+ -+ ret = check_journal_header(jh); -+ if (ret) { -+ jrelse(jh); -+ return ret; -+ } -+ -+ header = (struct journal_header *)jdata(jh); -+ sbinfo->last_committed_tx = le64_to_cpu(get_unaligned(&header->last_committed_tx)); -+ -+ jrelse(jh); -+ -+ /* replay committed transactions */ -+ while ((ret = replay_oldest_transaction(s)) == -E_REPEAT) -+ nr_tx_replayed++; -+ -+ return ret; -+} -+ -+/* load journal control block (either journal header or journal footer block) */ -+static int -+load_journal_control_block(jnode ** node, const reiser4_block_nr * block) -+{ -+ int ret; -+ -+ *node = reiser4_alloc_io_head(block); -+ if (!(*node)) -+ return RETERR(-ENOMEM); -+ -+ ret = jload(*node); -+ -+ if (ret) { -+ reiser4_drop_io_head(*node); -+ *node = NULL; -+ return ret; -+ } -+ -+ pin_jnode_data(*node); -+ jrelse(*node); -+ -+ return 0; -+} -+ -+/* unload journal header or footer and free jnode */ -+static void unload_journal_control_block(jnode ** node) -+{ -+ if (*node) { -+ unpin_jnode_data(*node); -+ reiser4_drop_io_head(*node); -+ *node = NULL; -+ } -+} -+ -+/* release journal control blocks */ -+void reiser4_done_journal_info(struct super_block *s) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(s); -+ -+ assert("zam-476", sbinfo != NULL); -+ -+ unload_journal_control_block(&sbinfo->journal_header); -+ unload_journal_control_block(&sbinfo->journal_footer); -+ rcu_barrier(); -+} -+ -+/* load journal control blocks */ -+int reiser4_init_journal_info(struct super_block *s) -+{ -+ reiser4_super_info_data *sbinfo = get_super_private(s); -+ journal_location *loc; -+ int ret; -+ -+ loc = &sbinfo->jloc; -+ -+ assert("zam-651", loc != NULL); -+ assert("zam-652", loc->header != 0); -+ assert("zam-653", loc->footer != 0); -+ -+ ret = load_journal_control_block(&sbinfo->journal_header, &loc->header); -+ -+ if (ret) -+ return ret; -+ -+ ret = load_journal_control_block(&sbinfo->journal_footer, &loc->footer); -+ -+ if (ret) { -+ unload_journal_control_block(&sbinfo->journal_header); -+ } -+ -+ return ret; -+} -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/wander.h linux-2.6.24/fs/reiser4/wander.h ---- linux-2.6.24.orig/fs/reiser4/wander.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/wander.h 2008-01-25 11:39:07.116254057 +0300 -@@ -0,0 +1,135 @@ -+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#if !defined (__FS_REISER4_WANDER_H__) -+#define __FS_REISER4_WANDER_H__ -+ -+#include "dformat.h" -+ -+#include /* for struct super_block */ -+ -+/* REISER4 JOURNAL ON-DISK DATA STRUCTURES */ -+ -+#define TX_HEADER_MAGIC "TxMagic4" -+#define WANDER_RECORD_MAGIC "LogMagc4" -+ -+#define TX_HEADER_MAGIC_SIZE (8) -+#define WANDER_RECORD_MAGIC_SIZE (8) -+ -+/* journal header block format */ -+struct journal_header { -+ /* last written transaction head location */ -+ d64 last_committed_tx; -+}; -+ -+typedef struct journal_location { -+ reiser4_block_nr footer; -+ reiser4_block_nr header; -+} journal_location; -+ -+/* The wander.c head comment describes usage and semantic of all these structures */ -+/* journal footer block format */ -+struct journal_footer { -+ /* last flushed transaction location. */ -+ /* This block number is no more valid after the transaction it points -+ to gets flushed, this number is used only at journal replaying time -+ for detection of the end of on-disk list of committed transactions -+ which were not flushed completely */ -+ d64 last_flushed_tx; -+ -+ /* free block counter is written in journal footer at transaction -+ flushing , not in super block because free blocks counter is logged -+ by another way than super block fields (root pointer, for -+ example). */ -+ d64 free_blocks; -+ -+ /* number of used OIDs and maximal used OID are logged separately from -+ super block */ -+ d64 nr_files; -+ d64 next_oid; -+}; -+ -+/* Each wander record (except the first one) has unified format with wander -+ record header followed by an array of log entries */ -+struct wander_record_header { -+ /* when there is no predefined location for wander records, this magic -+ string should help reiser4fsck. */ -+ char magic[WANDER_RECORD_MAGIC_SIZE]; -+ -+ /* transaction id */ -+ d64 id; -+ -+ /* total number of wander records in current transaction */ -+ d32 total; -+ -+ /* this block number in transaction */ -+ d32 serial; -+ -+ /* number of previous block in commit */ -+ d64 next_block; -+}; -+ -+/* The first wander record (transaction head) of written transaction has the -+ special format */ -+struct tx_header { -+ /* magic string makes first block in transaction different from other -+ logged blocks, it should help fsck. */ -+ char magic[TX_HEADER_MAGIC_SIZE]; -+ -+ /* transaction id */ -+ d64 id; -+ -+ /* total number of records (including this first tx head) in the -+ transaction */ -+ d32 total; -+ -+ /* align next field to 8-byte boundary; this field always is zero */ -+ d32 padding; -+ -+ /* block number of previous transaction head */ -+ d64 prev_tx; -+ -+ /* next wander record location */ -+ d64 next_block; -+ -+ /* committed versions of free blocks counter */ -+ d64 free_blocks; -+ -+ /* number of used OIDs (nr_files) and maximal used OID are logged -+ separately from super block */ -+ d64 nr_files; -+ d64 next_oid; -+}; -+ -+/* A transaction gets written to disk as a set of wander records (each wander -+ record size is fs block) */ -+ -+/* As it was told above a wander The rest of wander record is filled by these log entries, unused space filled -+ by zeroes */ -+struct wander_entry { -+ d64 original; /* block original location */ -+ d64 wandered; /* block wandered location */ -+}; -+ -+/* REISER4 JOURNAL WRITER FUNCTIONS */ -+ -+extern int reiser4_write_logs(long *); -+extern int reiser4_journal_replay(struct super_block *); -+extern int reiser4_journal_recover_sb_data(struct super_block *); -+ -+extern int reiser4_init_journal_info(struct super_block *); -+extern void reiser4_done_journal_info(struct super_block *); -+ -+extern int write_jnode_list(struct list_head *, flush_queue_t *, long *, int); -+ -+#endif /* __FS_REISER4_WANDER_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ scroll-step: 1 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/writeout.h linux-2.6.24/fs/reiser4/writeout.h ---- linux-2.6.24.orig/fs/reiser4/writeout.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/writeout.h 2008-01-25 11:39:07.120255087 +0300 -@@ -0,0 +1,21 @@ -+/* Copyright 2002, 2003, 2004 by Hans Reiser, licensing governed by reiser4/README */ -+ -+#if !defined (__FS_REISER4_WRITEOUT_H__) -+ -+#define WRITEOUT_SINGLE_STREAM (0x1) -+#define WRITEOUT_FOR_PAGE_RECLAIM (0x2) -+#define WRITEOUT_BARRIER (0x4) -+ -+extern int reiser4_get_writeout_flags(void); -+ -+#endif /* __FS_REISER4_WRITEOUT_H__ */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 80 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/znode.c linux-2.6.24/fs/reiser4/znode.c ---- linux-2.6.24.orig/fs/reiser4/znode.c 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/znode.c 2008-01-25 11:39:07.120255087 +0300 -@@ -0,0 +1,1029 @@ -+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+/* Znode manipulation functions. */ -+/* Znode is the in-memory header for a tree node. It is stored -+ separately from the node itself so that it does not get written to -+ disk. In this respect znode is like buffer head or page head. We -+ also use znodes for additional reiser4 specific purposes: -+ -+ . they are organized into tree structure which is a part of whole -+ reiser4 tree. -+ . they are used to implement node grained locking -+ . they are used to keep additional state associated with a -+ node -+ . they contain links to lists used by the transaction manager -+ -+ Znode is attached to some variable "block number" which is instance of -+ fs/reiser4/tree.h:reiser4_block_nr type. Znode can exist without -+ appropriate node being actually loaded in memory. Existence of znode itself -+ is regulated by reference count (->x_count) in it. Each time thread -+ acquires reference to znode through call to zget(), ->x_count is -+ incremented and decremented on call to zput(). Data (content of node) are -+ brought in memory through call to zload(), which also increments ->d_count -+ reference counter. zload can block waiting on IO. Call to zrelse() -+ decreases this counter. Also, ->c_count keeps track of number of child -+ znodes and prevents parent znode from being recycled until all of its -+ children are. ->c_count is decremented whenever child goes out of existence -+ (being actually recycled in zdestroy()) which can be some time after last -+ reference to this child dies if we support some form of LRU cache for -+ znodes. -+ -+*/ -+/* EVERY ZNODE'S STORY -+ -+ 1. His infancy. -+ -+ Once upon a time, the znode was born deep inside of zget() by call to -+ zalloc(). At the return from zget() znode had: -+ -+ . reference counter (x_count) of 1 -+ . assigned block number, marked as used in bitmap -+ . pointer to parent znode. Root znode parent pointer points -+ to its father: "fake" znode. This, in turn, has NULL parent pointer. -+ . hash table linkage -+ . no data loaded from disk -+ . no node plugin -+ . no sibling linkage -+ -+ 2. His childhood -+ -+ Each node is either brought into memory as a result of tree traversal, or -+ created afresh, creation of the root being a special case of the latter. In -+ either case it's inserted into sibling list. This will typically require -+ some ancillary tree traversing, but ultimately both sibling pointers will -+ exist and JNODE_LEFT_CONNECTED and JNODE_RIGHT_CONNECTED will be true in -+ zjnode.state. -+ -+ 3. His youth. -+ -+ If znode is bound to already existing node in a tree, its content is read -+ from the disk by call to zload(). At that moment, JNODE_LOADED bit is set -+ in zjnode.state and zdata() function starts to return non null for this -+ znode. zload() further calls zparse() that determines which node layout -+ this node is rendered in, and sets ->nplug on success. -+ -+ If znode is for new node just created, memory for it is allocated and -+ zinit_new() function is called to initialise data, according to selected -+ node layout. -+ -+ 4. His maturity. -+ -+ After this point, znode lingers in memory for some time. Threads can -+ acquire references to znode either by blocknr through call to zget(), or by -+ following a pointer to unallocated znode from internal item. Each time -+ reference to znode is obtained, x_count is increased. Thread can read/write -+ lock znode. Znode data can be loaded through calls to zload(), d_count will -+ be increased appropriately. If all references to znode are released -+ (x_count drops to 0), znode is not recycled immediately. Rather, it is -+ still cached in the hash table in the hope that it will be accessed -+ shortly. -+ -+ There are two ways in which znode existence can be terminated: -+ -+ . sudden death: node bound to this znode is removed from the tree -+ . overpopulation: znode is purged out of memory due to memory pressure -+ -+ 5. His death. -+ -+ Death is complex process. -+ -+ When we irrevocably commit ourselves to decision to remove node from the -+ tree, JNODE_HEARD_BANSHEE bit is set in zjnode.state of corresponding -+ znode. This is done either in ->kill_hook() of internal item or in -+ reiser4_kill_root() function when tree root is removed. -+ -+ At this moment znode still has: -+ -+ . locks held on it, necessary write ones -+ . references to it -+ . disk block assigned to it -+ . data loaded from the disk -+ . pending requests for lock -+ -+ But once JNODE_HEARD_BANSHEE bit set, last call to unlock_znode() does node -+ deletion. Node deletion includes two phases. First all ways to get -+ references to that znode (sibling and parent links and hash lookup using -+ block number stored in parent node) should be deleted -- it is done through -+ sibling_list_remove(), also we assume that nobody uses down link from -+ parent node due to its nonexistence or proper parent node locking and -+ nobody uses parent pointers from children due to absence of them. Second we -+ invalidate all pending lock requests which still are on znode's lock -+ request queue, this is done by reiser4_invalidate_lock(). Another -+ JNODE_IS_DYING znode status bit is used to invalidate pending lock requests. -+ Once it set all requesters are forced to return -EINVAL from -+ longterm_lock_znode(). Future locking attempts are not possible because all -+ ways to get references to that znode are removed already. Last, node is -+ uncaptured from transaction. -+ -+ When last reference to the dying znode is just about to be released, -+ block number for this lock is released and znode is removed from the -+ hash table. -+ -+ Now znode can be recycled. -+ -+ [it's possible to free bitmap block and remove znode from the hash -+ table when last lock is released. This will result in having -+ referenced but completely orphaned znode] -+ -+ 6. Limbo -+ -+ As have been mentioned above znodes with reference counter 0 are -+ still cached in a hash table. Once memory pressure increases they are -+ purged out of there [this requires something like LRU list for -+ efficient implementation. LRU list would also greatly simplify -+ implementation of coord cache that would in this case morph to just -+ scanning some initial segment of LRU list]. Data loaded into -+ unreferenced znode are flushed back to the durable storage if -+ necessary and memory is freed. Znodes themselves can be recycled at -+ this point too. -+ -+*/ -+ -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/plugin_header.h" -+#include "plugin/node/node.h" -+#include "plugin/plugin.h" -+#include "txnmgr.h" -+#include "jnode.h" -+#include "znode.h" -+#include "block_alloc.h" -+#include "tree.h" -+#include "tree_walk.h" -+#include "super.h" -+#include "reiser4.h" -+ -+#include -+#include -+#include -+#include -+ -+static z_hash_table *get_htable(reiser4_tree *, -+ const reiser4_block_nr * const blocknr); -+static z_hash_table *znode_get_htable(const znode *); -+static void zdrop(znode *); -+ -+/* hash table support */ -+ -+/* compare two block numbers for equality. Used by hash-table macros */ -+static inline int -+blknreq(const reiser4_block_nr * b1, const reiser4_block_nr * b2) -+{ -+ assert("nikita-534", b1 != NULL); -+ assert("nikita-535", b2 != NULL); -+ -+ return *b1 == *b2; -+} -+ -+/* Hash znode by block number. Used by hash-table macros */ -+/* Audited by: umka (2002.06.11) */ -+static inline __u32 -+blknrhashfn(z_hash_table * table, const reiser4_block_nr * b) -+{ -+ assert("nikita-536", b != NULL); -+ -+ return *b & (REISER4_ZNODE_HASH_TABLE_SIZE - 1); -+} -+ -+/* The hash table definition */ -+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get()) -+#define KFREE(ptr, size) kfree(ptr) -+TYPE_SAFE_HASH_DEFINE(z, znode, reiser4_block_nr, zjnode.key.z, zjnode.link.z, -+ blknrhashfn, blknreq); -+#undef KFREE -+#undef KMALLOC -+ -+/* slab for znodes */ -+static struct kmem_cache *znode_cache; -+ -+int znode_shift_order; -+ -+/** -+ * init_znodes - create znode cache -+ * -+ * Initializes slab cache of znodes. It is part of reiser4 module initialization. -+ */ -+int init_znodes(void) -+{ -+ znode_cache = kmem_cache_create("znode", sizeof(znode), 0, -+ SLAB_HWCACHE_ALIGN | -+ SLAB_RECLAIM_ACCOUNT, NULL); -+ if (znode_cache == NULL) -+ return RETERR(-ENOMEM); -+ -+ for (znode_shift_order = 0; (1 << znode_shift_order) < sizeof(znode); -+ ++znode_shift_order); -+ --znode_shift_order; -+ return 0; -+} -+ -+/** -+ * done_znodes - delete znode cache -+ * -+ * This is called on reiser4 module unloading or system shutdown. -+ */ -+void done_znodes(void) -+{ -+ destroy_reiser4_cache(&znode_cache); -+} -+ -+/* call this to initialise tree of znodes */ -+int znodes_tree_init(reiser4_tree * tree /* tree to initialise znodes for */ ) -+{ -+ int result; -+ assert("umka-050", tree != NULL); -+ -+ rwlock_init(&tree->dk_lock); -+ -+ result = z_hash_init(&tree->zhash_table, REISER4_ZNODE_HASH_TABLE_SIZE); -+ if (result != 0) -+ return result; -+ result = z_hash_init(&tree->zfake_table, REISER4_ZNODE_HASH_TABLE_SIZE); -+ return result; -+} -+ -+/* free this znode */ -+void zfree(znode * node /* znode to free */ ) -+{ -+ assert("nikita-465", node != NULL); -+ assert("nikita-2120", znode_page(node) == NULL); -+ assert("nikita-2301", list_empty_careful(&node->lock.owners)); -+ assert("nikita-2302", list_empty_careful(&node->lock.requestors)); -+ assert("nikita-2663", (list_empty_careful(&ZJNODE(node)->capture_link) && -+ NODE_LIST(ZJNODE(node)) == NOT_CAPTURED)); -+ assert("nikita-3220", list_empty(&ZJNODE(node)->jnodes)); -+ assert("nikita-3293", !znode_is_right_connected(node)); -+ assert("nikita-3294", !znode_is_left_connected(node)); -+ assert("nikita-3295", node->left == NULL); -+ assert("nikita-3296", node->right == NULL); -+ -+ /* not yet phash_jnode_destroy(ZJNODE(node)); */ -+ -+ kmem_cache_free(znode_cache, node); -+} -+ -+/* call this to free tree of znodes */ -+void znodes_tree_done(reiser4_tree * tree /* tree to finish with znodes of */ ) -+{ -+ znode *node; -+ znode *next; -+ z_hash_table *ztable; -+ -+ /* scan znode hash-tables and kill all znodes, then free hash tables -+ * themselves. */ -+ -+ assert("nikita-795", tree != NULL); -+ -+ ztable = &tree->zhash_table; -+ -+ if (ztable->_table != NULL) { -+ for_all_in_htable(ztable, z, node, next) { -+ node->c_count = 0; -+ node->in_parent.node = NULL; -+ assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0); -+ zdrop(node); -+ } -+ -+ z_hash_done(&tree->zhash_table); -+ } -+ -+ ztable = &tree->zfake_table; -+ -+ if (ztable->_table != NULL) { -+ for_all_in_htable(ztable, z, node, next) { -+ node->c_count = 0; -+ node->in_parent.node = NULL; -+ assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0); -+ zdrop(node); -+ } -+ -+ z_hash_done(&tree->zfake_table); -+ } -+} -+ -+/* ZNODE STRUCTURES */ -+ -+/* allocate fresh znode */ -+znode *zalloc(gfp_t gfp_flag /* allocation flag */ ) -+{ -+ znode *node; -+ -+ node = kmem_cache_alloc(znode_cache, gfp_flag); -+ return node; -+} -+ -+/* Initialize fields of znode -+ @node: znode to initialize; -+ @parent: parent znode; -+ @tree: tree we are in. */ -+void zinit(znode * node, const znode * parent, reiser4_tree * tree) -+{ -+ assert("nikita-466", node != NULL); -+ assert("umka-268", current_tree != NULL); -+ -+ memset(node, 0, sizeof *node); -+ -+ assert("umka-051", tree != NULL); -+ -+ jnode_init(&node->zjnode, tree, JNODE_FORMATTED_BLOCK); -+ reiser4_init_lock(&node->lock); -+ init_parent_coord(&node->in_parent, parent); -+} -+ -+/* -+ * remove znode from indices. This is called jput() when last reference on -+ * znode is released. -+ */ -+void znode_remove(znode * node /* znode to remove */ , reiser4_tree * tree) -+{ -+ assert("nikita-2108", node != NULL); -+ assert("nikita-470", node->c_count == 0); -+ assert_rw_write_locked(&(tree->tree_lock)); -+ -+ /* remove reference to this znode from cbk cache */ -+ cbk_cache_invalidate(node, tree); -+ -+ /* update c_count of parent */ -+ if (znode_parent(node) != NULL) { -+ assert("nikita-472", znode_parent(node)->c_count > 0); -+ /* father, onto your hands I forward my spirit... */ -+ znode_parent(node)->c_count--; -+ node->in_parent.node = NULL; -+ } else { -+ /* orphaned znode?! Root? */ -+ } -+ -+ /* remove znode from hash-table */ -+ z_hash_remove_rcu(znode_get_htable(node), node); -+} -+ -+/* zdrop() -- Remove znode from the tree. -+ -+ This is called when znode is removed from the memory. */ -+static void zdrop(znode * node /* znode to finish with */ ) -+{ -+ jdrop(ZJNODE(node)); -+} -+ -+/* -+ * put znode into right place in the hash table. This is called by relocate -+ * code. -+ */ -+int znode_rehash(znode * node /* node to rehash */ , -+ const reiser4_block_nr * new_block_nr /* new block number */ ) -+{ -+ z_hash_table *oldtable; -+ z_hash_table *newtable; -+ reiser4_tree *tree; -+ -+ assert("nikita-2018", node != NULL); -+ -+ tree = znode_get_tree(node); -+ oldtable = znode_get_htable(node); -+ newtable = get_htable(tree, new_block_nr); -+ -+ write_lock_tree(tree); -+ /* remove znode from hash-table */ -+ z_hash_remove_rcu(oldtable, node); -+ -+ /* assertion no longer valid due to RCU */ -+ /* assert("nikita-2019", z_hash_find(newtable, new_block_nr) == NULL); */ -+ -+ /* update blocknr */ -+ znode_set_block(node, new_block_nr); -+ node->zjnode.key.z = *new_block_nr; -+ -+ /* insert it into hash */ -+ z_hash_insert_rcu(newtable, node); -+ write_unlock_tree(tree); -+ return 0; -+} -+ -+/* ZNODE LOOKUP, GET, PUT */ -+ -+/* zlook() - get znode with given block_nr in a hash table or return NULL -+ -+ If result is non-NULL then the znode's x_count is incremented. Internal version -+ accepts pre-computed hash index. The hash table is accessed under caller's -+ tree->hash_lock. -+*/ -+znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const blocknr) -+{ -+ znode *result; -+ __u32 hash; -+ z_hash_table *htable; -+ -+ assert("jmacd-506", tree != NULL); -+ assert("jmacd-507", blocknr != NULL); -+ -+ htable = get_htable(tree, blocknr); -+ hash = blknrhashfn(htable, blocknr); -+ -+ rcu_read_lock(); -+ result = z_hash_find_index(htable, hash, blocknr); -+ -+ if (result != NULL) { -+ add_x_ref(ZJNODE(result)); -+ result = znode_rip_check(tree, result); -+ } -+ rcu_read_unlock(); -+ -+ return result; -+} -+ -+/* return hash table where znode with block @blocknr is (or should be) -+ * stored */ -+static z_hash_table *get_htable(reiser4_tree * tree, -+ const reiser4_block_nr * const blocknr) -+{ -+ z_hash_table *table; -+ if (is_disk_addr_unallocated(blocknr)) -+ table = &tree->zfake_table; -+ else -+ table = &tree->zhash_table; -+ return table; -+} -+ -+/* return hash table where znode @node is (or should be) stored */ -+static z_hash_table *znode_get_htable(const znode * node) -+{ -+ return get_htable(znode_get_tree(node), znode_get_block(node)); -+} -+ -+/* zget() - get znode from hash table, allocating it if necessary. -+ -+ First a call to zlook, locating a x-referenced znode if one -+ exists. If znode is not found, allocate new one and return. Result -+ is returned with x_count reference increased. -+ -+ LOCKS TAKEN: TREE_LOCK, ZNODE_LOCK -+ LOCK ORDERING: NONE -+*/ -+znode *zget(reiser4_tree * tree, -+ const reiser4_block_nr * const blocknr, -+ znode * parent, tree_level level, gfp_t gfp_flag) -+{ -+ znode *result; -+ __u32 hashi; -+ -+ z_hash_table *zth; -+ -+ assert("jmacd-512", tree != NULL); -+ assert("jmacd-513", blocknr != NULL); -+ assert("jmacd-514", level < REISER4_MAX_ZTREE_HEIGHT); -+ -+ zth = get_htable(tree, blocknr); -+ hashi = blknrhashfn(zth, blocknr); -+ -+ /* NOTE-NIKITA address-as-unallocated-blocknr still is not -+ implemented. */ -+ -+ z_hash_prefetch_bucket(zth, hashi); -+ -+ rcu_read_lock(); -+ /* Find a matching BLOCKNR in the hash table. If the znode is found, -+ we obtain an reference (x_count) but the znode remains unlocked. -+ Have to worry about race conditions later. */ -+ result = z_hash_find_index(zth, hashi, blocknr); -+ /* According to the current design, the hash table lock protects new -+ znode references. */ -+ if (result != NULL) { -+ add_x_ref(ZJNODE(result)); -+ /* NOTE-NIKITA it should be so, but special case during -+ creation of new root makes such assertion highly -+ complicated. */ -+ assert("nikita-2131", 1 || znode_parent(result) == parent || -+ (ZF_ISSET(result, JNODE_ORPHAN) -+ && (znode_parent(result) == NULL))); -+ result = znode_rip_check(tree, result); -+ } -+ -+ rcu_read_unlock(); -+ -+ if (!result) { -+ znode *shadow; -+ -+ result = zalloc(gfp_flag); -+ if (!result) { -+ return ERR_PTR(RETERR(-ENOMEM)); -+ } -+ -+ zinit(result, parent, tree); -+ ZJNODE(result)->blocknr = *blocknr; -+ ZJNODE(result)->key.z = *blocknr; -+ result->level = level; -+ -+ write_lock_tree(tree); -+ -+ shadow = z_hash_find_index(zth, hashi, blocknr); -+ if (unlikely(shadow != NULL && !ZF_ISSET(shadow, JNODE_RIP))) { -+ jnode_list_remove(ZJNODE(result)); -+ zfree(result); -+ result = shadow; -+ } else { -+ result->version = znode_build_version(tree); -+ z_hash_insert_index_rcu(zth, hashi, result); -+ -+ if (parent != NULL) -+ ++parent->c_count; -+ } -+ -+ add_x_ref(ZJNODE(result)); -+ -+ write_unlock_tree(tree); -+ } -+#if REISER4_DEBUG -+ if (!reiser4_blocknr_is_fake(blocknr) && *blocknr != 0) -+ reiser4_check_block(blocknr, 1); -+#endif -+ /* Check for invalid tree level, return -EIO */ -+ if (unlikely(znode_get_level(result) != level)) { -+ warning("jmacd-504", -+ "Wrong level for cached block %llu: %i expecting %i", -+ (unsigned long long)(*blocknr), znode_get_level(result), -+ level); -+ zput(result); -+ return ERR_PTR(RETERR(-EIO)); -+ } -+ -+ assert("nikita-1227", znode_invariant(result)); -+ -+ return result; -+} -+ -+/* ZNODE PLUGINS/DATA */ -+ -+/* "guess" plugin for node loaded from the disk. Plugin id of node plugin is -+ stored at the fixed offset from the beginning of the node. */ -+static node_plugin *znode_guess_plugin(const znode * node /* znode to guess -+ * plugin of */ ) -+{ -+ reiser4_tree *tree; -+ -+ assert("nikita-1053", node != NULL); -+ assert("nikita-1055", zdata(node) != NULL); -+ -+ tree = znode_get_tree(node); -+ assert("umka-053", tree != NULL); -+ -+ if (reiser4_is_set(tree->super, REISER4_ONE_NODE_PLUGIN)) { -+ return tree->nplug; -+ } else { -+ return node_plugin_by_disk_id -+ (tree, &((common_node_header *) zdata(node))->plugin_id); -+#ifdef GUESS_EXISTS -+ reiser4_plugin *plugin; -+ -+ /* NOTE-NIKITA add locking here when dynamic plugins will be -+ * implemented */ -+ for_all_plugins(REISER4_NODE_PLUGIN_TYPE, plugin) { -+ if ((plugin->u.node.guess != NULL) -+ && plugin->u.node.guess(node)) -+ return plugin; -+ } -+ warning("nikita-1057", "Cannot guess node plugin"); -+ print_znode("node", node); -+ return NULL; -+#endif -+ } -+} -+ -+/* parse node header and install ->node_plugin */ -+int zparse(znode * node /* znode to parse */ ) -+{ -+ int result; -+ -+ assert("nikita-1233", node != NULL); -+ assert("nikita-2370", zdata(node) != NULL); -+ -+ if (node->nplug == NULL) { -+ node_plugin *nplug; -+ -+ nplug = znode_guess_plugin(node); -+ if (likely(nplug != NULL)) { -+ result = nplug->parse(node); -+ if (likely(result == 0)) -+ node->nplug = nplug; -+ } else { -+ result = RETERR(-EIO); -+ } -+ } else -+ result = 0; -+ return result; -+} -+ -+/* zload with readahead */ -+int zload_ra(znode * node /* znode to load */ , ra_info_t * info) -+{ -+ int result; -+ -+ assert("nikita-484", node != NULL); -+ assert("nikita-1377", znode_invariant(node)); -+ assert("jmacd-7771", !znode_above_root(node)); -+ assert("nikita-2125", atomic_read(&ZJNODE(node)->x_count) > 0); -+ assert("nikita-3016", reiser4_schedulable()); -+ -+ if (info) -+ formatted_readahead(node, info); -+ -+ result = jload(ZJNODE(node)); -+ assert("nikita-1378", znode_invariant(node)); -+ return result; -+} -+ -+/* load content of node into memory */ -+int zload(znode * node) -+{ -+ return zload_ra(node, NULL); -+} -+ -+/* call node plugin to initialise newly allocated node. */ -+int zinit_new(znode * node /* znode to initialise */ , gfp_t gfp_flags) -+{ -+ return jinit_new(ZJNODE(node), gfp_flags); -+} -+ -+/* drop reference to node data. When last reference is dropped, data are -+ unloaded. */ -+void zrelse(znode * node /* znode to release references to */ ) -+{ -+ assert("nikita-1381", znode_invariant(node)); -+ -+ jrelse(ZJNODE(node)); -+} -+ -+/* returns free space in node */ -+unsigned znode_free_space(znode * node /* znode to query */ ) -+{ -+ assert("nikita-852", node != NULL); -+ return node_plugin_by_node(node)->free_space(node); -+} -+ -+/* left delimiting key of znode */ -+reiser4_key *znode_get_rd_key(znode * node /* znode to query */ ) -+{ -+ assert("nikita-958", node != NULL); -+ assert_rw_locked(&(znode_get_tree(node)->dk_lock)); -+ assert("nikita-3067", LOCK_CNT_GTZ(rw_locked_dk)); -+ assert("nikita-30671", node->rd_key_version != 0); -+ return &node->rd_key; -+} -+ -+/* right delimiting key of znode */ -+reiser4_key *znode_get_ld_key(znode * node /* znode to query */ ) -+{ -+ assert("nikita-974", node != NULL); -+ assert_rw_locked(&(znode_get_tree(node)->dk_lock)); -+ assert("nikita-3068", LOCK_CNT_GTZ(rw_locked_dk)); -+ assert("nikita-30681", node->ld_key_version != 0); -+ return &node->ld_key; -+} -+ -+ON_DEBUG(atomic_t delim_key_version = ATOMIC_INIT(0); -+ ) -+ -+/* update right-delimiting key of @node */ -+reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key) -+{ -+ assert("nikita-2937", node != NULL); -+ assert("nikita-2939", key != NULL); -+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock)); -+ assert("nikita-3069", LOCK_CNT_GTZ(write_locked_dk)); -+ assert("nikita-2944", -+ znode_is_any_locked(node) || -+ znode_get_level(node) != LEAF_LEVEL || -+ keyge(key, &node->rd_key) || -+ keyeq(&node->rd_key, reiser4_min_key()) || -+ ZF_ISSET(node, JNODE_HEARD_BANSHEE)); -+ -+ node->rd_key = *key; -+ ON_DEBUG(node->rd_key_version = atomic_inc_return(&delim_key_version)); -+ return &node->rd_key; -+} -+ -+/* update left-delimiting key of @node */ -+reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key) -+{ -+ assert("nikita-2940", node != NULL); -+ assert("nikita-2941", key != NULL); -+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock)); -+ assert("nikita-3070", LOCK_CNT_GTZ(write_locked_dk)); -+ assert("nikita-2943", -+ znode_is_any_locked(node) || keyeq(&node->ld_key, -+ reiser4_min_key())); -+ -+ node->ld_key = *key; -+ ON_DEBUG(node->ld_key_version = atomic_inc_return(&delim_key_version)); -+ return &node->ld_key; -+} -+ -+/* true if @key is inside key range for @node */ -+int znode_contains_key(znode * node /* znode to look in */ , -+ const reiser4_key * key /* key to look for */ ) -+{ -+ assert("nikita-1237", node != NULL); -+ assert("nikita-1238", key != NULL); -+ -+ /* left_delimiting_key <= key <= right_delimiting_key */ -+ return keyle(znode_get_ld_key(node), key) -+ && keyle(key, znode_get_rd_key(node)); -+} -+ -+/* same as znode_contains_key(), but lock dk lock */ -+int znode_contains_key_lock(znode * node /* znode to look in */ , -+ const reiser4_key * key /* key to look for */ ) -+{ -+ int result; -+ -+ assert("umka-056", node != NULL); -+ assert("umka-057", key != NULL); -+ -+ read_lock_dk(znode_get_tree(node)); -+ result = znode_contains_key(node, key); -+ read_unlock_dk(znode_get_tree(node)); -+ return result; -+} -+ -+/* get parent pointer, assuming tree is not locked */ -+znode *znode_parent_nolock(const znode * node /* child znode */ ) -+{ -+ assert("nikita-1444", node != NULL); -+ return node->in_parent.node; -+} -+ -+/* get parent pointer of znode */ -+znode *znode_parent(const znode * node /* child znode */ ) -+{ -+ assert("nikita-1226", node != NULL); -+ assert("nikita-1406", LOCK_CNT_GTZ(rw_locked_tree)); -+ return znode_parent_nolock(node); -+} -+ -+/* detect uber znode used to protect in-superblock tree root pointer */ -+int znode_above_root(const znode * node /* znode to query */ ) -+{ -+ assert("umka-059", node != NULL); -+ -+ return disk_addr_eq(&ZJNODE(node)->blocknr, &UBER_TREE_ADDR); -+} -+ -+/* check that @node is root---that its block number is recorder in the tree as -+ that of root node */ -+#if REISER4_DEBUG -+static int znode_is_true_root(const znode * node /* znode to query */ ) -+{ -+ assert("umka-060", node != NULL); -+ assert("umka-061", current_tree != NULL); -+ -+ return disk_addr_eq(znode_get_block(node), -+ &znode_get_tree(node)->root_block); -+} -+#endif -+ -+/* check that @node is root */ -+int znode_is_root(const znode * node /* znode to query */ ) -+{ -+ assert("nikita-1206", node != NULL); -+ -+ return znode_get_level(node) == znode_get_tree(node)->height; -+} -+ -+/* Returns true is @node was just created by zget() and wasn't ever loaded -+ into memory. */ -+/* NIKITA-HANS: yes */ -+int znode_just_created(const znode * node) -+{ -+ assert("nikita-2188", node != NULL); -+ return (znode_page(node) == NULL); -+} -+ -+/* obtain updated ->znode_epoch. See seal.c for description. */ -+__u64 znode_build_version(reiser4_tree * tree) -+{ -+ __u64 result; -+ -+ spin_lock(&tree->epoch_lock); -+ result = ++tree->znode_epoch; -+ spin_unlock(&tree->epoch_lock); -+ return result; -+} -+ -+void init_load_count(load_count * dh) -+{ -+ assert("nikita-2105", dh != NULL); -+ memset(dh, 0, sizeof *dh); -+} -+ -+void done_load_count(load_count * dh) -+{ -+ assert("nikita-2106", dh != NULL); -+ if (dh->node != NULL) { -+ for (; dh->d_ref > 0; --dh->d_ref) -+ zrelse(dh->node); -+ dh->node = NULL; -+ } -+} -+ -+static int incr_load_count(load_count * dh) -+{ -+ int result; -+ -+ assert("nikita-2110", dh != NULL); -+ assert("nikita-2111", dh->node != NULL); -+ -+ result = zload(dh->node); -+ if (result == 0) -+ ++dh->d_ref; -+ return result; -+} -+ -+int incr_load_count_znode(load_count * dh, znode * node) -+{ -+ assert("nikita-2107", dh != NULL); -+ assert("nikita-2158", node != NULL); -+ assert("nikita-2109", -+ ergo(dh->node != NULL, (dh->node == node) || (dh->d_ref == 0))); -+ -+ dh->node = node; -+ return incr_load_count(dh); -+} -+ -+int incr_load_count_jnode(load_count * dh, jnode * node) -+{ -+ if (jnode_is_znode(node)) { -+ return incr_load_count_znode(dh, JZNODE(node)); -+ } -+ return 0; -+} -+ -+void copy_load_count(load_count * new, load_count * old) -+{ -+ int ret = 0; -+ done_load_count(new); -+ new->node = old->node; -+ new->d_ref = 0; -+ -+ while ((new->d_ref < old->d_ref) && (ret = incr_load_count(new)) == 0) { -+ } -+ -+ assert("jmacd-87589", ret == 0); -+} -+ -+void move_load_count(load_count * new, load_count * old) -+{ -+ done_load_count(new); -+ new->node = old->node; -+ new->d_ref = old->d_ref; -+ old->node = NULL; -+ old->d_ref = 0; -+} -+ -+/* convert parent pointer into coord */ -+void parent_coord_to_coord(const parent_coord_t * pcoord, coord_t * coord) -+{ -+ assert("nikita-3204", pcoord != NULL); -+ assert("nikita-3205", coord != NULL); -+ -+ coord_init_first_unit_nocheck(coord, pcoord->node); -+ coord_set_item_pos(coord, pcoord->item_pos); -+ coord->between = AT_UNIT; -+} -+ -+/* pack coord into parent_coord_t */ -+void coord_to_parent_coord(const coord_t * coord, parent_coord_t * pcoord) -+{ -+ assert("nikita-3206", pcoord != NULL); -+ assert("nikita-3207", coord != NULL); -+ -+ pcoord->node = coord->node; -+ pcoord->item_pos = coord->item_pos; -+} -+ -+/* Initialize a parent hint pointer. (parent hint pointer is a field in znode, -+ look for comments there) */ -+void init_parent_coord(parent_coord_t * pcoord, const znode * node) -+{ -+ pcoord->node = (znode *) node; -+ pcoord->item_pos = (unsigned short)~0; -+} -+ -+#if REISER4_DEBUG -+ -+/* debugging aid: znode invariant */ -+static int znode_invariant_f(const znode * node /* znode to check */ , -+ char const **msg /* where to store error -+ * message, if any */ ) -+{ -+#define _ergo(ant, con) \ -+ ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con))) -+ -+#define _equi(e1, e2) \ -+ ((*msg) = "{" #e1 "} <=> {" #e2 "}", equi((e1), (e2))) -+ -+#define _check(exp) ((*msg) = #exp, (exp)) -+ -+ return jnode_invariant_f(ZJNODE(node), msg) && -+ /* [znode-fake] invariant */ -+ /* fake znode doesn't have a parent, and */ -+ _ergo(znode_get_level(node) == 0, znode_parent(node) == NULL) && -+ /* there is another way to express this very check, and */ -+ _ergo(znode_above_root(node), znode_parent(node) == NULL) && -+ /* it has special block number, and */ -+ _ergo(znode_get_level(node) == 0, -+ disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) && -+ /* it is the only znode with such block number, and */ -+ _ergo(!znode_above_root(node) && znode_is_loaded(node), -+ !disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) && -+ /* it is parent of the tree root node */ -+ _ergo(znode_is_true_root(node), -+ znode_above_root(znode_parent(node))) && -+ /* [znode-level] invariant */ -+ /* level of parent znode is one larger than that of child, -+ except for the fake znode, and */ -+ _ergo(znode_parent(node) && !znode_above_root(znode_parent(node)), -+ znode_get_level(znode_parent(node)) == -+ znode_get_level(node) + 1) && -+ /* left neighbor is at the same level, and */ -+ _ergo(znode_is_left_connected(node) && node->left != NULL, -+ znode_get_level(node) == znode_get_level(node->left)) && -+ /* right neighbor is at the same level */ -+ _ergo(znode_is_right_connected(node) && node->right != NULL, -+ znode_get_level(node) == znode_get_level(node->right)) && -+ /* [znode-connected] invariant */ -+ _ergo(node->left != NULL, znode_is_left_connected(node)) && -+ _ergo(node->right != NULL, znode_is_right_connected(node)) && -+ _ergo(!znode_is_root(node) && node->left != NULL, -+ znode_is_right_connected(node->left) && -+ node->left->right == node) && -+ _ergo(!znode_is_root(node) && node->right != NULL, -+ znode_is_left_connected(node->right) && -+ node->right->left == node) && -+ /* [znode-c_count] invariant */ -+ /* for any znode, c_count of its parent is greater than 0 */ -+ _ergo(znode_parent(node) != NULL && -+ !znode_above_root(znode_parent(node)), -+ znode_parent(node)->c_count > 0) && -+ /* leaves don't have children */ -+ _ergo(znode_get_level(node) == LEAF_LEVEL, -+ node->c_count == 0) && -+ _check(node->zjnode.jnodes.prev != NULL) && -+ _check(node->zjnode.jnodes.next != NULL) && -+ /* orphan doesn't have a parent */ -+ _ergo(ZF_ISSET(node, JNODE_ORPHAN), znode_parent(node) == 0) && -+ /* [znode-modify] invariant */ -+ /* if znode is not write-locked, its checksum remains -+ * invariant */ -+ /* unfortunately, zlock is unordered w.r.t. jnode_lock, so we -+ * cannot check this. */ -+ /* [znode-refs] invariant */ -+ /* only referenced znode can be long-term locked */ -+ _ergo(znode_is_locked(node), -+ atomic_read(&ZJNODE(node)->x_count) != 0); -+} -+ -+/* debugging aid: check znode invariant and panic if it doesn't hold */ -+int znode_invariant(znode * node /* znode to check */ ) -+{ -+ char const *failed_msg; -+ int result; -+ -+ assert("umka-063", node != NULL); -+ assert("umka-064", current_tree != NULL); -+ -+ spin_lock_znode(node); -+ read_lock_tree(znode_get_tree(node)); -+ result = znode_invariant_f(node, &failed_msg); -+ if (!result) { -+ /* print_znode("corrupted node", node); */ -+ warning("jmacd-555", "Condition %s failed", failed_msg); -+ } -+ read_unlock_tree(znode_get_tree(node)); -+ spin_unlock_znode(node); -+ return result; -+} -+ -+/* return non-0 iff data are loaded into znode */ -+int znode_is_loaded(const znode * node /* znode to query */ ) -+{ -+ assert("nikita-497", node != NULL); -+ return jnode_is_loaded(ZJNODE(node)); -+} -+ -+unsigned long znode_times_locked(const znode * z) -+{ -+ return z->times_locked; -+} -+ -+#endif /* REISER4_DEBUG */ -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/fs/reiser4/znode.h linux-2.6.24/fs/reiser4/znode.h ---- linux-2.6.24.orig/fs/reiser4/znode.h 1970-01-01 03:00:00.000000000 +0300 -+++ linux-2.6.24/fs/reiser4/znode.h 2008-01-25 11:39:07.120255087 +0300 -@@ -0,0 +1,434 @@ -+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by -+ * reiser4/README */ -+ -+/* Declaration of znode (Zam's node). See znode.c for more details. */ -+ -+#ifndef __ZNODE_H__ -+#define __ZNODE_H__ -+ -+#include "forward.h" -+#include "debug.h" -+#include "dformat.h" -+#include "key.h" -+#include "coord.h" -+#include "plugin/node/node.h" -+#include "jnode.h" -+#include "lock.h" -+#include "readahead.h" -+ -+#include -+#include -+#include /* for PAGE_CACHE_SIZE */ -+#include -+#include -+ -+/* znode tracks its position within parent (internal item in a parent node, -+ * that contains znode's block number). */ -+typedef struct parent_coord { -+ znode *node; -+ pos_in_node_t item_pos; -+} parent_coord_t; -+ -+/* &znode - node in a reiser4 tree. -+ -+ NOTE-NIKITA fields in this struct have to be rearranged (later) to reduce -+ cacheline pressure. -+ -+ Locking: -+ -+ Long term: data in a disk node attached to this znode are protected -+ by long term, deadlock aware lock ->lock; -+ -+ Spin lock: the following fields are protected by the spin lock: -+ -+ ->lock -+ -+ Following fields are protected by the global tree lock: -+ -+ ->left -+ ->right -+ ->in_parent -+ ->c_count -+ -+ Following fields are protected by the global delimiting key lock (dk_lock): -+ -+ ->ld_key (to update ->ld_key long-term lock on the node is also required) -+ ->rd_key -+ -+ Following fields are protected by the long term lock: -+ -+ ->nr_items -+ -+ ->node_plugin is never changed once set. This means that after code made -+ itself sure that field is valid it can be accessed without any additional -+ locking. -+ -+ ->level is immutable. -+ -+ Invariants involving this data-type: -+ -+ [znode-fake] -+ [znode-level] -+ [znode-connected] -+ [znode-c_count] -+ [znode-refs] -+ [jnode-refs] -+ [jnode-queued] -+ [znode-modify] -+ -+ For this to be made into a clustering or NUMA filesystem, we would want to eliminate all of the global locks. -+ Suggestions for how to do that are desired.*/ -+struct znode { -+ /* Embedded jnode. */ -+ jnode zjnode; -+ -+ /* contains three subfields, node, pos_in_node, and pos_in_unit. -+ -+ pos_in_node and pos_in_unit are only hints that are cached to -+ speed up lookups during balancing. They are not required to be up to -+ date. Synched in find_child_ptr(). -+ -+ This value allows us to avoid expensive binary searches. -+ -+ in_parent->node points to the parent of this node, and is NOT a -+ hint. -+ */ -+ parent_coord_t in_parent; -+ -+ /* -+ * sibling list pointers -+ */ -+ -+ /* left-neighbor */ -+ znode *left; -+ /* right-neighbor */ -+ znode *right; -+ -+ /* long term lock on node content. This lock supports deadlock -+ detection. See lock.c -+ */ -+ zlock lock; -+ -+ /* You cannot remove from memory a node that has children in -+ memory. This is because we rely on the fact that parent of given -+ node can always be reached without blocking for io. When reading a -+ node into memory you must increase the c_count of its parent, when -+ removing it from memory you must decrease the c_count. This makes -+ the code simpler, and the cases where it is suboptimal are truly -+ obscure. -+ */ -+ int c_count; -+ -+ /* plugin of node attached to this znode. NULL if znode is not -+ loaded. */ -+ node_plugin *nplug; -+ -+ /* version of znode data. This is increased on each modification. This -+ * is necessary to implement seals (see seal.[ch]) efficiently. */ -+ __u64 version; -+ -+ /* left delimiting key. Necessary to efficiently perform -+ balancing with node-level locking. Kept in memory only. */ -+ reiser4_key ld_key; -+ /* right delimiting key. */ -+ reiser4_key rd_key; -+ -+ /* znode's tree level */ -+ __u16 level; -+ /* number of items in this node. This field is modified by node -+ * plugin. */ -+ __u16 nr_items; -+ -+#if REISER4_DEBUG -+ void *creator; -+ reiser4_key first_key; -+ unsigned long times_locked; -+ int left_version; /* when node->left was updated */ -+ int right_version; /* when node->right was updated */ -+ int ld_key_version; /* when node->ld_key was updated */ -+ int rd_key_version; /* when node->rd_key was updated */ -+#endif -+ -+} __attribute__ ((aligned(16))); -+ -+ON_DEBUG(extern atomic_t delim_key_version; -+ ) -+ -+/* In general I think these macros should not be exposed. */ -+#define znode_is_locked(node) (lock_is_locked(&node->lock)) -+#define znode_is_rlocked(node) (lock_is_rlocked(&node->lock)) -+#define znode_is_wlocked(node) (lock_is_wlocked(&node->lock)) -+#define znode_is_wlocked_once(node) (lock_is_wlocked_once(&node->lock)) -+#define znode_can_be_rlocked(node) (lock_can_be_rlocked(&node->lock)) -+#define is_lock_compatible(node, mode) (lock_mode_compatible(&node->lock, mode)) -+/* Macros for accessing the znode state. */ -+#define ZF_CLR(p,f) JF_CLR (ZJNODE(p), (f)) -+#define ZF_ISSET(p,f) JF_ISSET(ZJNODE(p), (f)) -+#define ZF_SET(p,f) JF_SET (ZJNODE(p), (f)) -+extern znode *zget(reiser4_tree * tree, const reiser4_block_nr * const block, -+ znode * parent, tree_level level, gfp_t gfp_flag); -+extern znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const block); -+extern int zload(znode * node); -+extern int zload_ra(znode * node, ra_info_t * info); -+extern int zinit_new(znode * node, gfp_t gfp_flags); -+extern void zrelse(znode * node); -+extern void znode_change_parent(znode * new_parent, reiser4_block_nr * block); -+ -+/* size of data in znode */ -+static inline unsigned -+znode_size(const znode * node UNUSED_ARG /* znode to query */ ) -+{ -+ assert("nikita-1416", node != NULL); -+ return PAGE_CACHE_SIZE; -+} -+ -+extern void parent_coord_to_coord(const parent_coord_t * pcoord, -+ coord_t * coord); -+extern void coord_to_parent_coord(const coord_t * coord, -+ parent_coord_t * pcoord); -+extern void init_parent_coord(parent_coord_t * pcoord, const znode * node); -+ -+extern unsigned znode_free_space(znode * node); -+ -+extern reiser4_key *znode_get_rd_key(znode * node); -+extern reiser4_key *znode_get_ld_key(znode * node); -+ -+extern reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key); -+extern reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key); -+ -+/* `connected' state checks */ -+static inline int znode_is_right_connected(const znode * node) -+{ -+ return ZF_ISSET(node, JNODE_RIGHT_CONNECTED); -+} -+ -+static inline int znode_is_left_connected(const znode * node) -+{ -+ return ZF_ISSET(node, JNODE_LEFT_CONNECTED); -+} -+ -+static inline int znode_is_connected(const znode * node) -+{ -+ return znode_is_right_connected(node) && znode_is_left_connected(node); -+} -+ -+extern int znode_shift_order; -+extern int znode_rehash(znode * node, const reiser4_block_nr * new_block_nr); -+extern void znode_remove(znode *, reiser4_tree *); -+extern znode *znode_parent(const znode * node); -+extern znode *znode_parent_nolock(const znode * node); -+extern int znode_above_root(const znode * node); -+extern int init_znodes(void); -+extern void done_znodes(void); -+extern int znodes_tree_init(reiser4_tree * ztree); -+extern void znodes_tree_done(reiser4_tree * ztree); -+extern int znode_contains_key(znode * node, const reiser4_key * key); -+extern int znode_contains_key_lock(znode * node, const reiser4_key * key); -+extern unsigned znode_save_free_space(znode * node); -+extern unsigned znode_recover_free_space(znode * node); -+extern znode *zalloc(gfp_t gfp_flag); -+extern void zinit(znode *, const znode * parent, reiser4_tree *); -+extern int zparse(znode * node); -+ -+extern int znode_just_created(const znode * node); -+ -+extern void zfree(znode * node); -+ -+#if REISER4_DEBUG -+extern void print_znode(const char *prefix, const znode * node); -+#else -+#define print_znode( p, n ) noop -+#endif -+ -+/* Make it look like various znode functions exist instead of treating znodes as -+ jnodes in znode-specific code. */ -+#define znode_page(x) jnode_page ( ZJNODE(x) ) -+#define zdata(x) jdata ( ZJNODE(x) ) -+#define znode_get_block(x) jnode_get_block ( ZJNODE(x) ) -+#define znode_created(x) jnode_created ( ZJNODE(x) ) -+#define znode_set_created(x) jnode_set_created ( ZJNODE(x) ) -+#define znode_convertible(x) jnode_convertible (ZJNODE(x)) -+#define znode_set_convertible(x) jnode_set_convertible (ZJNODE(x)) -+ -+#define znode_is_dirty(x) jnode_is_dirty ( ZJNODE(x) ) -+#define znode_check_dirty(x) jnode_check_dirty ( ZJNODE(x) ) -+#define znode_make_clean(x) jnode_make_clean ( ZJNODE(x) ) -+#define znode_set_block(x, b) jnode_set_block ( ZJNODE(x), (b) ) -+ -+#define spin_lock_znode(x) spin_lock_jnode ( ZJNODE(x) ) -+#define spin_unlock_znode(x) spin_unlock_jnode ( ZJNODE(x) ) -+#define spin_trylock_znode(x) spin_trylock_jnode ( ZJNODE(x) ) -+#define spin_znode_is_locked(x) spin_jnode_is_locked ( ZJNODE(x) ) -+#define spin_znode_is_not_locked(x) spin_jnode_is_not_locked ( ZJNODE(x) ) -+ -+#if REISER4_DEBUG -+extern int znode_x_count_is_protected(const znode * node); -+extern int znode_invariant(znode * node); -+#endif -+ -+/* acquire reference to @node */ -+static inline znode *zref(znode * node) -+{ -+ /* change of x_count from 0 to 1 is protected by tree spin-lock */ -+ return JZNODE(jref(ZJNODE(node))); -+} -+ -+/* release reference to @node */ -+static inline void zput(znode * node) -+{ -+ assert("nikita-3564", znode_invariant(node)); -+ jput(ZJNODE(node)); -+} -+ -+/* get the level field for a znode */ -+static inline tree_level znode_get_level(const znode * node) -+{ -+ return node->level; -+} -+ -+/* get the level field for a jnode */ -+static inline tree_level jnode_get_level(const jnode * node) -+{ -+ if (jnode_is_znode(node)) -+ return znode_get_level(JZNODE(node)); -+ else -+ /* unformatted nodes are all at the LEAF_LEVEL and for -+ "semi-formatted" nodes like bitmaps, level doesn't matter. */ -+ return LEAF_LEVEL; -+} -+ -+/* true if jnode is on leaf level */ -+static inline int jnode_is_leaf(const jnode * node) -+{ -+ if (jnode_is_znode(node)) -+ return (znode_get_level(JZNODE(node)) == LEAF_LEVEL); -+ if (jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK) -+ return 1; -+ return 0; -+} -+ -+/* return znode's tree */ -+static inline reiser4_tree *znode_get_tree(const znode * node) -+{ -+ assert("nikita-2692", node != NULL); -+ return jnode_get_tree(ZJNODE(node)); -+} -+ -+/* resolve race with zput */ -+static inline znode *znode_rip_check(reiser4_tree * tree, znode * node) -+{ -+ jnode *j; -+ -+ j = jnode_rip_sync(tree, ZJNODE(node)); -+ if (likely(j != NULL)) -+ node = JZNODE(j); -+ else -+ node = NULL; -+ return node; -+} -+ -+#if defined(REISER4_DEBUG) -+int znode_is_loaded(const znode * node /* znode to query */ ); -+#endif -+ -+extern __u64 znode_build_version(reiser4_tree * tree); -+ -+/* Data-handles. A data handle object manages pairing calls to zload() and zrelse(). We -+ must load the data for a node in many places. We could do this by simply calling -+ zload() everywhere, the difficulty arises when we must release the loaded data by -+ calling zrelse. In a function with many possible error/return paths, it requires extra -+ work to figure out which exit paths must call zrelse and those which do not. The data -+ handle automatically calls zrelse for every zload that it is responsible for. In that -+ sense, it acts much like a lock_handle. -+*/ -+typedef struct load_count { -+ znode *node; -+ int d_ref; -+} load_count; -+ -+extern void init_load_count(load_count * lc); /* Initialize a load_count set the current node to NULL. */ -+extern void done_load_count(load_count * dh); /* Finalize a load_count: call zrelse() if necessary */ -+extern int incr_load_count_znode(load_count * dh, znode * node); /* Set the argument znode to the current node, call zload(). */ -+extern int incr_load_count_jnode(load_count * dh, jnode * node); /* If the argument jnode is formatted, do the same as -+ * incr_load_count_znode, otherwise do nothing (unformatted nodes -+ * don't require zload/zrelse treatment). */ -+extern void move_load_count(load_count * new, load_count * old); /* Move the contents of a load_count. Old handle is released. */ -+extern void copy_load_count(load_count * new, load_count * old); /* Copy the contents of a load_count. Old handle remains held. */ -+ -+/* Variable initializers for load_count. */ -+#define INIT_LOAD_COUNT ( load_count * ){ .node = NULL, .d_ref = 0 } -+#define INIT_LOAD_COUNT_NODE( n ) ( load_count ){ .node = ( n ), .d_ref = 0 } -+/* A convenience macro for use in assertions or debug-only code, where loaded -+ data is only required to perform the debugging check. This macro -+ encapsulates an expression inside a pair of calls to zload()/zrelse(). */ -+#define WITH_DATA( node, exp ) \ -+({ \ -+ long __with_dh_result; \ -+ znode *__with_dh_node; \ -+ \ -+ __with_dh_node = ( node ); \ -+ __with_dh_result = zload( __with_dh_node ); \ -+ if( __with_dh_result == 0 ) { \ -+ __with_dh_result = ( long )( exp ); \ -+ zrelse( __with_dh_node ); \ -+ } \ -+ __with_dh_result; \ -+}) -+ -+/* Same as above, but accepts a return value in case zload fails. */ -+#define WITH_DATA_RET( node, ret, exp ) \ -+({ \ -+ int __with_dh_result; \ -+ znode *__with_dh_node; \ -+ \ -+ __with_dh_node = ( node ); \ -+ __with_dh_result = zload( __with_dh_node ); \ -+ if( __with_dh_result == 0 ) { \ -+ __with_dh_result = ( int )( exp ); \ -+ zrelse( __with_dh_node ); \ -+ } else \ -+ __with_dh_result = ( ret ); \ -+ __with_dh_result; \ -+}) -+ -+#define WITH_COORD(coord, exp) \ -+({ \ -+ coord_t *__coord; \ -+ \ -+ __coord = (coord); \ -+ coord_clear_iplug(__coord); \ -+ WITH_DATA(__coord->node, exp); \ -+}) -+ -+#if REISER4_DEBUG -+#define STORE_COUNTERS \ -+ reiser4_lock_cnt_info __entry_counters = \ -+ *reiser4_lock_counters() -+#define CHECK_COUNTERS \ -+ON_DEBUG_CONTEXT( \ -+({ \ -+ __entry_counters.x_refs = reiser4_lock_counters() -> x_refs; \ -+ __entry_counters.t_refs = reiser4_lock_counters() -> t_refs; \ -+ __entry_counters.d_refs = reiser4_lock_counters() -> d_refs; \ -+ assert("nikita-2159", \ -+ !memcmp(&__entry_counters, reiser4_lock_counters(), \ -+ sizeof __entry_counters)); \ -+}) ) -+ -+#else -+#define STORE_COUNTERS -+#define CHECK_COUNTERS noop -+#endif -+ -+/* __ZNODE_H__ */ -+#endif -+ -+/* Make Linus happy. -+ Local variables: -+ c-indentation-style: "K&R" -+ mode-name: "LC" -+ c-basic-offset: 8 -+ tab-width: 8 -+ fill-column: 120 -+ End: -+*/ -diff -urN linux-2.6.24.orig/include/linux/fs.h linux-2.6.24/include/linux/fs.h ---- linux-2.6.24.orig/include/linux/fs.h 2008-01-25 14:24:20.893378532 +0300 -+++ linux-2.6.24/include/linux/fs.h 2008-01-25 11:39:07.124256117 +0300 -@@ -1256,6 +1256,8 @@ - void (*clear_inode) (struct inode *); - void (*umount_begin) (struct vfsmount *, int); - -+ void (*sync_inodes) (struct super_block *sb, -+ struct writeback_control *wbc); - int (*show_options)(struct seq_file *, struct vfsmount *); - int (*show_stats)(struct seq_file *, struct vfsmount *); - #ifdef CONFIG_QUOTA -@@ -1671,6 +1673,7 @@ - extern int invalidate_inode_pages2_range(struct address_space *mapping, - pgoff_t start, pgoff_t end); - extern int write_inode_now(struct inode *, int); -+extern void generic_sync_sb_inodes(struct super_block *, struct writeback_control *); - extern int filemap_fdatawrite(struct address_space *); - extern int filemap_flush(struct address_space *); - extern int filemap_fdatawait(struct address_space *); -diff -urN linux-2.6.24.orig/mm/filemap.c linux-2.6.24/mm/filemap.c ---- linux-2.6.24.orig/mm/filemap.c 2008-01-25 14:24:21.569552179 +0300 -+++ linux-2.6.24/mm/filemap.c 2008-01-25 11:39:07.132258178 +0300 -@@ -137,6 +137,7 @@ - dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); - } - } -+EXPORT_SYMBOL(__remove_from_page_cache); - - void remove_from_page_cache(struct page *page) - { -@@ -148,6 +149,7 @@ - __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); - } -+EXPORT_SYMBOL(remove_from_page_cache); - - static int sync_page(void *word) - { -@@ -731,6 +733,7 @@ - read_unlock_irq(&mapping->tree_lock); - return ret; - } -+EXPORT_SYMBOL(add_to_page_cache_lru); - - /** - * find_get_pages_contig - gang contiguous pagecache lookup -@@ -850,6 +853,7 @@ - - ra->ra_pages /= 4; - } -+EXPORT_SYMBOL(find_get_pages); - - /** - * do_generic_mapping_read - generic file read routine